[2026-03-23 06:22:22,947] [WARNING] [huggingface_hub.utils._http._warn_on_warning_headers:916] [PID:180] Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
config.json:   0%|                                                | 0.00/675 [00:00<?, ?B/s]config.json: 100%|█████████████████████████████████████████| 675/675 [00:00<00:00, 6.18MB/s]
[2026-03-23 06:22:22,974] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:180] baseline 0.000GB (+0.000GB allocated, +0.002GB reserved)
[2026-03-23 06:22:22,975] [INFO] [axolotl.cli.config.load_cfg:341] [PID:180] config:
{
  "activation_offloading": false,
  "adapter": "lora",
  "auto_resume_from_checkpoints": true,
  "axolotl_config_path": "train.yml",
  "base_model": "PicoKittens/PicoMistral-23M",
  "base_model_config": "PicoKittens/PicoMistral-23M",
  "batch_size": 4,
  "bf16": false,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_75",
    "fp8": false,
    "n_gpu": 1,
    "n_node": 1,
    "tf32": false
  },
  "context_parallel_size": 1,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 16,
  "datasets": [
    {
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "psychopenguin/indian_legal_dataset_qna",
      "split": "train[:2%]",
      "trust_remote_code": false,
      "type": "alpaca"
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "eaft_alpha": 1.0,
  "eaft_k": 20,
  "early_stopping_patience": 3,
  "env_capabilities": {
    "torch_version": "2.9.1"
  },
  "eval_batch_size": 2,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_steps": 100,
  "eval_strategy": "steps",
  "eval_table_size": 0,
  "experimental_skip_move_to_device": true,
  "fp16": true,
  "generate_samples": false,
  "generation_do_sample": true,
  "generation_max_new_tokens": 50,
  "generation_prompt_ratio": 0.5,
  "generation_temperature": 0.7,
  "gradient_accumulation_steps": 2,
  "gradient_checkpointing": false,
  "hf_use_auth_token": true,
  "hub_model_id": "psychopenguin/t1",
  "include_tkps": true,
  "is_falcon_derived_model": false,
  "is_llama_derived_model": false,
  "is_mistral_derived_model": true,
  "learning_rate": 0.0002,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": true,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "lora_alpha": 8,
  "lora_dropout": 0.05,
  "lora_r": 4,
  "lora_target_linear": true,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "mean_resizing_embeddings": false,
  "merge_lora": true,
  "micro_batch_size": 2,
  "model_config_type": "mistral",
  "neftune_noise_alpha": 5.0,
  "num_epochs": 3.0,
  "num_generation_samples": 3,
  "optimizer": "adamw_bnb_8bit",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./final_model",
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "quantize_moe_experts": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_steps": 100,
  "sdp_attention": true,
  "seed": 9,
  "sequence_len": 256,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": false,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "PicoKittens/PicoMistral-23M",
  "tokenizer_save_jinja_files": true,
  "tokenizer_type": "AutoTokenizer",
  "tokenizer_use_fast": true,
  "torch_dtype": "torch.float16",
  "train_on_inputs": false,
  "trl": {
    "async_prefetch": false,
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "replay_buffer_size": 0,
    "replay_recompute_logps": true,
    "reroll_max_groups": 1,
    "reroll_start_fraction": 1.0,
    "reward_num_workers": 1,
    "scale_rewards": true,
    "skip_zero_advantage_batches": true,
    "sync_ref_model": false,
    "use_data_producer": false,
    "use_vllm": false,
    "vllm_lora_sync": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "trust_remote_code": true,
  "type_of_model": "AutoModelForCausalLM",
  "use_otel_metrics": false,
  "use_ray": false,
  "use_wandb": true,
  "val_set_size": 0.3,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_name": "t1",
  "wandb_project": "tttt",
  "weight_decay": 0.0,
  "world_size": 1
}
[2026-03-23 06:22:22,976] [WARNING] [axolotl.cli.checks.check_user_token:46] [PID:180] Error verifying HuggingFace token. Remember to log in using `hf auth login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets.
tokenizer_config.json:   0%|                                      | 0.00/307 [00:00<?, ?B/s]tokenizer_config.json: 100%|███████████████████████████████| 307/307 [00:00<00:00, 3.62MB/s]
tokenizer.json: 0.00B [00:00, ?B/s]tokenizer.json: 1.14MB [00:00, 105MB/s]
[2026-03-23 06:22:26,592] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:299] [PID:180] EOS: 2 / [EOS]
[2026-03-23 06:22:26,592] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:180] BOS: 1 / [BOS]
[2026-03-23 06:22:26,592] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:180] PAD: 0 / [PAD]
[2026-03-23 06:22:26,592] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:180] UNK: 3 / [UNK]
[2026-03-23 06:22:26,592] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:316] [PID:180] No Chat template selected. Consider adding a chat template for easier inference.
[2026-03-23 06:22:26,592] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:480] [PID:180] Unable to find prepared dataset in last_run_prepared/d5a17e3528edff9ec955ba25ed6f7604
[2026-03-23 06:22:26,592] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:180] Loading raw datasets...
[2026-03-23 06:22:26,593] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:180] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
[2026-03-23 06:22:26,597] [ERROR] [axolotl.telemetry.errors.wrapper:158] [PID:180] Error captured in telemetry. Run ID: 14d7850f-db08-45a0-ac88-328f852e04fa
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/workspace/axolotl/src/axolotl/cli/train.py", line 121, in <module>
    fire.Fire(do_cli)
  File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/fire/core.py", line 135, in Fire
    component_trace = _Fire(component, args, parsed_flag_args, context, name)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/fire/core.py", line 468, in _Fire
    component, remaining_args = _CallAndUpdateTrace(
                                ^^^^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace
    component = fn(*varargs, **kwargs)
                ^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/axolotl/src/axolotl/cli/train.py", line 88, in do_cli
    return do_train(parsed_cfg, parsed_cli_args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/axolotl/src/axolotl/cli/train.py", line 43, in do_train
    dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/axolotl/src/axolotl/telemetry/errors.py", line 127, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/axolotl/src/axolotl/common/datasets.py", line 61, in load_datasets
    train_dataset, eval_dataset, total_num_steps, prompters = prepare_datasets(
                                                              ^^^^^^^^^^^^^^^^^
  File "/workspace/axolotl/src/axolotl/utils/data/utils.py", line 50, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 65, in prepare_datasets
    return _prepare_standard_dataset(cfg, tokenizer, processor)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 98, in _prepare_standard_dataset
    train_dataset, eval_dataset, prompters = loader.load(_load_datasets)
                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/axolotl/src/axolotl/utils/data/lock.py", line 38, in load
    result = load_fn()
             ^^^^^^^^^
  File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 77, in _load_datasets
    train_dataset, eval_dataset, prompters = _load_and_prepare_datasets(
                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 492, in _load_and_prepare_datasets
    dataset, prompters = _load_tokenized_prepared_datasets(
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 299, in _load_tokenized_prepared_datasets
    dataset, prompters = _load_raw_datasets(
                         ^^^^^^^^^^^^^^^^^^^
  File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 331, in _load_raw_datasets
    dataset_wrapper, dataset_prompter = _load_and_process_single_dataset(
                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 380, in _load_and_process_single_dataset
    dataset = load_dataset_with_config(
              ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/axolotl/src/axolotl/utils/data/shared.py", line 121, in load_dataset_with_config
    is_hub_dataset = _check_if_hub_dataset(dataset_config, use_auth_token)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/axolotl/src/axolotl/utils/data/shared.py", line 154, in _check_if_hub_dataset
    snapshot_download(
  File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/huggingface_hub/utils/_validators.py", line 89, in _inner_fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/huggingface_hub/_snapshot_download.py", line 240, in snapshot_download
    repo_info = api.repo_info(repo_id=repo_id, repo_type=repo_type, revision=revision)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/huggingface_hub/utils/_validators.py", line 89, in _inner_fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/huggingface_hub/hf_api.py", line 3146, in repo_info
    return method(
           ^^^^^^^
  File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/huggingface_hub/utils/_validators.py", line 89, in _inner_fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/huggingface_hub/hf_api.py", line 2997, in dataset_info
    headers = self._build_hf_headers(token=token)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/huggingface_hub/hf_api.py", line 10083, in _build_hf_headers
    return build_hf_headers(
           ^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/huggingface_hub/utils/_validators.py", line 89, in _inner_fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/huggingface_hub/utils/_headers.py", line 110, in build_hf_headers
    token_to_send = get_token_to_send(token)
                    ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/huggingface_hub/utils/_headers.py", line 143, in get_token_to_send
    raise LocalTokenNotFoundError(
huggingface_hub.errors.LocalTokenNotFoundError: Token is required (`token=True`), but no token found. You need to provide a token or be logged in to Hugging Face with `hf auth login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.
[2026-03-23 06:23:31,392] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:323] baseline 0.000GB (+0.000GB allocated, +0.002GB reserved)
[2026-03-23 06:23:31,393] [INFO] [axolotl.cli.config.load_cfg:341] [PID:323] config:
{
  "activation_offloading": false,
  "adapter": "lora",
  "auto_resume_from_checkpoints": true,
  "axolotl_config_path": "train.yml",
  "base_model": "PicoKittens/PicoMistral-23M",
  "base_model_config": "PicoKittens/PicoMistral-23M",
  "batch_size": 4,
  "bf16": false,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_75",
    "fp8": false,
    "n_gpu": 1,
    "n_node": 1,
    "tf32": false
  },
  "context_parallel_size": 1,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 16,
  "datasets": [
    {
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "psychopenguin/indian_legal_dataset_qna",
      "split": "train[:2%]",
      "trust_remote_code": false,
      "type": "alpaca"
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "eaft_alpha": 1.0,
  "eaft_k": 20,
  "early_stopping_patience": 3,
  "env_capabilities": {
    "torch_version": "2.9.1"
  },
  "eval_batch_size": 2,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_steps": 100,
  "eval_strategy": "steps",
  "eval_table_size": 0,
  "experimental_skip_move_to_device": true,
  "fp16": true,
  "generate_samples": false,
  "generation_do_sample": true,
  "generation_max_new_tokens": 50,
  "generation_prompt_ratio": 0.5,
  "generation_temperature": 0.7,
  "gradient_accumulation_steps": 2,
  "gradient_checkpointing": false,
  "hf_use_auth_token": true,
  "hub_model_id": "psychopenguin/t1",
  "include_tkps": true,
  "is_falcon_derived_model": false,
  "is_llama_derived_model": false,
  "is_mistral_derived_model": true,
  "learning_rate": 0.0002,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": true,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "lora_alpha": 8,
  "lora_dropout": 0.05,
  "lora_r": 4,
  "lora_target_linear": true,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "mean_resizing_embeddings": false,
  "merge_lora": true,
  "micro_batch_size": 2,
  "model_config_type": "mistral",
  "neftune_noise_alpha": 5.0,
  "num_epochs": 3.0,
  "num_generation_samples": 3,
  "optimizer": "adamw_bnb_8bit",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./final_model",
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "quantize_moe_experts": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_steps": 100,
  "sdp_attention": true,
  "seed": 9,
  "sequence_len": 256,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": false,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "PicoKittens/PicoMistral-23M",
  "tokenizer_save_jinja_files": true,
  "tokenizer_type": "AutoTokenizer",
  "tokenizer_use_fast": true,
  "torch_dtype": "torch.float16",
  "train_on_inputs": false,
  "trl": {
    "async_prefetch": false,
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "replay_buffer_size": 0,
    "replay_recompute_logps": true,
    "reroll_max_groups": 1,
    "reroll_start_fraction": 1.0,
    "reward_num_workers": 1,
    "scale_rewards": true,
    "skip_zero_advantage_batches": true,
    "sync_ref_model": false,
    "use_data_producer": false,
    "use_vllm": false,
    "vllm_lora_sync": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "trust_remote_code": true,
  "type_of_model": "AutoModelForCausalLM",
  "use_otel_metrics": false,
  "use_ray": false,
  "use_wandb": true,
  "val_set_size": 0.3,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_name": "t1",
  "wandb_project": "tttt",
  "weight_decay": 0.0,
  "world_size": 1
}
[2026-03-23 06:23:33,247] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:299] [PID:323] EOS: 2 / [EOS]
[2026-03-23 06:23:33,247] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:323] BOS: 1 / [BOS]
[2026-03-23 06:23:33,247] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:323] PAD: 0 / [PAD]
[2026-03-23 06:23:33,247] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:323] UNK: 3 / [UNK]
[2026-03-23 06:23:33,247] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:316] [PID:323] No Chat template selected. Consider adding a chat template for easier inference.
[2026-03-23 06:23:33,248] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:480] [PID:323] Unable to find prepared dataset in last_run_prepared/d5a17e3528edff9ec955ba25ed6f7604
[2026-03-23 06:23:33,248] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:323] Loading raw datasets...
[2026-03-23 06:23:33,248] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:323] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
Downloading (incomplete total...): 0.00B [00:00, ?B/s]
Fetching 0 files: 0it [00:00, ?it/s][AFetching 0 files: 0it [00:00, ?it/s]
Download complete: : 0.00B [00:00, ?B/s]              Download complete: : 0.00B [00:01, ?B/s]
train.json:   0%|                                               | 0.00/16.9M [00:00<?, ?B/s]train.json:   0%|                                               | 0.00/16.9M [00:00<?, ?B/s]train.json:   0%|                                               | 0.00/16.9M [00:00<?, ?B/s]train.json:   0%|                                               | 0.00/16.9M [00:00<?, ?B/s]train.json:   0%|                                               | 0.00/16.9M [00:00<?, ?B/s]train.json:   0%|                                               | 0.00/16.9M [00:01<?, ?B/s]train.json:   0%|                                               | 0.00/16.9M [00:01<?, ?B/s]train.json: 100%|██████████████████████████████████████| 16.9M/16.9M [00:01<00:00, 84.3MB/s]train.json: 100%|██████████████████████████████████████| 16.9M/16.9M [00:01<00:00, 11.7MB/s]
Generating train split:   0%|                              | 0/37048 [00:00<?, ? examples/s]Generating train split: 100%|██████████████| 37048/37048 [00:00<00:00, 371365.76 examples/s]
[2026-03-23 06:23:38,788] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:323] Loading dataset: psychopenguin/indian_legal_dataset_qna with base_type: alpaca and prompt_style: None
Tokenizing Prompts (num_proc=16):   0%|                      | 0/741 [00:00<?, ? examples/s]Tokenizing Prompts (num_proc=16):   6%|▊           | 47/741 [00:00<00:03, 174.10 examples/s]Tokenizing Prompts (num_proc=16):  81%|████████▏ | 603/741 [00:00<00:00, 2026.23 examples/s]Tokenizing Prompts (num_proc=16): 100%|██████████| 741/741 [00:00<00:00, 1577.59 examples/s]
[2026-03-23 06:23:39,283] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:323] min_input_len: 46
[2026-03-23 06:23:39,284] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:323] max_input_len: 333
Dropping Invalid Sequences (<None or >256) (num_proc=16):   0%| | 0/741 [00:00<?, ? examplesDropping Invalid Sequences (<None or >256) (num_proc=16):   6%| | 47/741 [00:00<00:02, 286.9Dropping Invalid Sequences (<None or >256) (num_proc=16): 100%|█| 741/741 [00:00<00:00, 2738
[2026-03-23 06:23:39,577] [INFO] [axolotl.utils.data.utils._drop_outside_range:306] [PID:323] Dropped 13 sequences outside valid range ([None, 256])
Saving the dataset (0/2 shards):   0%|                       | 0/728 [00:00<?, ? examples/s]Saving the dataset (1/2 shards):  50%|█████▌     | 364/728 [00:00<00:00, 6687.90 examples/s]Saving the dataset (2/2 shards): 100%|██████████| 728/728 [00:00<00:00, 12972.22 examples/s]Saving the dataset (2/2 shards): 100%|███████████| 728/728 [00:00<00:00, 6115.10 examples/s]
[2026-03-23 06:23:39,722] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:323] total_num_tokens: 61_212
[2026-03-23 06:23:39,725] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:323] `total_supervised_tokens: 41_580`
[2026-03-23 06:23:39,725] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:323] total_num_steps: 382
[2026-03-23 06:23:39,725] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:323] Maximum number of steps set at 382
[2026-03-23 06:23:39,726] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:323] loading tokenizer... PicoKittens/PicoMistral-23M
[2026-03-23 06:23:41,243] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:299] [PID:323] EOS: 2 / [EOS]
[2026-03-23 06:23:41,244] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:323] BOS: 1 / [BOS]
[2026-03-23 06:23:41,244] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:323] PAD: 0 / [PAD]
[2026-03-23 06:23:41,244] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:323] UNK: 3 / [UNK]
[2026-03-23 06:23:41,244] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:316] [PID:323] No Chat template selected. Consider adding a chat template for easier inference.
[2026-03-23 06:23:41,244] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:323] Loading model
[2026-03-23 06:23:41,611] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:91] [PID:323] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-03-23 06:23:41,612] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:142] [PID:323] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
model.safetensors:   0%|                                        | 0.00/94.4M [00:00<?, ?B/s]model.safetensors:   0%|                                        | 0.00/94.4M [00:00<?, ?B/s]model.safetensors:   0%|                                        | 0.00/94.4M [00:00<?, ?B/s]model.safetensors:   0%|                                        | 0.00/94.4M [00:00<?, ?B/s]model.safetensors:   0%|                                        | 0.00/94.4M [00:00<?, ?B/s]model.safetensors:   0%|                                        | 0.00/94.4M [00:01<?, ?B/s]model.safetensors:   0%|                                        | 0.00/94.4M [00:01<?, ?B/s]model.safetensors:   0%|                                        | 0.00/94.4M [00:01<?, ?B/s]model.safetensors:   0%|                                        | 0.00/94.4M [00:01<?, ?B/s]model.safetensors:   0%|                                        | 0.00/94.4M [00:01<?, ?B/s]model.safetensors:   0%|                                        | 0.00/94.4M [00:02<?, ?B/s]model.safetensors:   0%|                                        | 0.00/94.4M [00:02<?, ?B/s]model.safetensors:   0%|                                        | 0.00/94.4M [00:02<?, ?B/s]model.safetensors:   0%|                                        | 0.00/94.4M [00:02<?, ?B/s]model.safetensors: 100%|████████████████████████████████| 94.4M/94.4M [00:02<00:00, 471MB/s]model.safetensors: 100%|███████████████████████████████| 94.4M/94.4M [00:02<00:00, 33.7MB/s]
Loading weights:   0%|                                               | 0/74 [00:00<?, ?it/s]Loading weights: 100%|████████████████████████████████████| 74/74 [00:00<00:00, 7797.87it/s]
generation_config.json:   0%|                                     | 0.00/215 [00:00<?, ?B/s]generation_config.json: 100%|██████████████████████████████| 215/215 [00:00<00:00, 1.24MB/s]
[2026-03-23 06:23:46,645] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:361] [PID:323] Converting modules to torch.float16
[2026-03-23 06:23:46,650] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:323] Memory usage after model load 0.000GB (+0.002GB reserved)
[2026-03-23 06:23:46,650] [INFO] [axolotl.loaders.adapter.load_lora:81] [PID:323] found linear modules: ['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj']
trainable params: 266,240 || all params: 23,865,728 || trainable%: 1.1156
[2026-03-23 06:23:46,679] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:323] after adapters 0.000GB ()
[2026-03-23 06:23:48,758] [INFO] [axolotl.train.save_initial_configs:413] [PID:323] Pre-saving adapter config to ./final_model...
[2026-03-23 06:23:48,758] [INFO] [axolotl.train.save_initial_configs:417] [PID:323] Pre-saving tokenizer to ./final_model...
[2026-03-23 06:23:48,766] [INFO] [axolotl.train.save_initial_configs:422] [PID:323] Pre-saving model config to ./final_model...
[2026-03-23 06:23:48,767] [INFO] [axolotl.train.execute_training:218] [PID:323] Starting trainer...
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.
[34m[1mwandb[0m: Currently logged in as: [33mpsychopenguin0001[0m ([33mpsychopenguin0001-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: [38;5;178m⢿[0m Waiting for wandb.init()...
[Am[2K[34m[1mwandb[0m: [38;5;178m⣻[0m setting up run yksh2ail (0.1s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣽[0m setting up run yksh2ail (0.1s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣾[0m setting up run yksh2ail (0.1s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣷[0m setting up run yksh2ail (0.1s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣯[0m setting up run yksh2ail (0.6s)
[Am[2K[34m[1mwandb[0m: Tracking run with wandb version 0.25.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/workspace/axolotl/ttttttttttttttttttttt/wandb/run-20260323_062349-yksh2ail[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mt1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/psychopenguin0001-none/tttt[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/psychopenguin0001-none/tttt/runs/yksh2ail[0m
[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
[34m[1mwandb[0m: [33mWARNING[0m Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
[2026-03-23 06:23:52,754] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:323] The Axolotl config has been saved to the WandB run under files.
  0%|                                                               | 0/382 [00:00<?, ?it/s][2026-03-23 06:23:52,756] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:323] Running evaluation step...

  0%|                                                               | 0/110 [00:00<?, ?it/s][A
  3%|█▌                                                     | 3/110 [00:00<00:03, 29.36it/s][A
  5%|███                                                    | 6/110 [00:00<00:05, 17.60it/s][A
  8%|████▌                                                  | 9/110 [00:00<00:06, 15.67it/s][A
 10%|█████▍                                                | 11/110 [00:00<00:06, 14.96it/s][A
 12%|██████▍                                               | 13/110 [00:00<00:06, 15.47it/s][A
 14%|███████▎                                              | 15/110 [00:00<00:06, 14.94it/s][A
 15%|████████▎                                             | 17/110 [00:01<00:06, 14.58it/s][A
 17%|█████████▎                                            | 19/110 [00:01<00:06, 15.16it/s][A
 19%|██████████▎                                           | 21/110 [00:01<00:06, 14.76it/s][A
 21%|███████████▎                                          | 23/110 [00:01<00:06, 14.49it/s][A
 23%|████████████▎                                         | 25/110 [00:01<00:05, 14.30it/s][A
 25%|█████████████▎                                        | 27/110 [00:01<00:05, 14.19it/s][A
 26%|██████████████▏                                       | 29/110 [00:01<00:05, 14.07it/s][A
 29%|███████████████▋                                      | 32/110 [00:02<00:05, 15.37it/s][A
 31%|████████████████▋                                     | 34/110 [00:02<00:05, 14.28it/s][A
 33%|█████████████████▋                                    | 36/110 [00:02<00:04, 14.88it/s][A
 35%|███████████████████▏                                  | 39/110 [00:02<00:04, 15.87it/s][A
 37%|████████████████████▏                                 | 41/110 [00:02<00:04, 16.07it/s][A
 39%|█████████████████████                                 | 43/110 [00:02<00:04, 16.24it/s][A
 41%|██████████████████████                                | 45/110 [00:02<00:04, 15.48it/s][A
 43%|███████████████████████                               | 47/110 [00:03<00:03, 15.80it/s][A
 45%|████████████████████████                              | 49/110 [00:03<00:03, 16.04it/s][A
 47%|█████████████████████████▌                            | 52/110 [00:03<00:03, 16.57it/s][A
 50%|███████████████████████████                           | 55/110 [00:03<00:03, 17.02it/s][A
 52%|███████████████████████████▉                          | 57/110 [00:03<00:03, 16.11it/s][A
 54%|████████████████████████████▉                         | 59/110 [00:03<00:03, 16.25it/s][A
 55%|█████████████████████████████▉                        | 61/110 [00:03<00:03, 14.81it/s][A
 57%|██████████████████████████████▉                       | 63/110 [00:04<00:03, 13.88it/s][A
 60%|████████████████████████████████▍                     | 66/110 [00:04<00:02, 15.89it/s][A
 62%|█████████████████████████████████▍                    | 68/110 [00:04<00:02, 15.30it/s][A
 64%|██████████████████████████████████▎                   | 70/110 [00:04<00:02, 15.64it/s][A
 65%|███████████████████████████████████▎                  | 72/110 [00:04<00:02, 15.94it/s][A
 67%|████████████████████████████████████▎                 | 74/110 [00:04<00:02, 16.14it/s][A
 69%|█████████████████████████████████████▎                | 76/110 [00:04<00:02, 16.30it/s][A
 71%|██████████████████████████████████████▎               | 78/110 [00:05<00:01, 16.40it/s][A
 73%|███████████████████████████████████████▎              | 80/110 [00:05<00:01, 15.60it/s][A
 75%|████████████████████████████████████████▋             | 83/110 [00:05<00:01, 17.31it/s][A
 77%|█████████████████████████████████████████▋            | 85/110 [00:05<00:01, 17.13it/s][A
 79%|██████████████████████████████████████████▋           | 87/110 [00:05<00:01, 16.08it/s][A
 82%|████████████████████████████████████████████▏         | 90/110 [00:05<00:01, 16.72it/s][A
 84%|█████████████████████████████████████████████▏        | 92/110 [00:05<00:01, 16.71it/s][A
 85%|██████████████████████████████████████████████▏       | 94/110 [00:05<00:01, 15.84it/s][A
 87%|███████████████████████████████████████████████▏      | 96/110 [00:06<00:00, 15.24it/s][A
 89%|████████████████████████████████████████████████      | 98/110 [00:06<00:00, 14.83it/s][A
 92%|████████████████████████████████████████████████▋    | 101/110 [00:06<00:00, 16.67it/s][A
 94%|█████████████████████████████████████████████████▋   | 103/110 [00:06<00:00, 15.12it/s][A
 95%|██████████████████████████████████████████████████▌  | 105/110 [00:06<00:00, 14.77it/s][A
 97%|███████████████████████████████████████████████████▌ | 107/110 [00:06<00:00, 14.57it/s][A
100%|█████████████████████████████████████████████████████| 110/110 [00:07<00:00, 15.86it/s][A                                                                                            
                                                                                            [A{'eval_loss': '4.401', 'eval_runtime': '7.489', 'eval_samples_per_second': '29.24', 'eval_steps_per_second': '14.69', 'eval_ppl': '81.52', 'memory/max_active (GiB)': '0.13', 'memory/max_allocated (GiB)': '0.13', 'memory/device_reserved (GiB)': '0.19', 'epoch': 0}
  0%|                                                               | 0/382 [00:07<?, ?it/s]
100%|█████████████████████████████████████████████████████| 110/110 [00:07<00:00, 15.86it/s][A
                                                                                            [A  0%|▏                                                      | 1/382 [00:07<50:39,  7.98s/it]                                                                                            {'loss': '4.248', 'grad_norm': '0.3851', 'learning_rate': '0', 'ppl': '69.94', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.25', 'tokens/train_per_sec_per_gpu': '362.4', 'tokens/total': 768, 'tokens/trainable': 235, 'epoch': '0.007843'}
  0%|▏                                                      | 1/382 [00:07<50:39,  7.98s/it]  1%|▎                                                      | 2/382 [00:08<22:03,  3.48s/it]                                                                                            {'loss': '4.954', 'grad_norm': '0.307', 'learning_rate': '1.818e-05', 'ppl': '141.8', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.25', 'tokens/train_per_sec_per_gpu': '492.4', 'tokens/total': 1536, 'tokens/trainable': 597, 'epoch': '0.01569'}
  1%|▎                                                      | 2/382 [00:08<22:03,  3.48s/it]  1%|▍                                                      | 3/382 [00:08<12:54,  2.04s/it]                                                                                            {'loss': '3.94', 'grad_norm': '0.3305', 'learning_rate': '3.636e-05', 'ppl': '51.42', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '567', 'tokens/total': 2304, 'tokens/trainable': 803, 'epoch': '0.02353'}
  1%|▍                                                      | 3/382 [00:08<12:54,  2.04s/it]  1%|▌                                                      | 4/382 [00:08<08:29,  1.35s/it]                                                                                            {'loss': '4.496', 'grad_norm': '0.5261', 'learning_rate': '5.455e-05', 'ppl': '89.66', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '423.7', 'tokens/total': 2944, 'tokens/trainable': 958, 'epoch': '0.03137'}
  1%|▌                                                      | 4/382 [00:08<08:29,  1.35s/it]  1%|▋                                                      | 5/382 [00:09<06:10,  1.02it/s]                                                                                            {'loss': '4.738', 'grad_norm': '0.4116', 'learning_rate': '7.273e-05', 'ppl': '114.2', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '467.1', 'tokens/total': 3712, 'tokens/trainable': 1162, 'epoch': '0.03922'}
  1%|▋                                                      | 5/382 [00:09<06:10,  1.02it/s]  2%|▊                                                      | 6/382 [00:09<04:53,  1.28it/s]                                                                                            {'loss': '3.481', 'grad_norm': '0.3488', 'learning_rate': '9.091e-05', 'ppl': '32.49', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '379.6', 'tokens/total': 4608, 'tokens/trainable': 1387, 'epoch': '0.04706'}
  2%|▊                                                      | 6/382 [00:09<04:53,  1.28it/s]  2%|█                                                      | 7/382 [00:09<03:57,  1.58it/s]                                                                                            {'loss': '5.233', 'grad_norm': '0.5332', 'learning_rate': '0.0001091', 'ppl': '187.3', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '186.4', 'tokens/total': 5376, 'tokens/trainable': 1503, 'epoch': '0.0549'}
  2%|█                                                      | 7/382 [00:09<03:57,  1.58it/s]  2%|█▏                                                     | 8/382 [00:10<03:08,  1.98it/s]                                                                                            {'loss': '3.743', 'grad_norm': '0.4161', 'learning_rate': '0.0001273', 'ppl': '42.21', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '392.5', 'tokens/total': 5888, 'tokens/trainable': 1677, 'epoch': '0.06275'}
  2%|█▏                                                     | 8/382 [00:10<03:08,  1.98it/s]  2%|█▎                                                     | 9/382 [00:10<02:48,  2.21it/s]                                                                                            {'loss': '4.189', 'grad_norm': '0.3465', 'learning_rate': '0.0001455', 'ppl': '65.99', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '468.4', 'tokens/total': 6656, 'tokens/trainable': 1914, 'epoch': '0.07059'}
  2%|█▎                                                     | 9/382 [00:10<02:48,  2.21it/s]  3%|█▍                                                    | 10/382 [00:10<02:34,  2.40it/s]                                                                                            {'loss': '4.22', 'grad_norm': '0.3', 'learning_rate': '0.0001636', 'ppl': '68.06', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '684.7', 'tokens/total': 7424, 'tokens/trainable': 2354, 'epoch': '0.07843'}
  3%|█▍                                                    | 10/382 [00:10<02:34,  2.40it/s]  3%|█▌                                                    | 11/382 [00:11<02:12,  2.79it/s]                                                                                            {'loss': '4.16', 'grad_norm': '0.6687', 'learning_rate': '0.0001818', 'ppl': '64.06', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '227', 'tokens/total': 7936, 'tokens/trainable': 2464, 'epoch': '0.08627'}
  3%|█▌                                                    | 11/382 [00:11<02:12,  2.79it/s]  3%|█▋                                                    | 12/382 [00:11<02:10,  2.84it/s]                                                                                            {'loss': '3.694', 'grad_norm': '0.3338', 'learning_rate': '0.0002', 'ppl': '40.19', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '432.4', 'tokens/total': 8704, 'tokens/trainable': 2725, 'epoch': '0.09412'}
  3%|█▋                                                    | 12/382 [00:11<02:10,  2.84it/s]  3%|█▊                                                    | 13/382 [00:11<02:08,  2.88it/s]                                                                                            {'loss': '4.339', 'grad_norm': '0.33', 'learning_rate': '0.0002', 'ppl': '76.66', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '552.8', 'tokens/total': 9472, 'tokens/trainable': 3052, 'epoch': '0.102'}
  3%|█▊                                                    | 13/382 [00:11<02:08,  2.88it/s]  4%|█▉                                                    | 14/382 [00:12<02:00,  3.06it/s]                                                                                            {'loss': '3.712', 'grad_norm': '0.4532', 'learning_rate': '0.0002', 'ppl': '40.95', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '129.6', 'tokens/total': 10112, 'tokens/trainable': 3197, 'epoch': '0.1098'}
  4%|█▉                                                    | 14/382 [00:12<02:00,  3.06it/s]  4%|██                                                    | 15/382 [00:12<02:06,  2.89it/s]                                                                                            {'loss': '3.9', 'grad_norm': '0.3631', 'learning_rate': '0.0002', 'ppl': '49.42', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '721.8', 'tokens/total': 11008, 'tokens/trainable': 3609, 'epoch': '0.1176'}
  4%|██                                                    | 15/382 [00:12<02:06,  2.89it/s]  4%|██▎                                                   | 16/382 [00:12<01:59,  3.06it/s]                                                                                            {'loss': '4.074', 'grad_norm': '0.442', 'learning_rate': '0.0001999', 'ppl': '58.78', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '377.7', 'tokens/total': 11648, 'tokens/trainable': 3796, 'epoch': '0.1255'}
  4%|██▎                                                   | 16/382 [00:12<01:59,  3.06it/s]  4%|██▍                                                   | 17/382 [00:12<01:48,  3.37it/s]                                                                                            {'loss': '4.659', 'grad_norm': '0.6571', 'learning_rate': '0.0001999', 'ppl': '105.5', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '334.7', 'tokens/total': 12160, 'tokens/trainable': 3950, 'epoch': '0.1333'}
  4%|██▍                                                   | 17/382 [00:12<01:48,  3.37it/s]  5%|██▌                                                   | 18/382 [00:13<01:46,  3.43it/s]                                                                                            {'loss': '4.588', 'grad_norm': '0.4725', 'learning_rate': '0.0001999', 'ppl': '98.29', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '640.2', 'tokens/total': 12800, 'tokens/trainable': 4188, 'epoch': '0.1412'}
  5%|██▌                                                   | 18/382 [00:13<01:46,  3.43it/s]  5%|██▋                                                   | 19/382 [00:13<01:33,  3.88it/s]                                                                                            {'loss': '4.698', 'grad_norm': '0.7742', 'learning_rate': '0.0001998', 'ppl': '109.7', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '330.2', 'tokens/total': 13184, 'tokens/trainable': 4276, 'epoch': '0.149'}
  5%|██▋                                                   | 19/382 [00:13<01:33,  3.88it/s]  5%|██▊                                                   | 20/382 [00:13<01:35,  3.78it/s]                                                                                            {'loss': '4.524', 'grad_norm': '0.6916', 'learning_rate': '0.0001998', 'ppl': '92.25', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '284', 'tokens/total': 13824, 'tokens/trainable': 4377, 'epoch': '0.1569'}
  5%|██▊                                                   | 20/382 [00:13<01:35,  3.78it/s]  5%|██▉                                                   | 21/382 [00:14<01:48,  3.32it/s]                                                                                            {'loss': '3.903', 'grad_norm': '0.4196', 'learning_rate': '0.0001997', 'ppl': '49.54', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '490.4', 'tokens/total': 14720, 'tokens/trainable': 4700, 'epoch': '0.1647'}
  5%|██▉                                                   | 21/382 [00:14<01:48,  3.32it/s]  6%|███                                                   | 22/382 [00:14<01:57,  3.06it/s]                                                                                            {'loss': '3.72', 'grad_norm': '0.4048', 'learning_rate': '0.0001996', 'ppl': '41.28', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '441', 'tokens/total': 15616, 'tokens/trainable': 4988, 'epoch': '0.1725'}
  6%|███                                                   | 22/382 [00:14<01:57,  3.06it/s]  6%|███▎                                                  | 23/382 [00:14<01:52,  3.19it/s]                                                                                            {'loss': '3.313', 'grad_norm': '0.4152', 'learning_rate': '0.0001996', 'ppl': '27.46', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '580.3', 'tokens/total': 16256, 'tokens/trainable': 5232, 'epoch': '0.1804'}
  6%|███▎                                                  | 23/382 [00:14<01:52,  3.19it/s]  6%|███▍                                                  | 24/382 [00:14<01:42,  3.48it/s]                                                                                            {'loss': '5.641', 'grad_norm': '0.9761', 'learning_rate': '0.0001995', 'ppl': '281.7', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '241.2', 'tokens/total': 16768, 'tokens/trainable': 5333, 'epoch': '0.1882'}
  6%|███▍                                                  | 24/382 [00:14<01:42,  3.48it/s]  7%|███▌                                                  | 25/382 [00:15<01:42,  3.50it/s]                                                                                            {'loss': '3.434', 'grad_norm': '0.4904', 'learning_rate': '0.0001994', 'ppl': '31.02', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '290.2', 'tokens/total': 17408, 'tokens/trainable': 5520, 'epoch': '0.1961'}
  7%|███▌                                                  | 25/382 [00:15<01:42,  3.50it/s]  7%|███▋                                                  | 26/382 [00:15<01:41,  3.52it/s]                                                                                            {'loss': '4.711', 'grad_norm': '0.7715', 'learning_rate': '0.0001993', 'ppl': '111.2', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '233.9', 'tokens/total': 18048, 'tokens/trainable': 5654, 'epoch': '0.2039'}
  7%|███▋                                                  | 26/382 [00:15<01:41,  3.52it/s]  7%|███▊                                                  | 27/382 [00:15<01:47,  3.30it/s]                                                                                            {'loss': '4.399', 'grad_norm': '0.593', 'learning_rate': '0.0001992', 'ppl': '81.4', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '301.2', 'tokens/total': 18816, 'tokens/trainable': 5841, 'epoch': '0.2118'}
  7%|███▊                                                  | 27/382 [00:15<01:47,  3.30it/s]  7%|███▉                                                  | 28/382 [00:16<01:50,  3.20it/s]                                                                                            {'loss': '4.198', 'grad_norm': '0.4924', 'learning_rate': '0.0001991', 'ppl': '66.58', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '503', 'tokens/total': 19584, 'tokens/trainable': 6112, 'epoch': '0.2196'}
  7%|███▉                                                  | 28/382 [00:16<01:50,  3.20it/s]  8%|████                                                  | 29/382 [00:16<01:52,  3.13it/s]                                                                                            {'loss': '4.088', 'grad_norm': '0.5578', 'learning_rate': '0.000199', 'ppl': '59.64', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '210.4', 'tokens/total': 20352, 'tokens/trainable': 6373, 'epoch': '0.2275'}
  8%|████                                                  | 29/382 [00:16<01:52,  3.13it/s]  8%|████▏                                                 | 30/382 [00:16<01:42,  3.43it/s]                                                                                            {'loss': '4.141', 'grad_norm': '0.5591', 'learning_rate': '0.0001988', 'ppl': '62.9', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '465.4', 'tokens/total': 20864, 'tokens/trainable': 6535, 'epoch': '0.2353'}
  8%|████▏                                                 | 30/382 [00:16<01:42,  3.43it/s]  8%|████▍                                                 | 31/382 [00:17<01:35,  3.68it/s]                                                                                            {'loss': '4.482', 'grad_norm': '0.7028', 'learning_rate': '0.0001987', 'ppl': '88.37', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '362.3', 'tokens/total': 21376, 'tokens/trainable': 6680, 'epoch': '0.2431'}
  8%|████▍                                                 | 31/382 [00:17<01:35,  3.68it/s]  8%|████▌                                                 | 32/382 [00:17<01:30,  3.88it/s]                                                                                            {'loss': '4.884', 'grad_norm': '0.785', 'learning_rate': '0.0001986', 'ppl': '132.2', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '214.4', 'tokens/total': 21888, 'tokens/trainable': 6814, 'epoch': '0.251'}
  8%|████▌                                                 | 32/382 [00:17<01:30,  3.88it/s]  9%|████▋                                                 | 33/382 [00:17<01:32,  3.77it/s]                                                                                            {'loss': '4.679', 'grad_norm': '0.552', 'learning_rate': '0.0001984', 'ppl': '107.6', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '275.8', 'tokens/total': 22528, 'tokens/trainable': 7050, 'epoch': '0.2588'}
  9%|████▋                                                 | 33/382 [00:17<01:32,  3.77it/s]  9%|████▊                                                 | 34/382 [00:17<01:33,  3.70it/s]                                                                                            {'loss': '4.424', 'grad_norm': '0.7013', 'learning_rate': '0.0001983', 'ppl': '83.45', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '233.1', 'tokens/total': 23168, 'tokens/trainable': 7239, 'epoch': '0.2667'}
  9%|████▊                                                 | 34/382 [00:17<01:33,  3.70it/s]  9%|████▉                                                 | 35/382 [00:18<01:40,  3.47it/s]                                                                                            {'loss': '5.214', 'grad_norm': '0.6762', 'learning_rate': '0.0001981', 'ppl': '183.9', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '405.1', 'tokens/total': 23936, 'tokens/trainable': 7407, 'epoch': '0.2745'}
  9%|████▉                                                 | 35/382 [00:18<01:40,  3.47it/s]  9%|█████                                                 | 36/382 [00:18<01:44,  3.31it/s]                                                                                            {'loss': '4.641', 'grad_norm': '0.7065', 'learning_rate': '0.0001979', 'ppl': '103.7', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '136.6', 'tokens/total': 24704, 'tokens/trainable': 7582, 'epoch': '0.2824'}
  9%|█████                                                 | 36/382 [00:18<01:44,  3.31it/s] 10%|█████▏                                                | 37/382 [00:18<01:41,  3.38it/s]                                                                                            {'loss': '4.024', 'grad_norm': '0.603', 'learning_rate': '0.0001978', 'ppl': '55.94', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '675.2', 'tokens/total': 25344, 'tokens/trainable': 7854, 'epoch': '0.2902'}
 10%|█████▏                                                | 37/382 [00:18<01:41,  3.38it/s] 10%|█████▎                                                | 38/382 [00:19<01:40,  3.44it/s]                                                                                            {'loss': '4.443', 'grad_norm': '0.696', 'learning_rate': '0.0001976', 'ppl': '85.04', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '61.1', 'tokens/total': 25984, 'tokens/trainable': 7981, 'epoch': '0.298'}
 10%|█████▎                                                | 38/382 [00:19<01:40,  3.44it/s] 10%|█████▌                                                | 39/382 [00:19<01:33,  3.68it/s]                                                                                            {'loss': '3.97', 'grad_norm': '0.8297', 'learning_rate': '0.0001974', 'ppl': '53', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '266.9', 'tokens/total': 26496, 'tokens/trainable': 8094, 'epoch': '0.3059'}
 10%|█████▌                                                | 39/382 [00:19<01:33,  3.68it/s] 10%|█████▋                                                | 40/382 [00:19<01:39,  3.45it/s]                                                                                            {'loss': '3.991', 'grad_norm': '0.6391', 'learning_rate': '0.0001972', 'ppl': '54.12', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '167.4', 'tokens/total': 27264, 'tokens/trainable': 8320, 'epoch': '0.3137'}
 10%|█████▋                                                | 40/382 [00:19<01:39,  3.45it/s] 11%|█████▊                                                | 41/382 [00:19<01:32,  3.69it/s]                                                                                            {'loss': '3.614', 'grad_norm': '1.389', 'learning_rate': '0.000197', 'ppl': '37.13', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '344.3', 'tokens/total': 27776, 'tokens/trainable': 8449, 'epoch': '0.3216'}
 11%|█████▊                                                | 41/382 [00:19<01:32,  3.69it/s] 11%|█████▉                                                | 42/382 [00:20<01:43,  3.28it/s]                                                                                            {'loss': '3.896', 'grad_norm': '0.5063', 'learning_rate': '0.0001968', 'ppl': '49.18', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '374', 'tokens/total': 28672, 'tokens/trainable': 8681, 'epoch': '0.3294'}
 11%|█████▉                                                | 42/382 [00:20<01:43,  3.28it/s] 11%|██████                                                | 43/382 [00:20<01:40,  3.36it/s]                                                                                            {'loss': '3.93', 'grad_norm': '0.7373', 'learning_rate': '0.0001966', 'ppl': '50.92', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '240.6', 'tokens/total': 29312, 'tokens/trainable': 8905, 'epoch': '0.3373'}
 11%|██████                                                | 43/382 [00:20<01:40,  3.36it/s] 12%|██████▏                                               | 44/382 [00:20<01:44,  3.24it/s]                                                                                            {'loss': '3.95', 'grad_norm': '0.6125', 'learning_rate': '0.0001964', 'ppl': '51.95', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '279.7', 'tokens/total': 30080, 'tokens/trainable': 9136, 'epoch': '0.3451'}
 12%|██████▏                                               | 44/382 [00:20<01:44,  3.24it/s] 12%|██████▎                                               | 45/382 [00:21<01:51,  3.01it/s]                                                                                            {'loss': '4.124', 'grad_norm': '0.7797', 'learning_rate': '0.0001961', 'ppl': '61.79', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '284', 'tokens/total': 30976, 'tokens/trainable': 9351, 'epoch': '0.3529'}
 12%|██████▎                                               | 45/382 [00:21<01:51,  3.01it/s] 12%|██████▌                                               | 46/382 [00:21<01:51,  3.01it/s]                                                                                            {'loss': '3.779', 'grad_norm': '0.6263', 'learning_rate': '0.0001959', 'ppl': '43.77', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '695', 'tokens/total': 31744, 'tokens/trainable': 9631, 'epoch': '0.3608'}
 12%|██████▌                                               | 46/382 [00:21<01:51,  3.01it/s] 12%|██████▋                                               | 47/382 [00:21<01:46,  3.16it/s]                                                                                            {'loss': '4.41', 'grad_norm': '0.8065', 'learning_rate': '0.0001956', 'ppl': '82.23', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '161.3', 'tokens/total': 32384, 'tokens/trainable': 9747, 'epoch': '0.3686'}
 12%|██████▋                                               | 47/382 [00:21<01:46,  3.16it/s] 13%|███████████████████                                                                                                                                     | 48/382 [00:22<01:52,  2.96it/s]                                                                                                                                                                                              {'loss': '4.708', 'grad_norm': '0.4926', 'learning_rate': '0.0001954', 'ppl': '110.9', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '363.9', 'tokens/total': 33280, 'tokens/trainable': 10127, 'epoch': '0.3765'}
 13%|███████████████████                                                                                                                                     | 48/382 [00:22<01:52,  2.96it/s] 13%|███████████████████▍                                                                                                                                    | 49/382 [00:22<01:52,  2.97it/s]                                                                                                                                                                                              {'loss': '3.96', 'grad_norm': '0.702', 'learning_rate': '0.0001951', 'ppl': '52.44', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '384.6', 'tokens/total': 34048, 'tokens/trainable': 10364, 'epoch': '0.3843'}
 13%|███████████████████▍                                                                                                                                    | 49/382 [00:22<01:52,  2.97it/s] 13%|███████████████████▉                                                                                                                                    | 50/382 [00:22<01:56,  2.84it/s]                                                                                                                                                                                              {'loss': '3.078', 'grad_norm': '0.7404', 'learning_rate': '0.0001949', 'ppl': '21.72', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '339.2', 'tokens/total': 34944, 'tokens/trainable': 10549, 'epoch': '0.3922'}
 13%|███████████████████▉                                                                                                                                    | 50/382 [00:22<01:56,  2.84it/s] 13%|████████████████████▎                                                                                                                                   | 51/382 [00:23<01:54,  2.88it/s]                                                                                                                                                                                              {'loss': '4.204', 'grad_norm': '0.6599', 'learning_rate': '0.0001946', 'ppl': '66.92', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '320.9', 'tokens/total': 35712, 'tokens/trainable': 10783, 'epoch': '0.4'}
 13%|████████████████████▎                                                                                                                                   | 51/382 [00:23<01:54,  2.88it/s] 14%|████████████████████▋                                                                                                                                   | 52/382 [00:23<01:53,  2.90it/s]                                                                                                                                                                                              {'loss': '3.925', 'grad_norm': '0.7829', 'learning_rate': '0.0001943', 'ppl': '50.66', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '212.3', 'tokens/total': 36480, 'tokens/trainable': 10929, 'epoch': '0.4078'}
 14%|████████████████████▋                                                                                                                                   | 52/382 [00:23<01:53,  2.90it/s] 14%|█████████████████████                                                                                                                                   | 53/382 [00:23<01:52,  2.93it/s]                                                                                                                                                                                              {'loss': '3.752', 'grad_norm': '0.5946', 'learning_rate': '0.000194', 'ppl': '42.6', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '346', 'tokens/total': 37248, 'tokens/trainable': 11201, 'epoch': '0.4157'}
 14%|█████████████████████                                                                                                                                   | 53/382 [00:23<01:52,  2.93it/s] 14%|█████████████████████▍                                                                                                                                  | 54/382 [00:24<01:46,  3.09it/s]                                                                                                                                                                                              {'loss': '3.682', 'grad_norm': '0.7741', 'learning_rate': '0.0001937', 'ppl': '39.73', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '446.2', 'tokens/total': 37888, 'tokens/trainable': 11377, 'epoch': '0.4235'}
 14%|█████████████████████▍                                                                                                                                  | 54/382 [00:24<01:46,  3.09it/s] 14%|█████████████████████▉                                                                                                                                  | 55/382 [00:24<01:41,  3.22it/s]                                                                                                                                                                                              {'loss': '3.983', 'grad_norm': '0.7811', 'learning_rate': '0.0001934', 'ppl': '53.7', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '369.1', 'tokens/total': 38528, 'tokens/trainable': 11529, 'epoch': '0.4314'}
 14%|█████████████████████▉                                                                                                                                  | 55/382 [00:24<01:41,  3.22it/s] 15%|██████████████████████▎                                                                                                                                 | 56/382 [00:24<01:43,  3.14it/s]                                                                                                                                                                                              {'loss': '4.018', 'grad_norm': '0.6148', 'learning_rate': '0.0001931', 'ppl': '55.59', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '445', 'tokens/total': 39296, 'tokens/trainable': 11762, 'epoch': '0.4392'}
 15%|██████████████████████▎                                                                                                                                 | 56/382 [00:24<01:43,  3.14it/s] 15%|██████████████████████▋                                                                                                                                 | 57/382 [00:25<01:45,  3.09it/s]                                                                                                                                                                                              {'loss': '3.293', 'grad_norm': '0.5738', 'learning_rate': '0.0001928', 'ppl': '26.91', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '488.6', 'tokens/total': 40064, 'tokens/trainable': 11998, 'epoch': '0.4471'}
 15%|██████████████████████▋                                                                                                                                 | 57/382 [00:25<01:45,  3.09it/s] 15%|███████████████████████                                                                                                                                 | 58/382 [00:25<01:50,  2.92it/s]                                                                                                                                                                                              {'loss': '3.672', 'grad_norm': '0.5715', 'learning_rate': '0.0001925', 'ppl': '39.32', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '242.6', 'tokens/total': 40960, 'tokens/trainable': 12259, 'epoch': '0.4549'}
 15%|███████████████████████                                                                                                                                 | 58/382 [00:25<01:50,  2.92it/s] 15%|███████████████████████▍                                                                                                                                | 59/382 [00:25<01:54,  2.81it/s]                                                                                                                                                                                              {'loss': '3.732', 'grad_norm': '0.5216', 'learning_rate': '0.0001922', 'ppl': '41.77', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '576.5', 'tokens/total': 41856, 'tokens/trainable': 12577, 'epoch': '0.4627'}
 15%|███████████████████████▍                                                                                                                                | 59/382 [00:25<01:54,  2.81it/s] 16%|███████████████████████▊                                                                                                                                | 60/382 [00:26<02:02,  2.63it/s]                                                                                                                                                                                              {'loss': '3.876', 'grad_norm': '0.6081', 'learning_rate': '0.0001919', 'ppl': '48.21', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '322.7', 'tokens/total': 42880, 'tokens/trainable': 12916, 'epoch': '0.4706'}
 16%|███████████████████████▊                                                                                                                                | 60/382 [00:26<02:02,  2.63it/s] 16%|████████████████████████▎                                                                                                                               | 61/382 [00:26<01:52,  2.85it/s]                                                                                                                                                                                              {'loss': '3.735', 'grad_norm': '0.6942', 'learning_rate': '0.0001915', 'ppl': '41.89', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '305.3', 'tokens/total': 43520, 'tokens/trainable': 13093, 'epoch': '0.4784'}
 16%|████████████████████████▎                                                                                                                               | 61/382 [00:26<01:52,  2.85it/s] 16%|████████████████████████▋                                                                                                                               | 62/382 [00:26<01:45,  3.03it/s]                                                                                                                                                                                              {'loss': '4.187', 'grad_norm': '0.6836', 'learning_rate': '0.0001912', 'ppl': '65.83', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '502', 'tokens/total': 44160, 'tokens/trainable': 13348, 'epoch': '0.4863'}
 16%|████████████████████████▋                                                                                                                               | 62/382 [00:26<01:45,  3.03it/s] 16%|█████████████████████████                                                                                                                               | 63/382 [00:27<01:40,  3.17it/s]                                                                                                                                                                                              {'loss': '4.785', 'grad_norm': '0.9891', 'learning_rate': '0.0001908', 'ppl': '119.7', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '154.7', 'tokens/total': 44800, 'tokens/trainable': 13454, 'epoch': '0.4941'}
 16%|█████████████████████████                                                                                                                               | 63/382 [00:27<01:40,  3.17it/s] 17%|█████████████████████████▍                                                                                                                              | 64/382 [00:27<01:37,  3.27it/s]                                                                                                                                                                                              {'loss': '3.675', 'grad_norm': '0.7927', 'learning_rate': '0.0001905', 'ppl': '39.46', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '469.7', 'tokens/total': 45440, 'tokens/trainable': 13650, 'epoch': '0.502'}
 17%|█████████████████████████▍                                                                                                                              | 64/382 [00:27<01:37,  3.27it/s] 17%|█████████████████████████▊                                                                                                                              | 65/382 [00:27<01:34,  3.35it/s]                                                                                                                                                                                              {'loss': '3.882', 'grad_norm': '0.7067', 'learning_rate': '0.0001901', 'ppl': '48.52', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '412.6', 'tokens/total': 46080, 'tokens/trainable': 13819, 'epoch': '0.5098'}
 17%|█████████████████████████▊                                                                                                                              | 65/382 [00:27<01:34,  3.35it/s] 17%|██████████████████████████▎                                                                                                                             | 66/382 [00:28<01:32,  3.41it/s]                                                                                                                                                                                              {'loss': '4.452', 'grad_norm': '0.7593', 'learning_rate': '0.0001897', 'ppl': '85.77', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '240.5', 'tokens/total': 46720, 'tokens/trainable': 14020, 'epoch': '0.5176'}
 17%|██████████████████████████▎                                                                                                                             | 66/382 [00:28<01:32,  3.41it/s] 18%|██████████████████████████▋                                                                                                                             | 67/382 [00:28<01:36,  3.28it/s]                                                                                                                                                                                              {'loss': '3.771', 'grad_norm': '0.6231', 'learning_rate': '0.0001893', 'ppl': '43.44', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '565', 'tokens/total': 47488, 'tokens/trainable': 14284, 'epoch': '0.5255'}
 18%|██████████████████████████▋                                                                                                                             | 67/382 [00:28<01:36,  3.28it/s] 18%|███████████████████████████                                                                                                                             | 68/382 [00:28<01:33,  3.36it/s]                                                                                                                                                                                              {'loss': '3.949', 'grad_norm': '0.8237', 'learning_rate': '0.000189', 'ppl': '51.86', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '362.5', 'tokens/total': 48128, 'tokens/trainable': 14446, 'epoch': '0.5333'}
 18%|███████████████████████████                                                                                                                             | 68/382 [00:28<01:33,  3.36it/s] 18%|███████████████████████████▍                                                                                                                            | 69/382 [00:29<01:36,  3.23it/s]                                                                                                                                                                                              {'loss': '4.214', 'grad_norm': '0.8316', 'learning_rate': '0.0001886', 'ppl': '67.65', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '294.7', 'tokens/total': 48896, 'tokens/trainable': 14658, 'epoch': '0.5412'}
 18%|███████████████████████████▍                                                                                                                            | 69/382 [00:29<01:36,  3.23it/s] 18%|███████████████████████████▊                                                                                                                            | 70/382 [00:29<01:43,  3.01it/s]                                                                                                                                                                                              {'loss': '3.749', 'grad_norm': '0.5747', 'learning_rate': '0.0001882', 'ppl': '42.5', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '650.1', 'tokens/total': 49792, 'tokens/trainable': 14986, 'epoch': '0.549'}
 18%|███████████████████████████▊                                                                                                                            | 70/382 [00:29<01:43,  3.01it/s] 19%|████████████████████████████▎                                                                                                                           | 71/382 [00:29<01:48,  2.87it/s]                                                                                                                                                                                              {'loss': '3.233', 'grad_norm': '0.5122', 'learning_rate': '0.0001878', 'ppl': '25.36', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '363.3', 'tokens/total': 50688, 'tokens/trainable': 15303, 'epoch': '0.5569'}
 19%|████████████████████████████▎                                                                                                                           | 71/382 [00:29<01:48,  2.87it/s] 19%|████████████████████████████▋                                                                                                                           | 72/382 [00:30<01:41,  3.05it/s]                                                                                                                                                                                              {'loss': '4.152', 'grad_norm': '0.8388', 'learning_rate': '0.0001874', 'ppl': '63.56', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '333.5', 'tokens/total': 51328, 'tokens/trainable': 15474, 'epoch': '0.5647'}
 19%|████████████████████████████▋                                                                                                                           | 72/382 [00:30<01:41,  3.05it/s] 19%|█████████████████████████████                                                                                                                           | 73/382 [00:30<01:46,  2.89it/s]                                                                                                                                                                                              {'loss': '3.592', 'grad_norm': '0.6725', 'learning_rate': '0.000187', 'ppl': '36.32', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '242.5', 'tokens/total': 52224, 'tokens/trainable': 15795, 'epoch': '0.5725'}
 19%|█████████████████████████████                                                                                                                           | 73/382 [00:30<01:46,  2.89it/s] 19%|█████████████████████████████▍                                                                                                                          | 74/382 [00:30<01:40,  3.07it/s]                                                                                                                                                                                              {'loss': '3.61', 'grad_norm': '0.61', 'learning_rate': '0.0001865', 'ppl': '36.97', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '434.4', 'tokens/total': 52864, 'tokens/trainable': 16036, 'epoch': '0.5804'}
 19%|█████████████████████████████▍                                                                                                                          | 74/382 [00:30<01:40,  3.07it/s] 20%|█████████████████████████████▊                                                                                                                          | 75/382 [00:31<01:35,  3.20it/s]                                                                                                                                                                                              {'loss': '4.264', 'grad_norm': '0.6799', 'learning_rate': '0.0001861', 'ppl': '71.12', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '407.1', 'tokens/total': 53504, 'tokens/trainable': 16282, 'epoch': '0.5882'}
 20%|█████████████████████████████▊                                                                                                                          | 75/382 [00:31<01:35,  3.20it/s] 20%|██████████████████████████████▏                                                                                                                         | 76/382 [00:31<01:37,  3.13it/s]                                                                                                                                                                                              {'loss': '3.405', 'grad_norm': '0.5984', 'learning_rate': '0.0001857', 'ppl': '30.1', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '548.2', 'tokens/total': 54272, 'tokens/trainable': 16537, 'epoch': '0.5961'}
 20%|██████████████████████████████▏                                                                                                                         | 76/382 [00:31<01:37,  3.13it/s] 20%|██████████████████████████████▋                                                                                                                         | 77/382 [00:31<01:39,  3.08it/s]                                                                                                                                                                                              {'loss': '4.653', 'grad_norm': '1.031', 'learning_rate': '0.0001852', 'ppl': '104.9', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '143.8', 'tokens/total': 55040, 'tokens/trainable': 16661, 'epoch': '0.6039'}
 20%|██████████████████████████████▋                                                                                                                         | 77/382 [00:31<01:39,  3.08it/s] 20%|███████████████████████████████                                                                                                                         | 78/382 [00:32<01:39,  3.04it/s]                                                                                                                                                                                              {'loss': '4.365', 'grad_norm': '0.7759', 'learning_rate': '0.0001848', 'ppl': '78.64', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '269.1', 'tokens/total': 55808, 'tokens/trainable': 16834, 'epoch': '0.6118'}
 20%|███████████████████████████████                                                                                                                         | 78/382 [00:32<01:39,  3.04it/s] 21%|███████████████████████████████▍                                                                                                                        | 79/382 [00:32<01:35,  3.18it/s]                                                                                                                                                                                              {'loss': '3.468', 'grad_norm': '0.7899', 'learning_rate': '0.0001843', 'ppl': '32.06', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '308.7', 'tokens/total': 56448, 'tokens/trainable': 17007, 'epoch': '0.6196'}
 21%|███████████████████████████████▍                                                                                                                        | 79/382 [00:32<01:35,  3.18it/s] 21%|███████████████████████████████▊                                                                                                                        | 80/382 [00:32<01:27,  3.47it/s]                                                                                                                                                                                              {'loss': '4.171', 'grad_norm': '1.11', 'learning_rate': '0.0001839', 'ppl': '64.81', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '303.3', 'tokens/total': 56960, 'tokens/trainable': 17128, 'epoch': '0.6275'}
 21%|███████████████████████████████▊                                                                                                                        | 80/382 [00:32<01:27,  3.47it/s] 21%|████████████████████████████████▏                                                                                                                       | 81/382 [00:32<01:26,  3.49it/s]                                                                                                                                                                                              {'loss': '3.501', 'grad_norm': '0.9644', 'learning_rate': '0.0001834', 'ppl': '33.14', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '349.6', 'tokens/total': 57600, 'tokens/trainable': 17308, 'epoch': '0.6353'}
 21%|████████████████████████████████▏                                                                                                                       | 81/382 [00:32<01:26,  3.49it/s] 21%|████████████████████████████████▋                                                                                                                       | 82/382 [00:33<01:25,  3.50it/s]                                                                                                                                                                                              {'loss': '3.558', 'grad_norm': '0.6201', 'learning_rate': '0.0001829', 'ppl': '35.09', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '347.5', 'tokens/total': 58240, 'tokens/trainable': 17546, 'epoch': '0.6431'}
 21%|████████████████████████████████▋                                                                                                                       | 82/382 [00:33<01:25,  3.50it/s] 22%|█████████████████████████████████                                                                                                                       | 83/382 [00:33<01:25,  3.52it/s]                                                                                                                                                                                              {'loss': '4.18', 'grad_norm': '0.7564', 'learning_rate': '0.0001825', 'ppl': '65.38', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '433.6', 'tokens/total': 58880, 'tokens/trainable': 17746, 'epoch': '0.651'}
 22%|█████████████████████████████████                                                                                                                       | 83/382 [00:33<01:25,  3.52it/s] 22%|█████████████████████████████████▍                                                                                                                      | 84/382 [00:33<01:19,  3.74it/s]                                                                                                                                                                                              {'loss': '3.953', 'grad_norm': '0.8324', 'learning_rate': '0.000182', 'ppl': '52.09', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '253.4', 'tokens/total': 59392, 'tokens/trainable': 17911, 'epoch': '0.6588'}
 22%|█████████████████████████████████▍                                                                                                                      | 84/382 [00:33<01:19,  3.74it/s] 22%|█████████████████████████████████▊                                                                                                                      | 85/382 [00:33<01:20,  3.68it/s]                                                                                                                                                                                              {'loss': '4.723', 'grad_norm': '0.8145', 'learning_rate': '0.0001815', 'ppl': '112.5', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '484.9', 'tokens/total': 60032, 'tokens/trainable': 18079, 'epoch': '0.6667'}
 22%|█████████████████████████████████▊                                                                                                                      | 85/382 [00:33<01:20,  3.68it/s] 23%|██████████████████████████████████▏                                                                                                                     | 86/382 [00:34<01:26,  3.42it/s]                                                                                                                                                                                              {'loss': '3.749', 'grad_norm': '0.6989', 'learning_rate': '0.000181', 'ppl': '42.47', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '190.2', 'tokens/total': 60800, 'tokens/trainable': 18280, 'epoch': '0.6745'}
 23%|██████████████████████████████████▏                                                                                                                     | 86/382 [00:34<01:26,  3.42it/s] 23%|██████████████████████████████████▌                                                                                                                     | 87/382 [00:34<01:34,  3.12it/s]                                                                                                                                                                                              {'loss': '4.397', 'grad_norm': '0.6168', 'learning_rate': '0.0001805', 'ppl': '81.21', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '627.7', 'tokens/total': 61696, 'tokens/trainable': 18621, 'epoch': '0.6824'}
 23%|██████████████████████████████████▌                                                                                                                     | 87/382 [00:34<01:34,  3.12it/s] 23%|███████████████████████████████████                                                                                                                     | 88/382 [00:34<01:35,  3.07it/s]                                                                                                                                                                                              {'loss': '4.026', 'grad_norm': '0.7414', 'learning_rate': '0.00018', 'ppl': '56.06', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '325.1', 'tokens/total': 62464, 'tokens/trainable': 18851, 'epoch': '0.6902'}
 23%|███████████████████████████████████                                                                                                                     | 88/382 [00:34<01:35,  3.07it/s] 23%|███████████████████████████████████▍                                                                                                                    | 89/382 [00:35<01:40,  2.90it/s]                                                                                                                                                                                              {'loss': '3.66', 'grad_norm': '0.849', 'learning_rate': '0.0001795', 'ppl': '38.87', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '294.1', 'tokens/total': 63360, 'tokens/trainable': 19174, 'epoch': '0.698'}
 23%|███████████████████████████████████▍                                                                                                                    | 89/382 [00:35<01:40,  2.90it/s] 24%|███████████████████████████████████▊                                                                                                                    | 90/382 [00:35<01:35,  3.07it/s]                                                                                                                                                                                              {'loss': '4.334', 'grad_norm': '0.7557', 'learning_rate': '0.000179', 'ppl': '76.23', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '282.4', 'tokens/total': 64000, 'tokens/trainable': 19411, 'epoch': '0.7059'}
 24%|███████████████████████████████████▊                                                                                                                    | 90/382 [00:35<01:35,  3.07it/s] 24%|████████████████████████████████████▏                                                                                                                   | 91/382 [00:35<01:31,  3.19it/s]                                                                                                                                                                                              {'loss': '4.041', 'grad_norm': '0.777', 'learning_rate': '0.0001784', 'ppl': '56.9', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '327.9', 'tokens/total': 64640, 'tokens/trainable': 19649, 'epoch': '0.7137'}
 24%|████████████████████████████████████▏                                                                                                                   | 91/382 [00:35<01:31,  3.19it/s] 24%|████████████████████████████████████▌                                                                                                                   | 92/382 [00:36<01:33,  3.12it/s]                                                                                                                                                                                              {'loss': '3.641', 'grad_norm': '0.7316', 'learning_rate': '0.0001779', 'ppl': '38.12', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '326', 'tokens/total': 65408, 'tokens/trainable': 19851, 'epoch': '0.7216'}
 24%|████████████████████████████████████▌                                                                                                                   | 92/382 [00:36<01:33,  3.12it/s] 24%|█████████████████████████████████████                                                                                                                   | 93/382 [00:36<01:29,  3.23it/s]                                                                                                                                                                                              {'loss': '4.235', 'grad_norm': '0.7902', 'learning_rate': '0.0001774', 'ppl': '69.09', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '584.3', 'tokens/total': 66048, 'tokens/trainable': 20040, 'epoch': '0.7294'}
 24%|█████████████████████████████████████                                                                                                                   | 93/382 [00:36<01:29,  3.23it/s] 25%|█████████████████████████████████████▍                                                                                                                  | 94/382 [00:36<01:31,  3.15it/s]                                                                                                                                                                                              {'loss': '3.488', 'grad_norm': '0.5928', 'learning_rate': '0.0001768', 'ppl': '32.71', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '321.5', 'tokens/total': 66816, 'tokens/trainable': 20305, 'epoch': '0.7373'}
 25%|█████████████████████████████████████▍                                                                                                                  | 94/382 [00:36<01:31,  3.15it/s] 25%|█████████████████████████████████████▊                                                                                                                  | 95/382 [00:37<01:27,  3.27it/s]                                                                                                                                                                                              {'loss': '4.748', 'grad_norm': '0.9082', 'learning_rate': '0.0001763', 'ppl': '115.3', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '463.4', 'tokens/total': 67456, 'tokens/trainable': 20465, 'epoch': '0.7451'}
 25%|█████████████████████████████████████▊                                                                                                                  | 95/382 [00:37<01:27,  3.27it/s] 25%|██████████████████████████████████████▏                                                                                                                 | 96/382 [00:37<01:34,  3.03it/s]                                                                                                                                                                                              {'loss': '3.506', 'grad_norm': '0.4668', 'learning_rate': '0.0001758', 'ppl': '33.31', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '438.2', 'tokens/total': 68352, 'tokens/trainable': 20839, 'epoch': '0.7529'}
 25%|██████████████████████████████████████▏                                                                                                                 | 96/382 [00:37<01:34,  3.03it/s] 25%|██████████████████████████████████████▌                                                                                                                 | 97/382 [00:37<01:26,  3.31it/s]                                                                                                                                                                                              {'loss': '3.064', 'grad_norm': '0.9255', 'learning_rate': '0.0001752', 'ppl': '21.41', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '379.3', 'tokens/total': 68864, 'tokens/trainable': 20960, 'epoch': '0.7608'}
 25%|██████████████████████████████████████▌                                                                                                                 | 97/382 [00:37<01:26,  3.31it/s] 26%|██████████████████████████████████████▉                                                                                                                 | 98/382 [00:38<01:24,  3.38it/s]                                                                                                                                                                                              {'loss': '2.963', 'grad_norm': '0.8747', 'learning_rate': '0.0001746', 'ppl': '19.35', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '140.4', 'tokens/total': 69504, 'tokens/trainable': 21176, 'epoch': '0.7686'}
 26%|██████████████████████████████████████▉                                                                                                                 | 98/382 [00:38<01:24,  3.38it/s] 26%|███████████████████████████████████████▍                                                                                                                | 99/382 [00:38<01:27,  3.25it/s]                                                                                                                                                                                              {'loss': '3.967', 'grad_norm': '0.672', 'learning_rate': '0.0001741', 'ppl': '52.82', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '400', 'tokens/total': 70272, 'tokens/trainable': 21443, 'epoch': '0.7765'}
 26%|███████████████████████████████████████▍                                                                                                                | 99/382 [00:38<01:27,  3.25it/s] 26%|███████████████████████████████████████▌                                                                                                               | 100/382 [00:38<01:24,  3.33it/s]                                                                                                                                                                                              {'loss': '4.245', 'grad_norm': '0.7716', 'learning_rate': '0.0001735', 'ppl': '69.76', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '347.9', 'tokens/total': 70912, 'tokens/trainable': 21728, 'epoch': '0.7843'}
 26%|███████████████████████████████████████▌                                                                                                               | 100/382 [00:38<01:24,  3.33it/s][2026-03-23 06:24:31,429] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:323] Running evaluation step...

  0%|                                                                                                                                                                 | 0/110 [00:00<?, ?it/s][A
  3%|████▏                                                                                                                                                    | 3/110 [00:00<00:05, 18.67it/s][A
  5%|██████▉                                                                                                                                                  | 5/110 [00:00<00:06, 15.98it/s][A
  6%|█████████▋                                                                                                                                               | 7/110 [00:00<00:06, 15.06it/s][A
  8%|████████████▌                                                                                                                                            | 9/110 [00:00<00:06, 14.62it/s][A
 10%|███████████████▏                                                                                                                                        | 11/110 [00:00<00:06, 14.36it/s][A
 12%|█████████████████▉                                                                                                                                      | 13/110 [00:00<00:06, 15.04it/s][A
 14%|████████████████████▋                                                                                                                                   | 15/110 [00:01<00:06, 14.66it/s][A
 15%|███████████████████████▍                                                                                                                                | 17/110 [00:01<00:06, 14.41it/s][A
 17%|██████████████████████████▎                                                                                                                             | 19/110 [00:01<00:06, 15.05it/s][A
 19%|█████████████████████████████                                                                                                                           | 21/110 [00:01<00:06, 14.68it/s][A
 21%|███████████████████████████████▊                                                                                                                        | 23/110 [00:01<00:06, 14.42it/s][A
 23%|██████████████████████████████████▌                                                                                                                     | 25/110 [00:01<00:05, 14.27it/s][A
 25%|█████████████████████████████████████▎                                                                                                                  | 27/110 [00:01<00:05, 14.16it/s][A
 26%|████████████████████████████████████████                                                                                                                | 29/110 [00:01<00:05, 14.09it/s][A
 29%|████████████████████████████████████████████▏                                                                                                           | 32/110 [00:02<00:05, 15.40it/s][A
 31%|██████████████████████████████████████████████▉                                                                                                         | 34/110 [00:02<00:05, 14.26it/s][A
 33%|█████████████████████████████████████████████████▋                                                                                                      | 36/110 [00:02<00:04, 14.86it/s][A
 35%|█████████████████████████████████████████████████████▉                                                                                                  | 39/110 [00:02<00:04, 15.86it/s][A
 37%|████████████████████████████████████████████████████████▋                                                                                               | 41/110 [00:02<00:04, 16.08it/s][A
 39%|███████████████████████████████████████████████████████████▍                                                                                            | 43/110 [00:02<00:04, 16.24it/s][A
 41%|██████████████████████████████████████████████████████████████▏                                                                                         | 45/110 [00:02<00:04, 15.50it/s][A
 43%|████████████████████████████████████████████████████████████████▉                                                                                       | 47/110 [00:03<00:03, 15.80it/s][A
 45%|███████████████████████████████████████████████████████████████████▋                                                                                    | 49/110 [00:03<00:03, 16.00it/s][A
 46%|██████████████████████████████████████████████████████████████████████▍                                                                                 | 51/110 [00:03<00:03, 17.00it/s][A
 48%|█████████████████████████████████████████████████████████████████████████▏                                                                              | 53/110 [00:03<00:03, 16.95it/s][A
 50%|████████████████████████████████████████████████████████████████████████████                                                                            | 55/110 [00:03<00:03, 16.83it/s][A
 52%|██████████████████████████████████████████████████████████████████████████████▊                                                                         | 57/110 [00:03<00:03, 15.87it/s][A
 54%|█████████████████████████████████████████████████████████████████████████████████▌                                                                      | 59/110 [00:03<00:03, 16.10it/s][A
 55%|████████████████████████████████████████████████████████████████████████████████████▎                                                                   | 61/110 [00:04<00:03, 14.55it/s][A
 57%|███████████████████████████████████████████████████████████████████████████████████████                                                                 | 63/110 [00:04<00:03, 13.67it/s][A
 60%|███████████████████████████████████████████████████████████████████████████████████████████▏                                                            | 66/110 [00:04<00:02, 15.83it/s][A
 62%|█████████████████████████████████████████████████████████████████████████████████████████████▉                                                          | 68/110 [00:04<00:02, 15.27it/s][A
 64%|████████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 70/110 [00:04<00:02, 15.64it/s][A
 65%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                                    | 72/110 [00:04<00:02, 15.88it/s][A
 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                 | 74/110 [00:04<00:02, 16.06it/s][A
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████                                               | 76/110 [00:04<00:02, 16.24it/s][A
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                            | 78/110 [00:05<00:01, 16.34it/s][A
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 80/110 [00:05<00:01, 15.56it/s][A
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                     | 83/110 [00:05<00:01, 17.29it/s][A
 77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 85/110 [00:05<00:01, 17.12it/s][A
 79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                               | 87/110 [00:05<00:01, 16.07it/s][A
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 90/110 [00:05<00:01, 16.72it/s][A
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                        | 92/110 [00:05<00:01, 16.69it/s][A
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                      | 94/110 [00:06<00:01, 15.84it/s][A
 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 96/110 [00:06<00:00, 15.25it/s][A
 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 98/110 [00:06<00:00, 14.85it/s][A
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 101/110 [00:06<00:00, 16.69it/s][A
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍         | 103/110 [00:06<00:00, 15.12it/s][A
 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 105/110 [00:06<00:00, 14.77it/s][A
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 107/110 [00:06<00:00, 14.54it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [00:07<00:00, 15.93it/s][A                                                                                                                                                                                              
                                                                                                                                                                                              [A{'eval_loss': '3.83', 'eval_runtime': '7.192', 'eval_samples_per_second': '30.45', 'eval_steps_per_second': '15.29', 'eval_ppl': '46.08', 'memory/max_active (GiB)': '0.14', 'memory/max_allocated (GiB)': '0.14', 'memory/device_reserved (GiB)': '0.37', 'epoch': '0.7843', 'tokens/train_per_sec_per_gpu': '0'}
 26%|███████████████████████████████████████▌                                                                                                               | 100/382 [00:45<01:24,  3.33it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [00:07<00:00, 15.93it/s][A
                                                                                                                                                                                              [A[2026-03-23 06:24:38,629] [INFO] [axolotl.core.trainers.base._save:721] [PID:323] Saving model checkpoint to ./final_model/checkpoint-100
 26%|███████████████████████████████████████▉                                                                                                               | 101/382 [00:46<12:14,  2.61s/it]                                                                                                                                                                                              {'loss': '3.787', 'grad_norm': '1.024', 'learning_rate': '0.0001729', 'ppl': '44.11', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.26', 'tokens/train_per_sec_per_gpu': '363.2', 'tokens/total': 71552, 'tokens/trainable': 21859, 'epoch': '0.7922'}
 26%|███████████████████████████████████████▉                                                                                                               | 101/382 [00:46<12:14,  2.61s/it] 27%|████████████████████████████████████████▎                                                                                                              | 102/382 [00:47<09:00,  1.93s/it]                                                                                                                                                                                              {'loss': '3.475', 'grad_norm': '0.8803', 'learning_rate': '0.0001723', 'ppl': '32.28', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.26', 'tokens/train_per_sec_per_gpu': '309.5', 'tokens/total': 72320, 'tokens/trainable': 22058, 'epoch': '0.8'}
 27%|████████████████████████████████████████▎                                                                                                              | 102/382 [00:47<09:00,  1.93s/it] 27%|████████████████████████████████████████▋                                                                                                              | 103/382 [00:47<06:36,  1.42s/it]                                                                                                                                                                                              {'loss': '4.56', 'grad_norm': '1.136', 'learning_rate': '0.0001718', 'ppl': '95.57', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.26', 'tokens/train_per_sec_per_gpu': '303.1', 'tokens/total': 72832, 'tokens/trainable': 22166, 'epoch': '0.8078'}
 27%|████████████████████████████████████████▋                                                                                                              | 103/382 [00:47<06:36,  1.42s/it] 27%|█████████████████████████████████████████                                                                                                              | 104/382 [00:47<04:59,  1.08s/it]                                                                                                                                                                                              {'loss': '4.676', 'grad_norm': '1.059', 'learning_rate': '0.0001712', 'ppl': '107.3', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.26', 'tokens/train_per_sec_per_gpu': '365.6', 'tokens/total': 73472, 'tokens/trainable': 22310, 'epoch': '0.8157'}
 27%|█████████████████████████████████████████                                                                                                              | 104/382 [00:47<04:59,  1.08s/it] 27%|█████████████████████████████████████████▌                                                                                                             | 105/382 [00:47<03:48,  1.21it/s]                                                                                                                                                                                              {'loss': '3.415', 'grad_norm': '1.008', 'learning_rate': '0.0001706', 'ppl': '30.42', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.26', 'tokens/train_per_sec_per_gpu': '261.4', 'tokens/total': 73984, 'tokens/trainable': 22433, 'epoch': '0.8235'}
 27%|█████████████████████████████████████████▌                                                                                                             | 105/382 [00:47<03:48,  1.21it/s] 28%|█████████████████████████████████████████▉                                                                                                             | 106/382 [00:48<03:02,  1.51it/s]                                                                                                                                                                                              {'loss': '4.555', 'grad_norm': '0.6613', 'learning_rate': '0.00017', 'ppl': '95.14', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.26', 'tokens/train_per_sec_per_gpu': '296.9', 'tokens/total': 74624, 'tokens/trainable': 22658, 'epoch': '0.8314'}
 28%|█████████████████████████████████████████▉                                                                                                             | 106/382 [00:48<03:02,  1.51it/s] 28%|██████████████████████████████████████████▎                                                                                                            | 107/382 [00:48<02:30,  1.83it/s]                                                                                                                                                                                              {'loss': '3.615', 'grad_norm': '0.5976', 'learning_rate': '0.0001694', 'ppl': '37.16', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.26', 'tokens/train_per_sec_per_gpu': '537.8', 'tokens/total': 75264, 'tokens/trainable': 22916, 'epoch': '0.8392'}
 28%|██████████████████████████████████████████▎                                                                                                            | 107/382 [00:48<02:30,  1.83it/s] 28%|██████████████████████████████████████████▋                                                                                                            | 108/382 [00:48<02:12,  2.07it/s]                                                                                                                                                                                              {'loss': '3.575', 'grad_norm': '0.5665', 'learning_rate': '0.0001687', 'ppl': '35.7', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '182.4', 'tokens/total': 76032, 'tokens/trainable': 23212, 'epoch': '0.8471'}
 28%|██████████████████████████████████████████▋                                                                                                            | 108/382 [00:48<02:12,  2.07it/s] 29%|███████████████████████████████████████████                                                                                                            | 109/382 [00:48<01:55,  2.37it/s]                                                                                                                                                                                              {'loss': '4.244', 'grad_norm': '0.6433', 'learning_rate': '0.0001681', 'ppl': '69.7', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '404.6', 'tokens/total': 76672, 'tokens/trainable': 23455, 'epoch': '0.8549'}
 29%|███████████████████████████████████████████                                                                                                            | 109/382 [00:48<01:55,  2.37it/s] 29%|███████████████████████████████████████████▍                                                                                                           | 110/382 [00:49<01:43,  2.62it/s]                                                                                                                                                                                              {'loss': '3.922', 'grad_norm': '0.7572', 'learning_rate': '0.0001675', 'ppl': '50.49', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '349.5', 'tokens/total': 77312, 'tokens/trainable': 23675, 'epoch': '0.8627'}
 29%|███████████████████████████████████████████▍                                                                                                           | 110/382 [00:49<01:43,  2.62it/s] 29%|███████████████████████████████████████████▉                                                                                                           | 111/382 [00:49<01:30,  2.98it/s]                                                                                                                                                                                              {'loss': '4.176', 'grad_norm': '1.112', 'learning_rate': '0.0001669', 'ppl': '65.13', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '173', 'tokens/total': 77824, 'tokens/trainable': 23794, 'epoch': '0.8706'}
 29%|███████████████████████████████████████████▉                                                                                                           | 111/382 [00:49<01:30,  2.98it/s] 29%|████████████████████████████████████████████▎                                                                                                          | 112/382 [00:49<01:26,  3.13it/s]                                                                                                                                                                                              {'loss': '3.911', 'grad_norm': '0.7148', 'learning_rate': '0.0001662', 'ppl': '49.96', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '432.9', 'tokens/total': 78464, 'tokens/trainable': 24007, 'epoch': '0.8784'}
 29%|████████████████████████████████████████████▎                                                                                                          | 112/382 [00:49<01:26,  3.13it/s] 30%|████████████████████████████████████████████▋                                                                                                          | 113/382 [00:50<01:22,  3.25it/s]                                                                                                                                                                                              {'loss': '4.394', 'grad_norm': '0.6833', 'learning_rate': '0.0001656', 'ppl': '80.95', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '279.5', 'tokens/total': 79104, 'tokens/trainable': 24266, 'epoch': '0.8863'}
 30%|████████████████████████████████████████████▋                                                                                                          | 113/382 [00:50<01:22,  3.25it/s] 30%|█████████████████████████████████████████████                                                                                                          | 114/382 [00:50<01:24,  3.16it/s]                                                                                                                                                                                              {'loss': '3.494', 'grad_norm': '0.8494', 'learning_rate': '0.000165', 'ppl': '32.9', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '224.8', 'tokens/total': 79872, 'tokens/trainable': 24454, 'epoch': '0.8941'}
 30%|█████████████████████████████████████████████                                                                                                          | 114/382 [00:50<01:24,  3.16it/s] 30%|█████████████████████████████████████████████▍                                                                                                         | 115/382 [00:50<01:18,  3.42it/s]                                                                                                                                                                                              {'loss': '4.72', 'grad_norm': '0.8796', 'learning_rate': '0.0001643', 'ppl': '112.1', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '943.6', 'tokens/total': 80384, 'tokens/trainable': 24684, 'epoch': '0.902'}
 30%|█████████████████████████████████████████████▍                                                                                                         | 115/382 [00:50<01:18,  3.42it/s] 30%|█████████████████████████████████████████████▊                                                                                                         | 116/382 [00:50<01:08,  3.87it/s]                                                                                                                                                                                              {'loss': '3.621', 'grad_norm': '1.559', 'learning_rate': '0.0001637', 'ppl': '37.38', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '108.3', 'tokens/total': 80768, 'tokens/trainable': 24767, 'epoch': '0.9098'}
 30%|█████████████████████████████████████████████▊                                                                                                         | 116/382 [00:50<01:08,  3.87it/s] 31%|██████████████████████████████████████████████▏                                                                                                        | 117/382 [00:51<01:10,  3.77it/s]                                                                                                                                                                                              {'loss': '3.571', 'grad_norm': '0.8166', 'learning_rate': '0.000163', 'ppl': '35.55', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '316.1', 'tokens/total': 81408, 'tokens/trainable': 24914, 'epoch': '0.9176'}
 31%|██████████████████████████████████████████████▏                                                                                                        | 117/382 [00:51<01:10,  3.77it/s] 31%|██████████████████████████████████████████████▋                                                                                                        | 118/382 [00:51<01:19,  3.30it/s]                                                                                                                                                                                              {'loss': '4.219', 'grad_norm': '0.6314', 'learning_rate': '0.0001623', 'ppl': '67.94', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '483.6', 'tokens/total': 82304, 'tokens/trainable': 25206, 'epoch': '0.9255'}
 31%|██████████████████████████████████████████████▋                                                                                                        | 118/382 [00:51<01:19,  3.30it/s] 31%|███████████████████████████████████████████████                                                                                                        | 119/382 [00:51<01:21,  3.21it/s]                                                                                                                                                                                              {'loss': '3.808', 'grad_norm': '0.5624', 'learning_rate': '0.0001617', 'ppl': '45.08', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '154.7', 'tokens/total': 83072, 'tokens/trainable': 25522, 'epoch': '0.9333'}
 31%|███████████████████████████████████████████████                                                                                                        | 119/382 [00:51<01:21,  3.21it/s] 31%|███████████████████████████████████████████████▍                                                                                                       | 120/382 [00:52<01:23,  3.14it/s]                                                                                                                                                                                              {'loss': '3.747', 'grad_norm': '0.6616', 'learning_rate': '0.000161', 'ppl': '42.39', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '549.8', 'tokens/total': 83840, 'tokens/trainable': 25841, 'epoch': '0.9412'}
 31%|███████████████████████████████████████████████▍                                                                                                       | 120/382 [00:52<01:23,  3.14it/s] 32%|███████████████████████████████████████████████▊                                                                                                       | 121/382 [00:52<01:20,  3.25it/s]                                                                                                                                                                                              {'loss': '3.483', 'grad_norm': '0.893', 'learning_rate': '0.0001603', 'ppl': '32.55', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '283.5', 'tokens/total': 84480, 'tokens/trainable': 26015, 'epoch': '0.949'}
 32%|███████████████████████████████████████████████▊                                                                                                       | 121/382 [00:52<01:20,  3.25it/s] 32%|████████████████████████████████████████████████▏                                                                                                      | 122/382 [00:52<01:22,  3.16it/s]                                                                                                                                                                                              {'loss': '3.905', 'grad_norm': '0.6041', 'learning_rate': '0.0001597', 'ppl': '49.65', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '478', 'tokens/total': 85248, 'tokens/trainable': 26348, 'epoch': '0.9569'}
 32%|████████████████████████████████████████████████▏                                                                                                      | 122/382 [00:52<01:22,  3.16it/s] 32%|████████████████████████████████████████████████▌                                                                                                      | 123/382 [00:53<01:23,  3.10it/s]                                                                                                                                                                                              {'loss': '2.982', 'grad_norm': '0.707', 'learning_rate': '0.000159', 'ppl': '19.74', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '474.4', 'tokens/total': 86016, 'tokens/trainable': 26626, 'epoch': '0.9647'}
 32%|████████████████████████████████████████████████▌                                                                                                      | 123/382 [00:53<01:23,  3.10it/s] 32%|█████████████████████████████████████████████████                                                                                                      | 124/382 [00:53<01:20,  3.22it/s]                                                                                                                                                                                              {'loss': '4.64', 'grad_norm': '0.9411', 'learning_rate': '0.0001583', 'ppl': '103.5', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '258.5', 'tokens/total': 86656, 'tokens/trainable': 26782, 'epoch': '0.9725'}
 32%|█████████████████████████████████████████████████                                                                                                      | 124/382 [00:53<01:20,  3.22it/s] 33%|█████████████████████████████████████████████████▍                                                                                                     | 125/382 [00:53<01:17,  3.31it/s]                                                                                                                                                                                              {'loss': '3.986', 'grad_norm': '0.6305', 'learning_rate': '0.0001576', 'ppl': '53.82', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '653.5', 'tokens/total': 87296, 'tokens/trainable': 27039, 'epoch': '0.9804'}
 33%|█████████████████████████████████████████████████▍                                                                                                     | 125/382 [00:53<01:17,  3.31it/s] 33%|█████████████████████████████████████████████████▊                                                                                                     | 126/382 [00:54<01:23,  3.06it/s]                                                                                                                                                                                              {'loss': '3.662', 'grad_norm': '0.804', 'learning_rate': '0.0001569', 'ppl': '38.95', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '456.8', 'tokens/total': 88192, 'tokens/trainable': 27272, 'epoch': '0.9882'}
 33%|█████████████████████████████████████████████████▊                                                                                                     | 126/382 [00:54<01:23,  3.06it/s] 33%|██████████████████████████████████████████████████▏                                                                                                    | 127/382 [00:54<01:19,  3.19it/s]                                                                                                                                                                                              {'loss': '3.956', 'grad_norm': '0.7883', 'learning_rate': '0.0001562', 'ppl': '52.26', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '620.5', 'tokens/total': 88832, 'tokens/trainable': 27507, 'epoch': '0.9961'}
 33%|██████████████████████████████████████████████████▏                                                                                                    | 127/382 [00:54<01:19,  3.19it/s]                                                                                                                                                                                              {'loss': '4.51', 'grad_norm': '3.251', 'learning_rate': '0.0001555', 'ppl': '90.88', 'memory/max_active (GiB)': '0.11', 'memory/max_allocated (GiB)': '0.11', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '269', 'tokens/total': 88960, 'tokens/trainable': 27525, 'epoch': '1'}
 34%|██████████████████████████████████████████████████▌                                                                                                    | 128/382 [00:54<01:19,  3.19it/s] 34%|██████████████████████████████████████████████████▉                                                                                                    | 129/382 [00:54<01:09,  3.62it/s]                                                                                                                                                                                              {'loss': '3.015', 'grad_norm': '0.7341', 'learning_rate': '0.0001548', 'ppl': '20.39', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '492', 'tokens/total': 89472, 'tokens/trainable': 27726, 'epoch': '1.008'}
 34%|██████████████████████████████████████████████████▉                                                                                                    | 129/382 [00:54<01:09,  3.62it/s] 34%|███████████████████████████████████████████████████▍                                                                                                   | 130/382 [00:55<01:10,  3.60it/s]                                                                                                                                                                                              {'loss': '4.486', 'grad_norm': '1.012', 'learning_rate': '0.0001541', 'ppl': '88.81', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '171.9', 'tokens/total': 90112, 'tokens/trainable': 27879, 'epoch': '1.016'}
 34%|███████████████████████████████████████████████████▍                                                                                                   | 130/382 [00:55<01:10,  3.60it/s] 34%|███████████████████████████████████████████████████▊                                                                                                   | 131/382 [00:55<01:09,  3.59it/s]                                                                                                                                                                                              {'loss': '3.349', 'grad_norm': '0.8405', 'learning_rate': '0.0001534', 'ppl': '28.49', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '613.3', 'tokens/total': 90752, 'tokens/trainable': 28096, 'epoch': '1.024'}
 34%|███████████████████████████████████████████████████▊                                                                                                   | 131/382 [00:55<01:09,  3.59it/s] 35%|████████████████████████████████████████████████████▏                                                                                                  | 132/382 [00:55<01:09,  3.58it/s]                                                                                                                                                                                              {'loss': '4.225', 'grad_norm': '0.7329', 'learning_rate': '0.0001527', 'ppl': '68.37', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '279.6', 'tokens/total': 91392, 'tokens/trainable': 28314, 'epoch': '1.031'}
 35%|████████████████████████████████████████████████████▏                                                                                                  | 132/382 [00:55<01:09,  3.58it/s] 35%|████████████████████████████████████████████████████▌                                                                                                  | 133/382 [00:55<01:09,  3.57it/s]                                                                                                                                                                                              {'loss': '4.266', 'grad_norm': '1.041', 'learning_rate': '0.0001519', 'ppl': '71.23', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '294.2', 'tokens/total': 92032, 'tokens/trainable': 28441, 'epoch': '1.039'}
 35%|████████████████████████████████████████████████████▌                                                                                                  | 133/382 [00:55<01:09,  3.57it/s] 35%|████████████████████████████████████████████████████▉                                                                                                  | 134/382 [00:56<01:13,  3.38it/s]                                                                                                                                                                                              {'loss': '4.203', 'grad_norm': '0.6629', 'learning_rate': '0.0001512', 'ppl': '66.92', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '456.9', 'tokens/total': 92800, 'tokens/trainable': 28757, 'epoch': '1.047'}
 35%|████████████████████████████████████████████████████▉                                                                                                  | 134/382 [00:56<01:13,  3.38it/s] 35%|█████████████████████████████████████████████████████▎                                                                                                 | 135/382 [00:56<01:15,  3.25it/s]                                                                                                                                                                                              {'loss': '3.007', 'grad_norm': '1.169', 'learning_rate': '0.0001505', 'ppl': '20.23', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '201.1', 'tokens/total': 93568, 'tokens/trainable': 28965, 'epoch': '1.055'}
 35%|█████████████████████████████████████████████████████▎                                                                                                 | 135/382 [00:56<01:15,  3.25it/s] 36%|█████████████████████████████████████████████████████▊                                                                                                 | 136/382 [00:56<01:13,  3.33it/s]                                                                                                                                                                                              {'loss': '3.135', 'grad_norm': '0.8071', 'learning_rate': '0.0001498', 'ppl': '23', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '559', 'tokens/total': 94208, 'tokens/trainable': 29176, 'epoch': '1.063'}
 36%|█████████████████████████████████████████████████████▊                                                                                                 | 136/382 [00:56<01:13,  3.33it/s] 36%|██████████████████████████████████████████████████████▏                                                                                                | 137/382 [00:57<01:16,  3.22it/s]                                                                                                                                                                                              {'loss': '3.481', 'grad_norm': '0.9441', 'learning_rate': '0.000149', 'ppl': '32.51', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '201.5', 'tokens/total': 94976, 'tokens/trainable': 29327, 'epoch': '1.071'}
 36%|██████████████████████████████████████████████████████▏                                                                                                | 137/382 [00:57<01:16,  3.22it/s] 36%|██████████████████████████████████████████████████████▌                                                                                                | 138/382 [00:57<01:17,  3.15it/s]                                                                                                                                                                                              {'loss': '3.731', 'grad_norm': '0.6692', 'learning_rate': '0.0001483', 'ppl': '41.72', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '542.6', 'tokens/total': 95744, 'tokens/trainable': 29640, 'epoch': '1.078'}
 36%|██████████████████████████████████████████████████████▌                                                                                                | 138/382 [00:57<01:17,  3.15it/s] 36%|██████████████████████████████████████████████████████▉                                                                                                | 139/382 [00:57<01:18,  3.10it/s]                                                                                                                                                                                              {'loss': '3.855', 'grad_norm': '0.65', 'learning_rate': '0.0001475', 'ppl': '47.22', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '468.4', 'tokens/total': 96512, 'tokens/trainable': 29912, 'epoch': '1.086'}
 36%|██████████████████████████████████████████████████████▉                                                                                                | 139/382 [00:57<01:18,  3.10it/s] 37%|███████████████████████████████████████████████████████▎                                                                                               | 140/382 [00:58<01:19,  3.05it/s]                                                                                                                                                                                              {'loss': '3.72', 'grad_norm': '0.6522', 'learning_rate': '0.0001468', 'ppl': '41.27', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '400.9', 'tokens/total': 97280, 'tokens/trainable': 30158, 'epoch': '1.094'}
 37%|███████████████████████████████████████████████████████▎                                                                                               | 140/382 [00:58<01:19,  3.05it/s] 37%|███████████████████████████████████████████████████████▋                                                                                               | 141/382 [00:58<01:19,  3.03it/s]                                                                                                                                                                                              {'loss': '4.155', 'grad_norm': '1.096', 'learning_rate': '0.000146', 'ppl': '63.77', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '291.3', 'tokens/total': 98048, 'tokens/trainable': 30322, 'epoch': '1.102'}
 37%|███████████████████████████████████████████████████████▋                                                                                               | 141/382 [00:58<01:19,  3.03it/s] 37%|████████████████████████████████████████████████████████▏                                                                                              | 142/382 [00:58<01:19,  3.02it/s]                                                                                                                                                                                              {'loss': '5.032', 'grad_norm': '0.8192', 'learning_rate': '0.0001453', 'ppl': '153.3', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '626.2', 'tokens/total': 98816, 'tokens/trainable': 30584, 'epoch': '1.11'}
 37%|████████████████████████████████████████████████████████▏                                                                                              | 142/382 [00:58<01:19,  3.02it/s] 37%|████████████████████████████████████████████████████████▌                                                                                              | 143/382 [00:59<01:19,  3.00it/s]                                                                                                                                                                                              {'loss': '3.368', 'grad_norm': '0.8732', 'learning_rate': '0.0001445', 'ppl': '29.02', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '248.4', 'tokens/total': 99584, 'tokens/trainable': 30765, 'epoch': '1.118'}
 37%|████████████████████████████████████████████████████████▌                                                                                              | 143/382 [00:59<01:19,  3.00it/s] 38%|████████████████████████████████████████████████████████▉                                                                                              | 144/382 [00:59<01:15,  3.15it/s]                                                                                                                                                                                              {'loss': '4.807', 'grad_norm': '0.9719', 'learning_rate': '0.0001438', 'ppl': '122.4', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '315.9', 'tokens/total': 100224, 'tokens/trainable': 30925, 'epoch': '1.125'}
 38%|████████████████████████████████████████████████████████▉                                                                                              | 144/382 [00:59<01:15,  3.15it/s] 38%|█████████████████████████████████████████████████████████▎                                                                                             | 145/382 [00:59<01:12,  3.26it/s]                                                                                                                                                                                              {'loss': '3.641', 'grad_norm': '0.7061', 'learning_rate': '0.000143', 'ppl': '38.14', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '337.2', 'tokens/total': 100864, 'tokens/trainable': 31163, 'epoch': '1.133'}
 38%|█████████████████████████████████████████████████████████▎                                                                                             | 145/382 [00:59<01:12,  3.26it/s] 38%|█████████████████████████████████████████████████████████▋                                                                                             | 146/382 [01:00<01:10,  3.34it/s]                                                                                                                                                                                              {'loss': '3.772', 'grad_norm': '1.188', 'learning_rate': '0.0001422', 'ppl': '43.49', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '214.7', 'tokens/total': 101504, 'tokens/trainable': 31282, 'epoch': '1.141'}
 38%|█████████████████████████████████████████████████████████▋                                                                                             | 146/382 [01:00<01:10,  3.34it/s] 38%|██████████████████████████████████████████████████████████                                                                                             | 147/382 [01:00<01:09,  3.40it/s]                                                                                                                                                                                              {'loss': '3.599', 'grad_norm': '0.9165', 'learning_rate': '0.0001415', 'ppl': '36.55', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '243.8', 'tokens/total': 102144, 'tokens/trainable': 31445, 'epoch': '1.149'}
 38%|██████████████████████████████████████████████████████████                                                                                             | 147/382 [01:00<01:09,  3.40it/s] 39%|██████████████████████████████████████████████████████████▌                                                                                            | 148/382 [01:00<01:11,  3.26it/s]                                                                                                                                                                                              {'loss': '3.088', 'grad_norm': '0.9147', 'learning_rate': '0.0001407', 'ppl': '21.94', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '279.5', 'tokens/total': 102912, 'tokens/trainable': 31603, 'epoch': '1.157'}
 39%|██████████████████████████████████████████████████████████▌                                                                                            | 148/382 [01:00<01:11,  3.26it/s] 39%|██████████████████████████████████████████████████████████▉                                                                                            | 149/382 [01:00<01:13,  3.17it/s]                                                                                                                                                                                              {'loss': '4.082', 'grad_norm': '0.8946', 'learning_rate': '0.0001399', 'ppl': '59.28', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '243.4', 'tokens/total': 103680, 'tokens/trainable': 31831, 'epoch': '1.165'}
 39%|██████████████████████████████████████████████████████████▉                                                                                            | 149/382 [01:00<01:13,  3.17it/s] 39%|███████████████████████████████████████████████████████████▎                                                                                           | 150/382 [01:01<01:07,  3.46it/s]                                                                                                                                                                                              {'loss': '3.465', 'grad_norm': '1.506', 'learning_rate': '0.0001391', 'ppl': '31.97', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '191.5', 'tokens/total': 104192, 'tokens/trainable': 31915, 'epoch': '1.173'}
 39%|███████████████████████████████████████████████████████████▎                                                                                           | 150/382 [01:01<01:07,  3.46it/s] 40%|███████████████████████████████████████████████████████████▋                                                                                           | 151/382 [01:01<01:06,  3.49it/s]                                                                                                                                                                                              {'loss': '2.998', 'grad_norm': '0.8444', 'learning_rate': '0.0001384', 'ppl': '20.04', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '412.6', 'tokens/total': 104832, 'tokens/trainable': 32084, 'epoch': '1.18'}
 40%|███████████████████████████████████████████████████████████▋                                                                                           | 151/382 [01:01<01:06,  3.49it/s] 40%|████████████████████████████████████████████████████████████                                                                                           | 152/382 [01:01<01:05,  3.51it/s]                                                                                                                                                                                              {'loss': '3.413', 'grad_norm': '0.8544', 'learning_rate': '0.0001376', 'ppl': '30.34', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '351', 'tokens/total': 105472, 'tokens/trainable': 32258, 'epoch': '1.188'}
 40%|████████████████████████████████████████████████████████████                                                                                           | 152/382 [01:01<01:05,  3.51it/s] 40%|████████████████████████████████████████████████████████████▍                                                                                          | 153/382 [01:02<01:05,  3.52it/s]                                                                                                                                                                                              {'loss': '3.087', 'grad_norm': '0.8419', 'learning_rate': '0.0001368', 'ppl': '21.9', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '229.5', 'tokens/total': 106112, 'tokens/trainable': 32444, 'epoch': '1.196'}
 40%|████████████████████████████████████████████████████████████▍                                                                                          | 153/382 [01:02<01:05,  3.52it/s] 40%|████████████████████████████████████████████████████████████▊                                                                                          | 154/382 [01:02<01:04,  3.53it/s]                                                                                                                                                                                              {'loss': '3.964', 'grad_norm': '0.9033', 'learning_rate': '0.000136', 'ppl': '52.65', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '114.8', 'tokens/total': 106752, 'tokens/trainable': 32578, 'epoch': '1.204'}
 40%|████████████████████████████████████████████████████████████▊                                                                                          | 154/382 [01:02<01:04,  3.53it/s] 41%|█████████████████████████████████████████████████████████████▎                                                                                         | 155/382 [01:02<01:04,  3.54it/s]                                                                                                                                                                                              {'loss': '3.899', 'grad_norm': '0.8942', 'learning_rate': '0.0001352', 'ppl': '49.34', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '280', 'tokens/total': 107392, 'tokens/trainable': 32794, 'epoch': '1.212'}
 41%|█████████████████████████████████████████████████████████████▎                                                                                         | 155/382 [01:02<01:04,  3.54it/s] 41%|█████████████████████████████████████████████████████████████▋                                                                                         | 156/382 [01:03<01:14,  3.03it/s]                                                                                                                                                                                              {'loss': '3.531', 'grad_norm': '0.6046', 'learning_rate': '0.0001344', 'ppl': '34.14', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '630.2', 'tokens/total': 108416, 'tokens/trainable': 33249, 'epoch': '1.22'}
 41%|█████████████████████████████████████████████████████████████▋                                                                                         | 156/382 [01:03<01:14,  3.03it/s] 41%|██████████████████████████████████████████████████████████████                                                                                         | 157/382 [01:03<01:10,  3.17it/s]                                                                                                                                                                                              {'loss': '3.656', 'grad_norm': '0.7922', 'learning_rate': '0.0001336', 'ppl': '38.69', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '641.7', 'tokens/total': 109056, 'tokens/trainable': 33506, 'epoch': '1.227'}
 41%|██████████████████████████████████████████████████████████████                                                                                         | 157/382 [01:03<01:10,  3.17it/s] 41%|██████████████████████████████████████████████████████████████▍                                                                                        | 158/382 [01:03<01:12,  3.11it/s]                                                                                                                                                                                              {'loss': '3.902', 'grad_norm': '0.673', 'learning_rate': '0.0001328', 'ppl': '49.49', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '341.8', 'tokens/total': 109824, 'tokens/trainable': 33780, 'epoch': '1.235'}
 41%|██████████████████████████████████████████████████████████████▍                                                                                        | 158/382 [01:03<01:12,  3.11it/s] 42%|██████████████████████████████████████████████████████████████▊                                                                                        | 159/382 [01:04<01:16,  2.93it/s]                                                                                                                                                                                              {'loss': '3.491', 'grad_norm': '0.6462', 'learning_rate': '0.000132', 'ppl': '32.83', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '612', 'tokens/total': 110720, 'tokens/trainable': 34119, 'epoch': '1.243'}
 42%|██████████████████████████████████████████████████████████████▊                                                                                        | 159/382 [01:04<01:16,  2.93it/s] 42%|███████████████████████████████████████████████████████████████▏                                                                                       | 160/382 [01:04<01:15,  2.94it/s]                                                                                                                                                                                              {'loss': '4.612', 'grad_norm': '1.04', 'learning_rate': '0.0001312', 'ppl': '100.7', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '326.7', 'tokens/total': 111488, 'tokens/trainable': 34376, 'epoch': '1.251'}
 42%|███████████████████████████████████████████████████████████████▏                                                                                       | 160/382 [01:04<01:15,  2.94it/s] 42%|███████████████████████████████████████████████████████████████▋                                                                                       | 161/382 [01:04<01:11,  3.10it/s]                                                                                                                                                                                              {'loss': '4.47', 'grad_norm': '0.8244', 'learning_rate': '0.0001304', 'ppl': '87.35', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '579.4', 'tokens/total': 112128, 'tokens/trainable': 34613, 'epoch': '1.259'}
 42%|███████████████████████████████████████████████████████████████▋                                                                                       | 161/382 [01:04<01:11,  3.10it/s] 42%|████████████████████████████████████████████████████████████████                                                                                       | 162/382 [01:04<01:04,  3.40it/s]                                                                                                                                                                                              {'loss': '3.015', 'grad_norm': '0.8032', 'learning_rate': '0.0001296', 'ppl': '20.4', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '346.7', 'tokens/total': 112640, 'tokens/trainable': 34775, 'epoch': '1.267'}
 42%|████████████████████████████████████████████████████████████████                                                                                       | 162/382 [01:04<01:04,  3.40it/s] 43%|████████████████████████████████████████████████████████████████▍                                                                                      | 163/382 [01:05<01:03,  3.44it/s]                                                                                                                                                                                              {'loss': '3.913', 'grad_norm': '1.014', 'learning_rate': '0.0001288', 'ppl': '50.05', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '210.7', 'tokens/total': 113280, 'tokens/trainable': 34915, 'epoch': '1.275'}
 43%|████████████████████████████████████████████████████████████████▍                                                                                      | 163/382 [01:05<01:03,  3.44it/s] 43%|████████████████████████████████████████████████████████████████▊                                                                                      | 164/382 [01:05<01:09,  3.12it/s]                                                                                                                                                                                              {'loss': '2.224', 'grad_norm': '0.6388', 'learning_rate': '0.000128', 'ppl': '9.24', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '204.9', 'tokens/total': 114176, 'tokens/trainable': 35141, 'epoch': '1.282'}
 43%|████████████████████████████████████████████████████████████████▊                                                                                      | 164/382 [01:05<01:09,  3.12it/s] 43%|█████████████████████████████████████████████████████████████████▏                                                                                     | 165/382 [01:05<01:10,  3.08it/s]                                                                                                                                                                                              {'loss': '3.568', 'grad_norm': '0.985', 'learning_rate': '0.0001272', 'ppl': '35.43', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '281.1', 'tokens/total': 114944, 'tokens/trainable': 35323, 'epoch': '1.29'}
 43%|█████████████████████████████████████████████████████████████████▏                                                                                     | 165/382 [01:05<01:10,  3.08it/s] 43%|█████████████████████████████████████████████████████████████████▌                                                                                     | 166/382 [01:06<01:07,  3.20it/s]                                                                                                                                                                                              {'loss': '3.882', 'grad_norm': '0.8512', 'learning_rate': '0.0001264', 'ppl': '48.52', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '293.4', 'tokens/total': 115584, 'tokens/trainable': 35485, 'epoch': '1.298'}
 43%|█████████████████████████████████████████████████████████████████▌                                                                                     | 166/382 [01:06<01:07,  3.20it/s] 44%|██████████████████████████████████████████████████████████████████                                                                                     | 167/382 [01:06<01:12,  2.98it/s]                                                                                                                                                                                              {'loss': '3.826', 'grad_norm': '0.8955', 'learning_rate': '0.0001255', 'ppl': '45.87', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '293.4', 'tokens/total': 116480, 'tokens/trainable': 35774, 'epoch': '1.306'}
 44%|██████████████████████████████████████████████████████████████████                                                                                     | 167/382 [01:06<01:12,  2.98it/s] 44%|██████████████████████████████████████████████████████████████████▍                                                                                    | 168/382 [01:06<01:08,  3.13it/s]                                                                                                                                                                                              {'loss': '4.161', 'grad_norm': '0.8737', 'learning_rate': '0.0001247', 'ppl': '64.14', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '153.6', 'tokens/total': 117120, 'tokens/trainable': 36006, 'epoch': '1.314'}
 44%|██████████████████████████████████████████████████████████████████▍                                                                                    | 168/382 [01:06<01:08,  3.13it/s] 44%|██████████████████████████████████████████████████████████████████▊                                                                                    | 169/382 [01:07<01:09,  3.09it/s]                                                                                                                                                                                              {'loss': '3.911', 'grad_norm': '0.6413', 'learning_rate': '0.0001239', 'ppl': '49.95', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '257', 'tokens/total': 117888, 'tokens/trainable': 36292, 'epoch': '1.322'}
 44%|██████████████████████████████████████████████████████████████████▊                                                                                    | 169/382 [01:07<01:09,  3.09it/s] 45%|███████████████████████████████████████████████████████████████████▏                                                                                   | 170/382 [01:07<01:09,  3.05it/s]                                                                                                                                                                                              {'loss': '4.175', 'grad_norm': '0.7705', 'learning_rate': '0.0001231', 'ppl': '65.04', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '616', 'tokens/total': 118656, 'tokens/trainable': 36587, 'epoch': '1.329'}
 45%|███████████████████████████████████████████████████████████████████▏                                                                                   | 170/382 [01:07<01:09,  3.05it/s] 45%|███████████████████████████████████████████████████████████████████▌                                                                                   | 171/382 [01:07<01:06,  3.18it/s]                                                                                                                                                                                              {'loss': '4.246', 'grad_norm': '0.7701', 'learning_rate': '0.0001223', 'ppl': '69.79', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '550.6', 'tokens/total': 119296, 'tokens/trainable': 36836, 'epoch': '1.337'}
 45%|███████████████████████████████████████████████████████████████████▌                                                                                   | 171/382 [01:07<01:06,  3.18it/s] 45%|███████████████████████████████████████████████████████████████████▉                                                                                   | 172/382 [01:08<01:04,  3.28it/s]                                                                                                                                                                                              {'loss': '3.356', 'grad_norm': '0.9297', 'learning_rate': '0.0001214', 'ppl': '28.68', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '246.6', 'tokens/total': 119936, 'tokens/trainable': 37002, 'epoch': '1.345'}
 45%|███████████████████████████████████████████████████████████████████▉                                                                                   | 172/382 [01:08<01:04,  3.28it/s] 45%|████████████████████████████████████████████████████████████████████▍                                                                                  | 173/382 [01:08<01:08,  3.03it/s]                                                                                                                                                                                              {'loss': '4.471', 'grad_norm': '0.7731', 'learning_rate': '0.0001206', 'ppl': '87.49', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '524.3', 'tokens/total': 120832, 'tokens/trainable': 37262, 'epoch': '1.353'}
 45%|████████████████████████████████████████████████████████████████████▍                                                                                  | 173/382 [01:08<01:08,  3.03it/s] 46%|████████████████████████████████████████████████████████████████████▊                                                                                  | 174/382 [01:08<01:08,  3.02it/s]                                                                                                                                                                                              {'loss': '3.533', 'grad_norm': '0.7589', 'learning_rate': '0.0001198', 'ppl': '34.21', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '834.3', 'tokens/total': 121600, 'tokens/trainable': 37623, 'epoch': '1.361'}
 46%|████████████████████████████████████████████████████████████████████▊                                                                                  | 174/382 [01:08<01:08,  3.02it/s] 46%|█████████████████████████████████████████████████████████████████████▏                                                                                 | 175/382 [01:09<01:02,  3.32it/s]                                                                                                                                                                                              {'loss': '4.222', 'grad_norm': '1.076', 'learning_rate': '0.0001189', 'ppl': '68.16', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '433.8', 'tokens/total': 122112, 'tokens/trainable': 37752, 'epoch': '1.369'}
 46%|█████████████████████████████████████████████████████████████████████▏                                                                                 | 175/382 [01:09<01:02,  3.32it/s] 46%|█████████████████████████████████████████████████████████████████████▌                                                                                 | 176/382 [01:09<01:07,  3.06it/s]                                                                                                                                                                                              {'loss': '4.284', 'grad_norm': '0.6732', 'learning_rate': '0.0001181', 'ppl': '72.56', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '332.5', 'tokens/total': 123008, 'tokens/trainable': 38011, 'epoch': '1.376'}
 46%|█████████████████████████████████████████████████████████████████████▌                                                                                 | 176/382 [01:09<01:07,  3.06it/s] 46%|█████████████████████████████████████████████████████████████████████▉                                                                                 | 177/382 [01:09<01:04,  3.18it/s]                                                                                                                                                                                              {'loss': '3.924', 'grad_norm': '0.8281', 'learning_rate': '0.0001173', 'ppl': '50.63', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '588.6', 'tokens/total': 123648, 'tokens/trainable': 38206, 'epoch': '1.384'}
 46%|█████████████████████████████████████████████████████████████████████▉                                                                                 | 177/382 [01:09<01:04,  3.18it/s] 47%|██████████████████████████████████████████████████████████████████████▎                                                                                | 178/382 [01:10<01:08,  2.97it/s]                                                                                                                                                                                              {'loss': '3.941', 'grad_norm': '0.7856', 'learning_rate': '0.0001164', 'ppl': '51.45', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '326.6', 'tokens/total': 124544, 'tokens/trainable': 38489, 'epoch': '1.392'}
 47%|██████████████████████████████████████████████████████████████████████▎                                                                                | 178/382 [01:10<01:08,  2.97it/s] 47%|██████████████████████████████████████████████████████████████████████▊                                                                                | 179/382 [01:10<01:01,  3.29it/s]                                                                                                                                                                                              {'loss': '4.2', 'grad_norm': '1.593', 'learning_rate': '0.0001156', 'ppl': '66.69', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '115.5', 'tokens/total': 125056, 'tokens/trainable': 38553, 'epoch': '1.4'}
 47%|██████████████████████████████████████████████████████████████████████▊                                                                                | 179/382 [01:10<01:01,  3.29it/s] 47%|███████████████████████████████████████████████████████████████████████▏                                                                               | 180/382 [01:10<01:00,  3.36it/s]                                                                                                                                                                                              {'loss': '4.159', 'grad_norm': '0.8187', 'learning_rate': '0.0001148', 'ppl': '64.01', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '229.1', 'tokens/total': 125696, 'tokens/trainable': 38803, 'epoch': '1.408'}
 47%|███████████████████████████████████████████████████████████████████████▏                                                                               | 180/382 [01:10<01:00,  3.36it/s] 47%|███████████████████████████████████████████████████████████████████████▌                                                                               | 181/382 [01:10<00:59,  3.40it/s]                                                                                                                                                                                              {'loss': '3.965', 'grad_norm': '0.7317', 'learning_rate': '0.0001139', 'ppl': '52.73', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '334', 'tokens/total': 126336, 'tokens/trainable': 39039, 'epoch': '1.416'}
 47%|███████████████████████████████████████████████████████████████████████▌                                                                               | 181/382 [01:10<00:59,  3.40it/s] 48%|███████████████████████████████████████████████████████████████████████▉                                                                               | 182/382 [01:11<00:54,  3.65it/s]                                                                                                                                                                                              {'loss': '4.011', 'grad_norm': '1.04', 'learning_rate': '0.0001131', 'ppl': '55.2', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '337.7', 'tokens/total': 126848, 'tokens/trainable': 39161, 'epoch': '1.424'}
 48%|███████████████████████████████████████████████████████████████████████▉                                                                               | 182/382 [01:11<00:54,  3.65it/s] 48%|████████████████████████████████████████████████████████████████████████▎                                                                              | 183/382 [01:11<01:01,  3.24it/s]                                                                                                                                                                                              {'loss': '3.895', 'grad_norm': '0.655', 'learning_rate': '0.0001122', 'ppl': '49.15', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '532.5', 'tokens/total': 127744, 'tokens/trainable': 39452, 'epoch': '1.431'}
 48%|████████████████████████████████████████████████████████████████████████▎                                                                              | 183/382 [01:11<01:01,  3.24it/s] 48%|████████████████████████████████████████████████████████████████████████▋                                                                              | 184/382 [01:11<01:02,  3.15it/s]                                                                                                                                                                                              {'loss': '3.439', 'grad_norm': '0.6263', 'learning_rate': '0.0001114', 'ppl': '31.14', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '424.4', 'tokens/total': 128512, 'tokens/trainable': 39757, 'epoch': '1.439'}
 48%|████████████████████████████████████████████████████████████████████████▋                                                                              | 184/382 [01:11<01:02,  3.15it/s] 48%|█████████████████████████████████████████████████████████████████████████▏                                                                             | 185/382 [01:12<01:00,  3.26it/s]                                                                                                                                                                                              {'loss': '4.002', 'grad_norm': '1', 'learning_rate': '0.0001106', 'ppl': '54.71', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '314.6', 'tokens/total': 129152, 'tokens/trainable': 39925, 'epoch': '1.447'}
 48%|█████████████████████████████████████████████████████████████████████████▏                                                                             | 185/382 [01:12<01:00,  3.26it/s] 49%|█████████████████████████████████████████████████████████████████████████▌                                                                             | 186/382 [01:12<01:01,  3.17it/s]                                                                                                                                                                                              {'loss': '3.214', 'grad_norm': '0.7704', 'learning_rate': '0.0001097', 'ppl': '24.87', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '395.3', 'tokens/total': 129920, 'tokens/trainable': 40240, 'epoch': '1.455'}
 49%|█████████████████████████████████████████████████████████████████████████▌                                                                             | 186/382 [01:12<01:01,  3.17it/s] 49%|█████████████████████████████████████████████████████████████████████████▉                                                                             | 187/382 [01:12<00:56,  3.46it/s]                                                                                                                                                                                              {'loss': '4.854', 'grad_norm': '1.584', 'learning_rate': '0.0001089', 'ppl': '128.2', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '297.3', 'tokens/total': 130432, 'tokens/trainable': 40322, 'epoch': '1.463'}
 49%|█████████████████████████████████████████████████████████████████████████▉                                                                             | 187/382 [01:12<00:56,  3.46it/s] 49%|██████████████████████████████████████████████████████████████████████████▎                                                                            | 188/382 [01:13<00:58,  3.30it/s]                                                                                                                                                                                              {'loss': '3.546', 'grad_norm': '0.7946', 'learning_rate': '0.000108', 'ppl': '34.69', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '244.9', 'tokens/total': 131200, 'tokens/trainable': 40635, 'epoch': '1.471'}
 49%|██████████████████████████████████████████████████████████████████████████▎                                                                            | 188/382 [01:13<00:58,  3.30it/s] 49%|██████████████████████████████████████████████████████████████████████████▋                                                                            | 189/382 [01:13<01:00,  3.20it/s]                                                                                                                                                                                              {'loss': '3.853', 'grad_norm': '0.6695', 'learning_rate': '0.0001072', 'ppl': '47.13', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '413.3', 'tokens/total': 131968, 'tokens/trainable': 40871, 'epoch': '1.478'}
 49%|██████████████████████████████████████████████████████████████████████████▋                                                                            | 189/382 [01:13<01:00,  3.20it/s] 50%|███████████████████████████████████████████████████████████████████████████                                                                            | 190/382 [01:13<01:01,  3.12it/s]                                                                                                                                                                                              {'loss': '3.341', 'grad_norm': '0.5896', 'learning_rate': '0.0001063', 'ppl': '28.23', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '585.8', 'tokens/total': 132736, 'tokens/trainable': 41224, 'epoch': '1.486'}
 50%|███████████████████████████████████████████████████████████████████████████                                                                            | 190/382 [01:13<01:01,  3.12it/s] 50%|███████████████████████████████████████████████████████████████████████████▌                                                                           | 191/382 [01:14<01:05,  2.94it/s]                                                                                                                                                                                              {'loss': '3.719', 'grad_norm': '0.7028', 'learning_rate': '0.0001055', 'ppl': '41.23', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '503.2', 'tokens/total': 133632, 'tokens/trainable': 41605, 'epoch': '1.494'}
 50%|███████████████████████████████████████████████████████████████████████████▌                                                                           | 191/382 [01:14<01:05,  2.94it/s] 50%|███████████████████████████████████████████████████████████████████████████▉                                                                           | 192/382 [01:14<01:01,  3.09it/s]                                                                                                                                                                                              {'loss': '4.456', 'grad_norm': '0.7975', 'learning_rate': '0.0001047', 'ppl': '86.18', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '174.8', 'tokens/total': 134272, 'tokens/trainable': 41807, 'epoch': '1.502'}
 50%|███████████████████████████████████████████████████████████████████████████▉                                                                           | 192/382 [01:14<01:01,  3.09it/s] 51%|████████████████████████████████████████████████████████████████████████████▎                                                                          | 193/382 [01:14<01:01,  3.05it/s]                                                                                                                                                                                              {'loss': '3.656', 'grad_norm': '0.6768', 'learning_rate': '0.0001038', 'ppl': '38.71', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '456.8', 'tokens/total': 135040, 'tokens/trainable': 42063, 'epoch': '1.51'}
 51%|████████████████████████████████████████████████████████████████████████████▎                                                                          | 193/382 [01:14<01:01,  3.05it/s] 51%|████████████████████████████████████████████████████████████████████████████▋                                                                          | 194/382 [01:14<00:56,  3.35it/s]                                                                                                                                                                                              {'loss': '3.641', 'grad_norm': '1.365', 'learning_rate': '0.000103', 'ppl': '38.14', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '225.5', 'tokens/total': 135552, 'tokens/trainable': 42150, 'epoch': '1.518'}
 51%|████████████████████████████████████████████████████████████████████████████▋                                                                          | 194/382 [01:14<00:56,  3.35it/s] 51%|█████████████████████████████████████████████████████████████████████████████                                                                          | 195/382 [01:15<00:51,  3.61it/s]                                                                                                                                                                                              {'loss': '4.311', 'grad_norm': '0.9933', 'learning_rate': '0.0001021', 'ppl': '74.54', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '314.7', 'tokens/total': 136064, 'tokens/trainable': 42305, 'epoch': '1.525'}
 51%|█████████████████████████████████████████████████████████████████████████████                                                                          | 195/382 [01:15<00:51,  3.61it/s] 51%|█████████████████████████████████████████████████████████████████████████████▍                                                                         | 196/382 [01:15<00:54,  3.39it/s]                                                                                                                                                                                              {'loss': '4.166', 'grad_norm': '0.8887', 'learning_rate': '0.0001013', 'ppl': '64.48', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '474.3', 'tokens/total': 136832, 'tokens/trainable': 42520, 'epoch': '1.533'}
 51%|█████████████████████████████████████████████████████████████████████████████▍                                                                         | 196/382 [01:15<00:54,  3.39it/s] 52%|█████████████████████████████████████████████████████████████████████████████▊                                                                         | 197/382 [01:15<00:53,  3.44it/s]                                                                                                                                                                                              {'loss': '3.699', 'grad_norm': '0.7978', 'learning_rate': '0.0001004', 'ppl': '40.39', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '142.7', 'tokens/total': 137472, 'tokens/trainable': 42700, 'epoch': '1.541'}
 52%|█████████████████████████████████████████████████████████████████████████████▊                                                                         | 197/382 [01:15<00:53,  3.44it/s] 52%|██████████████████████████████████████████████████████████████████████████████▎                                                                        | 198/382 [01:16<00:53,  3.46it/s]                                                                                                                                                                                              {'loss': '3.825', 'grad_norm': '0.8202', 'learning_rate': '9.958e-05', 'ppl': '45.81', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '513.7', 'tokens/total': 138112, 'tokens/trainable': 42932, 'epoch': '1.549'}
 52%|██████████████████████████████████████████████████████████████████████████████▎                                                                        | 198/382 [01:16<00:53,  3.46it/s] 52%|██████████████████████████████████████████████████████████████████████████████▋                                                                        | 199/382 [01:16<00:49,  3.67it/s]                                                                                                                                                                                              {'loss': '3.544', 'grad_norm': '0.8667', 'learning_rate': '9.873e-05', 'ppl': '34.6', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '612.5', 'tokens/total': 138624, 'tokens/trainable': 43084, 'epoch': '1.557'}
 52%|██████████████████████████████████████████████████████████████████████████████▋                                                                        | 199/382 [01:16<00:49,  3.67it/s] 52%|███████████████████████████████████████████████████████████████████████████████                                                                        | 200/382 [01:16<00:52,  3.44it/s]                                                                                                                                                                                              {'loss': '3.803', 'grad_norm': '0.8771', 'learning_rate': '9.788e-05', 'ppl': '44.83', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.37', 'tokens/train_per_sec_per_gpu': '217.5', 'tokens/total': 139392, 'tokens/trainable': 43305, 'epoch': '1.565'}
 52%|███████████████████████████████████████████████████████████████████████████████                                                                        | 200/382 [01:16<00:52,  3.44it/s][2026-03-23 06:25:09,434] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:323] Running evaluation step...

  0%|                                                                                                                                                                 | 0/110 [00:00<?, ?it/s][A
  3%|████▏                                                                                                                                                    | 3/110 [00:00<00:05, 18.34it/s][A
  5%|██████▉                                                                                                                                                  | 5/110 [00:00<00:06, 15.81it/s][A
  6%|█████████▋                                                                                                                                               | 7/110 [00:00<00:06, 14.95it/s][A
  8%|████████████▌                                                                                                                                            | 9/110 [00:00<00:06, 14.53it/s][A
 10%|███████████████▏                                                                                                                                        | 11/110 [00:00<00:06, 14.28it/s][A
 12%|█████████████████▉                                                                                                                                      | 13/110 [00:00<00:06, 14.95it/s][A
 14%|████████████████████▋                                                                                                                                   | 15/110 [00:01<00:06, 14.59it/s][A
 15%|███████████████████████▍                                                                                                                                | 17/110 [00:01<00:06, 14.34it/s][A
 17%|██████████████████████████▎                                                                                                                             | 19/110 [00:01<00:06, 14.96it/s][A
 19%|█████████████████████████████                                                                                                                           | 21/110 [00:01<00:06, 14.60it/s][A
 21%|███████████████████████████████▊                                                                                                                        | 23/110 [00:01<00:06, 14.36it/s][A
 23%|██████████████████████████████████▌                                                                                                                     | 25/110 [00:01<00:05, 14.19it/s][A
 25%|█████████████████████████████████████▎                                                                                                                  | 27/110 [00:01<00:05, 14.07it/s][A
 26%|████████████████████████████████████████                                                                                                                | 29/110 [00:01<00:05, 14.01it/s][A
 29%|████████████████████████████████████████████▏                                                                                                           | 32/110 [00:02<00:05, 15.29it/s][A
 31%|██████████████████████████████████████████████▉                                                                                                         | 34/110 [00:02<00:05, 14.22it/s][A
 33%|█████████████████████████████████████████████████▋                                                                                                      | 36/110 [00:02<00:04, 14.81it/s][A
 35%|█████████████████████████████████████████████████████▉                                                                                                  | 39/110 [00:02<00:04, 15.81it/s][A
 37%|████████████████████████████████████████████████████████▋                                                                                               | 41/110 [00:02<00:04, 16.01it/s][A
 39%|███████████████████████████████████████████████████████████▍                                                                                            | 43/110 [00:02<00:04, 16.17it/s][A
 41%|██████████████████████████████████████████████████████████████▏                                                                                         | 45/110 [00:03<00:04, 15.44it/s][A
 43%|████████████████████████████████████████████████████████████████▉                                                                                       | 47/110 [00:03<00:03, 15.77it/s][A
 45%|███████████████████████████████████████████████████████████████████▋                                                                                    | 49/110 [00:03<00:03, 16.01it/s][A
 46%|██████████████████████████████████████████████████████████████████████▍                                                                                 | 51/110 [00:03<00:03, 17.00it/s][A
 48%|█████████████████████████████████████████████████████████████████████████▏                                                                              | 53/110 [00:03<00:03, 16.88it/s][A
 50%|████████████████████████████████████████████████████████████████████████████                                                                            | 55/110 [00:03<00:03, 16.80it/s][A
 52%|██████████████████████████████████████████████████████████████████████████████▊                                                                         | 57/110 [00:03<00:03, 15.79it/s][A
 54%|█████████████████████████████████████████████████████████████████████████████████▌                                                                      | 59/110 [00:03<00:03, 16.04it/s][A
 55%|████████████████████████████████████████████████████████████████████████████████████▎                                                                   | 61/110 [00:04<00:03, 14.55it/s][A
 57%|███████████████████████████████████████████████████████████████████████████████████████                                                                 | 63/110 [00:04<00:03, 13.66it/s][A
 60%|███████████████████████████████████████████████████████████████████████████████████████████▏                                                            | 66/110 [00:04<00:02, 15.78it/s][A
 62%|█████████████████████████████████████████████████████████████████████████████████████████████▉                                                          | 68/110 [00:04<00:02, 15.21it/s][A
 64%|████████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 70/110 [00:04<00:02, 15.58it/s][A
 65%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                                    | 72/110 [00:04<00:02, 15.87it/s][A
 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                 | 74/110 [00:04<00:02, 16.05it/s][A
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████                                               | 76/110 [00:04<00:02, 16.22it/s][A
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                            | 78/110 [00:05<00:01, 16.32it/s][A
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 80/110 [00:05<00:01, 15.52it/s][A
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                     | 83/110 [00:05<00:01, 17.21it/s][A
 77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 85/110 [00:05<00:01, 17.02it/s][A
 79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                               | 87/110 [00:05<00:01, 16.01it/s][A
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 90/110 [00:05<00:01, 16.65it/s][A
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                        | 92/110 [00:05<00:01, 16.64it/s][A
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                      | 94/110 [00:06<00:01, 15.77it/s][A
 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 96/110 [00:06<00:00, 15.17it/s][A
 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 98/110 [00:06<00:00, 14.76it/s][A
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 101/110 [00:06<00:00, 16.57it/s][A
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍         | 103/110 [00:06<00:00, 15.02it/s][A
 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 105/110 [00:06<00:00, 14.68it/s][A
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 107/110 [00:06<00:00, 14.48it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [00:07<00:00, 15.88it/s][A                                                                                                                                                                                              
                                                                                                                                                                                              [A{'eval_loss': '3.749', 'eval_runtime': '7.208', 'eval_samples_per_second': '30.38', 'eval_steps_per_second': '15.26', 'eval_ppl': '42.46', 'memory/max_active (GiB)': '0.14', 'memory/max_allocated (GiB)': '0.14', 'memory/device_reserved (GiB)': '0.37', 'epoch': '1.565', 'tokens/train_per_sec_per_gpu': '0'}
 52%|███████████████████████████████████████████████████████████████████████████████                                                                        | 200/382 [01:23<00:52,  3.44it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [00:07<00:00, 15.88it/s][A
                                                                                                                                                                                              [A[2026-03-23 06:25:16,652] [INFO] [axolotl.core.trainers.base._save:721] [PID:323] Saving model checkpoint to ./final_model/checkpoint-200
 53%|███████████████████████████████████████████████████████████████████████████████▍                                                                       | 201/382 [01:34<17:08,  5.68s/it]                                                                                                                                                                                              {'loss': '3.93', 'grad_norm': '0.735', 'learning_rate': '9.704e-05', 'ppl': '50.92', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.24', 'tokens/train_per_sec_per_gpu': '330.9', 'tokens/total': 140160, 'tokens/trainable': 43574, 'epoch': '1.573'}
 53%|███████████████████████████████████████████████████████████████████████████████▍                                                                       | 201/382 [01:34<17:08,  5.68s/it] 53%|███████████████████████████████████████████████████████████████████████████████▊                                                                       | 202/382 [01:35<12:08,  4.05s/it]                                                                                                                                                                                              {'loss': '3.469', 'grad_norm': '0.9033', 'learning_rate': '9.619e-05', 'ppl': '32.1', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.25', 'tokens/train_per_sec_per_gpu': '421.5', 'tokens/total': 140672, 'tokens/trainable': 43767, 'epoch': '1.58'}
 53%|███████████████████████████████████████████████████████████████████████████████▊                                                                       | 202/382 [01:35<12:08,  4.05s/it] 53%|████████████████████████████████████████████████████████████████████████████████▏                                                                      | 203/382 [01:35<08:42,  2.92s/it]                                                                                                                                                                                              {'loss': '3.789', 'grad_norm': '0.8384', 'learning_rate': '9.534e-05', 'ppl': '44.2', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.25', 'tokens/train_per_sec_per_gpu': '203.8', 'tokens/total': 141312, 'tokens/trainable': 43959, 'epoch': '1.588'}
 53%|████████████████████████████████████████████████████████████████████████████████▏                                                                      | 203/382 [01:35<08:42,  2.92s/it] 53%|████████████████████████████████████████████████████████████████████████████████▋                                                                      | 204/382 [01:35<06:24,  2.16s/it]                                                                                                                                                                                              {'loss': '3.441', 'grad_norm': '0.8443', 'learning_rate': '9.45e-05', 'ppl': '31.21', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '499.1', 'tokens/total': 142208, 'tokens/trainable': 44293, 'epoch': '1.596'}
 53%|████████████████████████████████████████████████████████████████████████████████▋                                                                      | 204/382 [01:35<06:24,  2.16s/it] 54%|█████████████████████████████████████████████████████████████████████████████████                                                                      | 205/382 [01:36<04:45,  1.61s/it]                                                                                                                                                                                              {'loss': '4.233', 'grad_norm': '1.049', 'learning_rate': '9.365e-05', 'ppl': '68.94', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '209.3', 'tokens/total': 142976, 'tokens/trainable': 44455, 'epoch': '1.604'}
 54%|█████████████████████████████████████████████████████████████████████████████████                                                                      | 205/382 [01:36<04:45,  1.61s/it] 54%|█████████████████████████████████████████████████████████████████████████████████▍                                                                     | 206/382 [01:36<03:33,  1.21s/it]                                                                                                                                                                                              {'loss': '3.301', 'grad_norm': '0.9992', 'learning_rate': '9.281e-05', 'ppl': '27.14', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '260.5', 'tokens/total': 143616, 'tokens/trainable': 44589, 'epoch': '1.612'}
 54%|█████████████████████████████████████████████████████████████████████████████████▍                                                                     | 206/382 [01:36<03:33,  1.21s/it] 54%|█████████████████████████████████████████████████████████████████████████████████▊                                                                     | 207/382 [01:36<02:46,  1.05it/s]                                                                                                                                                                                              {'loss': '3.619', 'grad_norm': '0.7493', 'learning_rate': '9.196e-05', 'ppl': '37.3', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '242.3', 'tokens/total': 144384, 'tokens/trainable': 44761, 'epoch': '1.62'}
 54%|█████████████████████████████████████████████████████████████████████████████████▊                                                                     | 207/382 [01:36<02:46,  1.05it/s] 54%|██████████████████████████████████████████████████████████████████████████████████▏                                                                    | 208/382 [01:37<02:10,  1.33it/s]                                                                                                                                                                                              {'loss': '3.946', 'grad_norm': '0.7331', 'learning_rate': '9.112e-05', 'ppl': '51.72', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '303.9', 'tokens/total': 145024, 'tokens/trainable': 44985, 'epoch': '1.627'}
 54%|██████████████████████████████████████████████████████████████████████████████████▏                                                                    | 208/382 [01:37<02:10,  1.33it/s] 55%|██████████████████████████████████████████████████████████████████████████████████▌                                                                    | 209/382 [01:37<01:50,  1.56it/s]                                                                                                                                                                                              {'loss': '3.754', 'grad_norm': '0.6662', 'learning_rate': '9.028e-05', 'ppl': '42.69', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '405', 'tokens/total': 145920, 'tokens/trainable': 45302, 'epoch': '1.635'}
 55%|██████████████████████████████████████████████████████████████████████████████████▌                                                                    | 209/382 [01:37<01:50,  1.56it/s] 55%|███████████████████████████████████████████████████████████████████████████████████                                                                    | 210/382 [01:37<01:29,  1.93it/s]                                                                                                                                                                                              {'loss': '3.838', 'grad_norm': '0.9333', 'learning_rate': '8.943e-05', 'ppl': '46.44', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '394.8', 'tokens/total': 146432, 'tokens/trainable': 45499, 'epoch': '1.643'}
 55%|███████████████████████████████████████████████████████████████████████████████████                                                                    | 210/382 [01:37<01:29,  1.93it/s] 55%|███████████████████████████████████████████████████████████████████████████████████▍                                                                   | 211/382 [01:38<01:19,  2.14it/s]                                                                                                                                                                                              {'loss': '3.495', 'grad_norm': '0.831', 'learning_rate': '8.859e-05', 'ppl': '32.94', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '307.9', 'tokens/total': 147200, 'tokens/trainable': 45697, 'epoch': '1.651'}
 55%|███████████████████████████████████████████████████████████████████████████████████▍                                                                   | 211/382 [01:38<01:19,  2.14it/s] 55%|███████████████████████████████████████████████████████████████████████████████████▊                                                                   | 212/382 [01:38<01:07,  2.53it/s]                                                                                                                                                                                              {'loss': '3.405', 'grad_norm': '1.08', 'learning_rate': '8.775e-05', 'ppl': '30.12', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '308.9', 'tokens/total': 147712, 'tokens/trainable': 45819, 'epoch': '1.659'}
 55%|███████████████████████████████████████████████████████████████████████████████████▊                                                                   | 212/382 [01:38<01:07,  2.53it/s] 56%|████████████████████████████████████████████████████████████████████████████████████▏                                                                  | 213/382 [01:38<01:01,  2.76it/s]                                                                                                                                                                                              {'loss': '4.167', 'grad_norm': '1.228', 'learning_rate': '8.691e-05', 'ppl': '64.55', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '249.5', 'tokens/total': 148352, 'tokens/trainable': 45956, 'epoch': '1.667'}
 56%|████████████████████████████████████████████████████████████████████████████████████▏                                                                  | 213/382 [01:38<01:01,  2.76it/s] 56%|████████████████████████████████████████████████████████████████████████████████████▌                                                                  | 214/382 [01:38<00:59,  2.82it/s]                                                                                                                                                                                              {'loss': '3.42', 'grad_norm': '0.9593', 'learning_rate': '8.607e-05', 'ppl': '30.57', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '370.7', 'tokens/total': 149120, 'tokens/trainable': 46168, 'epoch': '1.675'}
 56%|████████████████████████████████████████████████████████████████████████████████████▌                                                                  | 214/382 [01:38<00:59,  2.82it/s] 56%|████████████████████████████████████████████████████████████████████████████████████▉                                                                  | 215/382 [01:39<00:58,  2.86it/s]                                                                                                                                                                                              {'loss': '4.583', 'grad_norm': '0.9577', 'learning_rate': '8.524e-05', 'ppl': '97.83', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '176.3', 'tokens/total': 149888, 'tokens/trainable': 46342, 'epoch': '1.682'}
 56%|████████████████████████████████████████████████████████████████████████████████████▉                                                                  | 215/382 [01:39<00:58,  2.86it/s] 57%|█████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 216/382 [01:39<01:02,  2.66it/s]                                                                                                                                                                                              {'loss': '3.166', 'grad_norm': '0.5692', 'learning_rate': '8.44e-05', 'ppl': '23.71', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '488.2', 'tokens/total': 150912, 'tokens/trainable': 46710, 'epoch': '1.69'}
 57%|█████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 216/382 [01:39<01:02,  2.66it/s] 57%|█████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 217/382 [01:39<00:57,  2.87it/s]                                                                                                                                                                                              {'loss': '3.542', 'grad_norm': '1.463', 'learning_rate': '8.356e-05', 'ppl': '34.53', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '364.6', 'tokens/total': 151552, 'tokens/trainable': 46859, 'epoch': '1.698'}
 57%|█████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 217/382 [01:39<00:57,  2.87it/s] 57%|██████████████████████████████████████████████████████████████████████████████████████▏                                                                | 218/382 [01:40<00:53,  3.04it/s]                                                                                                                                                                                              {'loss': '3.811', 'grad_norm': '0.8724', 'learning_rate': '8.273e-05', 'ppl': '45.21', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '225.2', 'tokens/total': 152192, 'tokens/trainable': 47072, 'epoch': '1.706'}
 57%|██████████████████████████████████████████████████████████████████████████████████████▏                                                                | 218/382 [01:40<00:53,  3.04it/s] 57%|██████████████████████████████████████████████████████████████████████████████████████▌                                                                | 219/382 [01:40<00:48,  3.35it/s]                                                                                                                                                                                              {'loss': '4.252', 'grad_norm': '1.401', 'learning_rate': '8.189e-05', 'ppl': '70.28', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '212.5', 'tokens/total': 152704, 'tokens/trainable': 47174, 'epoch': '1.714'}
 57%|██████████████████████████████████████████████████████████████████████████████████████▌                                                                | 219/382 [01:40<00:48,  3.35it/s] 58%|██████████████████████████████████████████████████████████████████████████████████████▉                                                                | 220/382 [01:40<00:50,  3.23it/s]                                                                                                                                                                                              {'loss': '3.26', 'grad_norm': '0.7367', 'learning_rate': '8.106e-05', 'ppl': '26.06', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '217.8', 'tokens/total': 153472, 'tokens/trainable': 47431, 'epoch': '1.722'}
 58%|██████████████████████████████████████████████████████████████████████████████████████▉                                                                | 220/382 [01:40<00:50,  3.23it/s] 58%|███████████████████████████████████████████████████████████████████████████████████████▎                                                               | 221/382 [01:41<00:48,  3.32it/s]                                                                                                                                                                                              {'loss': '3.676', 'grad_norm': '0.8766', 'learning_rate': '8.023e-05', 'ppl': '39.48', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '296.3', 'tokens/total': 154112, 'tokens/trainable': 47633, 'epoch': '1.729'}
 58%|███████████████████████████████████████████████████████████████████████████████████████▎                                                               | 221/382 [01:41<00:48,  3.32it/s] 58%|███████████████████████████████████████████████████████████████████████████████████████▊                                                               | 222/382 [01:41<00:47,  3.38it/s]                                                                                                                                                                                              {'loss': '3.674', 'grad_norm': '0.8514', 'learning_rate': '7.94e-05', 'ppl': '39.41', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '160.5', 'tokens/total': 154752, 'tokens/trainable': 47832, 'epoch': '1.737'}
 58%|███████████████████████████████████████████████████████████████████████████████████████▊                                                               | 222/382 [01:41<00:47,  3.38it/s] 58%|████████████████████████████████████████████████████████████████████████████████████████▏                                                              | 223/382 [01:41<00:46,  3.42it/s]                                                                                                                                                                                              {'loss': '3.324', 'grad_norm': '1.133', 'learning_rate': '7.857e-05', 'ppl': '27.76', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '299.3', 'tokens/total': 155392, 'tokens/trainable': 48035, 'epoch': '1.745'}
 58%|████████████████████████████████████████████████████████████████████████████████████████▏                                                              | 223/382 [01:41<00:46,  3.42it/s] 59%|████████████████████████████████████████████████████████████████████████████████████████▌                                                              | 224/382 [01:41<00:48,  3.27it/s]                                                                                                                                                                                              {'loss': '3.742', 'grad_norm': '0.7458', 'learning_rate': '7.775e-05', 'ppl': '42.16', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '278', 'tokens/total': 156160, 'tokens/trainable': 48302, 'epoch': '1.753'}
 59%|████████████████████████████████████████████████████████████████████████████████████████▌                                                              | 224/382 [01:41<00:48,  3.27it/s] 59%|████████████████████████████████████████████████████████████████████████████████████████▉                                                              | 225/382 [01:42<00:46,  3.35it/s]                                                                                                                                                                                              {'loss': '2.54', 'grad_norm': '0.9569', 'learning_rate': '7.692e-05', 'ppl': '12.69', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '371', 'tokens/total': 156800, 'tokens/trainable': 48487, 'epoch': '1.761'}
 59%|████████████████████████████████████████████████████████████████████████████████████████▉                                                              | 225/382 [01:42<00:46,  3.35it/s] 59%|█████████████████████████████████████████████████████████████████████████████████████████▎                                                             | 226/382 [01:42<00:45,  3.40it/s]                                                                                                                                                                                              {'loss': '4.015', 'grad_norm': '1.035', 'learning_rate': '7.61e-05', 'ppl': '55.43', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '642.8', 'tokens/total': 157440, 'tokens/trainable': 48700, 'epoch': '1.769'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████▎                                                             | 226/382 [01:42<00:45,  3.40it/s] 59%|█████████████████████████████████████████████████████████████████████████████████████████▋                                                             | 227/382 [01:42<00:47,  3.24it/s]                                                                                                                                                                                              {'loss': '3.788', 'grad_norm': '0.8082', 'learning_rate': '7.528e-05', 'ppl': '44.17', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '266.3', 'tokens/total': 158208, 'tokens/trainable': 48910, 'epoch': '1.776'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████▋                                                             | 227/382 [01:42<00:47,  3.24it/s] 60%|██████████████████████████████████████████████████████████████████████████████████████████▏                                                            | 228/382 [01:43<00:46,  3.32it/s]                                                                                                                                                                                              {'loss': '3.372', 'grad_norm': '0.9381', 'learning_rate': '7.446e-05', 'ppl': '29.15', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '132.1', 'tokens/total': 158848, 'tokens/trainable': 49072, 'epoch': '1.784'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████▏                                                            | 228/382 [01:43<00:46,  3.32it/s] 60%|██████████████████████████████████████████████████████████████████████████████████████████▌                                                            | 229/382 [01:43<00:47,  3.22it/s]                                                                                                                                                                                              {'loss': '2.622', 'grad_norm': '0.9714', 'learning_rate': '7.364e-05', 'ppl': '13.76', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '154.2', 'tokens/total': 159616, 'tokens/trainable': 49250, 'epoch': '1.792'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████▌                                                            | 229/382 [01:43<00:47,  3.22it/s] 60%|██████████████████████████████████████████████████████████████████████████████████████████▉                                                            | 230/382 [01:43<00:45,  3.31it/s]                                                                                                                                                                                              {'loss': '4.046', 'grad_norm': '0.9722', 'learning_rate': '7.283e-05', 'ppl': '57.18', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '250.2', 'tokens/total': 160256, 'tokens/trainable': 49432, 'epoch': '1.8'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████▉                                                            | 230/382 [01:43<00:45,  3.31it/s] 60%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                           | 231/382 [01:44<00:49,  3.05it/s]                                                                                                                                                                                              {'loss': '3.577', 'grad_norm': '0.7995', 'learning_rate': '7.201e-05', 'ppl': '35.78', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '254.5', 'tokens/total': 161152, 'tokens/trainable': 49662, 'epoch': '1.808'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                           | 231/382 [01:44<00:49,  3.05it/s] 61%|███████████████████████████████████████████████████████████████████████████████████████████▋                                                           | 232/382 [01:44<00:49,  3.02it/s]                                                                                                                                                                                              {'loss': '3.591', 'grad_norm': '0.7812', 'learning_rate': '7.12e-05', 'ppl': '36.26', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '440.5', 'tokens/total': 161920, 'tokens/trainable': 49879, 'epoch': '1.816'}
 61%|███████████████████████████████████████████████████████████████████████████████████████████▋                                                           | 232/382 [01:44<00:49,  3.02it/s] 61%|████████████████████████████████████████████████████████████████████████████████████████████                                                           | 233/382 [01:44<00:47,  3.16it/s]                                                                                                                                                                                              {'loss': '3.496', 'grad_norm': '0.7489', 'learning_rate': '7.039e-05', 'ppl': '32.97', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '503.5', 'tokens/total': 162560, 'tokens/trainable': 50076, 'epoch': '1.824'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████                                                           | 233/382 [01:44<00:47,  3.16it/s] 61%|████████████████████████████████████████████████████████████████████████████████████████████▍                                                          | 234/382 [01:45<00:47,  3.10it/s]                                                                                                                                                                                              {'loss': '3.508', 'grad_norm': '0.6969', 'learning_rate': '6.958e-05', 'ppl': '33.4', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '523.2', 'tokens/total': 163328, 'tokens/trainable': 50372, 'epoch': '1.831'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████▍                                                          | 234/382 [01:45<00:47,  3.10it/s] 62%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                          | 235/382 [01:45<00:48,  3.05it/s]                                                                                                                                                                                              {'loss': '3.287', 'grad_norm': '0.9481', 'learning_rate': '6.878e-05', 'ppl': '26.77', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '406.2', 'tokens/total': 164096, 'tokens/trainable': 50536, 'epoch': '1.839'}
 62%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                          | 235/382 [01:45<00:48,  3.05it/s] 62%|█████████████████████████████████████████████████████████████████████████████████████████████▎                                                         | 236/382 [01:45<00:50,  2.89it/s]                                                                                                                                                                                              {'loss': '3.802', 'grad_norm': '0.7349', 'learning_rate': '6.797e-05', 'ppl': '44.78', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '517.6', 'tokens/total': 164992, 'tokens/trainable': 50863, 'epoch': '1.847'}
 62%|█████████████████████████████████████████████████████████████████████████████████████████████▎                                                         | 236/382 [01:45<00:50,  2.89it/s] 62%|█████████████████████████████████████████████████████████████████████████████████████████████▋                                                         | 237/382 [01:46<00:47,  3.06it/s]                                                                                                                                                                                              {'loss': '4.314', 'grad_norm': '0.9062', 'learning_rate': '6.717e-05', 'ppl': '74.71', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '482', 'tokens/total': 165632, 'tokens/trainable': 51088, 'epoch': '1.855'}
 62%|█████████████████████████████████████████████████████████████████████████████████████████████▋                                                         | 237/382 [01:46<00:47,  3.06it/s] 62%|██████████████████████████████████████████████████████████████████████████████████████████████                                                         | 238/382 [01:46<00:47,  3.04it/s]                                                                                                                                                                                              {'loss': '3.708', 'grad_norm': '0.8057', 'learning_rate': '6.637e-05', 'ppl': '40.77', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '629.8', 'tokens/total': 166400, 'tokens/trainable': 51355, 'epoch': '1.863'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████                                                         | 238/382 [01:46<00:47,  3.04it/s] 63%|██████████████████████████████████████████████████████████████████████████████████████████████▍                                                        | 239/382 [01:46<00:49,  2.88it/s]                                                                                                                                                                                              {'loss': '3.186', 'grad_norm': '0.6233', 'learning_rate': '6.558e-05', 'ppl': '24.19', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '304.1', 'tokens/total': 167296, 'tokens/trainable': 51706, 'epoch': '1.871'}
 63%|██████████████████████████████████████████████████████████████████████████████████████████████▍                                                        | 239/382 [01:46<00:49,  2.88it/s] 63%|██████████████████████████████████████████████████████████████████████████████████████████████▊                                                        | 240/382 [01:47<00:46,  3.05it/s]                                                                                                                                                                                              {'loss': '3.225', 'grad_norm': '0.892', 'learning_rate': '6.478e-05', 'ppl': '25.16', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '177.9', 'tokens/total': 167936, 'tokens/trainable': 51886, 'epoch': '1.878'}
 63%|██████████████████████████████████████████████████████████████████████████████████████████████▊                                                        | 240/382 [01:47<00:46,  3.05it/s] 63%|███████████████████████████████████████████████████████████████████████████████████████████████▎                                                       | 241/382 [01:47<00:40,  3.52it/s]                                                                                                                                                                                              {'loss': '3.94', 'grad_norm': '1.155', 'learning_rate': '6.399e-05', 'ppl': '51.4', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '604.7', 'tokens/total': 168320, 'tokens/trainable': 52013, 'epoch': '1.886'}
 63%|███████████████████████████████████████████████████████████████████████████████████████████████▎                                                       | 241/382 [01:47<00:40,  3.52it/s] 63%|███████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 242/382 [01:47<00:39,  3.53it/s]                                                                                                                                                                                              {'loss': '4.401', 'grad_norm': '1.031', 'learning_rate': '6.32e-05', 'ppl': '81.54', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '325.1', 'tokens/total': 168960, 'tokens/trainable': 52212, 'epoch': '1.894'}
 63%|███████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 242/382 [01:47<00:39,  3.53it/s] 64%|████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 243/382 [01:47<00:41,  3.34it/s]                                                                                                                                                                                              {'loss': '3.132', 'grad_norm': '0.7158', 'learning_rate': '6.242e-05', 'ppl': '22.93', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '451.8', 'tokens/total': 169728, 'tokens/trainable': 52478, 'epoch': '1.902'}
 64%|████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 243/382 [01:47<00:41,  3.34it/s] 64%|████████████████████████████████████████████████████████████████████████████████████████████████▍                                                      | 244/382 [01:48<00:40,  3.39it/s]                                                                                                                                                                                              {'loss': '3.684', 'grad_norm': '0.8872', 'learning_rate': '6.163e-05', 'ppl': '39.79', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '253.9', 'tokens/total': 170368, 'tokens/trainable': 52639, 'epoch': '1.91'}
 64%|████████████████████████████████████████████████████████████████████████████████████████████████▍                                                      | 244/382 [01:48<00:40,  3.39it/s] 64%|████████████████████████████████████████████████████████████████████████████████████████████████▊                                                      | 245/382 [01:48<00:39,  3.43it/s]                                                                                                                                                                                              {'loss': '3.953', 'grad_norm': '1.012', 'learning_rate': '6.085e-05', 'ppl': '52.1', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '221.5', 'tokens/total': 171008, 'tokens/trainable': 52795, 'epoch': '1.918'}
 64%|████████████████████████████████████████████████████████████████████████████████████████████████▊                                                      | 245/382 [01:48<00:39,  3.43it/s] 64%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                     | 246/382 [01:48<00:43,  3.12it/s]                                                                                                                                                                                              {'loss': '3.47', 'grad_norm': '0.716', 'learning_rate': '6.008e-05', 'ppl': '32.13', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '306.2', 'tokens/total': 171904, 'tokens/trainable': 53109, 'epoch': '1.925'}
 64%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                     | 246/382 [01:48<00:43,  3.12it/s] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                                                     | 247/382 [01:49<00:39,  3.42it/s]                                                                                                                                                                                              {'loss': '3.555', 'grad_norm': '1.947', 'learning_rate': '5.93e-05', 'ppl': '34.99', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '106.5', 'tokens/total': 172416, 'tokens/trainable': 53167, 'epoch': '1.933'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                                                     | 247/382 [01:49<00:39,  3.42it/s] 65%|██████████████████████████████████████████████████████████████████████████████████████████████████                                                     | 248/382 [01:49<00:41,  3.26it/s]                                                                                                                                                                                              {'loss': '3.778', 'grad_norm': '0.722', 'learning_rate': '5.853e-05', 'ppl': '43.72', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '387.7', 'tokens/total': 173184, 'tokens/trainable': 53456, 'epoch': '1.941'}
 65%|██████████████████████████████████████████████████████████████████████████████████████████████████                                                     | 248/382 [01:49<00:41,  3.26it/s] 65%|██████████████████████████████████████████████████████████████████████████████████████████████████▍                                                    | 249/382 [01:49<00:39,  3.34it/s]                                                                                                                                                                                              {'loss': '3.82', 'grad_norm': '0.8653', 'learning_rate': '5.776e-05', 'ppl': '45.6', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '485.4', 'tokens/total': 173824, 'tokens/trainable': 53702, 'epoch': '1.949'}
 65%|██████████████████████████████████████████████████████████████████████████████████████████████████▍                                                    | 249/382 [01:49<00:39,  3.34it/s] 65%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                                                    | 250/382 [01:50<00:41,  3.21it/s]                                                                                                                                                                                              {'loss': '3.622', 'grad_norm': '0.8184', 'learning_rate': '5.699e-05', 'ppl': '37.43', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '480.7', 'tokens/total': 174592, 'tokens/trainable': 53990, 'epoch': '1.957'}
 65%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                                                    | 250/382 [01:50<00:41,  3.21it/s] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                                   | 251/382 [01:50<00:43,  2.99it/s]                                                                                                                                                                                              {'loss': '3.557', 'grad_norm': '0.802', 'learning_rate': '5.623e-05', 'ppl': '35.07', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '303.9', 'tokens/total': 175488, 'tokens/trainable': 54219, 'epoch': '1.965'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                                   | 251/382 [01:50<00:43,  2.99it/s] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                   | 252/382 [01:50<00:43,  2.98it/s]                                                                                                                                                                                              {'loss': '3.789', 'grad_norm': '0.8915', 'learning_rate': '5.547e-05', 'ppl': '44.2', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '343.8', 'tokens/total': 176256, 'tokens/trainable': 54458, 'epoch': '1.973'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                   | 252/382 [01:50<00:43,  2.98it/s] 66%|████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 253/382 [01:51<00:41,  3.13it/s]                                                                                                                                                                                              {'loss': '4.297', 'grad_norm': '1.211', 'learning_rate': '5.471e-05', 'ppl': '73.45', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '466.3', 'tokens/total': 176896, 'tokens/trainable': 54624, 'epoch': '1.98'}
 66%|████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 253/382 [01:51<00:41,  3.13it/s] 66%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                  | 254/382 [01:51<00:41,  3.08it/s]                                                                                                                                                                                              {'loss': '3.588', 'grad_norm': '0.757', 'learning_rate': '5.396e-05', 'ppl': '36.15', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '212.1', 'tokens/total': 177664, 'tokens/trainable': 54864, 'epoch': '1.988'}
 66%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                  | 254/382 [01:51<00:41,  3.08it/s] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                  | 255/382 [01:51<00:37,  3.37it/s]                                                                                                                                                                                              {'loss': '3.101', 'grad_norm': '1.278', 'learning_rate': '5.321e-05', 'ppl': '22.21', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '279.4', 'tokens/total': 178176, 'tokens/trainable': 55023, 'epoch': '1.996'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                  | 255/382 [01:51<00:37,  3.37it/s]                                                                                                                                                                                              {'loss': '4.313', 'grad_norm': '2.705', 'learning_rate': '5.246e-05', 'ppl': '74.64', 'memory/max_active (GiB)': '0.11', 'memory/max_allocated (GiB)': '0.11', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '402.9', 'tokens/total': 178304, 'tokens/trainable': 55050, 'epoch': '2'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                 | 256/382 [01:51<00:37,  3.37it/s] 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                 | 257/382 [01:52<00:35,  3.50it/s]                                                                                                                                                                                              {'loss': '3.661', 'grad_norm': '0.7558', 'learning_rate': '5.172e-05', 'ppl': '38.92', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '645.6', 'tokens/total': 179072, 'tokens/trainable': 55341, 'epoch': '2.008'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                 | 257/382 [01:52<00:35,  3.50it/s] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                 | 258/382 [01:52<00:37,  3.35it/s]                                                                                                                                                                                              {'loss': '3.792', 'grad_norm': '0.8073', 'learning_rate': '5.098e-05', 'ppl': '44.35', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '298.8', 'tokens/total': 179840, 'tokens/trainable': 55587, 'epoch': '2.016'}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                 | 258/382 [01:52<00:37,  3.35it/s] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 259/382 [01:52<00:36,  3.39it/s]                                                                                                                                                                                              {'loss': '3.209', 'grad_norm': '0.7161', 'learning_rate': '5.024e-05', 'ppl': '24.76', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '286.1', 'tokens/total': 180480, 'tokens/trainable': 55813, 'epoch': '2.024'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 259/382 [01:52<00:36,  3.39it/s] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                | 260/382 [01:53<00:39,  3.12it/s]                                                                                                                                                                                              {'loss': '3.333', 'grad_norm': '0.7891', 'learning_rate': '4.951e-05', 'ppl': '28.01', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '301.4', 'tokens/total': 181376, 'tokens/trainable': 56113, 'epoch': '2.031'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                | 260/382 [01:53<00:39,  3.12it/s] 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 261/382 [01:53<00:37,  3.23it/s]                                                                                                                                                                                              {'loss': '4.053', 'grad_norm': '0.8391', 'learning_rate': '4.878e-05', 'ppl': '57.57', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '414.2', 'tokens/total': 182016, 'tokens/trainable': 56332, 'epoch': '2.039'}
 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 261/382 [01:53<00:37,  3.23it/s] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                               | 262/382 [01:53<00:38,  3.16it/s]                                                                                                                                                                                              {'loss': '3.414', 'grad_norm': '0.8275', 'learning_rate': '4.806e-05', 'ppl': '30.38', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '199.4', 'tokens/total': 182784, 'tokens/trainable': 56620, 'epoch': '2.047'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                               | 262/382 [01:53<00:38,  3.16it/s] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                                               | 263/382 [01:54<00:40,  2.96it/s]                                                                                                                                                                                              {'loss': '3.544', 'grad_norm': '0.627', 'learning_rate': '4.734e-05', 'ppl': '34.6', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '550.3', 'tokens/total': 183680, 'tokens/trainable': 57048, 'epoch': '2.055'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                                               | 263/382 [01:54<00:40,  2.96it/s] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                              | 264/382 [01:54<00:37,  3.11it/s]                                                                                                                                                                                              {'loss': '4.021', 'grad_norm': '0.6815', 'learning_rate': '4.662e-05', 'ppl': '55.75', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '250.2', 'tokens/total': 184320, 'tokens/trainable': 57336, 'epoch': '2.063'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                              | 264/382 [01:54<00:37,  3.11it/s] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                              | 265/382 [01:54<00:34,  3.40it/s]                                                                                                                                                                                              {'loss': '3.423', 'grad_norm': '1.336', 'learning_rate': '4.59e-05', 'ppl': '30.67', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '292.5', 'tokens/total': 184832, 'tokens/trainable': 57437, 'epoch': '2.071'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                              | 265/382 [01:54<00:34,  3.40it/s] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                             | 266/382 [01:55<00:35,  3.25it/s]                                                                                                                                                                                              {'loss': '3.413', 'grad_norm': '0.6534', 'learning_rate': '4.519e-05', 'ppl': '30.35', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '779.4', 'tokens/total': 185600, 'tokens/trainable': 57820, 'epoch': '2.078'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                             | 266/382 [01:55<00:35,  3.25it/s] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                             | 267/382 [01:55<00:36,  3.17it/s]                                                                                                                                                                                              {'loss': '4.367', 'grad_norm': '0.8549', 'learning_rate': '4.449e-05', 'ppl': '78.77', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '308.5', 'tokens/total': 186368, 'tokens/trainable': 58030, 'epoch': '2.086'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                             | 267/382 [01:55<00:36,  3.17it/s] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                             | 268/382 [01:55<00:34,  3.27it/s]                                                                                                                                                                                              {'loss': '4.412', 'grad_norm': '0.911', 'learning_rate': '4.378e-05', 'ppl': '82.42', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '224.9', 'tokens/total': 187008, 'tokens/trainable': 58244, 'epoch': '2.094'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                             | 268/382 [01:55<00:34,  3.27it/s] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                            | 269/382 [01:56<00:35,  3.17it/s]                                                                                                                                                                                              {'loss': '4.513', 'grad_norm': '0.8701', 'learning_rate': '4.309e-05', 'ppl': '91.19', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '496.6', 'tokens/total': 187776, 'tokens/trainable': 58464, 'epoch': '2.102'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                            | 269/382 [01:56<00:35,  3.17it/s] 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                            | 270/382 [01:56<00:39,  2.84it/s]                                                                                                                                                                                              {'loss': '3.676', 'grad_norm': '0.7719', 'learning_rate': '4.239e-05', 'ppl': '39.49', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '544.4', 'tokens/total': 188800, 'tokens/trainable': 58896, 'epoch': '2.11'}
 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                            | 270/382 [01:56<00:39,  2.84it/s] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████                                            | 271/382 [01:56<00:36,  3.02it/s]                                                                                                                                                                                              {'loss': '4.231', 'grad_norm': '0.8486', 'learning_rate': '4.17e-05', 'ppl': '68.8', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '410.8', 'tokens/total': 189440, 'tokens/trainable': 59123, 'epoch': '2.118'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████                                            | 271/382 [01:56<00:36,  3.02it/s] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 272/382 [01:57<00:34,  3.16it/s]                                                                                                                                                                                              {'loss': '4.415', 'grad_norm': '0.9051', 'learning_rate': '4.102e-05', 'ppl': '82.67', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '246.4', 'tokens/total': 190080, 'tokens/trainable': 59336, 'epoch': '2.125'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 272/382 [01:57<00:34,  3.16it/s] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 273/382 [01:57<00:33,  3.26it/s]                                                                                                                                                                                              {'loss': '4.211', 'grad_norm': '1.157', 'learning_rate': '4.033e-05', 'ppl': '67.41', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '321.1', 'tokens/total': 190720, 'tokens/trainable': 59477, 'epoch': '2.133'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 273/382 [01:57<00:33,  3.26it/s] 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 274/382 [01:57<00:32,  3.34it/s]                                                                                                                                                                                              {'loss': '3.93', 'grad_norm': '0.8333', 'learning_rate': '3.966e-05', 'ppl': '50.92', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '199.4', 'tokens/total': 191360, 'tokens/trainable': 59691, 'epoch': '2.141'}
 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 274/382 [01:57<00:32,  3.34it/s] 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                          | 275/382 [01:57<00:31,  3.39it/s]                                                                                                                                                                                              {'loss': '3.599', 'grad_norm': '1.154', 'learning_rate': '3.898e-05', 'ppl': '36.54', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '160.6', 'tokens/total': 192000, 'tokens/trainable': 59825, 'epoch': '2.149'}
 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                          | 275/382 [01:57<00:31,  3.39it/s] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████                                          | 276/382 [01:58<00:34,  3.09it/s]                                                                                                                                                                                              {'loss': '3.477', 'grad_norm': '0.7399', 'learning_rate': '3.832e-05', 'ppl': '32.36', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '238.2', 'tokens/total': 192896, 'tokens/trainable': 60081, 'epoch': '2.157'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████                                          | 276/382 [01:58<00:34,  3.09it/s] 73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                         | 277/382 [01:58<00:36,  2.91it/s]                                                                                                                                                                                              {'loss': '3.306', 'grad_norm': '0.8333', 'learning_rate': '3.765e-05', 'ppl': '27.28', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '383.3', 'tokens/total': 193792, 'tokens/trainable': 60455, 'epoch': '2.165'}
 73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                         | 277/382 [01:58<00:36,  2.91it/s] 73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                         | 278/382 [01:59<00:37,  2.80it/s]                                                                                                                                                                                              {'loss': '3.984', 'grad_norm': '0.859', 'learning_rate': '3.699e-05', 'ppl': '53.75', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '181.8', 'tokens/total': 194688, 'tokens/trainable': 60657, 'epoch': '2.173'}
 73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                         | 278/382 [01:59<00:37,  2.80it/s] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 279/382 [01:59<00:33,  3.11it/s]                                                                                                                                                                                              {'loss': '4.1', 'grad_norm': '1.192', 'learning_rate': '3.634e-05', 'ppl': '60.31', 'memory/max_active (GiB)': '0.17', 'memory/max_allocated (GiB)': '0.17', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '202.3', 'tokens/total': 195200, 'tokens/trainable': 60818, 'epoch': '2.18'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 279/382 [01:59<00:33,  3.11it/s] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                        | 280/382 [01:59<00:33,  3.08it/s]                                                                                                                                                                                              {'loss': '2.903', 'grad_norm': '0.7297', 'learning_rate': '3.569e-05', 'ppl': '18.23', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '211.8', 'tokens/total': 195968, 'tokens/trainable': 61097, 'epoch': '2.188'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                        | 280/382 [01:59<00:33,  3.08it/s] 74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 281/382 [01:59<00:33,  3.05it/s]                                                                                                                                                                                              {'loss': '3.848', 'grad_norm': '0.7529', 'learning_rate': '3.504e-05', 'ppl': '46.89', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '181.6', 'tokens/total': 196736, 'tokens/trainable': 61355, 'epoch': '2.196'}
 74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 281/382 [01:59<00:33,  3.05it/s] 74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                       | 282/382 [02:00<00:31,  3.18it/s]                                                                                                                                                                                              {'loss': '3.414', 'grad_norm': '1.144', 'learning_rate': '3.44e-05', 'ppl': '30.39', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '306.7', 'tokens/total': 197376, 'tokens/trainable': 61471, 'epoch': '2.204'}
 74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                       | 282/382 [02:00<00:31,  3.18it/s] 74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                       | 283/382 [02:00<00:33,  2.97it/s]                                                                                                                                                                                              {'loss': '2.944', 'grad_norm': '0.8082', 'learning_rate': '3.376e-05', 'ppl': '18.99', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '349.9', 'tokens/total': 198272, 'tokens/trainable': 61738, 'epoch': '2.212'}
 74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                       | 283/382 [02:00<00:33,  2.97it/s] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 284/382 [02:00<00:29,  3.29it/s]                                                                                                                                                                                              {'loss': '3.338', 'grad_norm': '1.474', 'learning_rate': '3.313e-05', 'ppl': '28.15', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '230.8', 'tokens/total': 198784, 'tokens/trainable': 61817, 'epoch': '2.22'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 284/382 [02:00<00:29,  3.29it/s] 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                      | 285/382 [02:01<00:28,  3.36it/s]                                                                                                                                                                                              {'loss': '3.547', 'grad_norm': '0.8831', 'learning_rate': '3.25e-05', 'ppl': '34.7', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '228.8', 'tokens/total': 199424, 'tokens/trainable': 62018, 'epoch': '2.227'}
 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                      | 285/382 [02:01<00:28,  3.36it/s] 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                      | 286/382 [02:01<00:26,  3.61it/s]                                                                                                                                                                                              {'loss': '3.823', 'grad_norm': '1.224', 'learning_rate': '3.188e-05', 'ppl': '45.72', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '269.7', 'tokens/total': 199936, 'tokens/trainable': 62131, 'epoch': '2.235'}
 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                      | 286/382 [02:01<00:26,  3.61it/s] 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                     | 287/382 [02:01<00:27,  3.40it/s]                                                                                                                                                                                              {'loss': '2.877', 'grad_norm': '0.9245', 'learning_rate': '3.126e-05', 'ppl': '17.76', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '466.1', 'tokens/total': 200704, 'tokens/trainable': 62339, 'epoch': '2.243'}
 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                     | 287/382 [02:01<00:27,  3.40it/s] 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                     | 288/382 [02:02<00:30,  3.10it/s]                                                                                                                                                                                              {'loss': '3.68', 'grad_norm': '0.6547', 'learning_rate': '3.065e-05', 'ppl': '39.65', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '409.6', 'tokens/total': 201600, 'tokens/trainable': 62669, 'epoch': '2.251'}
 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                     | 288/382 [02:02<00:30,  3.10it/s] 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                    | 289/382 [02:02<00:28,  3.21it/s]                                                                                                                                                                                              {'loss': '3.191', 'grad_norm': '1.237', 'learning_rate': '3.004e-05', 'ppl': '24.32', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '270.3', 'tokens/total': 202240, 'tokens/trainable': 62823, 'epoch': '2.259'}
 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                    | 289/382 [02:02<00:28,  3.21it/s] 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                    | 290/382 [02:02<00:27,  3.30it/s]                                                                                                                                                                                              {'loss': '4.068', 'grad_norm': '1.132', 'learning_rate': '2.944e-05', 'ppl': '58.45', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '446.5', 'tokens/total': 202880, 'tokens/trainable': 62974, 'epoch': '2.267'}
 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                    | 290/382 [02:02<00:27,  3.30it/s] 76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                    | 291/382 [02:02<00:27,  3.37it/s]                                                                                                                                                                                              {'loss': '3.819', 'grad_norm': '1.294', 'learning_rate': '2.884e-05', 'ppl': '45.56', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '271.1', 'tokens/total': 203520, 'tokens/trainable': 63086, 'epoch': '2.275'}
 76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                    | 291/382 [02:02<00:27,  3.37it/s] 76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                   | 292/382 [02:03<00:24,  3.62it/s]                                                                                                                                                                                              {'loss': '3.341', 'grad_norm': '1.305', 'learning_rate': '2.825e-05', 'ppl': '28.24', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '168.7', 'tokens/total': 204032, 'tokens/trainable': 63181, 'epoch': '2.282'}
 76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                   | 292/382 [02:03<00:24,  3.62it/s] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 293/382 [02:03<00:24,  3.60it/s]                                                                                                                                                                                              {'loss': '4.25', 'grad_norm': '1.233', 'learning_rate': '2.766e-05', 'ppl': '70.09', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '89.4', 'tokens/total': 204672, 'tokens/trainable': 63374, 'epoch': '2.29'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 293/382 [02:03<00:24,  3.60it/s] 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 294/382 [02:03<00:24,  3.58it/s]                                                                                                                                                                                              {'loss': '3.336', 'grad_norm': '1.114', 'learning_rate': '2.708e-05', 'ppl': '28.11', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '125.1', 'tokens/total': 205312, 'tokens/trainable': 63504, 'epoch': '2.298'}
 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 294/382 [02:03<00:24,  3.58it/s] 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 295/382 [02:04<00:25,  3.37it/s]                                                                                                                                                                                              {'loss': '4.083', 'grad_norm': '0.8554', 'learning_rate': '2.65e-05', 'ppl': '59.31', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '284.1', 'tokens/total': 206080, 'tokens/trainable': 63723, 'epoch': '2.306'}
 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 295/382 [02:04<00:25,  3.37it/s] 77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                  | 296/382 [02:04<00:25,  3.42it/s]                                                                                                                                                                                              {'loss': '4.01', 'grad_norm': '0.8707', 'learning_rate': '2.593e-05', 'ppl': '55.14', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '368', 'tokens/total': 206720, 'tokens/trainable': 63972, 'epoch': '2.314'}
 77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                  | 296/382 [02:04<00:25,  3.42it/s] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                 | 297/382 [02:04<00:27,  3.11it/s]                                                                                                                                                                                              {'loss': '3.496', 'grad_norm': '0.6354', 'learning_rate': '2.537e-05', 'ppl': '33', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '446.7', 'tokens/total': 207616, 'tokens/trainable': 64391, 'epoch': '2.322'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                 | 297/382 [02:04<00:27,  3.11it/s] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                 | 298/382 [02:05<00:27,  3.07it/s]                                                                                                                                                                                              {'loss': '3.61', 'grad_norm': '0.846', 'learning_rate': '2.48e-05', 'ppl': '36.96', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '290', 'tokens/total': 208384, 'tokens/trainable': 64606, 'epoch': '2.329'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                 | 298/382 [02:05<00:27,  3.07it/s] 78%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                | 299/382 [02:05<00:28,  2.90it/s]                                                                                                                                                                                              {'loss': '3.641', 'grad_norm': '0.7699', 'learning_rate': '2.425e-05', 'ppl': '38.12', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '479.7', 'tokens/total': 209280, 'tokens/trainable': 64924, 'epoch': '2.337'}
 78%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                | 299/382 [02:05<00:28,  2.90it/s] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 300/382 [02:05<00:26,  3.06it/s]                                                                                                                                                                                              {'loss': '3.776', 'grad_norm': '1.337', 'learning_rate': '2.37e-05', 'ppl': '43.63', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '67.74', 'tokens/total': 209920, 'tokens/trainable': 65014, 'epoch': '2.345'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 300/382 [02:05<00:26,  3.06it/s][2026-03-23 06:25:58,523] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:323] Running evaluation step...

  0%|                                                                                                                                                                 | 0/110 [00:00<?, ?it/s][A
  3%|████▏                                                                                                                                                    | 3/110 [00:00<00:05, 18.48it/s][A
  5%|██████▉                                                                                                                                                  | 5/110 [00:00<00:06, 15.87it/s][A
  6%|█████████▋                                                                                                                                               | 7/110 [00:00<00:06, 14.95it/s][A
  8%|████████████▌                                                                                                                                            | 9/110 [00:00<00:06, 14.51it/s][A
 10%|███████████████▏                                                                                                                                        | 11/110 [00:00<00:06, 14.29it/s][A
 12%|█████████████████▉                                                                                                                                      | 13/110 [00:00<00:06, 14.97it/s][A
 14%|████████████████████▋                                                                                                                                   | 15/110 [00:01<00:06, 14.59it/s][A
 15%|███████████████████████▍                                                                                                                                | 17/110 [00:01<00:06, 14.33it/s][A
 17%|██████████████████████████▎                                                                                                                             | 19/110 [00:01<00:06, 14.99it/s][A
 19%|█████████████████████████████                                                                                                                           | 21/110 [00:01<00:06, 14.60it/s][A
 21%|███████████████████████████████▊                                                                                                                        | 23/110 [00:01<00:06, 14.35it/s][A
 23%|██████████████████████████████████▌                                                                                                                     | 25/110 [00:01<00:05, 14.19it/s][A
 25%|█████████████████████████████████████▎                                                                                                                  | 27/110 [00:01<00:05, 14.09it/s][A
 26%|████████████████████████████████████████                                                                                                                | 29/110 [00:01<00:05, 14.01it/s][A
 29%|████████████████████████████████████████████▏                                                                                                           | 32/110 [00:02<00:05, 15.32it/s][A
 31%|██████████████████████████████████████████████▉                                                                                                         | 34/110 [00:02<00:05, 14.24it/s][A
 33%|█████████████████████████████████████████████████▋                                                                                                      | 36/110 [00:02<00:04, 14.81it/s][A
 35%|█████████████████████████████████████████████████████▉                                                                                                  | 39/110 [00:02<00:04, 15.80it/s][A
 37%|████████████████████████████████████████████████████████▋                                                                                               | 41/110 [00:02<00:04, 16.00it/s][A
 39%|███████████████████████████████████████████████████████████▍                                                                                            | 43/110 [00:02<00:04, 16.17it/s][A
 41%|██████████████████████████████████████████████████████████████▏                                                                                         | 45/110 [00:03<00:04, 15.44it/s][A
 43%|████████████████████████████████████████████████████████████████▉                                                                                       | 47/110 [00:03<00:04, 15.73it/s][A
 45%|███████████████████████████████████████████████████████████████████▋                                                                                    | 49/110 [00:03<00:03, 15.98it/s][A
 46%|██████████████████████████████████████████████████████████████████████▍                                                                                 | 51/110 [00:03<00:03, 16.98it/s][A
 48%|█████████████████████████████████████████████████████████████████████████▏                                                                              | 53/110 [00:03<00:03, 16.87it/s][A
 50%|████████████████████████████████████████████████████████████████████████████                                                                            | 55/110 [00:03<00:03, 16.79it/s][A
 52%|██████████████████████████████████████████████████████████████████████████████▊                                                                         | 57/110 [00:03<00:03, 15.63it/s][A
 54%|█████████████████████████████████████████████████████████████████████████████████▌                                                                      | 59/110 [00:03<00:03, 15.94it/s][A
 55%|████████████████████████████████████████████████████████████████████████████████████▎                                                                   | 61/110 [00:04<00:03, 14.52it/s][A
 57%|███████████████████████████████████████████████████████████████████████████████████████                                                                 | 63/110 [00:04<00:03, 13.66it/s][A
 60%|███████████████████████████████████████████████████████████████████████████████████████████▏                                                            | 66/110 [00:04<00:02, 15.79it/s][A
 62%|█████████████████████████████████████████████████████████████████████████████████████████████▉                                                          | 68/110 [00:04<00:02, 15.18it/s][A
 64%|████████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 70/110 [00:04<00:02, 15.58it/s][A
 65%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                                    | 72/110 [00:04<00:02, 15.88it/s][A
 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                 | 74/110 [00:04<00:02, 16.09it/s][A
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████                                               | 76/110 [00:04<00:02, 16.24it/s][A
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                            | 78/110 [00:05<00:01, 16.35it/s][A
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 80/110 [00:05<00:01, 15.56it/s][A
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                     | 83/110 [00:05<00:01, 17.26it/s][A
 77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 85/110 [00:05<00:01, 17.05it/s][A
 79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                               | 87/110 [00:05<00:01, 16.03it/s][A
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 90/110 [00:05<00:01, 16.64it/s][A
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                        | 92/110 [00:05<00:01, 16.65it/s][A
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                      | 94/110 [00:06<00:01, 15.75it/s][A
 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 96/110 [00:06<00:00, 15.18it/s][A
 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 98/110 [00:06<00:00, 14.77it/s][A
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 101/110 [00:06<00:00, 16.59it/s][A
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍         | 103/110 [00:06<00:00, 15.03it/s][A
 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 105/110 [00:06<00:00, 14.70it/s][A
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 107/110 [00:06<00:00, 14.49it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [00:07<00:00, 15.83it/s][A                                                                                                                                                                                              
                                                                                                                                                                                              [A{'eval_loss': '3.717', 'eval_runtime': '7.202', 'eval_samples_per_second': '30.41', 'eval_steps_per_second': '15.27', 'eval_ppl': '41.14', 'memory/max_active (GiB)': '0.14', 'memory/max_allocated (GiB)': '0.14', 'memory/device_reserved (GiB)': '0.36', 'epoch': '2.345', 'tokens/train_per_sec_per_gpu': '0'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 300/382 [02:12<00:26,  3.06it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [00:07<00:00, 15.83it/s][A
                                                                                                                                                                                              [A[2026-03-23 06:26:05,733] [INFO] [axolotl.core.trainers.base._save:721] [PID:323] Saving model checkpoint to ./final_model/checkpoint-300
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                | 301/382 [02:13<03:33,  2.64s/it]                                                                                                                                                                                              {'loss': '3.431', 'grad_norm': '0.735', 'learning_rate': '2.315e-05', 'ppl': '30.92', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.24', 'tokens/train_per_sec_per_gpu': '352.6', 'tokens/total': 210688, 'tokens/trainable': 65274, 'epoch': '2.353'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                | 301/382 [02:13<03:33,  2.64s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                               | 302/382 [02:14<02:35,  1.95s/it]                                                                                                                                                                                              {'loss': '4.029', 'grad_norm': '0.7489', 'learning_rate': '2.262e-05', 'ppl': '56.2', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.24', 'tokens/train_per_sec_per_gpu': '400.2', 'tokens/total': 211456, 'tokens/trainable': 65592, 'epoch': '2.361'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                               | 302/382 [02:14<02:35,  1.95s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 303/382 [02:14<01:54,  1.45s/it]                                                                                                                                                                                              {'loss': '3.571', 'grad_norm': '1.08', 'learning_rate': '2.208e-05', 'ppl': '35.56', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.25', 'tokens/train_per_sec_per_gpu': '206.9', 'tokens/total': 212096, 'tokens/trainable': 65751, 'epoch': '2.369'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 303/382 [02:14<01:54,  1.45s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                              | 304/382 [02:14<01:28,  1.13s/it]                                                                                                                                                                                              {'loss': '3.855', 'grad_norm': '0.8106', 'learning_rate': '2.155e-05', 'ppl': '47.23', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '484.5', 'tokens/total': 212992, 'tokens/trainable': 66011, 'epoch': '2.376'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                              | 304/382 [02:14<01:28,  1.13s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                              | 305/382 [02:15<01:06,  1.16it/s]                                                                                                                                                                                              {'loss': '4.775', 'grad_norm': '1.584', 'learning_rate': '2.103e-05', 'ppl': '118.5', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '339.7', 'tokens/total': 213504, 'tokens/trainable': 66140, 'epoch': '2.384'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                              | 305/382 [02:15<01:06,  1.16it/s] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                              | 306/382 [02:15<00:50,  1.49it/s]                                                                                                                                                                                              {'loss': '4', 'grad_norm': '0.9137', 'learning_rate': '2.051e-05', 'ppl': '54.57', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '424.7', 'tokens/total': 214016, 'tokens/trainable': 66322, 'epoch': '2.392'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                              | 306/382 [02:15<00:50,  1.49it/s] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                             | 307/382 [02:15<00:41,  1.80it/s]                                                                                                                                                                                              {'loss': '3.718', 'grad_norm': '0.8652', 'learning_rate': '2e-05', 'ppl': '41.18', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '256.6', 'tokens/total': 214656, 'tokens/trainable': 66550, 'epoch': '2.4'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                             | 307/382 [02:15<00:41,  1.80it/s] 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 308/382 [02:15<00:36,  2.04it/s]                                                                                                                                                                                              {'loss': '3.417', 'grad_norm': '1.046', 'learning_rate': '1.95e-05', 'ppl': '30.49', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '322.8', 'tokens/total': 215424, 'tokens/trainable': 66778, 'epoch': '2.408'}
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 308/382 [02:15<00:36,  2.04it/s] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 309/382 [02:16<00:31,  2.34it/s]                                                                                                                                                                                              {'loss': '3.895', 'grad_norm': '0.9536', 'learning_rate': '1.9e-05', 'ppl': '49.17', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '546.1', 'tokens/total': 216064, 'tokens/trainable': 66976, 'epoch': '2.416'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 309/382 [02:16<00:31,  2.34it/s] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 310/382 [02:16<00:27,  2.60it/s]                                                                                                                                                                                              {'loss': '4.153', 'grad_norm': '1.398', 'learning_rate': '1.851e-05', 'ppl': '63.63', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '335.2', 'tokens/total': 216704, 'tokens/trainable': 67155, 'epoch': '2.424'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 310/382 [02:16<00:27,  2.60it/s] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                            | 311/382 [02:16<00:26,  2.70it/s]                                                                                                                                                                                              {'loss': '3.306', 'grad_norm': '0.7946', 'learning_rate': '1.802e-05', 'ppl': '27.29', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '289.4', 'tokens/total': 217472, 'tokens/trainable': 67456, 'epoch': '2.431'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                            | 311/382 [02:16<00:26,  2.70it/s] 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 312/382 [02:17<00:23,  3.03it/s]                                                                                                                                                                                              {'loss': '3.919', 'grad_norm': '1.405', 'learning_rate': '1.754e-05', 'ppl': '50.38', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '324.4', 'tokens/total': 217984, 'tokens/trainable': 67567, 'epoch': '2.439'}
 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 312/382 [02:17<00:23,  3.03it/s] 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                           | 313/382 [02:17<00:20,  3.32it/s]                                                                                                                                                                                              {'loss': '3.483', 'grad_norm': '0.9307', 'learning_rate': '1.706e-05', 'ppl': '32.55', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '129.6', 'tokens/total': 218496, 'tokens/trainable': 67761, 'epoch': '2.447'}
 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                           | 313/382 [02:17<00:20,  3.32it/s] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 314/382 [02:17<00:21,  3.20it/s]                                                                                                                                                                                              {'loss': '4.19', 'grad_norm': '1.185', 'learning_rate': '1.659e-05', 'ppl': '66', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '214.9', 'tokens/total': 219264, 'tokens/trainable': 67918, 'epoch': '2.455'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 314/382 [02:17<00:21,  3.20it/s] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 315/382 [02:17<00:19,  3.48it/s]                                                                                                                                                                                              {'loss': '3.676', 'grad_norm': '0.9765', 'learning_rate': '1.613e-05', 'ppl': '39.5', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '433.8', 'tokens/total': 219776, 'tokens/trainable': 68089, 'epoch': '2.463'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 315/382 [02:17<00:19,  3.48it/s] 83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                          | 316/382 [02:18<00:18,  3.49it/s]                                                                                                                                                                                              {'loss': '3.877', 'grad_norm': '1.089', 'learning_rate': '1.567e-05', 'ppl': '48.29', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '185.2', 'tokens/total': 220416, 'tokens/trainable': 68217, 'epoch': '2.471'}
 83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                          | 316/382 [02:18<00:18,  3.49it/s] 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 317/382 [02:18<00:18,  3.51it/s]                                                                                                                                                                                              {'loss': '3.044', 'grad_norm': '0.9688', 'learning_rate': '1.522e-05', 'ppl': '21', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '338.7', 'tokens/total': 221056, 'tokens/trainable': 68382, 'epoch': '2.478'}
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 317/382 [02:18<00:18,  3.51it/s] 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                         | 318/382 [02:18<00:19,  3.32it/s]                                                                                                                                                                                              {'loss': '3.332', 'grad_norm': '0.96', 'learning_rate': '1.477e-05', 'ppl': '28', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '209.1', 'tokens/total': 221824, 'tokens/trainable': 68575, 'epoch': '2.486'}
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                         | 318/382 [02:18<00:19,  3.32it/s] 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                         | 319/382 [02:19<00:18,  3.38it/s]                                                                                                                                                                                              {'loss': '2.857', 'grad_norm': '0.9582', 'learning_rate': '1.433e-05', 'ppl': '17.4', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '164.2', 'tokens/total': 222464, 'tokens/trainable': 68752, 'epoch': '2.494'}
 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                         | 319/382 [02:19<00:18,  3.38it/s] 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                        | 320/382 [02:19<00:19,  3.24it/s]                                                                                                                                                                                              {'loss': '3.638', 'grad_norm': '1.043', 'learning_rate': '1.39e-05', 'ppl': '38.03', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '417.6', 'tokens/total': 223232, 'tokens/trainable': 69007, 'epoch': '2.502'}
 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                        | 320/382 [02:19<00:19,  3.24it/s] 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 321/382 [02:19<00:18,  3.32it/s]                                                                                                                                                                                              {'loss': '4.012', 'grad_norm': '1.324', 'learning_rate': '1.347e-05', 'ppl': '55.23', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '245.8', 'tokens/total': 223872, 'tokens/trainable': 69127, 'epoch': '2.51'}
 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 321/382 [02:19<00:18,  3.32it/s] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 322/382 [02:19<00:17,  3.38it/s]                                                                                                                                                                                              {'loss': '4.076', 'grad_norm': '0.9566', 'learning_rate': '1.305e-05', 'ppl': '58.89', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '139.4', 'tokens/total': 224512, 'tokens/trainable': 69299, 'epoch': '2.518'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 322/382 [02:19<00:17,  3.38it/s] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                       | 323/382 [02:20<00:17,  3.43it/s]                                                                                                                                                                                              {'loss': '3.607', 'grad_norm': '1.473', 'learning_rate': '1.263e-05', 'ppl': '36.84', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '267.9', 'tokens/total': 225152, 'tokens/trainable': 69410, 'epoch': '2.525'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                       | 323/382 [02:20<00:17,  3.43it/s] 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                       | 324/382 [02:20<00:16,  3.46it/s]                                                                                                                                                                                              {'loss': '4.329', 'grad_norm': '1.234', 'learning_rate': '1.222e-05', 'ppl': '75.9', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '325.1', 'tokens/total': 225792, 'tokens/trainable': 69557, 'epoch': '2.533'}
 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                       | 324/382 [02:20<00:16,  3.46it/s] 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                      | 325/382 [02:20<00:16,  3.48it/s]                                                                                                                                                                                              {'loss': '3.589', 'grad_norm': '0.8417', 'learning_rate': '1.182e-05', 'ppl': '36.18', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '379.1', 'tokens/total': 226432, 'tokens/trainable': 69770, 'epoch': '2.541'}
 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                      | 325/382 [02:20<00:16,  3.48it/s] 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 326/382 [02:21<00:15,  3.50it/s]                                                                                                                                                                                              {'loss': '2.867', 'grad_norm': '1.152', 'learning_rate': '1.142e-05', 'ppl': '17.59', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '168.4', 'tokens/total': 227072, 'tokens/trainable': 69914, 'epoch': '2.549'}
 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 326/382 [02:21<00:15,  3.50it/s] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                     | 327/382 [02:21<00:16,  3.32it/s]                                                                                                                                                                                              {'loss': '3.486', 'grad_norm': '0.8504', 'learning_rate': '1.103e-05', 'ppl': '32.64', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '254.4', 'tokens/total': 227840, 'tokens/trainable': 70122, 'epoch': '2.557'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                     | 327/382 [02:21<00:16,  3.32it/s] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 328/382 [02:21<00:15,  3.38it/s]                                                                                                                                                                                              {'loss': '3.708', 'grad_norm': '1.215', 'learning_rate': '1.065e-05', 'ppl': '40.77', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '260.1', 'tokens/total': 228480, 'tokens/trainable': 70266, 'epoch': '2.565'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 328/382 [02:21<00:15,  3.38it/s] 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                     | 329/382 [02:22<00:16,  3.24it/s]                                                                                                                                                                                              {'loss': '3.25', 'grad_norm': '0.8267', 'learning_rate': '1.027e-05', 'ppl': '25.78', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '633.5', 'tokens/total': 229248, 'tokens/trainable': 70654, 'epoch': '2.573'}
 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                     | 329/382 [02:22<00:16,  3.24it/s] 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 330/382 [02:22<00:16,  3.14it/s]                                                                                                                                                                                              {'loss': '3.513', 'grad_norm': '0.7664', 'learning_rate': '9.903e-06', 'ppl': '33.54', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '327.7', 'tokens/total': 230016, 'tokens/trainable': 70881, 'epoch': '2.58'}
 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 330/382 [02:22<00:16,  3.14it/s] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 331/382 [02:22<00:14,  3.43it/s]                                                                                                                                                                                              {'loss': '2.823', 'grad_norm': '1.302', 'learning_rate': '9.539e-06', 'ppl': '16.83', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '376.8', 'tokens/total': 230528, 'tokens/trainable': 71015, 'epoch': '2.588'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 331/382 [02:22<00:14,  3.43it/s] 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 332/382 [02:22<00:14,  3.46it/s]                                                                                                                                                                                              {'loss': '3.939', 'grad_norm': '1.066', 'learning_rate': '9.181e-06', 'ppl': '51.35', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '307.1', 'tokens/total': 231168, 'tokens/trainable': 71203, 'epoch': '2.596'}
 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 332/382 [02:22<00:14,  3.46it/s] 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 333/382 [02:23<00:14,  3.29it/s]                                                                                                                                                                                              {'loss': '3.878', 'grad_norm': '0.8538', 'learning_rate': '8.83e-06', 'ppl': '48.35', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '307.8', 'tokens/total': 231936, 'tokens/trainable': 71387, 'epoch': '2.604'}
 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 333/382 [02:23<00:14,  3.29it/s] 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 334/382 [02:23<00:15,  3.20it/s]                                                                                                                                                                                              {'loss': '3.418', 'grad_norm': '0.7094', 'learning_rate': '8.485e-06', 'ppl': '30.49', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '151.3', 'tokens/total': 232704, 'tokens/trainable': 71676, 'epoch': '2.612'}
 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 334/382 [02:23<00:15,  3.20it/s] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 335/382 [02:23<00:13,  3.48it/s]                                                                                                                                                                                              {'loss': '3.441', 'grad_norm': '1.096', 'learning_rate': '8.147e-06', 'ppl': '31.22', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '173.2', 'tokens/total': 233216, 'tokens/trainable': 71824, 'epoch': '2.62'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 335/382 [02:23<00:13,  3.48it/s] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 336/382 [02:24<00:13,  3.31it/s]                                                                                                                                                                                              {'loss': '3.793', 'grad_norm': '0.7229', 'learning_rate': '7.816e-06', 'ppl': '44.38', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '535', 'tokens/total': 233984, 'tokens/trainable': 72150, 'epoch': '2.627'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 336/382 [02:24<00:13,  3.31it/s] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 337/382 [02:24<00:14,  3.17it/s]                                                                                                                                                                                              {'loss': '3.541', 'grad_norm': '0.8475', 'learning_rate': '7.491e-06', 'ppl': '34.49', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '281.2', 'tokens/total': 234752, 'tokens/trainable': 72348, 'epoch': '2.635'}
 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 337/382 [02:24<00:14,  3.17it/s] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 338/382 [02:24<00:14,  2.96it/s]                                                                                                                                                                                              {'loss': '4.409', 'grad_norm': '0.7078', 'learning_rate': '7.173e-06', 'ppl': '82.19', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '264.7', 'tokens/total': 235648, 'tokens/trainable': 72697, 'epoch': '2.643'}
 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 338/382 [02:24<00:14,  2.96it/s] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                 | 339/382 [02:25<00:15,  2.72it/s]                                                                                                                                                                                              {'loss': '3.324', 'grad_norm': '0.6823', 'learning_rate': '6.861e-06', 'ppl': '27.76', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '422', 'tokens/total': 236672, 'tokens/trainable': 73031, 'epoch': '2.651'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                 | 339/382 [02:25<00:15,  2.72it/s] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 340/382 [02:25<00:15,  2.79it/s]                                                                                                                                                                                              {'loss': '4.106', 'grad_norm': '0.893', 'learning_rate': '6.556e-06', 'ppl': '60.69', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '286.7', 'tokens/total': 237440, 'tokens/trainable': 73219, 'epoch': '2.659'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 340/382 [02:25<00:15,  2.79it/s] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 341/382 [02:26<00:15,  2.72it/s]                                                                                                                                                                                              {'loss': '3.859', 'grad_norm': '0.6602', 'learning_rate': '6.258e-06', 'ppl': '47.41', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '373.8', 'tokens/total': 238336, 'tokens/trainable': 73566, 'epoch': '2.667'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 341/382 [02:26<00:15,  2.72it/s] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 342/382 [02:26<00:14,  2.79it/s]                                                                                                                                                                                              {'loss': '3.273', 'grad_norm': '0.7559', 'learning_rate': '5.967e-06', 'ppl': '26.39', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '271.8', 'tokens/total': 239104, 'tokens/trainable': 73899, 'epoch': '2.675'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 342/382 [02:26<00:14,  2.79it/s] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 343/382 [02:26<00:13,  2.84it/s]                                                                                                                                                                                              {'loss': '3.495', 'grad_norm': '0.7813', 'learning_rate': '5.682e-06', 'ppl': '32.94', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '400.4', 'tokens/total': 239872, 'tokens/trainable': 74139, 'epoch': '2.682'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 343/382 [02:26<00:13,  2.84it/s] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉               | 344/382 [02:27<00:13,  2.87it/s]                                                                                                                                                                                              {'loss': '3.525', 'grad_norm': '0.8053', 'learning_rate': '5.404e-06', 'ppl': '33.96', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '259.7', 'tokens/total': 240640, 'tokens/trainable': 74366, 'epoch': '2.69'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉               | 344/382 [02:27<00:13,  2.87it/s] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 345/382 [02:27<00:11,  3.20it/s]                                                                                                                                                                                              {'loss': '3.178', 'grad_norm': '1.053', 'learning_rate': '5.133e-06', 'ppl': '24', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '297.2', 'tokens/total': 241152, 'tokens/trainable': 74481, 'epoch': '2.698'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 345/382 [02:27<00:11,  3.20it/s] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 346/382 [02:27<00:11,  3.12it/s]                                                                                                                                                                                              {'loss': '3.505', 'grad_norm': '0.817', 'learning_rate': '4.868e-06', 'ppl': '33.27', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '325.5', 'tokens/total': 241920, 'tokens/trainable': 74695, 'epoch': '2.706'}
 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 346/382 [02:27<00:11,  3.12it/s] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 347/382 [02:27<00:10,  3.24it/s]                                                                                                                                                                                              {'loss': '3.926', 'grad_norm': '0.8433', 'learning_rate': '4.611e-06', 'ppl': '50.73', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '528.8', 'tokens/total': 242560, 'tokens/trainable': 74905, 'epoch': '2.714'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 347/382 [02:27<00:10,  3.24it/s] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 348/382 [02:28<00:10,  3.32it/s]                                                                                                                                                                                              {'loss': '3.476', 'grad_norm': '1.056', 'learning_rate': '4.36e-06', 'ppl': '32.33', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '182', 'tokens/total': 243200, 'tokens/trainable': 75050, 'epoch': '2.722'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 348/382 [02:28<00:10,  3.32it/s] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 349/382 [02:28<00:10,  3.21it/s]                                                                                                                                                                                              {'loss': '3.265', 'grad_norm': '0.7648', 'learning_rate': '4.116e-06', 'ppl': '26.17', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '162.8', 'tokens/total': 243968, 'tokens/trainable': 75356, 'epoch': '2.729'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 349/382 [02:28<00:10,  3.21it/s] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 350/382 [02:28<00:10,  3.13it/s]                                                                                                                                                                                              {'loss': '4.287', 'grad_norm': '0.8828', 'learning_rate': '3.879e-06', 'ppl': '72.75', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '481.3', 'tokens/total': 244736, 'tokens/trainable': 75619, 'epoch': '2.737'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 350/382 [02:28<00:10,  3.13it/s] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 351/382 [02:29<00:10,  3.08it/s]                                                                                                                                                                                              {'loss': '3.788', 'grad_norm': '0.8247', 'learning_rate': '3.649e-06', 'ppl': '44.19', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '331.7', 'tokens/total': 245504, 'tokens/trainable': 75839, 'epoch': '2.745'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 351/382 [02:29<00:10,  3.08it/s] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 352/382 [02:29<00:09,  3.04it/s]                                                                                                                                                                                              {'loss': '3.34', 'grad_norm': '0.8649', 'learning_rate': '3.426e-06', 'ppl': '28.22', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '487', 'tokens/total': 246272, 'tokens/trainable': 76079, 'epoch': '2.753'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 352/382 [02:29<00:09,  3.04it/s] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 353/382 [02:29<00:09,  3.02it/s]                                                                                                                                                                                              {'loss': '3.003', 'grad_norm': '0.8977', 'learning_rate': '3.209e-06', 'ppl': '20.15', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '269', 'tokens/total': 247040, 'tokens/trainable': 76289, 'epoch': '2.761'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 353/382 [02:29<00:09,  3.02it/s] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉           | 354/382 [02:30<00:08,  3.15it/s]                                                                                                                                                                                              {'loss': '3.751', 'grad_norm': '0.8489', 'learning_rate': '3e-06', 'ppl': '42.57', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '585.3', 'tokens/total': 247680, 'tokens/trainable': 76497, 'epoch': '2.769'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉           | 354/382 [02:30<00:08,  3.15it/s] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 355/382 [02:30<00:08,  3.26it/s]                                                                                                                                                                                              {'loss': '3.445', 'grad_norm': '1.089', 'learning_rate': '2.798e-06', 'ppl': '31.36', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '182', 'tokens/total': 248320, 'tokens/trainable': 76684, 'epoch': '2.776'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 355/382 [02:30<00:08,  3.26it/s] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 356/382 [02:30<00:07,  3.34it/s]                                                                                                                                                                                              {'loss': '3.806', 'grad_norm': '0.9037', 'learning_rate': '2.602e-06', 'ppl': '44.98', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '556.8', 'tokens/total': 248960, 'tokens/trainable': 76916, 'epoch': '2.784'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 356/382 [02:30<00:07,  3.34it/s] 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████          | 357/382 [02:31<00:08,  3.06it/s]                                                                                                                                                                                              {'loss': '3.773', 'grad_norm': '0.683', 'learning_rate': '2.414e-06', 'ppl': '43.52', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '581.1', 'tokens/total': 249856, 'tokens/trainable': 77266, 'epoch': '2.792'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████          | 357/382 [02:31<00:08,  3.06it/s] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 358/382 [02:31<00:07,  3.19it/s]                                                                                                                                                                                              {'loss': '3.295', 'grad_norm': '0.9091', 'learning_rate': '2.232e-06', 'ppl': '26.97', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '367.7', 'tokens/total': 250496, 'tokens/trainable': 77473, 'epoch': '2.8'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 358/382 [02:31<00:07,  3.19it/s] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 359/382 [02:31<00:07,  3.28it/s]                                                                                                                                                                                              {'loss': '2.905', 'grad_norm': '1.053', 'learning_rate': '2.058e-06', 'ppl': '18.26', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '246.1', 'tokens/total': 251136, 'tokens/trainable': 77632, 'epoch': '2.808'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 359/382 [02:31<00:07,  3.28it/s] 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 360/382 [02:31<00:06,  3.35it/s]                                                                                                                                                                                              {'loss': '4.302', 'grad_norm': '1.374', 'learning_rate': '1.891e-06', 'ppl': '73.82', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '153.3', 'tokens/total': 251776, 'tokens/trainable': 77791, 'epoch': '2.816'}
 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 360/382 [02:31<00:06,  3.35it/s] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 361/382 [02:32<00:06,  3.22it/s]                                                                                                                                                                                              {'loss': '3.534', 'grad_norm': '0.9552', 'learning_rate': '1.73e-06', 'ppl': '34.27', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '235.7', 'tokens/total': 252544, 'tokens/trainable': 77984, 'epoch': '2.824'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 361/382 [02:32<00:06,  3.22it/s] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████        | 362/382 [02:32<00:05,  3.49it/s]                                                                                                                                                                                              {'loss': '3.527', 'grad_norm': '1.156', 'learning_rate': '1.577e-06', 'ppl': '34.03', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '269.7', 'tokens/total': 253056, 'tokens/trainable': 78093, 'epoch': '2.831'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████        | 362/382 [02:32<00:05,  3.49it/s] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 363/382 [02:32<00:05,  3.50it/s]                                                                                                                                                                                              {'loss': '3.985', 'grad_norm': '0.8407', 'learning_rate': '1.431e-06', 'ppl': '53.78', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '436', 'tokens/total': 253696, 'tokens/trainable': 78346, 'epoch': '2.839'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 363/382 [02:32<00:05,  3.50it/s] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉       | 364/382 [02:33<00:04,  3.72it/s]                                                                                                                                                                                              {'loss': '3.434', 'grad_norm': '1.078', 'learning_rate': '1.291e-06', 'ppl': '31.01', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '296.6', 'tokens/total': 254208, 'tokens/trainable': 78476, 'epoch': '2.847'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉       | 364/382 [02:33<00:04,  3.72it/s] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 365/382 [02:33<00:04,  3.47it/s]                                                                                                                                                                                              {'loss': '3.441', 'grad_norm': '0.8911', 'learning_rate': '1.159e-06', 'ppl': '31.21', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '540.7', 'tokens/total': 254976, 'tokens/trainable': 78743, 'epoch': '2.855'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 365/382 [02:33<00:04,  3.47it/s] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 366/382 [02:33<00:04,  3.49it/s]                                                                                                                                                                                              {'loss': '3.497', 'grad_norm': '0.9519', 'learning_rate': '1.034e-06', 'ppl': '33.02', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '235.3', 'tokens/total': 255616, 'tokens/trainable': 79003, 'epoch': '2.863'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 366/382 [02:33<00:04,  3.49it/s] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████      | 367/382 [02:33<00:04,  3.31it/s]                                                                                                                                                                                              {'loss': '4.068', 'grad_norm': '0.8022', 'learning_rate': '9.164e-07', 'ppl': '58.44', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '468.3', 'tokens/total': 256384, 'tokens/trainable': 79246, 'epoch': '2.871'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████      | 367/382 [02:33<00:04,  3.31it/s] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 368/382 [02:34<00:04,  3.38it/s]                                                                                                                                                                                              {'loss': '3.182', 'grad_norm': '1.044', 'learning_rate': '8.056e-07', 'ppl': '24.09', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '185.6', 'tokens/total': 257024, 'tokens/trainable': 79403, 'epoch': '2.878'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 368/382 [02:34<00:04,  3.38it/s] 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 369/382 [02:34<00:03,  3.59it/s]                                                                                                                                                                                              {'loss': '3.817', 'grad_norm': '1.044', 'learning_rate': '7.019e-07', 'ppl': '45.47', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '284.3', 'tokens/total': 257536, 'tokens/trainable': 79561, 'epoch': '2.886'}
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 369/382 [02:34<00:03,  3.59it/s] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 370/382 [02:34<00:03,  3.37it/s]                                                                                                                                                                                              {'loss': '3.54', 'grad_norm': '0.6841', 'learning_rate': '6.053e-07', 'ppl': '34.47', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '509.1', 'tokens/total': 258304, 'tokens/trainable': 79843, 'epoch': '2.894'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 370/382 [02:34<00:03,  3.37it/s] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 371/382 [02:35<00:03,  3.62it/s]                                                                                                                                                                                              {'loss': '3.705', 'grad_norm': '1.176', 'learning_rate': '5.158e-07', 'ppl': '40.66', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '217.6', 'tokens/total': 258816, 'tokens/trainable': 79964, 'epoch': '2.902'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 371/382 [02:35<00:03,  3.62it/s] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 372/382 [02:35<00:02,  3.59it/s]                                                                                                                                                                                              {'loss': '3.472', 'grad_norm': '1.009', 'learning_rate': '4.335e-07', 'ppl': '32.2', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '284.5', 'tokens/total': 259456, 'tokens/trainable': 80139, 'epoch': '2.91'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 372/382 [02:35<00:02,  3.59it/s] 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 373/382 [02:35<00:02,  3.21it/s]                                                                                                                                                                                              {'loss': '3.784', 'grad_norm': '0.8152', 'learning_rate': '3.583e-07', 'ppl': '43.99', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '399.5', 'tokens/total': 260352, 'tokens/trainable': 80477, 'epoch': '2.918'}
 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 373/382 [02:35<00:02,  3.21it/s] 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 374/382 [02:35<00:02,  3.49it/s]                                                                                                                                                                                              {'loss': '4.074', 'grad_norm': '1.42', 'learning_rate': '2.903e-07', 'ppl': '58.78', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '181.6', 'tokens/total': 260864, 'tokens/trainable': 80572, 'epoch': '2.925'}
 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 374/382 [02:35<00:02,  3.49it/s] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 375/382 [02:36<00:02,  3.31it/s]                                                                                                                                                                                              {'loss': '4.469', 'grad_norm': '1.051', 'learning_rate': '2.294e-07', 'ppl': '87.3', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '313.4', 'tokens/total': 261632, 'tokens/trainable': 80769, 'epoch': '2.933'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 375/382 [02:36<00:02,  3.31it/s] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋  | 376/382 [02:36<00:01,  3.37it/s]                                                                                                                                                                                              {'loss': '4.525', 'grad_norm': '1.148', 'learning_rate': '1.756e-07', 'ppl': '92.26', 'memory/max_active (GiB)': '0.21', 'memory/max_allocated (GiB)': '0.21', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '289.3', 'tokens/total': 262272, 'tokens/trainable': 80898, 'epoch': '2.941'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋  | 376/382 [02:36<00:01,  3.37it/s] 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 377/382 [02:36<00:01,  3.24it/s]                                                                                                                                                                                              {'loss': '4.046', 'grad_norm': '0.8811', 'learning_rate': '1.29e-07', 'ppl': '57.19', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '208.9', 'tokens/total': 263040, 'tokens/trainable': 81107, 'epoch': '2.949'}
 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 377/382 [02:36<00:01,  3.24it/s] 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 378/382 [02:37<00:01,  3.15it/s]                                                                                                                                                                                              {'loss': '3.377', 'grad_norm': '0.7525', 'learning_rate': '8.962e-08', 'ppl': '29.28', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '384.8', 'tokens/total': 263808, 'tokens/trainable': 81318, 'epoch': '2.957'}
 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 378/382 [02:37<00:01,  3.15it/s] 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 379/382 [02:37<00:00,  3.09it/s]                                                                                                                                                                                              {'loss': '3.55', 'grad_norm': '0.7345', 'learning_rate': '5.736e-08', 'ppl': '34.83', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '372.8', 'tokens/total': 264576, 'tokens/trainable': 81612, 'epoch': '2.965'}
 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 379/382 [02:37<00:00,  3.09it/s] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 380/382 [02:37<00:00,  2.91it/s]                                                                                                                                                                                              {'loss': '2.94', 'grad_norm': '0.6685', 'learning_rate': '3.227e-08', 'ppl': '18.92', 'memory/max_active (GiB)': '0.27', 'memory/max_allocated (GiB)': '0.27', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '583.5', 'tokens/total': 265472, 'tokens/trainable': 81950, 'epoch': '2.973'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 380/382 [02:37<00:00,  2.91it/s]100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 381/382 [02:38<00:00,  2.93it/s]                                                                                                                                                                                              {'loss': '3.181', 'grad_norm': '0.9305', 'learning_rate': '1.434e-08', 'ppl': '24.07', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '164.2', 'tokens/total': 266240, 'tokens/trainable': 82138, 'epoch': '2.98'}
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 381/382 [02:38<00:00,  2.93it/s]100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 382/382 [02:38<00:00,  3.25it/s]                                                                                                                                                                                              {'loss': '3.704', 'grad_norm': '1.14', 'learning_rate': '3.585e-09', 'ppl': '40.62', 'memory/max_active (GiB)': '0.16', 'memory/max_allocated (GiB)': '0.16', 'memory/device_reserved (GiB)': '0.36', 'tokens/train_per_sec_per_gpu': '359.1', 'tokens/total': 266752, 'tokens/trainable': 82265, 'epoch': '2.988'}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 382/382 [02:38<00:00,  3.25it/s][2026-03-23 06:26:31,300] [INFO] [axolotl.core.trainers.base._save:721] [PID:323] Saving model checkpoint to ./final_model/checkpoint-382
                                                                                                                                                                                              {'train_runtime': '167.9', 'train_samples_per_second': '9.1', 'train_steps_per_second': '2.275', 'train_loss': '3.82', 'memory/max_active (GiB)': '0.06', 'memory/max_allocated (GiB)': '0.06', 'memory/device_reserved (GiB)': '0.36', 'epoch': '2.988', 'tokens/train_per_sec_per_gpu': '0'}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 382/382 [02:44<00:00,  3.25it/s]100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 382/382 [02:44<00:00,  2.33it/s]
[2026-03-23 06:26:42,574] [INFO] [axolotl.train.save_trained_model:237] [PID:323] Training completed! Saving trained model to ./final_model.
[2026-03-23 06:26:43,067] [INFO] [axolotl.train.save_trained_model:351] [PID:323] Model successfully saved to ./final_model
[2026-03-23 06:26:43,072] [INFO] [axolotl.core.trainers.base._save:721] [PID:323] Saving model checkpoint to ./final_model
Processing Files (0 / 0)      : |                                                                                                                                |  0.00B /  0.00B            
New Data Upload               : |                                                                                                                                |  0.00B /  0.00B            [A

  ...l_model/training_args.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.67kB / 6.67kB            [A[A


  ...adapter_model.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.08MB / 1.08MB            [A[A[A

  ...l_model/training_args.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.67kB / 6.67kB            [A[A


  ...adapter_model.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.08MB / 1.08MB            [A[A[AProcessing Files (2 / 2)      : 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.09MB / 1.09MB,   ???B/s  

  ...l_model/training_args.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.67kB / 6.67kB            [A[A


  ...adapter_model.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.08MB / 1.08MB            [A[A[A

  ...l_model/training_args.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.67kB / 6.67kB            [A[A


  ...adapter_model.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.08MB / 1.08MB            [A[A[AProcessing Files (2 / 2)      : 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.09MB / 1.09MB,  0.00B/s  
New Data Upload               : |                                                                                                                                |  0.00B /  0.00B,  0.00B/s  
  ...l_model/training_args.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.67kB / 6.67kB            
  ...adapter_model.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.08MB / 1.08MB            
Exception ignored in atexit callback: <function _start_and_connect_service.<locals>.teardown_atexit at 0x7724f5a9bce0>
Traceback (most recent call last):
  File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/wandb/sdk/lib/service/service_connection.py", line 73, in teardown_atexit
[2026-03-23 06:34:33,895] [WARNING] [asyncio.write:1054] [PID:323] socket.send() raised exception.
    conn.teardown(hooks.exit_code)
  File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/wandb/sdk/lib/service/service_connection.py", line 346, in teardown
    return self._proc.join()
[2026-03-23 06:34:33,897] [WARNING] [asyncio.write:1054] [PID:323] socket.send() raised exception.
           ^^^^^^^^^^^^^^^^^
[2026-03-23 06:34:33,898] [WARNING] [asyncio.write:1054] [PID:323] socket.send() raised exception.
  File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/wandb/sdk/lib/service/service_process.py", line 57, in join
[2026-03-23 06:34:33,899] [WARNING] [asyncio.write:1054] [PID:323] socket.send() raised exception.
[2026-03-23 06:34:33,900] [WARNING] [asyncio.write:1054] [PID:323] socket.send() raised exception.
[2026-03-23 06:34:33,900] [WARNING] [asyncio.write:1054] [PID:323] socket.send() raised exception.
[2026-03-23 06:34:33,900] [WARNING] [asyncio.write:1054] [PID:323] socket.send() raised exception.
[2026-03-23 06:34:33,900] [WARNING] [asyncio.write:1054] [PID:323] socket.send() raised exception.
    return self._process.wait()
           ^^^^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/py3.11/lib/python3.11/subprocess.py", line 1264, in wait
    return self._wait(timeout=timeout)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/py3.11/lib/python3.11/subprocess.py", line 2053, in _wait
    (pid, sts) = self._try_wait(0)
                 ^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/py3.11/lib/python3.11/subprocess.py", line 2011, in _try_wait
    (pid, sts) = os.waitpid(self.pid, wait_flags)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/axolotl/src/axolotl/train.py", line 171, in <lambda>
    lambda signum, frame: terminate_handler(signum, frame, _model_weakref),
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/axolotl/src/axolotl/train.py", line 166, in terminate_handler
    sys.exit(0)
  File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/wandb/sdk/lib/exit_hooks.py", line 38, in exit
    self._orig_exit(orig_code)  # type: ignore
    ^^^^^^^^^^^^^^^^^^^^^^^^^^
SystemExit: 0
[2026-03-23 06:35:21,063] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:774] baseline 0.000GB (+0.000GB allocated, +0.002GB reserved)
[2026-03-23 06:35:21,064] [INFO] [axolotl.cli.config.load_cfg:341] [PID:774] config:
{
  "activation_offloading": false,
  "adapter": "lora",
  "auto_resume_from_checkpoints": true,
  "axolotl_config_path": "train.yml",
  "base_model": "PicoKittens/PicoMistral-23M",
  "base_model_config": "PicoKittens/PicoMistral-23M",
  "batch_size": 4,
  "bf16": false,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_75",
    "fp8": false,
    "n_gpu": 1,
    "n_node": 1,
    "tf32": false
  },
  "context_parallel_size": 1,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 16,
  "datasets": [
    {
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "psychopenguin/indian_legal_dataset_qna",
      "split": "train[:2%]",
      "trust_remote_code": false,
      "type": "alpaca"
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "eaft_alpha": 1.0,
  "eaft_k": 20,
  "early_stopping_patience": 3,
  "env_capabilities": {
    "torch_version": "2.9.1"
  },
  "eval_batch_size": 2,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_steps": 100,
  "eval_strategy": "steps",
  "eval_table_size": 0,
  "experimental_skip_move_to_device": true,
  "fp16": true,
  "generate_samples": false,
  "generation_do_sample": true,
  "generation_max_new_tokens": 50,
  "generation_prompt_ratio": 0.5,
  "generation_temperature": 0.7,
  "gradient_accumulation_steps": 2,
  "gradient_checkpointing": false,
  "hf_use_auth_token": true,
  "hub_model_id": "psychopenguin/t1",
  "include_tkps": true,
  "is_falcon_derived_model": false,
  "is_llama_derived_model": false,
  "is_mistral_derived_model": true,
  "learning_rate": 0.0002,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": true,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "lora_alpha": 8,
  "lora_dropout": 0.05,
  "lora_r": 4,
  "lora_target_linear": true,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "mean_resizing_embeddings": false,
  "merge_lora": true,
  "micro_batch_size": 2,
  "model_config_type": "mistral",
  "neftune_noise_alpha": 5.0,
  "num_epochs": 3.0,
  "num_generation_samples": 3,
  "optimizer": "adamw_bnb_8bit",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./final_model",
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "quantize_moe_experts": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_steps": 100,
  "sdp_attention": true,
  "seed": 9,
  "sequence_len": 256,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": false,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "PicoKittens/PicoMistral-23M",
  "tokenizer_save_jinja_files": true,
  "tokenizer_type": "AutoTokenizer",
  "tokenizer_use_fast": true,
  "torch_dtype": "torch.float16",
  "train_on_inputs": false,
  "trl": {
    "async_prefetch": false,
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "replay_buffer_size": 0,
    "replay_recompute_logps": true,
    "reroll_max_groups": 1,
    "reroll_start_fraction": 1.0,
    "reward_num_workers": 1,
    "scale_rewards": true,
    "skip_zero_advantage_batches": true,
    "sync_ref_model": false,
    "use_data_producer": false,
    "use_vllm": false,
    "vllm_lora_sync": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "trust_remote_code": true,
  "type_of_model": "AutoModelForCausalLM",
  "use_otel_metrics": false,
  "use_ray": false,
  "use_wandb": true,
  "val_set_size": 0.3,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_name": "t1",
  "wandb_project": "tttt",
  "weight_decay": 0.0,
  "world_size": 1
}
[2026-03-23 06:35:22,707] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:299] [PID:774] EOS: 2 / [EOS]
[2026-03-23 06:35:22,707] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:774] BOS: 1 / [BOS]
[2026-03-23 06:35:22,707] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:774] PAD: 0 / [PAD]
[2026-03-23 06:35:22,707] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:774] UNK: 3 / [UNK]
[2026-03-23 06:35:22,707] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:316] [PID:774] No Chat template selected. Consider adding a chat template for easier inference.
[2026-03-23 06:35:22,707] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:480] [PID:774] Unable to find prepared dataset in last_run_prepared/d5a17e3528edff9ec955ba25ed6f7604
[2026-03-23 06:35:22,708] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:774] Loading raw datasets...
[2026-03-23 06:35:22,708] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:774] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
Downloading (incomplete total...): 0.00B [00:00, ?B/s]
Fetching 0 files: 0it [00:00, ?it/s][AFetching 0 files: 0it [00:00, ?it/s]
Download complete: : 0.00B [00:00, ?B/s]              Download complete: : 0.00B [00:01, ?B/s]
[2026-03-23 06:35:26,131] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:774] Loading dataset: psychopenguin/indian_legal_dataset_qna with base_type: alpaca and prompt_style: None
[2026-03-23 06:35:26,161] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:774] min_input_len: 46
[2026-03-23 06:35:26,161] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:774] max_input_len: 333
[2026-03-23 06:35:26,182] [INFO] [axolotl.utils.data.utils._drop_outside_range:306] [PID:774] Dropped 13 sequences outside valid range ([None, 256])
Saving the dataset (0/2 shards):   0%|                                                                                                                         | 0/728 [00:00<?, ? examples/s]Saving the dataset (1/2 shards):  50%|██████████████████████████████████████████████████████▌                                                      | 364/728 [00:00<00:00, 5154.22 examples/s]Saving the dataset (2/2 shards): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 728/728 [00:00<00:00, 10017.46 examples/s]Saving the dataset (2/2 shards): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 728/728 [00:00<00:00, 5420.19 examples/s]
[2026-03-23 06:35:26,342] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:774] total_num_tokens: 61_212
[2026-03-23 06:35:26,344] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:774] `total_supervised_tokens: 41_580`
[2026-03-23 06:35:26,345] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:774] total_num_steps: 382
[2026-03-23 06:35:26,345] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:774] Maximum number of steps set at 382
[2026-03-23 06:35:26,346] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:774] loading tokenizer... PicoKittens/PicoMistral-23M
[2026-03-23 06:35:27,894] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:299] [PID:774] EOS: 2 / [EOS]
[2026-03-23 06:35:27,894] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:774] BOS: 1 / [BOS]
[2026-03-23 06:35:27,894] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:774] PAD: 0 / [PAD]
[2026-03-23 06:35:27,894] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:774] UNK: 3 / [UNK]
[2026-03-23 06:35:27,894] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:316] [PID:774] No Chat template selected. Consider adding a chat template for easier inference.
[2026-03-23 06:35:27,894] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:774] Loading model
[2026-03-23 06:35:28,115] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:91] [PID:774] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-03-23 06:35:28,116] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:142] [PID:774] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
Loading weights:   0%|                                                                                                                                                 | 0/74 [00:00<?, ?it/s]Loading weights: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 74/74 [00:00<00:00, 7178.21it/s]
[2026-03-23 06:35:28,987] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:361] [PID:774] Converting modules to torch.float16
[2026-03-23 06:35:28,993] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:774] Memory usage after model load 0.000GB (+0.002GB reserved)
[2026-03-23 06:35:28,994] [INFO] [axolotl.loaders.adapter.load_lora:81] [PID:774] found linear modules: ['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj']
trainable params: 266,240 || all params: 23,865,728 || trainable%: 1.1156
[2026-03-23 06:35:29,022] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:774] after adapters 0.000GB ()
[2026-03-23 06:35:30,851] [INFO] [axolotl.train.save_initial_configs:413] [PID:774] Pre-saving adapter config to ./final_model...
[2026-03-23 06:35:30,852] [INFO] [axolotl.train.save_initial_configs:417] [PID:774] Pre-saving tokenizer to ./final_model...
[2026-03-23 06:35:30,858] [INFO] [axolotl.train.save_initial_configs:422] [PID:774] Pre-saving model config to ./final_model...
[2026-03-23 06:35:30,859] [INFO] [axolotl.utils.train.determine_last_checkpoint:43] [PID:774] Using auto-resume functionality to resume from checkpoint at final_model/checkpoint-382
[2026-03-23 06:35:30,859] [INFO] [axolotl.train.execute_training:218] [PID:774] Starting trainer...
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.
[34m[1mwandb[0m: Currently logged in as: [33mpsychopenguin0001[0m ([33mpsychopenguin0001-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: [38;5;178m⢿[0m setting up run 56yh00lh (0.1s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣻[0m setting up run 56yh00lh (0.1s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣽[0m setting up run 56yh00lh (0.1s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣾[0m setting up run 56yh00lh (0.1s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣷[0m setting up run 56yh00lh (0.1s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣯[0m setting up run 56yh00lh (0.6s)
[Am[2K[34m[1mwandb[0m: Tracking run with wandb version 0.25.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/workspace/axolotl/ttttttttttttttttttttt/wandb/run-20260323_063532-56yh00lh[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mt1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/psychopenguin0001-none/tttt[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/psychopenguin0001-none/tttt/runs/56yh00lh[0m
[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
[34m[1mwandb[0m: [33mWARNING[0m Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
[2026-03-23 06:35:35,522] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:774] The Axolotl config has been saved to the WandB run under files.
  0%|                                                                                                                                                                 | 0/382 [00:00<?, ?it/s][2026-03-23 06:35:35,527] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:774] Running evaluation step...

  0%|                                                                                                                                                                 | 0/110 [00:00<?, ?it/s][A
  4%|█████▌                                                                                                                                                   | 4/110 [00:00<00:04, 24.51it/s][A
  6%|█████████▋                                                                                                                                               | 7/110 [00:00<00:06, 17.17it/s][A
  8%|████████████▌                                                                                                                                            | 9/110 [00:00<00:06, 16.07it/s][A
 10%|███████████████▏                                                                                                                                        | 11/110 [00:00<00:06, 15.28it/s][A
 12%|█████████████████▉                                                                                                                                      | 13/110 [00:00<00:06, 15.69it/s][A
 14%|████████████████████▋                                                                                                                                   | 15/110 [00:00<00:06, 15.08it/s][A
 15%|███████████████████████▍                                                                                                                                | 17/110 [00:01<00:06, 14.70it/s][A
 17%|██████████████████████████▎                                                                                                                             | 19/110 [00:01<00:05, 15.26it/s][A
 19%|█████████████████████████████                                                                                                                           | 21/110 [00:01<00:06, 14.76it/s][A
 21%|███████████████████████████████▊                                                                                                                        | 23/110 [00:01<00:06, 14.48it/s][A
 23%|██████████████████████████████████▌                                                                                                                     | 25/110 [00:01<00:05, 14.31it/s][A
 25%|█████████████████████████████████████▎                                                                                                                  | 27/110 [00:01<00:05, 14.17it/s][A
 26%|████████████████████████████████████████                                                                                                                | 29/110 [00:01<00:05, 14.08it/s][A
 29%|████████████████████████████████████████████▏                                                                                                           | 32/110 [00:02<00:05, 15.36it/s][A
 31%|██████████████████████████████████████████████▉                                                                                                         | 34/110 [00:02<00:05, 14.29it/s][A
 33%|█████████████████████████████████████████████████▋                                                                                                      | 36/110 [00:02<00:04, 14.86it/s][A
 35%|█████████████████████████████████████████████████████▉                                                                                                  | 39/110 [00:02<00:04, 15.85it/s][A
 37%|████████████████████████████████████████████████████████▋                                                                                               | 41/110 [00:02<00:04, 16.06it/s][A
 39%|███████████████████████████████████████████████████████████▍                                                                                            | 43/110 [00:02<00:04, 16.22it/s][A
 41%|██████████████████████████████████████████████████████████████▏                                                                                         | 45/110 [00:02<00:04, 15.50it/s][A
 43%|████████████████████████████████████████████████████████████████▉                                                                                       | 47/110 [00:03<00:03, 15.82it/s][A
 45%|███████████████████████████████████████████████████████████████████▋                                                                                    | 49/110 [00:03<00:03, 16.03it/s][A
 47%|███████████████████████████████████████████████████████████████████████▊                                                                                | 52/110 [00:03<00:03, 16.58it/s][A
 50%|████████████████████████████████████████████████████████████████████████████                                                                            | 55/110 [00:03<00:03, 16.99it/s][A
 52%|██████████████████████████████████████████████████████████████████████████████▊                                                                         | 57/110 [00:03<00:03, 16.08it/s][A
 54%|█████████████████████████████████████████████████████████████████████████████████▌                                                                      | 59/110 [00:03<00:03, 16.23it/s][A
 55%|████████████████████████████████████████████████████████████████████████████████████▎                                                                   | 61/110 [00:03<00:03, 14.78it/s][A
 57%|███████████████████████████████████████████████████████████████████████████████████████                                                                 | 63/110 [00:04<00:03, 13.89it/s][A
 60%|███████████████████████████████████████████████████████████████████████████████████████████▏                                                            | 66/110 [00:04<00:02, 15.90it/s][A
 62%|█████████████████████████████████████████████████████████████████████████████████████████████▉                                                          | 68/110 [00:04<00:02, 15.33it/s][A
 64%|████████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 70/110 [00:04<00:02, 15.67it/s][A
 65%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                                    | 72/110 [00:04<00:02, 15.94it/s][A
 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                 | 74/110 [00:04<00:02, 16.13it/s][A
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████                                               | 76/110 [00:04<00:02, 16.27it/s][A
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                            | 78/110 [00:04<00:01, 16.38it/s][A
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 80/110 [00:05<00:01, 15.60it/s][A
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                     | 83/110 [00:05<00:01, 17.26it/s][A
 77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 85/110 [00:05<00:01, 17.10it/s][A
 79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                               | 87/110 [00:05<00:01, 16.05it/s][A
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 90/110 [00:05<00:01, 16.69it/s][A
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                        | 92/110 [00:05<00:01, 16.69it/s][A
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                      | 94/110 [00:05<00:01, 15.81it/s][A
 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 96/110 [00:06<00:00, 15.21it/s][A
 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 98/110 [00:06<00:00, 14.79it/s][A
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 101/110 [00:06<00:00, 16.59it/s][A
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍         | 103/110 [00:06<00:00, 15.05it/s][A
 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 105/110 [00:06<00:00, 14.69it/s][A
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 107/110 [00:06<00:00, 14.51it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [00:07<00:00, 15.98it/s][A                                                                                                                                                                                              
                                                                                                                                                                                              [A{'eval_loss': '3.713', 'eval_runtime': '7.423', 'eval_samples_per_second': '29.5', 'eval_steps_per_second': '14.82', 'eval_ppl': '40.99', 'memory/max_active (GiB)': '0.13', 'memory/max_allocated (GiB)': '0.13', 'memory/device_reserved (GiB)': '0.19', 'epoch': '2.988'}
  0%|                                                                                                                                                                 | 0/382 [00:07<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [00:07<00:00, 15.98it/s][A
                                                                                                                                                                                              [A383it [00:07, 48.84it/s]                                                                                                                                                                                              {'loss': '3.939', 'grad_norm': '0.9242', 'learning_rate': '0', 'ppl': '51.36', 'memory/max_active (GiB)': '0.22', 'memory/max_allocated (GiB)': '0.22', 'memory/device_reserved (GiB)': '0.25', 'tokens/train_per_sec_per_gpu': '311.6', 'tokens/total': 768, 'tokens/trainable': 276, 'epoch': '2.996'}
383it [00:07, 48.84it/s][2026-03-23 06:35:43,374] [INFO] [axolotl.core.trainers.base._save:721] [PID:774] Saving model checkpoint to ./final_model/checkpoint-383
                        {'train_runtime': '12.87', 'train_samples_per_second': '118.7', 'train_steps_per_second': '29.68', 'train_loss': '0.01028', 'memory/max_active (GiB)': '0.06', 'memory/max_allocated (GiB)': '0.06', 'memory/device_reserved (GiB)': '0.25', 'epoch': '2.996', 'tokens/train_per_sec_per_gpu': '0'}
383it [00:08, 48.84it/s]383it [00:08, 45.48it/s]
[2026-03-23 06:35:49,270] [INFO] [axolotl.train.save_trained_model:237] [PID:774] Training completed! Saving trained model to ./final_model.
[2026-03-23 06:35:49,717] [INFO] [axolotl.train.save_trained_model:351] [PID:774] Model successfully saved to ./final_model
[2026-03-23 06:35:49,722] [INFO] [axolotl.core.trainers.base._save:721] [PID:774] Saving model checkpoint to ./final_model
Processing Files (0 / 0)      : |                                                                                                                                |  0.00B /  0.00B            
New Data Upload               : |                                                                                                                                |  0.00B /  0.00B            [A

  ...l_model/training_args.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.67kB / 6.67kB            [A[A


  ...adapter_model.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.08MB / 1.08MB            [A[A[A

  ...l_model/training_args.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.67kB / 6.67kB            [A[A


  ...adapter_model.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.08MB / 1.08MB            [A[A[AProcessing Files (2 / 2)      : 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.09MB / 1.09MB,   ???B/s  

  ...l_model/training_args.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.67kB / 6.67kB            [A[A


  ...adapter_model.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.08MB / 1.08MB            [A[A[A

  ...l_model/training_args.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.67kB / 6.67kB            [A[A


  ...adapter_model.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.08MB / 1.08MB            [A[A[A

  ...l_model/training_args.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.67kB / 6.67kB            [A[A


  ...adapter_model.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.08MB / 1.08MB            [A[A[AProcessing Files (2 / 2)      : 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.09MB / 1.09MB,  0.00B/s  
New Data Upload               : |                                                                                                                                |  0.00B /  0.00B,  0.00B/s  
  ...l_model/training_args.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.67kB / 6.67kB            
  ...adapter_model.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.08MB / 1.08MB