[2026-01-06 06:31:33,733] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:5347] baseline 0.000GB ()
[2026-01-06 06:31:33,734] [INFO] [axolotl.cli.config.load_cfg:248] [PID:5347] config:
{
  "activation_offloading": false,
  "axolotl_config_path": "sft-axolotl-olmo3-7b-think.yaml",
  "base_model": "allenai/Olmo-3-1025-7B",
  "base_model_config": "allenai/Olmo-3-1025-7B",
  "batch_size": 8,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_90",
    "fp8": false,
    "n_gpu": 2,
    "n_node": 1
  },
  "chat_template": "jinja",
  "chat_template_jinja": "/workspace/data/model-output/chat_template.jinja",
  "context_parallel_size": 2,
  "dataloader_num_workers": 2,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_prepared_path": "last_run_prepared",
  "dataset_processes": 48,
  "datasets": [
    {
      "chat_template": "tokenizer_default",
      "field_messages": "messages",
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "dataset-tfs-mk-IMP-SOS-processed-olmo3-think.jsonl",
      "roles": {
        "assistant": [
          "assistant"
        ],
        "system": [
          "system"
        ],
        "user": [
          "user"
        ]
      },
      "roles_to_train": [
        "assistant"
      ],
      "trust_remote_code": false,
      "type": "chat_template"
    }
  ],
  "ddp": true,
  "device": "cuda:0",
  "device_map": {
    "": 0
  },
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "env_capabilities": {
    "torch_version": "2.7.1"
  },
  "eval_batch_size": 1,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": true,
  "eval_steps": 0.5,
  "eval_table_size": 0,
  "evals_per_epoch": 1,
  "experimental_skip_move_to_device": true,
  "flash_attention": true,
  "fp16": false,
  "gradient_accumulation_steps": 4,
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": true
  },
  "group_by_length": false,
  "hub_model_id": "Auditt/O37BB",
  "include_tkps": true,
  "learning_rate": 1e-05,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_dropout": 0.0,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "mean_resizing_embeddings": false,
  "micro_batch_size": 1,
  "model_config_type": "olmo3",
  "num_epochs": 2.0,
  "optimizer": "adamw_torch",
  "output_dir": "/workspace/data/model-output-base",
  "pad_to_sequence_len": true,
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "ring_attn_func": "varlen_llama3",
  "sample_packing": true,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "sequence_len": 60000,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": true,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "allenai/Olmo-3-1025-7B",
  "tokenizer_save_jinja_files": true,
  "tokens": [
    "\ud801\udd32",
    "\ud801\udd3e",
    "\u3009",
    "\ud835\udf0e",
    "\u22c1",
    "\ud801\udd60",
    "\ud801\udd5c",
    "\ud801\udd38",
    "\u2227",
    "\u2265",
    "\ud801\udd5f",
    "\ud801\udd56",
    "\u27c2",
    "\ud801\udd4f",
    "\u22c0",
    "\ud801\udd63",
    "\ud801\udd43",
    "\ud801\udd59",
    "\ud801\udd55",
    "\u03c7",
    "\ud801\udd4a",
    "\u3008",
    "\ud801\udd50",
    "\ud801\udd3b",
    "\ud801\udd40",
    "\ud801\udd33",
    "\u2260",
    "\ud801\udd37",
    "\u2264",
    "\ud801\udd5e",
    "\ud801\udd31",
    "\ud801\udd42",
    "\u21a6",
    "\ud801\udd4e",
    "\u2192",
    "\ud801\udd5b",
    "\ud801\udd30",
    "\u03b5"
  ],
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "trust_remote_code": true,
  "use_ray": false,
  "val_set_size": 0.1,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "weight_decay": 0.0,
  "world_size": 2
}
[2026-01-06 06:31:34,116] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:5347] EOS: 100257 / <|endoftext|>
[2026-01-06 06:31:34,116] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:5347] BOS: None / None
[2026-01-06 06:31:34,117] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:5347] PAD: 100277 / <|pad|>
[2026-01-06 06:31:34,117] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:5347] UNK: 100257 / <|endoftext|>
[2026-01-06 06:31:48,563] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:470] [PID:5347] Loading prepared dataset from disk at last_run_prepared/521442581534a9837f30b55bdde4d057...
[2026-01-06 06:31:48,578] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:5347] total_num_tokens: 4_377_664
[2026-01-06 06:31:48,586] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:5347] `total_supervised_tokens: 3_345_873`
[2026-01-06 06:31:49,583] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.4595468044281006
[2026-01-06 06:31:50,030] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.44755101203918457
[2026-01-06 06:31:50,477] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.44657182693481445
[2026-01-06 06:31:50,952] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.4744563102722168
[2026-01-06 06:31:52,230] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:5347] gather_len_batches: [78, 78]
[2026-01-06 06:31:52,231] [WARNING] [py.warnings._showwarnmsg:110] [PID:5347] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. 
  warnings.warn(  # warn only once

[2026-01-06 06:31:52,413] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:481] [PID:5347] data_loader_len: 9
[2026-01-06 06:31:52,429] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:497] [PID:5347] sample_packing_eff_est across ranks: [0.9235578179359436, 0.9235578179359436]
[2026-01-06 06:31:52,430] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:509] [PID:5347] sample_packing_eff_est: None
[2026-01-06 06:31:52,430] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:5347] total_num_steps: 36
[2026-01-06 06:31:52,542] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:5347] total_num_tokens: 39_860_654
[2026-01-06 06:31:53,267] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:5347] `total_supervised_tokens: 30_599_793`
[2026-01-06 06:31:54,333] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.5050613880157471
[2026-01-06 06:31:54,830] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.49697208404541016
[2026-01-06 06:31:55,317] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.48656487464904785
[2026-01-06 06:31:55,795] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.47708654403686523
[2026-01-06 06:31:55,796] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:5347] gather_len_batches: [699, 697]
[2026-01-06 06:31:55,797] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:481] [PID:5347] data_loader_len: 87
[2026-01-06 06:31:55,798] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:497] [PID:5347] sample_packing_eff_est across ranks: [0.9517825841903687, 0.947709321975708]
[2026-01-06 06:31:55,798] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:509] [PID:5347] sample_packing_eff_est: 0.96
[2026-01-06 06:31:55,802] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:5347] total_num_steps: 348
[2026-01-06 06:31:55,802] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:5347] Maximum number of steps set at 348
[2026-01-06 06:31:55,821] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:5347] Loading tokenizer... allenai/Olmo-3-1025-7B
[2026-01-06 06:31:56,142] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:5347] EOS: 100257 / <|endoftext|>
[2026-01-06 06:31:56,142] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:5347] BOS: None / None
[2026-01-06 06:31:56,142] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:5347] PAD: 100277 / <|pad|>
[2026-01-06 06:31:56,142] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:5347] UNK: 100257 / <|endoftext|>
[2026-01-06 06:31:56,144] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:5347] Loading model
[2026-01-06 06:31:56,194] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:5347] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-01-06 06:31:56,195] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:5347] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-01-06 06:31:56,197] [DEBUG] [axolotl.monkeypatch.transformers.trainer_context_parallel.patch_prepare_context_parallel_inputs:66] [PID:5347] Patched Trainer._prepare_context_parallel_inputs for FlashAttention + CP
[2026-01-06 06:31:56,198] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:5347] Applying multipack dataloader patch for sample packing...
Loading checkpoint shards:   0%|                                                                                               | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:  33%|█████████████████████████████                                                          | 1/3 [00:01<00:02,  1.06s/it]Loading checkpoint shards:  67%|██████████████████████████████████████████████████████████                             | 2/3 [00:02<00:01,  1.03s/it]Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.00it/s]Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.01s/it]
generation_config.json:   0%|                                                                                             | 0.00/69.0 [00:00<?, ?B/s]generation_config.json: 100%|██████████████████████████████████████████████████████████████████████████████████████| 69.0/69.0 [00:00<00:00, 789kB/s]
[2026-01-06 06:32:35,096] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:5347] Converting modules to torch.bfloat16
[2026-01-06 06:32:35,099] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:5347] Memory usage after model load 15.893GB (+15.893GB allocated, +18.191GB reserved)
[2026-01-06 06:32:39,828] [INFO] [axolotl.train.save_initial_configs:402] [PID:5347] Pre-saving tokenizer to /workspace/data/model-output-base...
[2026-01-06 06:32:39,939] [INFO] [axolotl.train.save_initial_configs:407] [PID:5347] Pre-saving model config to /workspace/data/model-output-base...
[2026-01-06 06:32:39,943] [INFO] [axolotl.monkeypatch.ring_attn.patch.register_ring_attn_from_device_mesh:154] [PID:5347] Enabling ring attention sequence parallelism using DeviceMesh dimension '('cp',)'
[2026-01-06 06:32:39,943] [INFO] [axolotl.monkeypatch.ring_attn.patch.register_ring_attn_from_device_mesh:174] [PID:5347] Sequence parallel degree: 2, mesh shape: torch.Size([2])
[2026-01-06 06:32:39,943] [INFO] [axolotl.train.execute_training:196] [PID:5347] Starting trainer...
[2026-01-06 06:32:50,347] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.6783885955810547
[2026-01-06 06:32:50,988] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.6399080753326416
[2026-01-06 06:32:51,625] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.6367971897125244
[2026-01-06 06:32:52,267] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.6410634517669678
[2026-01-06 06:32:53,405] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:5347] gather_len_batches: [700, 700]
[2026-01-06 06:32:53,406] [WARNING] [py.warnings._showwarnmsg:110] [PID:5347] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. 
  warnings.warn(  # warn only once

  0%|                                                                                                                        | 0/348 [00:00<?, ?it/s][2026-01-06 06:32:53,498] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:5347] Running evaluation step...
[2026-01-06 06:32:55,925] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.6825177669525146
[2026-01-06 06:32:56,556] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.6304805278778076
[2026-01-06 06:32:57,222] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.6658525466918945
[2026-01-06 06:32:57,878] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.6549110412597656
[2026-01-06 06:32:58,138] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:5347] gather_len_batches: [78, 78]

  0%|                                                                                                                         | 0/78 [00:00<?, ?it/s][A
  3%|██▉                                                                                                              | 2/78 [00:01<00:49,  1.54it/s][A
  4%|████▎                                                                                                            | 3/78 [00:02<01:17,  1.04s/it][A
  5%|█████▊                                                                                                           | 4/78 [00:04<01:31,  1.24s/it][A
  6%|███████▏                                                                                                         | 5/78 [00:06<01:43,  1.42s/it][A
  8%|████████▋                                                                                                        | 6/78 [00:07<01:49,  1.52s/it][A
  9%|██████████▏                                                                                                      | 7/78 [00:09<01:49,  1.54s/it][A
 10%|███████████▌                                                                                                     | 8/78 [00:11<01:51,  1.59s/it][A
 12%|█████████████                                                                                                    | 9/78 [00:12<01:50,  1.61s/it][A
 13%|██████████████▎                                                                                                 | 10/78 [00:14<01:50,  1.62s/it][A
 14%|███████████████▊                                                                                                | 11/78 [00:16<01:48,  1.62s/it][A
 15%|█████████████████▏                                                                                              | 12/78 [00:17<01:47,  1.63s/it][A
 17%|██████████████████▋                                                                                             | 13/78 [00:19<01:46,  1.64s/it][A
 18%|████████████████████                                                                                            | 14/78 [00:21<01:43,  1.62s/it][A
 19%|█████████████████████▌                                                                                          | 15/78 [00:22<01:41,  1.61s/it][A
 21%|██████████████████████▉                                                                                         | 16/78 [00:24<01:39,  1.60s/it][A
 22%|████████████████████████▍                                                                                       | 17/78 [00:25<01:40,  1.65s/it][A
 23%|█████████████████████████▊                                                                                      | 18/78 [00:27<01:37,  1.62s/it][A
 24%|███████████████████████████▎                                                                                    | 19/78 [00:29<01:36,  1.64s/it][A
 26%|████████████████████████████▋                                                                                   | 20/78 [00:30<01:34,  1.63s/it][A
 27%|██████████████████████████████▏                                                                                 | 21/78 [00:32<01:33,  1.64s/it][A
 28%|███████████████████████████████▌                                                                                | 22/78 [00:34<01:30,  1.62s/it][A
 29%|█████████████████████████████████                                                                               | 23/78 [00:35<01:27,  1.59s/it][A
 31%|██████████████████████████████████▍                                                                             | 24/78 [00:37<01:25,  1.58s/it][A
 32%|███████████████████████████████████▉                                                                            | 25/78 [00:38<01:23,  1.58s/it][A
 33%|█████████████████████████████████████▎                                                                          | 26/78 [00:40<01:22,  1.58s/it][A
 35%|██████████████████████████████████████▊                                                                         | 27/78 [00:41<01:20,  1.58s/it][A
 36%|████████████████████████████████████████▏                                                                       | 28/78 [00:43<01:19,  1.59s/it][A
 37%|█████████████████████████████████████████▋                                                                      | 29/78 [00:45<01:19,  1.62s/it][A
 38%|███████████████████████████████████████████                                                                     | 30/78 [00:46<01:17,  1.61s/it][A
 40%|████████████████████████████████████████████▌                                                                   | 31/78 [00:48<01:14,  1.59s/it][A
 41%|█████████████████████████████████████████████▉                                                                  | 32/78 [00:49<01:13,  1.59s/it][A
 42%|███████████████████████████████████████████████▍                                                                | 33/78 [00:51<01:12,  1.62s/it][A
 44%|████████████████████████████████████████████████▊                                                               | 34/78 [00:53<01:12,  1.64s/it][A
 45%|██████████████████████████████████████████████████▎                                                             | 35/78 [00:54<01:10,  1.65s/it][A
 46%|███████████████████████████████████████████████████▋                                                            | 36/78 [00:56<01:08,  1.62s/it][A
 47%|█████████████████████████████████████████████████████▏                                                          | 37/78 [00:58<01:06,  1.63s/it][A
 49%|██████████████████████████████████████████████████████▌                                                         | 38/78 [00:59<01:04,  1.62s/it][A
 50%|████████████████████████████████████████████████████████                                                        | 39/78 [01:01<01:02,  1.61s/it][A
 51%|█████████████████████████████████████████████████████████▍                                                      | 40/78 [01:02<01:01,  1.61s/it][A
 53%|██████████████████████████████████████████████████████████▊                                                     | 41/78 [01:04<01:01,  1.66s/it][A
 54%|████████████████████████████████████████████████████████████▎                                                   | 42/78 [01:06<00:59,  1.65s/it][A
 55%|█████████████████████████████████████████████████████████████▋                                                  | 43/78 [01:08<00:58,  1.67s/it][A
 56%|███████████████████████████████████████████████████████████████▏                                                | 44/78 [01:09<00:57,  1.69s/it][A
 58%|████████████████████████████████████████████████████████████████▌                                               | 45/78 [01:11<00:56,  1.72s/it][A
 59%|██████████████████████████████████████████████████████████████████                                              | 46/78 [01:13<00:55,  1.72s/it][A
 60%|███████████████████████████████████████████████████████████████████▍                                            | 47/78 [01:15<00:53,  1.73s/it][A
 62%|████████████████████████████████████████████████████████████████████▉                                           | 48/78 [01:16<00:51,  1.71s/it][A
 63%|██████████████████████████████████████████████████████████████████████▎                                         | 49/78 [01:18<00:48,  1.69s/it][A
 64%|███████████████████████████████████████████████████████████████████████▊                                        | 50/78 [01:20<00:46,  1.68s/it][A
 65%|█████████████████████████████████████████████████████████████████████████▏                                      | 51/78 [01:21<00:45,  1.67s/it][A
 67%|██████████████████████████████████████████████████████████████████████████▋                                     | 52/78 [01:23<00:43,  1.66s/it][A
 68%|████████████████████████████████████████████████████████████████████████████                                    | 53/78 [01:25<00:42,  1.69s/it][A
 69%|█████████████████████████████████████████████████████████████████████████████▌                                  | 54/78 [01:26<00:39,  1.66s/it][A
 71%|██████████████████████████████████████████████████████████████████████████████▉                                 | 55/78 [01:28<00:37,  1.65s/it][A
 72%|████████████████████████████████████████████████████████████████████████████████▍                               | 56/78 [01:29<00:35,  1.62s/it][A
 73%|█████████████████████████████████████████████████████████████████████████████████▊                              | 57/78 [01:31<00:33,  1.60s/it][A
 74%|███████████████████████████████████████████████████████████████████████████████████▎                            | 58/78 [01:32<00:31,  1.60s/it][A
 76%|████████████████████████████████████████████████████████████████████████████████████▋                           | 59/78 [01:34<00:30,  1.58s/it][A
 77%|██████████████████████████████████████████████████████████████████████████████████████▏                         | 60/78 [01:36<00:28,  1.58s/it][A
 78%|███████████████████████████████████████████████████████████████████████████████████████▌                        | 61/78 [01:37<00:26,  1.57s/it][A
 79%|█████████████████████████████████████████████████████████████████████████████████████████                       | 62/78 [01:39<00:25,  1.57s/it][A
 81%|██████████████████████████████████████████████████████████████████████████████████████████▍                     | 63/78 [01:40<00:24,  1.61s/it][A
 82%|███████████████████████████████████████████████████████████████████████████████████████████▉                    | 64/78 [01:42<00:22,  1.61s/it][A
 83%|█████████████████████████████████████████████████████████████████████████████████████████████▎                  | 65/78 [01:44<00:21,  1.63s/it][A
 85%|██████████████████████████████████████████████████████████████████████████████████████████████▊                 | 66/78 [01:45<00:19,  1.60s/it][A
 86%|████████████████████████████████████████████████████████████████████████████████████████████████▏               | 67/78 [01:47<00:17,  1.61s/it][A
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████▋              | 68/78 [01:49<00:16,  1.63s/it][A
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████             | 69/78 [01:50<00:14,  1.64s/it][A
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 70/78 [01:52<00:13,  1.66s/it][A
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 71/78 [01:54<00:11,  1.67s/it][A
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 72/78 [01:55<00:10,  1.67s/it][A
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 73/78 [01:57<00:08,  1.66s/it][A
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 74/78 [01:58<00:06,  1.64s/it][A
 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 75/78 [02:00<00:04,  1.64s/it][A
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 76/78 [02:02<00:03,  1.65s/it][A
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 77/78 [02:03<00:01,  1.63s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78/78 [02:05<00:00,  1.62s/it][A                                                                                                                                                     
                                                                                                                                                     [A{'eval_loss': 1.0679770708084106, 'eval_runtime': 129.1457, 'eval_samples_per_second': 1.44, 'eval_steps_per_second': 0.72, 'memory/max_active (GiB)': 58.72, 'memory/max_allocated (GiB)': 55.5, 'memory/device_reserved (GiB)': 65.44, 'epoch': 0}
  0%|                                                                                                                        | 0/348 [02:13<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78/78 [02:05<00:00,  1.62s/it][A
                                                                                                                                                     [A  0%|▎                                                                                                           | 1/348 [02:29<14:24:11, 149.43s/it]                                                                                                                                                     {'loss': 4.9629, 'grad_norm': 57.0, 'learning_rate': 0.0, 'memory/max_active (GiB)': 81.64, 'memory/max_allocated (GiB)': 81.64, 'memory/device_reserved (GiB)': 83.59, 'tokens_per_second_per_gpu': 124835.68, 'epoch': 0.01}
  0%|▎                                                                                                           | 1/348 [02:29<14:24:11, 149.43s/it]  1%|▋                                                                                                             | 2/348 [02:43<6:42:55, 69.87s/it]                                                                                                                                                     {'loss': 4.4324, 'grad_norm': 45.25, 'learning_rate': 1.0000000000000002e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2715.0, 'epoch': 0.01}
  1%|▋                                                                                                             | 2/348 [02:43<6:42:55, 69.87s/it]  1%|▉                                                                                                             | 3/348 [02:57<4:13:55, 44.16s/it]                                                                                                                                                     {'loss': 4.3548, 'grad_norm': 63.5, 'learning_rate': 2.0000000000000003e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2697.0, 'epoch': 0.02}
  1%|▉                                                                                                             | 3/348 [02:57<4:13:55, 44.16s/it]  1%|█▎                                                                                                            | 4/348 [03:10<3:03:18, 31.97s/it]                                                                                                                                                     {'loss': 4.2327, 'grad_norm': 48.0, 'learning_rate': 3e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2655.0, 'epoch': 0.02}
  1%|█▎                                                                                                            | 4/348 [03:10<3:03:18, 31.97s/it]  1%|█▌                                                                                                            | 5/348 [03:23<2:24:33, 25.29s/it]                                                                                                                                                     {'loss': 4.218, 'grad_norm': 46.0, 'learning_rate': 4.000000000000001e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2937.2, 'epoch': 0.03}
  1%|█▌                                                                                                            | 5/348 [03:23<2:24:33, 25.29s/it]  2%|█▉                                                                                                            | 6/348 [03:37<2:02:04, 21.42s/it]                                                                                                                                                     {'loss': 4.0187, 'grad_norm': 39.75, 'learning_rate': 5e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2416.65, 'epoch': 0.03}
  2%|█▉                                                                                                            | 6/348 [03:37<2:02:04, 21.42s/it]  2%|██▏                                                                                                           | 7/348 [03:51<1:47:01, 18.83s/it]                                                                                                                                                     {'loss': 3.7985, 'grad_norm': 47.25, 'learning_rate': 6e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3094.42, 'epoch': 0.04}
  2%|██▏                                                                                                           | 7/348 [03:51<1:47:01, 18.83s/it]  2%|██▌                                                                                                           | 8/348 [04:05<1:38:15, 17.34s/it]                                                                                                                                                     {'loss': 3.6982, 'grad_norm': 39.5, 'learning_rate': 7e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2475.28, 'epoch': 0.05}
  2%|██▌                                                                                                           | 8/348 [04:05<1:38:15, 17.34s/it]  3%|██▊                                                                                                           | 9/348 [04:18<1:31:02, 16.11s/it]                                                                                                                                                     {'loss': 3.5344, 'grad_norm': 42.75, 'learning_rate': 8.000000000000001e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2863.05, 'epoch': 0.05}
  3%|██▊                                                                                                           | 9/348 [04:18<1:31:02, 16.11s/it]  3%|███▏                                                                                                         | 10/348 [04:32<1:26:45, 15.40s/it]                                                                                                                                                     {'loss': 2.9472, 'grad_norm': 37.5, 'learning_rate': 9e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2346.54, 'epoch': 0.06}
  3%|███▏                                                                                                         | 10/348 [04:32<1:26:45, 15.40s/it]  3%|███▍                                                                                                         | 11/348 [04:46<1:23:36, 14.89s/it]                                                                                                                                                     {'loss': 2.1595, 'grad_norm': 30.875, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2992.9, 'epoch': 0.06}
  3%|███▍                                                                                                         | 11/348 [04:46<1:23:36, 14.89s/it]  3%|███▊                                                                                                         | 12/348 [05:00<1:21:15, 14.51s/it]                                                                                                                                                     {'loss': 1.5164, 'grad_norm': 24.0, 'learning_rate': 9.999784025127187e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2840.54, 'epoch': 0.07}
  3%|███▊                                                                                                         | 12/348 [05:00<1:21:15, 14.51s/it]  4%|████                                                                                                         | 13/348 [05:13<1:19:31, 14.24s/it]                                                                                                                                                     {'loss': 1.4181, 'grad_norm': 12.8125, 'learning_rate': 9.999136119166803e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2374.7, 'epoch': 0.07}
  4%|████                                                                                                         | 13/348 [05:13<1:19:31, 14.24s/it]  4%|████▍                                                                                                        | 14/348 [05:27<1:18:20, 14.07s/it]                                                                                                                                                     {'loss': 1.1872, 'grad_norm': 7.84375, 'learning_rate': 9.998056338091415e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2537.33, 'epoch': 0.08}
  4%|████▍                                                                                                        | 14/348 [05:27<1:18:20, 14.07s/it]  4%|████▋                                                                                                        | 15/348 [05:40<1:17:16, 13.92s/it]                                                                                                                                                     {'loss': 1.1644, 'grad_norm': 6.0, 'learning_rate': 9.99654477518325e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2530.44, 'epoch': 0.09}
  4%|████▋                                                                                                        | 15/348 [05:40<1:17:16, 13.92s/it]  5%|█████                                                                                                        | 16/348 [05:54<1:16:58, 13.91s/it]                                                                                                                                                     {'loss': 1.045, 'grad_norm': 4.9375, 'learning_rate': 9.994601561026156e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2575.3, 'epoch': 0.09}
  5%|█████                                                                                                        | 16/348 [05:54<1:16:58, 13.91s/it]  5%|█████▎                                                                                                       | 17/348 [06:08<1:16:29, 13.86s/it]                                                                                                                                                     {'loss': 1.1482, 'grad_norm': 5.09375, 'learning_rate': 9.9922268634943e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2933.78, 'epoch': 0.1}
  5%|█████▎                                                                                                       | 17/348 [06:08<1:16:29, 13.86s/it]  5%|█████▋                                                                                                       | 18/348 [06:22<1:15:43, 13.77s/it]                                                                                                                                                     {'loss': 1.0253, 'grad_norm': 5.71875, 'learning_rate': 9.989420887737684e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2700.81, 'epoch': 0.1}
  5%|█████▋                                                                                                       | 18/348 [06:22<1:15:43, 13.77s/it]  5%|█████▉                                                                                                       | 19/348 [06:35<1:14:59, 13.68s/it]                                                                                                                                                     {'loss': 0.8181, 'grad_norm': 4.96875, 'learning_rate': 9.986183876164412e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2656.52, 'epoch': 0.11}
  5%|█████▉                                                                                                       | 19/348 [06:35<1:14:59, 13.68s/it]  6%|██████▎                                                                                                      | 20/348 [06:48<1:14:08, 13.56s/it]                                                                                                                                                     {'loss': 0.7374, 'grad_norm': 3.15625, 'learning_rate': 9.982516108419746e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2864.5, 'epoch': 0.11}
  6%|██████▎                                                                                                      | 20/348 [06:48<1:14:08, 13.56s/it]  6%|██████▌                                                                                                      | 21/348 [07:02<1:13:43, 13.53s/it]                                                                                                                                                     {'loss': 0.7305, 'grad_norm': 6.75, 'learning_rate': 9.978417901361958e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2728.43, 'epoch': 0.12}
  6%|██████▌                                                                                                      | 21/348 [07:02<1:13:43, 13.53s/it]  6%|██████▉                                                                                                      | 22/348 [07:16<1:13:47, 13.58s/it]                                                                                                                                                     {'loss': 0.8355, 'grad_norm': 3.328125, 'learning_rate': 9.973889609034945e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2616.55, 'epoch': 0.13}
  6%|██████▉                                                                                                      | 22/348 [07:16<1:13:47, 13.58s/it]  7%|███████▏                                                                                                     | 23/348 [07:29<1:13:49, 13.63s/it]                                                                                                                                                     {'loss': 0.73, 'grad_norm': 3.703125, 'learning_rate': 9.968931622637652e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2806.47, 'epoch': 0.13}
  7%|███████▏                                                                                                     | 23/348 [07:29<1:13:49, 13.63s/it]  7%|███████▌                                                                                                     | 24/348 [07:43<1:14:15, 13.75s/it]                                                                                                                                                     {'loss': 0.6998, 'grad_norm': 2.921875, 'learning_rate': 9.96354437049027e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2495.19, 'epoch': 0.14}
  7%|███████▌                                                                                                     | 24/348 [07:43<1:14:15, 13.75s/it]  7%|███████▊                                                                                                     | 25/348 [07:57<1:13:53, 13.73s/it]                                                                                                                                                     {'loss': 0.6858, 'grad_norm': 2.734375, 'learning_rate': 9.95772831799724e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2663.83, 'epoch': 0.14}
  7%|███████▊                                                                                                     | 25/348 [07:57<1:13:53, 13.73s/it]  7%|████████▏                                                                                                    | 26/348 [08:11<1:13:31, 13.70s/it]                                                                                                                                                     {'loss': 0.6134, 'grad_norm': 2.703125, 'learning_rate': 9.95148396760704e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2848.04, 'epoch': 0.15}
  7%|████████▏                                                                                                    | 26/348 [08:11<1:13:31, 13.70s/it]  8%|████████▍                                                                                                    | 27/348 [08:25<1:13:36, 13.76s/it]                                                                                                                                                     {'loss': 0.588, 'grad_norm': 2.453125, 'learning_rate': 9.944811858768782e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2746.78, 'epoch': 0.15}
  8%|████████▍                                                                                                    | 27/348 [08:25<1:13:36, 13.76s/it]  8%|████████▊                                                                                                    | 28/348 [08:38<1:13:11, 13.72s/it]                                                                                                                                                     {'loss': 0.5296, 'grad_norm': 1.9453125, 'learning_rate': 9.93771256788561e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2550.39, 'epoch': 0.16}
  8%|████████▊                                                                                                    | 28/348 [08:38<1:13:11, 13.72s/it]  8%|█████████                                                                                                    | 29/348 [08:52<1:12:42, 13.67s/it]                                                                                                                                                     {'loss': 0.5367, 'grad_norm': 2.390625, 'learning_rate': 9.930186708264902e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2874.74, 'epoch': 0.17}
  8%|█████████                                                                                                    | 29/348 [08:52<1:12:42, 13.67s/it]  9%|█████████▍                                                                                                   | 30/348 [09:05<1:12:10, 13.62s/it]                                                                                                                                                     {'loss': 0.4341, 'grad_norm': 2.28125, 'learning_rate': 9.922234930065286e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3108.48, 'epoch': 0.17}
  9%|█████████▍                                                                                                   | 30/348 [09:05<1:12:10, 13.62s/it]  9%|█████████▋                                                                                                   | 31/348 [09:19<1:11:58, 13.62s/it]                                                                                                                                                     {'loss': 0.4675, 'grad_norm': 1.8046875, 'learning_rate': 9.913857920240481e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2722.71, 'epoch': 0.18}
  9%|█████████▋                                                                                                   | 31/348 [09:19<1:11:58, 13.62s/it]  9%|██████████                                                                                                   | 32/348 [09:33<1:12:15, 13.72s/it]                                                                                                                                                     {'loss': 0.3299, 'grad_norm': 1.4609375, 'learning_rate': 9.905056402479933e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2522.27, 'epoch': 0.18}
  9%|██████████                                                                                                   | 32/348 [09:33<1:12:15, 13.72s/it]  9%|██████████▎                                                                                                  | 33/348 [09:46<1:11:44, 13.66s/it]                                                                                                                                                     {'loss': 0.4499, 'grad_norm': 2.171875, 'learning_rate': 9.895831137146319e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2755.38, 'epoch': 0.19}
  9%|██████████▎                                                                                                  | 33/348 [09:46<1:11:44, 13.66s/it] 10%|██████████▋                                                                                                  | 34/348 [10:00<1:11:27, 13.65s/it]                                                                                                                                                     {'loss': 0.5226, 'grad_norm': 2.4375, 'learning_rate': 9.88618292120984e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2203.91, 'epoch': 0.19}
 10%|██████████▋                                                                                                  | 34/348 [10:00<1:11:27, 13.65s/it] 10%|██████████▉                                                                                                  | 35/348 [10:13<1:10:49, 13.58s/it]                                                                                                                                                     {'loss': 0.3125, 'grad_norm': 1.15625, 'learning_rate': 9.876112588179378e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2790.6, 'epoch': 0.2}
 10%|██████████▉                                                                                                  | 35/348 [10:13<1:10:49, 13.58s/it] 10%|███████████▎                                                                                                 | 36/348 [10:27<1:11:02, 13.66s/it]                                                                                                                                                     {'loss': 0.3661, 'grad_norm': 1.84375, 'learning_rate': 9.865621008030492e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3064.27, 'epoch': 0.21}
 10%|███████████▎                                                                                                 | 36/348 [10:27<1:11:02, 13.66s/it] 11%|███████████▌                                                                                                 | 37/348 [10:41<1:11:05, 13.72s/it]                                                                                                                                                     {'loss': 0.2538, 'grad_norm': 1.0625, 'learning_rate': 9.854709087130261e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2660.37, 'epoch': 0.21}
 11%|███████████▌                                                                                                 | 37/348 [10:41<1:11:05, 13.72s/it] 11%|███████████▉                                                                                                 | 38/348 [10:55<1:11:16, 13.80s/it]                                                                                                                                                     {'loss': 0.3445, 'grad_norm': 1.359375, 'learning_rate': 9.843377768158972e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2517.7, 'epoch': 0.22}
 11%|███████████▉                                                                                                 | 38/348 [10:55<1:11:16, 13.80s/it] 11%|████████████▏                                                                                                | 39/348 [11:09<1:10:36, 13.71s/it]                                                                                                                                                     {'loss': 0.3786, 'grad_norm': 1.421875, 'learning_rate': 9.831628030028698e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2826.59, 'epoch': 0.22}
 11%|████████████▏                                                                                                | 39/348 [11:09<1:10:36, 13.71s/it] 11%|████████████▌                                                                                                | 40/348 [11:22<1:10:32, 13.74s/it]                                                                                                                                                     {'loss': 0.3688, 'grad_norm': 1.2265625, 'learning_rate': 9.819460887798714e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2734.91, 'epoch': 0.23}
 11%|████████████▌                                                                                                | 40/348 [11:22<1:10:32, 13.74s/it] 12%|████████████▊                                                                                                | 41/348 [11:36<1:10:08, 13.71s/it]                                                                                                                                                     {'loss': 0.3712, 'grad_norm': 1.3984375, 'learning_rate': 9.80687739258782e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2603.09, 'epoch': 0.23}
 12%|████████████▊                                                                                                | 41/348 [11:36<1:10:08, 13.71s/it] 12%|█████████████▏                                                                                               | 42/348 [11:50<1:10:14, 13.77s/it]                                                                                                                                                     {'loss': 0.3584, 'grad_norm': 1.171875, 'learning_rate': 9.79387863148353e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2706.94, 'epoch': 0.24}
 12%|█████████████▏                                                                                               | 42/348 [11:50<1:10:14, 13.77s/it] 12%|█████████████▍                                                                                               | 43/348 [12:04<1:09:52, 13.75s/it]                                                                                                                                                     {'loss': 0.3005, 'grad_norm': 1.03125, 'learning_rate': 9.78046572744815e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2375.4, 'epoch': 0.25}
 12%|█████████████▍                                                                                               | 43/348 [12:04<1:09:52, 13.75s/it] 13%|█████████████▊                                                                                               | 44/348 [12:17<1:09:09, 13.65s/it]                                                                                                                                                     {'loss': 0.2191, 'grad_norm': 1.3125, 'learning_rate': 9.76663983922178e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2784.8, 'epoch': 0.25}
 13%|█████████████▊                                                                                               | 44/348 [12:17<1:09:09, 13.65s/it] 13%|██████████████                                                                                               | 45/348 [12:31<1:09:00, 13.67s/it]                                                                                                                                                     {'loss': 0.2729, 'grad_norm': 1.484375, 'learning_rate': 9.7524021612222e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2618.22, 'epoch': 0.26}
 13%|██████████████                                                                                               | 45/348 [12:31<1:09:00, 13.67s/it] 13%|██████████████▍                                                                                              | 46/348 [12:45<1:09:03, 13.72s/it]                                                                                                                                                     {'loss': 0.2154, 'grad_norm': 1.296875, 'learning_rate': 9.737753923441689e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2498.29, 'epoch': 0.26}
 13%|██████████████▍                                                                                              | 46/348 [12:45<1:09:03, 13.72s/it] 14%|██████████████▋                                                                                              | 47/348 [12:58<1:08:25, 13.64s/it]                                                                                                                                                     {'loss': 0.1515, 'grad_norm': 0.9921875, 'learning_rate': 9.722696391340762e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3038.62, 'epoch': 0.27}
 14%|██████████████▋                                                                                              | 47/348 [12:58<1:08:25, 13.64s/it] 14%|███████████████                                                                                              | 48/348 [13:12<1:08:53, 13.78s/it]                                                                                                                                                     {'loss': 0.1541, 'grad_norm': 0.8671875, 'learning_rate': 9.70723086573885e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2398.35, 'epoch': 0.27}
 14%|███████████████                                                                                              | 48/348 [13:12<1:08:53, 13.78s/it] 14%|███████████████▎                                                                                             | 49/348 [13:26<1:08:32, 13.76s/it]                                                                                                                                                     {'loss': 0.1502, 'grad_norm': 0.9375, 'learning_rate': 9.691358682701927e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2829.14, 'epoch': 0.28}
 14%|███████████████▎                                                                                             | 49/348 [13:26<1:08:32, 13.76s/it] 14%|███████████████▋                                                                                             | 50/348 [13:40<1:08:26, 13.78s/it]                                                                                                                                                     {'loss': 0.1268, 'grad_norm': 0.7890625, 'learning_rate': 9.675081213427076e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2848.71, 'epoch': 0.29}
 14%|███████████████▋                                                                                             | 50/348 [13:40<1:08:26, 13.78s/it] 15%|███████████████▉                                                                                             | 51/348 [13:53<1:07:59, 13.74s/it]                                                                                                                                                     {'loss': 0.1389, 'grad_norm': 0.87109375, 'learning_rate': 9.658399864124037e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2699.26, 'epoch': 0.29}
 15%|███████████████▉                                                                                             | 51/348 [13:53<1:07:59, 13.74s/it] 15%|████████████████▎                                                                                            | 52/348 [14:07<1:07:51, 13.76s/it]                                                                                                                                                     {'loss': 0.1829, 'grad_norm': 0.89453125, 'learning_rate': 9.641316075893731e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2923.15, 'epoch': 0.3}
 15%|████████████████▎                                                                                            | 52/348 [14:07<1:07:51, 13.76s/it] 15%|████████████████▌                                                                                            | 53/348 [14:21<1:07:10, 13.66s/it]                                                                                                                                                     {'loss': 0.1099, 'grad_norm': 0.62890625, 'learning_rate': 9.623831324603755e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2939.37, 'epoch': 0.3}
 15%|████████████████▌                                                                                            | 53/348 [14:21<1:07:10, 13.66s/it] 16%|████████████████▉                                                                                            | 54/348 [14:34<1:06:37, 13.60s/it]                                                                                                                                                     {'loss': 0.1941, 'grad_norm': 0.98046875, 'learning_rate': 9.605947120760878e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2832.64, 'epoch': 0.31}
 16%|████████████████▉                                                                                            | 54/348 [14:34<1:06:37, 13.60s/it] 16%|█████████████████▏                                                                                           | 55/348 [14:48<1:07:05, 13.74s/it]                                                                                                                                                     {'loss': 0.1666, 'grad_norm': 1.0, 'learning_rate': 9.587665009380565e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2217.25, 'epoch': 0.31}
 16%|█████████████████▏                                                                                           | 55/348 [14:48<1:07:05, 13.74s/it] 16%|█████████████████▌                                                                                           | 56/348 [15:02<1:06:50, 13.73s/it]                                                                                                                                                     {'loss': 0.1247, 'grad_norm': 0.86328125, 'learning_rate': 9.568986569853487e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2741.51, 'epoch': 0.32}
 16%|█████████████████▌                                                                                           | 56/348 [15:02<1:06:50, 13.73s/it] 16%|█████████████████▊                                                                                           | 57/348 [15:16<1:06:36, 13.73s/it]                                                                                                                                                     {'loss': 0.1752, 'grad_norm': 1.046875, 'learning_rate': 9.549913415809084e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2814.19, 'epoch': 0.33}
 16%|█████████████████▊                                                                                           | 57/348 [15:16<1:06:36, 13.73s/it] 17%|██████████████████▏                                                                                          | 58/348 [15:29<1:06:45, 13.81s/it]                                                                                                                                                     {'loss': 0.1254, 'grad_norm': 0.8046875, 'learning_rate': 9.530447194976164e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2592.9, 'epoch': 0.33}
 17%|██████████████████▏                                                                                          | 58/348 [15:30<1:06:45, 13.81s/it] 17%|██████████████████▍                                                                                          | 59/348 [15:43<1:06:16, 13.76s/it]                                                                                                                                                     {'loss': 0.1293, 'grad_norm': 0.73828125, 'learning_rate': 9.510589589040554e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2650.89, 'epoch': 0.34}
 17%|██████████████████▍                                                                                          | 59/348 [15:43<1:06:16, 13.76s/it] 17%|██████████████████▊                                                                                          | 60/348 [15:57<1:05:49, 13.71s/it]                                                                                                                                                     {'loss': 0.2146, 'grad_norm': 0.96484375, 'learning_rate': 9.49034231349982e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2992.86, 'epoch': 0.34}
 17%|██████████████████▊                                                                                          | 60/348 [15:57<1:05:49, 13.71s/it] 18%|███████████████████                                                                                          | 61/348 [16:10<1:05:22, 13.67s/it]                                                                                                                                                     {'loss': 0.1197, 'grad_norm': 0.87890625, 'learning_rate': 9.469707117515068e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2502.36, 'epoch': 0.35}
 18%|███████████████████                                                                                          | 61/348 [16:10<1:05:22, 13.67s/it] 18%|███████████████████▍                                                                                         | 62/348 [16:24<1:05:23, 13.72s/it]                                                                                                                                                     {'loss': 0.0597, 'grad_norm': 0.37109375, 'learning_rate': 9.448685783759825e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2860.21, 'epoch': 0.35}
 18%|███████████████████▍                                                                                         | 62/348 [16:24<1:05:23, 13.72s/it] 18%|███████████████████▋                                                                                         | 63/348 [16:38<1:05:05, 13.70s/it]                                                                                                                                                     {'loss': 0.1111, 'grad_norm': 0.7109375, 'learning_rate': 9.427280128266049e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2664.12, 'epoch': 0.36}
 18%|███████████████████▋                                                                                         | 63/348 [16:38<1:05:05, 13.70s/it] 18%|████████████████████                                                                                         | 64/348 [16:51<1:04:25, 13.61s/it]                                                                                                                                                     {'loss': 0.0786, 'grad_norm': 0.53515625, 'learning_rate': 9.405492000267228e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3019.66, 'epoch': 0.37}
 18%|████████████████████                                                                                         | 64/348 [16:51<1:04:25, 13.61s/it] 19%|████████████████████▎                                                                                        | 65/348 [17:05<1:04:56, 13.77s/it]                                                                                                                                                     {'loss': 0.0974, 'grad_norm': 0.60546875, 'learning_rate': 9.383323282038632e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2576.77, 'epoch': 0.37}
 19%|████████████████████▎                                                                                        | 65/348 [17:05<1:04:56, 13.77s/it] 19%|████████████████████▋                                                                                        | 66/348 [17:19<1:04:52, 13.80s/it]                                                                                                                                                     {'loss': 0.1319, 'grad_norm': 0.77734375, 'learning_rate': 9.360775888734699e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2465.95, 'epoch': 0.38}
 19%|████████████████████▋                                                                                        | 66/348 [17:19<1:04:52, 13.80s/it] 19%|████████████████████▉                                                                                        | 67/348 [17:33<1:04:21, 13.74s/it]                                                                                                                                                     {'loss': 0.0652, 'grad_norm': 0.53515625, 'learning_rate': 9.337851768223589e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2894.11, 'epoch': 0.38}
 19%|████████████████████▉                                                                                        | 67/348 [17:33<1:04:21, 13.74s/it] 20%|█████████████████████▎                                                                                       | 68/348 [17:47<1:04:44, 13.87s/it]                                                                                                                                                     {'loss': 0.0583, 'grad_norm': 0.52734375, 'learning_rate': 9.31455290091891e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2574.91, 'epoch': 0.39}
 20%|█████████████████████▎                                                                                       | 68/348 [17:47<1:04:44, 13.87s/it] 20%|█████████████████████▌                                                                                       | 69/348 [18:01<1:04:42, 13.92s/it]                                                                                                                                                     {'loss': 0.0442, 'grad_norm': 0.408203125, 'learning_rate': 9.29088129960862e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2519.51, 'epoch': 0.39}
 20%|█████████████████████▌                                                                                       | 69/348 [18:01<1:04:42, 13.92s/it] 20%|█████████████████████▉                                                                                       | 70/348 [18:15<1:04:19, 13.88s/it]                                                                                                                                                     {'loss': 0.1398, 'grad_norm': 0.80078125, 'learning_rate': 9.266839009281154e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2783.16, 'epoch': 0.4}
 20%|█████████████████████▉                                                                                       | 70/348 [18:15<1:04:19, 13.88s/it] 20%|██████████████████████▏                                                                                      | 71/348 [18:29<1:03:57, 13.85s/it]                                                                                                                                                     {'loss': 0.075, 'grad_norm': 0.46875, 'learning_rate': 9.242428106948748e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2950.36, 'epoch': 0.41}
 20%|██████████████████████▏                                                                                      | 71/348 [18:29<1:03:57, 13.85s/it] 21%|██████████████████████▌                                                                                      | 72/348 [18:42<1:02:59, 13.69s/it]                                                                                                                                                     {'loss': 0.1491, 'grad_norm': 0.81640625, 'learning_rate': 9.217650701468016e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2875.92, 'epoch': 0.41}
 21%|██████████████████████▌                                                                                      | 72/348 [18:42<1:02:59, 13.69s/it] 21%|██████████████████████▊                                                                                      | 73/348 [18:55<1:02:34, 13.65s/it]                                                                                                                                                     {'loss': 0.1263, 'grad_norm': 0.82421875, 'learning_rate': 9.192508933357753e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2569.93, 'epoch': 0.42}
 21%|██████████████████████▊                                                                                      | 73/348 [18:56<1:02:34, 13.65s/it] 21%|███████████████████████▏                                                                                     | 74/348 [19:09<1:02:25, 13.67s/it]                                                                                                                                                     {'loss': 0.0605, 'grad_norm': 0.43359375, 'learning_rate': 9.16700497461403e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2438.7, 'epoch': 0.42}
 21%|███████████████████████▏                                                                                     | 74/348 [19:09<1:02:25, 13.67s/it] 22%|███████████████████████▍                                                                                     | 75/348 [19:23<1:02:40, 13.77s/it]                                                                                                                                                     {'loss': 0.1145, 'grad_norm': 0.69921875, 'learning_rate': 9.141141028522544e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2611.75, 'epoch': 0.43}
 22%|███████████████████████▍                                                                                     | 75/348 [19:23<1:02:40, 13.77s/it] 22%|███████████████████████▊                                                                                     | 76/348 [19:37<1:02:05, 13.70s/it]                                                                                                                                                     {'loss': 0.0514, 'grad_norm': 0.37890625, 'learning_rate': 9.114919329468283e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3101.25, 'epoch': 0.43}
 22%|███████████████████████▊                                                                                     | 76/348 [19:37<1:02:05, 13.70s/it] 22%|████████████████████████                                                                                     | 77/348 [19:50<1:01:47, 13.68s/it]                                                                                                                                                     {'loss': 0.0562, 'grad_norm': 0.34765625, 'learning_rate': 9.088342142742493e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2733.41, 'epoch': 0.44}
 22%|████████████████████████                                                                                     | 77/348 [19:50<1:01:47, 13.68s/it] 22%|████████████████████████▍                                                                                    | 78/348 [20:04<1:01:41, 13.71s/it]                                                                                                                                                     {'loss': 0.0333, 'grad_norm': 0.2890625, 'learning_rate': 9.061411764346983e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2626.95, 'epoch': 0.45}
 22%|████████████████████████▍                                                                                    | 78/348 [20:04<1:01:41, 13.71s/it] 23%|████████████████████████▋                                                                                    | 79/348 [20:18<1:01:14, 13.66s/it]                                                                                                                                                     {'loss': 0.0568, 'grad_norm': 0.3984375, 'learning_rate': 9.034130520795774e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3008.89, 'epoch': 0.45}
 23%|████████████████████████▋                                                                                    | 79/348 [20:18<1:01:14, 13.66s/it] 23%|█████████████████████████                                                                                    | 80/348 [20:31<1:00:49, 13.62s/it]                                                                                                                                                     {'loss': 0.0463, 'grad_norm': 0.359375, 'learning_rate': 9.006500768914106e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3027.76, 'epoch': 0.46}
 23%|█████████████████████████                                                                                    | 80/348 [20:31<1:00:49, 13.62s/it] 23%|█████████████████████████▎                                                                                   | 81/348 [20:45<1:00:23, 13.57s/it]                                                                                                                                                     {'loss': 0.1023, 'grad_norm': 1.25, 'learning_rate': 8.978524895634842e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2492.67, 'epoch': 0.46}
 23%|█████████████████████████▎                                                                                   | 81/348 [20:45<1:00:23, 13.57s/it] 24%|█████████████████████████▋                                                                                   | 82/348 [20:58<1:00:03, 13.55s/it]                                                                                                                                                     {'loss': 0.1399, 'grad_norm': 0.875, 'learning_rate': 8.95020531779225e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2912.47, 'epoch': 0.47}
 24%|█████████████████████████▋                                                                                   | 82/348 [20:58<1:00:03, 13.55s/it] 24%|█████████████████████████▉                                                                                   | 83/348 [21:12<1:00:02, 13.59s/it]                                                                                                                                                     {'loss': 0.0874, 'grad_norm': 0.6328125, 'learning_rate': 8.921544481913218e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2789.59, 'epoch': 0.47}
 24%|█████████████████████████▉                                                                                   | 83/348 [21:12<1:00:02, 13.59s/it] 24%|██████████████████████████▊                                                                                    | 84/348 [21:25<59:48, 13.59s/it]                                                                                                                                                     {'loss': 0.0489, 'grad_norm': 0.341796875, 'learning_rate': 8.892544864005899e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2560.69, 'epoch': 0.48}
 24%|██████████████████████████▊                                                                                    | 84/348 [21:25<59:48, 13.59s/it] 24%|███████████████████████████                                                                                    | 85/348 [21:39<59:47, 13.64s/it]                                                                                                                                                     {'loss': 0.058, 'grad_norm': 0.49609375, 'learning_rate': 8.86320896934581e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2585.27, 'epoch': 0.49}
 24%|███████████████████████████                                                                                    | 85/348 [21:39<59:47, 13.64s/it] 25%|███████████████████████████▍                                                                                   | 86/348 [21:53<59:23, 13.60s/it]                                                                                                                                                     {'loss': 0.0867, 'grad_norm': 0.7109375, 'learning_rate': 8.833539332259398e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2720.59, 'epoch': 0.49}
 25%|███████████████████████████▍                                                                                   | 86/348 [21:53<59:23, 13.60s/it] 25%|███████████████████████████▊                                                                                   | 87/348 [22:06<59:21, 13.64s/it]                                                                                                                                                     {'loss': 0.033, 'grad_norm': 0.283203125, 'learning_rate': 8.803538515905102e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2553.95, 'epoch': 0.5}
 25%|███████████████████████████▊                                                                                   | 87/348 [22:06<59:21, 13.64s/it] 25%|████████████████████████████                                                                                   | 88/348 [22:20<59:05, 13.64s/it]                                                                                                                                                     {'loss': 0.0489, 'grad_norm': 0.4609375, 'learning_rate': 8.773209112051919e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2679.76, 'epoch': 0.5}
 25%|████████████████████████████                                                                                   | 88/348 [22:20<59:05, 13.64s/it] 26%|████████████████████████████▍                                                                                  | 89/348 [22:34<58:58, 13.66s/it]                                                                                                                                                     {'loss': 0.0393, 'grad_norm': 0.375, 'learning_rate': 8.742553740855507e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2581.67, 'epoch': 0.51}
 26%|████████████████████████████▍                                                                                  | 89/348 [22:34<58:58, 13.66s/it] 26%|████████████████████████████▋                                                                                  | 90/348 [22:48<58:49, 13.68s/it]                                                                                                                                                     {'loss': 0.04, 'grad_norm': 0.345703125, 'learning_rate': 8.711575050631823e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2911.56, 'epoch': 0.51}
 26%|████████████████████████████▋                                                                                  | 90/348 [22:48<58:49, 13.68s/it] 26%|█████████████████████████████                                                                                  | 91/348 [23:02<58:59, 13.77s/it]                                                                                                                                                     {'loss': 0.0431, 'grad_norm': 0.3671875, 'learning_rate': 8.680275717628336e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2518.89, 'epoch': 0.52}
 26%|█████████████████████████████                                                                                  | 91/348 [23:02<58:59, 13.77s/it] 26%|█████████████████████████████▎                                                                                 | 92/348 [23:15<58:32, 13.72s/it]                                                                                                                                                     {'loss': 0.0298, 'grad_norm': 0.296875, 'learning_rate': 8.64865844579284e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2680.58, 'epoch': 0.53}
 26%|█████████████████████████████▎                                                                                 | 92/348 [23:15<58:32, 13.72s/it] 27%|█████████████████████████████▋                                                                                 | 93/348 [23:29<58:17, 13.71s/it]                                                                                                                                                     {'loss': 0.0265, 'grad_norm': 0.275390625, 'learning_rate': 8.616725966539831e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3005.02, 'epoch': 0.53}
 27%|█████████████████████████████▋                                                                                 | 93/348 [23:29<58:17, 13.71s/it] 27%|█████████████████████████████▉                                                                                 | 94/348 [23:42<57:44, 13.64s/it]                                                                                                                                                     {'loss': 0.0373, 'grad_norm': 0.306640625, 'learning_rate': 8.584481038514573e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3016.43, 'epoch': 0.54}
 27%|█████████████████████████████▉                                                                                 | 94/348 [23:42<57:44, 13.64s/it] 27%|██████████████████████████████▎                                                                                | 95/348 [23:56<57:22, 13.61s/it]                                                                                                                                                     {'loss': 0.0458, 'grad_norm': 0.478515625, 'learning_rate': 8.551926447354759e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2795.81, 'epoch': 0.54}
 27%|██████████████████████████████▎                                                                                | 95/348 [23:56<57:22, 13.61s/it] 28%|██████████████████████████████▌                                                                                | 96/348 [24:09<56:56, 13.56s/it]                                                                                                                                                     {'loss': 0.0232, 'grad_norm': 0.27734375, 'learning_rate': 8.519065005449858e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2878.8, 'epoch': 0.55}
 28%|██████████████████████████████▌                                                                                | 96/348 [24:09<56:56, 13.56s/it] 28%|██████████████████████████████▉                                                                                | 97/348 [24:23<56:43, 13.56s/it]                                                                                                                                                     {'loss': 0.0453, 'grad_norm': 0.29296875, 'learning_rate': 8.485899551698166e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2725.95, 'epoch': 0.55}
 28%|██████████████████████████████▉                                                                                | 97/348 [24:23<56:43, 13.56s/it] 28%|███████████████████████████████▎                                                                               | 98/348 [24:36<56:36, 13.59s/it]                                                                                                                                                     {'loss': 0.0938, 'grad_norm': 0.6484375, 'learning_rate': 8.452432951261549e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2575.12, 'epoch': 0.56}
 28%|███████████████████████████████▎                                                                               | 98/348 [24:36<56:36, 13.59s/it] 28%|███████████████████████████████▌                                                                               | 99/348 [24:50<56:18, 13.57s/it]                                                                                                                                                     {'loss': 0.0753, 'grad_norm': 0.59765625, 'learning_rate': 8.418668095317912e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3209.7, 'epoch': 0.57}
 28%|███████████████████████████████▌                                                                               | 99/348 [24:50<56:18, 13.57s/it] 29%|███████████████████████████████▌                                                                              | 100/348 [25:04<56:07, 13.58s/it]                                                                                                                                                     {'loss': 0.1434, 'grad_norm': 1.2421875, 'learning_rate': 8.384607900811442e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2850.4, 'epoch': 0.57}
 29%|███████████████████████████████▌                                                                              | 100/348 [25:04<56:07, 13.58s/it] 29%|███████████████████████████████▉                                                                              | 101/348 [25:17<55:38, 13.52s/it]                                                                                                                                                     {'loss': 0.0273, 'grad_norm': 0.26953125, 'learning_rate': 8.350255310200611e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2721.27, 'epoch': 0.58}
 29%|███████████████████████████████▉                                                                              | 101/348 [25:17<55:38, 13.52s/it] 29%|████████████████████████████████▏                                                                             | 102/348 [25:31<56:00, 13.66s/it]                                                                                                                                                     {'loss': 0.0736, 'grad_norm': 0.546875, 'learning_rate': 8.315613291203977e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2675.92, 'epoch': 0.58}
 29%|████████████████████████████████▏                                                                             | 102/348 [25:31<56:00, 13.66s/it] 30%|████████████████████████████████▌                                                                             | 103/348 [25:45<55:40, 13.63s/it]                                                                                                                                                     {'loss': 0.058, 'grad_norm': 0.49609375, 'learning_rate': 8.280684836543794e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2559.37, 'epoch': 0.59}
 30%|████████████████████████████████▌                                                                             | 103/348 [25:45<55:40, 13.63s/it] 30%|████████████████████████████████▊                                                                             | 104/348 [25:59<55:55, 13.75s/it]                                                                                                                                                     {'loss': 0.0298, 'grad_norm': 0.25390625, 'learning_rate': 8.245472963687484e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2787.37, 'epoch': 0.59}
 30%|████████████████████████████████▊                                                                             | 104/348 [25:59<55:55, 13.75s/it] 30%|█████████████████████████████████▏                                                                            | 105/348 [26:12<55:36, 13.73s/it]                                                                                                                                                     {'loss': 0.0485, 'grad_norm': 0.400390625, 'learning_rate': 8.209980714586955e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2944.0, 'epoch': 0.6}
 30%|█████████████████████████████████▏                                                                            | 105/348 [26:12<55:36, 13.73s/it] 30%|█████████████████████████████████▌                                                                            | 106/348 [26:26<55:07, 13.67s/it]                                                                                                                                                     {'loss': 0.0967, 'grad_norm': 0.84375, 'learning_rate': 8.1742111554158e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2716.67, 'epoch': 0.61}
 30%|█████████████████████████████████▌                                                                            | 106/348 [26:26<55:07, 13.67s/it] 31%|█████████████████████████████████▊                                                                            | 107/348 [26:39<54:57, 13.68s/it]                                                                                                                                                     {'loss': 0.0364, 'grad_norm': 0.275390625, 'learning_rate': 8.138167376304411e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3009.11, 'epoch': 0.61}
 31%|█████████████████████████████████▊                                                                            | 107/348 [26:39<54:57, 13.68s/it] 31%|██████████████████████████████████▏                                                                           | 108/348 [26:53<54:28, 13.62s/it]                                                                                                                                                     {'loss': 0.0341, 'grad_norm': 0.271484375, 'learning_rate': 8.101852491073036e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3138.5, 'epoch': 0.62}
 31%|██████████████████████████████████▏                                                                           | 108/348 [26:53<54:28, 13.62s/it] 31%|██████████████████████████████████▍                                                                           | 109/348 [27:07<54:24, 13.66s/it]                                                                                                                                                     {'loss': 0.0313, 'grad_norm': 0.2734375, 'learning_rate': 8.065269636962765e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2719.79, 'epoch': 0.62}
 31%|██████████████████████████████████▍                                                                           | 109/348 [27:07<54:24, 13.66s/it] 32%|██████████████████████████████████▊                                                                           | 110/348 [27:21<54:29, 13.74s/it]                                                                                                                                                     {'loss': 0.0643, 'grad_norm': 0.439453125, 'learning_rate': 8.0284219743645e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2799.12, 'epoch': 0.63}
 32%|██████████████████████████████████▊                                                                           | 110/348 [27:21<54:29, 13.74s/it] 32%|███████████████████████████████████                                                                           | 111/348 [27:34<54:09, 13.71s/it]                                                                                                                                                     {'loss': 0.0418, 'grad_norm': 0.310546875, 'learning_rate': 7.991312686545939e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2738.02, 'epoch': 0.63}
 32%|███████████████████████████████████                                                                           | 111/348 [27:34<54:09, 13.71s/it] 32%|███████████████████████████████████▍                                                                          | 112/348 [27:48<53:45, 13.67s/it]                                                                                                                                                     {'loss': 0.0289, 'grad_norm': 0.310546875, 'learning_rate': 7.953944979376567e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2974.26, 'epoch': 0.64}
 32%|███████████████████████████████████▍                                                                          | 112/348 [27:48<53:45, 13.67s/it] 32%|███████████████████████████████████▋                                                                          | 113/348 [28:02<53:50, 13.75s/it]                                                                                                                                                     {'loss': 0.0832, 'grad_norm': 0.5625, 'learning_rate': 7.916322081050708e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2340.37, 'epoch': 0.65}
 32%|███████████████████████████████████▋                                                                          | 113/348 [28:02<53:50, 13.75s/it] 33%|████████████████████████████████████                                                                          | 114/348 [28:16<53:37, 13.75s/it]                                                                                                                                                     {'loss': 0.0861, 'grad_norm': 0.53515625, 'learning_rate': 7.878447241808634e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2676.66, 'epoch': 0.65}
 33%|████████████████████████████████████                                                                          | 114/348 [28:16<53:37, 13.75s/it] 33%|████████████████████████████████████▎                                                                         | 115/348 [28:30<53:52, 13.87s/it]                                                                                                                                                     {'loss': 0.0463, 'grad_norm': 0.41015625, 'learning_rate': 7.84032373365578e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2801.52, 'epoch': 0.66}
 33%|████████████████████████████████████▎                                                                         | 115/348 [28:30<53:52, 13.87s/it] 33%|████████████████████████████████████▋                                                                         | 116/348 [28:43<53:19, 13.79s/it]                                                                                                                                                     {'loss': 0.0391, 'grad_norm': 0.3359375, 'learning_rate': 7.801954850080075e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2962.88, 'epoch': 0.66}
 33%|████████████████████████████████████▋                                                                         | 116/348 [28:43<53:19, 13.79s/it] 34%|████████████████████████████████████▉                                                                         | 117/348 [28:57<52:55, 13.75s/it]                                                                                                                                                     {'loss': 0.0404, 'grad_norm': 0.46875, 'learning_rate': 7.76334390576742e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3018.36, 'epoch': 0.67}
 34%|████████████████████████████████████▉                                                                         | 117/348 [28:57<52:55, 13.75s/it] 34%|█████████████████████████████████████▎                                                                        | 118/348 [29:11<52:47, 13.77s/it]                                                                                                                                                     {'loss': 0.0464, 'grad_norm': 0.375, 'learning_rate': 7.724494236315327e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2427.37, 'epoch': 0.67}
 34%|█████████████████████████████████████▎                                                                        | 118/348 [29:11<52:47, 13.77s/it] 34%|█████████████████████████████████████▌                                                                        | 119/348 [29:25<52:48, 13.84s/it]                                                                                                                                                     {'loss': 0.0333, 'grad_norm': 0.32421875, 'learning_rate': 7.685409197944768e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2232.12, 'epoch': 0.68}
 34%|█████████████████████████████████████▌                                                                        | 119/348 [29:25<52:48, 13.84s/it] 34%|█████████████████████████████████████▉                                                                        | 120/348 [29:39<52:49, 13.90s/it]                                                                                                                                                     {'loss': 0.0861, 'grad_norm': 0.64453125, 'learning_rate': 7.646092167210217e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2813.2, 'epoch': 0.69}
 34%|█████████████████████████████████████▉                                                                        | 120/348 [29:39<52:49, 13.90s/it] 35%|██████████████████████████████████████▏                                                                       | 121/348 [29:53<52:45, 13.94s/it]                                                                                                                                                     {'loss': 0.021, 'grad_norm': 0.3046875, 'learning_rate': 7.60654654070796e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2352.58, 'epoch': 0.69}
 35%|██████████████████████████████████████▏                                                                       | 121/348 [29:53<52:45, 13.94s/it] 35%|██████████████████████████████████████▌                                                                       | 122/348 [30:07<53:02, 14.08s/it]                                                                                                                                                     {'loss': 0.0577, 'grad_norm': 0.46484375, 'learning_rate': 7.566775734782656e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2475.15, 'epoch': 0.7}
 35%|██████████████████████████████████████▌                                                                       | 122/348 [30:07<53:02, 14.08s/it] 35%|██████████████████████████████████████▉                                                                       | 123/348 [30:21<52:12, 13.92s/it]                                                                                                                                                     {'loss': 0.0475, 'grad_norm': 0.3984375, 'learning_rate': 7.526783185232208e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3012.45, 'epoch': 0.7}
 35%|██████████████████████████████████████▉                                                                       | 123/348 [30:21<52:12, 13.92s/it] 36%|███████████████████████████████████████▏                                                                      | 124/348 [30:35<52:01, 13.94s/it]                                                                                                                                                     {'loss': 0.0258, 'grad_norm': 0.228515625, 'learning_rate': 7.486572347010937e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2496.1, 'epoch': 0.71}
 36%|███████████████████████████████████████▏                                                                      | 124/348 [30:35<52:01, 13.94s/it] 36%|███████████████████████████████████████▌                                                                      | 125/348 [30:49<51:37, 13.89s/it]                                                                                                                                                     {'loss': 0.0246, 'grad_norm': 0.2236328125, 'learning_rate': 7.446146693931111e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2444.31, 'epoch': 0.71}
 36%|███████████████████████████████████████▌                                                                      | 125/348 [30:49<51:37, 13.89s/it] 36%|███████████████████████████████████████▊                                                                      | 126/348 [31:02<50:53, 13.76s/it]                                                                                                                                                     {'loss': 0.0754, 'grad_norm': 0.71484375, 'learning_rate': 7.405509718362842e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2953.81, 'epoch': 0.72}
 36%|███████████████████████████████████████▊                                                                      | 126/348 [31:02<50:53, 13.76s/it] 36%|████████████████████████████████████████▏                                                                     | 127/348 [31:16<50:59, 13.84s/it]                                                                                                                                                     {'loss': 0.0189, 'grad_norm': 0.216796875, 'learning_rate': 7.364664930932385e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2673.89, 'epoch': 0.73}
 36%|████████████████████████████████████████▏                                                                     | 127/348 [31:16<50:59, 13.84s/it] 37%|████████████████████████████████████████▍                                                                     | 128/348 [31:30<50:22, 13.74s/it]                                                                                                                                                     {'loss': 0.0288, 'grad_norm': 0.322265625, 'learning_rate': 7.323615860218844e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2798.32, 'epoch': 0.73}
 37%|████████████████████████████████████████▍                                                                     | 128/348 [31:30<50:22, 13.74s/it] 37%|████████████████████████████████████████▊                                                                     | 129/348 [31:43<49:56, 13.68s/it]                                                                                                                                                     {'loss': 0.0181, 'grad_norm': 0.232421875, 'learning_rate': 7.282366052449351e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2975.23, 'epoch': 0.74}
 37%|████████████████████████████████████████▊                                                                     | 129/348 [31:43<49:56, 13.68s/it] 37%|█████████████████████████████████████████                                                                     | 130/348 [31:57<50:01, 13.77s/it]                                                                                                                                                     {'loss': 0.013, 'grad_norm': 0.1787109375, 'learning_rate': 7.2409190711927015e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2568.09, 'epoch': 0.74}
 37%|█████████████████████████████████████████                                                                     | 130/348 [31:57<50:01, 13.77s/it] 38%|█████████████████████████████████████████▍                                                                    | 131/348 [32:11<49:54, 13.80s/it]                                                                                                                                                     {'loss': 0.0173, 'grad_norm': 0.22265625, 'learning_rate': 7.199278497051498e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2476.61, 'epoch': 0.75}
 38%|█████████████████████████████████████████▍                                                                    | 131/348 [32:11<49:54, 13.80s/it] 38%|█████████████████████████████████████████▋                                                                    | 132/348 [32:25<49:38, 13.79s/it]                                                                                                                                                     {'loss': 0.0213, 'grad_norm': 0.291015625, 'learning_rate': 7.157447927352821e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2714.73, 'epoch': 0.75}
 38%|█████████████████████████████████████████▋                                                                    | 132/348 [32:25<49:38, 13.79s/it] 38%|██████████████████████████████████████████                                                                    | 133/348 [32:38<49:20, 13.77s/it]                                                                                                                                                     {'loss': 0.0218, 'grad_norm': 0.248046875, 'learning_rate': 7.115430975837457e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2486.93, 'epoch': 0.76}
 38%|██████████████████████████████████████████                                                                    | 133/348 [32:38<49:20, 13.77s/it] 39%|██████████████████████████████████████████▎                                                                   | 134/348 [32:52<49:25, 13.86s/it]                                                                                                                                                     {'loss': 0.0264, 'grad_norm': 0.205078125, 'learning_rate': 7.073231272347714e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2564.86, 'epoch': 0.77}
 39%|██████████████████████████████████████████▎                                                                   | 134/348 [32:52<49:25, 13.86s/it] 39%|██████████████████████████████████████████▋                                                                   | 135/348 [33:06<49:14, 13.87s/it]                                                                                                                                                     {'loss': 0.0154, 'grad_norm': 0.2060546875, 'learning_rate': 7.030852462513827e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2190.97, 'epoch': 0.77}
 39%|██████████████████████████████████████████▋                                                                   | 135/348 [33:06<49:14, 13.87s/it] 39%|██████████████████████████████████████████▉                                                                   | 136/348 [33:20<48:27, 13.72s/it]                                                                                                                                                     {'loss': 0.0298, 'grad_norm': 0.455078125, 'learning_rate': 6.988298207439022e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3016.94, 'epoch': 0.78}
 39%|██████████████████████████████████████████▉                                                                   | 136/348 [33:20<48:27, 13.72s/it] 39%|███████████████████████████████████████████▎                                                                  | 137/348 [33:33<48:03, 13.67s/it]                                                                                                                                                     {'loss': 0.084, 'grad_norm': 1.09375, 'learning_rate': 6.945572183383229e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2727.31, 'epoch': 0.78}
 39%|███████████████████████████████████████████▎                                                                  | 137/348 [33:33<48:03, 13.67s/it] 40%|███████████████████████████████████████████▌                                                                  | 138/348 [33:47<47:50, 13.67s/it]                                                                                                                                                     {'loss': 0.0272, 'grad_norm': 0.2392578125, 'learning_rate': 6.902678081445495e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2607.28, 'epoch': 0.79}
 40%|███████████████████████████████████████████▌                                                                  | 138/348 [33:47<47:50, 13.67s/it] 40%|███████████████████████████████████████████▉                                                                  | 139/348 [34:01<47:31, 13.64s/it]                                                                                                                                                     {'loss': 0.0556, 'grad_norm': 0.5546875, 'learning_rate': 6.859619607245102e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2771.64, 'epoch': 0.79}
 40%|███████████████████████████████████████████▉                                                                  | 139/348 [34:01<47:31, 13.64s/it] 40%|████████████████████████████████████████████▎                                                                 | 140/348 [34:14<47:24, 13.68s/it]                                                                                                                                                     {'loss': 0.0207, 'grad_norm': 0.2265625, 'learning_rate': 6.816400480601445e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2401.32, 'epoch': 0.8}
 40%|████████████████████████████████████████████▎                                                                 | 140/348 [34:14<47:24, 13.68s/it] 41%|████████████████████████████████████████████▌                                                                 | 141/348 [34:28<46:53, 13.59s/it]                                                                                                                                                     {'loss': 0.0703, 'grad_norm': 0.66796875, 'learning_rate': 6.773024435212678e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3091.92, 'epoch': 0.81}
 41%|████████████████████████████████████████████▌                                                                 | 141/348 [34:28<46:53, 13.59s/it] 41%|████████████████████████████████████████████▉                                                                 | 142/348 [34:41<46:42, 13.61s/it]                                                                                                                                                     {'loss': 0.0279, 'grad_norm': 0.2373046875, 'learning_rate': 6.729495218333157e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2746.16, 'epoch': 0.81}
 41%|████████████████████████████████████████████▉                                                                 | 142/348 [34:41<46:42, 13.61s/it] 41%|█████████████████████████████████████████████▏                                                                | 143/348 [34:56<47:05, 13.79s/it]                                                                                                                                                     {'loss': 0.0265, 'grad_norm': 0.259765625, 'learning_rate': 6.685816590449708e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2282.45, 'epoch': 0.82}
 41%|█████████████████████████████████████████████▏                                                                | 143/348 [34:56<47:05, 13.79s/it] 41%|█████████████████████████████████████████████▌                                                                | 144/348 [35:09<46:44, 13.75s/it]                                                                                                                                                     {'loss': 0.0278, 'grad_norm': 0.2265625, 'learning_rate': 6.641992324956776e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3038.36, 'epoch': 0.82}
 41%|█████████████████████████████████████████████▌                                                                | 144/348 [35:09<46:44, 13.75s/it] 42%|█████████████████████████████████████████████▊                                                                | 145/348 [35:23<46:25, 13.72s/it]                                                                                                                                                     {'loss': 0.022, 'grad_norm': 0.2236328125, 'learning_rate': 6.598026207830428e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2970.55, 'epoch': 0.83}
 42%|█████████████████████████████████████████████▊                                                                | 145/348 [35:23<46:25, 13.72s/it] 42%|██████████████████████████████████████████████▏                                                               | 146/348 [35:36<46:04, 13.69s/it]                                                                                                                                                     {'loss': 0.0247, 'grad_norm': 0.2421875, 'learning_rate': 6.553922037301283e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2920.1, 'epoch': 0.83}
 42%|██████████████████████████████████████████████▏                                                               | 146/348 [35:36<46:04, 13.69s/it] 42%|██████████████████████████████████████████████▍                                                               | 147/348 [35:50<45:42, 13.64s/it]                                                                                                                                                     {'loss': 0.0285, 'grad_norm': 0.24609375, 'learning_rate': 6.5096836235263904e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2520.75, 'epoch': 0.84}
 42%|██████████████████████████████████████████████▍                                                               | 147/348 [35:50<45:42, 13.64s/it] 43%|██████████████████████████████████████████████▊                                                               | 148/348 [36:04<45:46, 13.73s/it]                                                                                                                                                     {'loss': 0.0354, 'grad_norm': 0.29296875, 'learning_rate': 6.465314788260067e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2114.22, 'epoch': 0.85}
 43%|██████████████████████████████████████████████▊                                                               | 148/348 [36:04<45:46, 13.73s/it] 43%|███████████████████████████████████████████████                                                               | 149/348 [36:17<45:19, 13.67s/it]                                                                                                                                                     {'loss': 0.0203, 'grad_norm': 0.259765625, 'learning_rate': 6.4208193645237314e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3177.33, 'epoch': 0.85}
 43%|███████████████████████████████████████████████                                                               | 149/348 [36:17<45:19, 13.67s/it] 43%|███████████████████████████████████████████████▍                                                              | 150/348 [36:31<44:48, 13.58s/it]                                                                                                                                                     {'loss': 0.0413, 'grad_norm': 0.333984375, 'learning_rate': 6.376201196274778e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2889.73, 'epoch': 0.86}
 43%|███████████████████████████████████████████████▍                                                              | 150/348 [36:31<44:48, 13.58s/it] 43%|███████████████████████████████████████████████▋                                                              | 151/348 [36:44<44:35, 13.58s/it]                                                                                                                                                     {'loss': 0.0287, 'grad_norm': 0.275390625, 'learning_rate': 6.331464138074493e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3087.28, 'epoch': 0.86}
 43%|███████████████████████████████████████████████▋                                                              | 151/348 [36:44<44:35, 13.58s/it] 44%|████████████████████████████████████████████████                                                              | 152/348 [36:58<44:47, 13.71s/it]                                                                                                                                                     {'loss': 0.0682, 'grad_norm': 0.7890625, 'learning_rate': 6.286612054755056e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2073.56, 'epoch': 0.87}
 44%|████████████████████████████████████████████████                                                              | 152/348 [36:58<44:47, 13.71s/it] 44%|████████████████████████████████████████████████▎                                                             | 153/348 [37:12<44:31, 13.70s/it]                                                                                                                                                     {'loss': 0.0603, 'grad_norm': 0.458984375, 'learning_rate': 6.241648821085666e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2501.33, 'epoch': 0.87}
 44%|████████████████████████████████████████████████▎                                                             | 153/348 [37:12<44:31, 13.70s/it] 44%|████████████████████████████████████████████████▋                                                             | 154/348 [37:26<44:38, 13.81s/it]                                                                                                                                                     {'loss': 0.0738, 'grad_norm': 0.546875, 'learning_rate': 6.1965783214377895e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2625.96, 'epoch': 0.88}
 44%|████████████████████████████████████████████████▋                                                             | 154/348 [37:26<44:38, 13.81s/it] 45%|████████████████████████████████████████████████▉                                                             | 155/348 [37:40<44:32, 13.85s/it]                                                                                                                                                     {'loss': 0.0474, 'grad_norm': 0.361328125, 'learning_rate': 6.1514044494496e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2711.4, 'epoch': 0.89}
 45%|████████████████████████████████████████████████▉                                                             | 155/348 [37:40<44:32, 13.85s/it] 45%|█████████████████████████████████████████████████▎                                                            | 156/348 [37:54<44:11, 13.81s/it]                                                                                                                                                     {'loss': 0.0393, 'grad_norm': 0.400390625, 'learning_rate': 6.106131107689599e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2709.86, 'epoch': 0.89}
 45%|█████████████████████████████████████████████████▎                                                            | 156/348 [37:54<44:11, 13.81s/it] 45%|█████████████████████████████████████████████████▋                                                            | 157/348 [38:07<43:39, 13.71s/it]                                                                                                                                                     {'loss': 0.0365, 'grad_norm': 0.333984375, 'learning_rate': 6.060762207319479e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2546.78, 'epoch': 0.9}
 45%|█████████████████████████████████████████████████▋                                                            | 157/348 [38:07<43:39, 13.71s/it] 45%|█████████████████████████████████████████████████▉                                                            | 158/348 [38:21<43:47, 13.83s/it]                                                                                                                                                     {'loss': 0.0474, 'grad_norm': 0.357421875, 'learning_rate': 6.015301667756234e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2539.39, 'epoch': 0.9}
 45%|█████████████████████████████████████████████████▉                                                            | 158/348 [38:21<43:47, 13.83s/it] 46%|██████████████████████████████████████████████████▎                                                           | 159/348 [38:35<43:33, 13.83s/it]                                                                                                                                                     {'loss': 0.0204, 'grad_norm': 0.21875, 'learning_rate': 5.9697534163335645e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2512.72, 'epoch': 0.91}
 46%|██████████████████████████████████████████████████▎                                                           | 159/348 [38:35<43:33, 13.83s/it] 46%|██████████████████████████████████████████████████▌                                                           | 160/348 [38:49<43:18, 13.82s/it]                                                                                                                                                     {'loss': 0.0652, 'grad_norm': 0.48046875, 'learning_rate': 5.924121387962594e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2464.49, 'epoch': 0.91}
 46%|██████████████████████████████████████████████████▌                                                           | 160/348 [38:49<43:18, 13.82s/it] 46%|██████████████████████████████████████████████████▉                                                           | 161/348 [39:03<42:50, 13.75s/it]                                                                                                                                                     {'loss': 0.0327, 'grad_norm': 0.28515625, 'learning_rate': 5.878409524791931e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2703.38, 'epoch': 0.92}
 46%|██████████████████████████████████████████████████▉                                                           | 161/348 [39:03<42:50, 13.75s/it] 47%|███████████████████████████████████████████████████▏                                                          | 162/348 [39:16<42:39, 13.76s/it]                                                                                                                                                     {'loss': 0.0409, 'grad_norm': 0.298828125, 'learning_rate': 5.83262177586711e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2593.95, 'epoch': 0.93}
 47%|███████████████████████████████████████████████████▏                                                          | 162/348 [39:16<42:39, 13.76s/it] 47%|███████████████████████████████████████████████████▌                                                          | 163/348 [39:30<42:19, 13.73s/it]                                                                                                                                                     {'loss': 0.0189, 'grad_norm': 0.2314453125, 'learning_rate': 5.786762096789431e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2815.0, 'epoch': 0.93}
 47%|███████████████████████████████████████████████████▌                                                          | 163/348 [39:30<42:19, 13.73s/it] 47%|███████████████████████████████████████████████████▊                                                          | 164/348 [39:44<41:56, 13.68s/it]                                                                                                                                                     {'loss': 0.0154, 'grad_norm': 0.208984375, 'learning_rate': 5.740834449374237e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2975.02, 'epoch': 0.94}
 47%|███████████████████████████████████████████████████▊                                                          | 164/348 [39:44<41:56, 13.68s/it] 47%|████████████████████████████████████████████████████▏                                                         | 165/348 [39:57<41:34, 13.63s/it]                                                                                                                                                     {'loss': 0.0199, 'grad_norm': 0.265625, 'learning_rate': 5.694842801308651e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2929.8, 'epoch': 0.94}
 47%|████████████████████████████████████████████████████▏                                                         | 165/348 [39:57<41:34, 13.63s/it] 48%|████████████████████████████████████████████████████▍                                                         | 166/348 [40:11<41:56, 13.83s/it]                                                                                                                                                     {'loss': 0.0346, 'grad_norm': 0.2890625, 'learning_rate': 5.648791125808809e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2386.34, 'epoch': 0.95}
 48%|████████████████████████████████████████████████████▍                                                         | 166/348 [40:11<41:56, 13.83s/it] 48%|████████████████████████████████████████████████████▊                                                         | 167/348 [40:25<41:33, 13.77s/it]                                                                                                                                                     {'loss': 0.0882, 'grad_norm': 1.09375, 'learning_rate': 5.6026834012766155e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2692.37, 'epoch': 0.95}
 48%|████████████████████████████████████████████████████▊                                                         | 167/348 [40:25<41:33, 13.77s/it] 48%|█████████████████████████████████████████████████████                                                         | 168/348 [40:38<40:58, 13.66s/it]                                                                                                                                                     {'loss': 0.0211, 'grad_norm': 0.2001953125, 'learning_rate': 5.556523610956049e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3084.81, 'epoch': 0.96}
 48%|█████████████████████████████████████████████████████                                                         | 168/348 [40:38<40:58, 13.66s/it] 49%|█████████████████████████████████████████████████████▍                                                        | 169/348 [40:52<40:26, 13.56s/it]                                                                                                                                                     {'loss': 0.022, 'grad_norm': 0.2392578125, 'learning_rate': 5.510315742589042e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3122.01, 'epoch': 0.97}
 49%|█████████████████████████████████████████████████████▍                                                        | 169/348 [40:52<40:26, 13.56s/it] 49%|█████████████████████████████████████████████████████▋                                                        | 170/348 [41:05<40:13, 13.56s/it]                                                                                                                                                     {'loss': 0.0761, 'grad_norm': 0.6484375, 'learning_rate': 5.464063788070996e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2822.8, 'epoch': 0.97}
 49%|█████████████████████████████████████████████████████▋                                                        | 170/348 [41:05<40:13, 13.56s/it] 49%|██████████████████████████████████████████████████████                                                        | 171/348 [41:19<39:40, 13.45s/it]                                                                                                                                                     {'loss': 0.0615, 'grad_norm': 0.498046875, 'learning_rate': 5.417771743105908e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2564.38, 'epoch': 0.98}
 49%|██████████████████████████████████████████████████████                                                        | 171/348 [41:19<39:40, 13.45s/it] 49%|██████████████████████████████████████████████████████▎                                                       | 172/348 [41:32<39:28, 13.46s/it]                                                                                                                                                     {'loss': 0.1264, 'grad_norm': 0.91796875, 'learning_rate': 5.371443606861186e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2787.49, 'epoch': 0.98}
 49%|██████████████████████████████████████████████████████▎                                                       | 172/348 [41:32<39:28, 13.46s/it] 50%|██████████████████████████████████████████████████████▋                                                       | 173/348 [41:46<39:36, 13.58s/it]                                                                                                                                                     {'loss': 0.0841, 'grad_norm': 0.72265625, 'learning_rate': 5.325083381622165e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2257.17, 'epoch': 0.99}
 50%|██████████████████████████████████████████████████████▋                                                       | 173/348 [41:46<39:36, 13.58s/it] 50%|███████████████████████████████████████████████████████                                                       | 174/348 [42:00<39:56, 13.77s/it]                                                                                                                                                     {'loss': 0.0647, 'grad_norm': 0.55859375, 'learning_rate': 5.278695072446342e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2282.09, 'epoch': 0.99}
 50%|███████████████████████████████████████████████████████                                                       | 174/348 [42:00<39:56, 13.77s/it][2026-01-06 07:14:54,106] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:5347] Running evaluation step...
[2026-01-06 07:14:56,604] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.7122600078582764
[2026-01-06 07:14:57,314] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.7101740837097168
[2026-01-06 07:14:58,070] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.755955696105957
[2026-01-06 07:14:58,820] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.7490048408508301
[2026-01-06 07:14:59,097] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:5347] gather_len_batches: [78, 78]

  0%|                                                                                                                         | 0/78 [00:00<?, ?it/s][A
  3%|██▉                                                                                                              | 2/78 [00:01<01:06,  1.15it/s][A
  4%|████▎                                                                                                            | 3/78 [00:03<01:27,  1.17s/it][A
  5%|█████▊                                                                                                           | 4/78 [00:04<01:37,  1.32s/it][A
  6%|███████▏                                                                                                         | 5/78 [00:06<01:47,  1.48s/it][A
  8%|████████▋                                                                                                        | 6/78 [00:08<01:52,  1.56s/it][A
  9%|██████████▏                                                                                                      | 7/78 [00:09<01:51,  1.57s/it][A
 10%|███████████▌                                                                                                     | 8/78 [00:11<01:52,  1.61s/it][A
 12%|█████████████                                                                                                    | 9/78 [00:13<01:51,  1.61s/it][A
 13%|██████████████▎                                                                                                 | 10/78 [00:14<01:50,  1.62s/it][A
 14%|███████████████▊                                                                                                | 11/78 [00:16<01:49,  1.63s/it][A
 15%|█████████████████▏                                                                                              | 12/78 [00:18<01:47,  1.63s/it][A
 17%|██████████████████▋                                                                                             | 13/78 [00:19<01:46,  1.64s/it][A
 18%|████████████████████                                                                                            | 14/78 [00:21<01:43,  1.62s/it][A
 19%|█████████████████████▌                                                                                          | 15/78 [00:23<01:41,  1.61s/it][A
 21%|██████████████████████▉                                                                                         | 16/78 [00:24<01:39,  1.61s/it][A
 22%|████████████████████████▍                                                                                       | 17/78 [00:26<01:40,  1.65s/it][A
 23%|█████████████████████████▊                                                                                      | 18/78 [00:27<01:37,  1.62s/it][A
 24%|███████████████████████████▎                                                                                    | 19/78 [00:29<01:36,  1.63s/it][A
 26%|████████████████████████████▋                                                                                   | 20/78 [00:31<01:34,  1.62s/it][A
 27%|██████████████████████████████▏                                                                                 | 21/78 [00:32<01:33,  1.64s/it][A
 28%|███████████████████████████████▌                                                                                | 22/78 [00:34<01:30,  1.62s/it][A
 29%|█████████████████████████████████                                                                               | 23/78 [00:35<01:27,  1.59s/it][A
 31%|██████████████████████████████████▍                                                                             | 24/78 [00:37<01:25,  1.58s/it][A
 32%|███████████████████████████████████▉                                                                            | 25/78 [00:39<01:23,  1.58s/it][A
 33%|█████████████████████████████████████▎                                                                          | 26/78 [00:40<01:22,  1.58s/it][A
 35%|██████████████████████████████████████▊                                                                         | 27/78 [00:42<01:20,  1.58s/it][A
 36%|████████████████████████████████████████▏                                                                       | 28/78 [00:43<01:19,  1.59s/it][A
 37%|█████████████████████████████████████████▋                                                                      | 29/78 [00:45<01:19,  1.62s/it][A
 38%|███████████████████████████████████████████                                                                     | 30/78 [00:47<01:17,  1.61s/it][A
 40%|████████████████████████████████████████████▌                                                                   | 31/78 [00:48<01:14,  1.58s/it][A
 41%|█████████████████████████████████████████████▉                                                                  | 32/78 [00:50<01:13,  1.59s/it][A
 42%|███████████████████████████████████████████████▍                                                                | 33/78 [00:51<01:12,  1.61s/it][A
 44%|████████████████████████████████████████████████▊                                                               | 34/78 [00:53<01:12,  1.64s/it][A
 45%|██████████████████████████████████████████████████▎                                                             | 35/78 [00:55<01:10,  1.65s/it][A
 46%|███████████████████████████████████████████████████▋                                                            | 36/78 [00:56<01:08,  1.62s/it][A
 47%|█████████████████████████████████████████████████████▏                                                          | 37/78 [00:58<01:06,  1.63s/it][A
 49%|██████████████████████████████████████████████████████▌                                                         | 38/78 [01:00<01:04,  1.62s/it][A
 50%|████████████████████████████████████████████████████████                                                        | 39/78 [01:01<01:02,  1.60s/it][A
 51%|█████████████████████████████████████████████████████████▍                                                      | 40/78 [01:03<01:01,  1.61s/it][A
 53%|██████████████████████████████████████████████████████████▊                                                     | 41/78 [01:05<01:00,  1.65s/it][A
 54%|████████████████████████████████████████████████████████████▎                                                   | 42/78 [01:06<00:58,  1.64s/it][A
 55%|█████████████████████████████████████████████████████████████▋                                                  | 43/78 [01:08<00:57,  1.66s/it][A
 56%|███████████████████████████████████████████████████████████████▏                                                | 44/78 [01:10<00:57,  1.68s/it][A
 58%|████████████████████████████████████████████████████████████████▌                                               | 45/78 [01:11<00:56,  1.71s/it][A
 59%|██████████████████████████████████████████████████████████████████                                              | 46/78 [01:13<00:54,  1.71s/it][A
 60%|███████████████████████████████████████████████████████████████████▍                                            | 47/78 [01:15<00:53,  1.72s/it][A
 62%|████████████████████████████████████████████████████████████████████▉                                           | 48/78 [01:17<00:51,  1.70s/it][A
 63%|██████████████████████████████████████████████████████████████████████▎                                         | 49/78 [01:18<00:48,  1.68s/it][A
 64%|███████████████████████████████████████████████████████████████████████▊                                        | 50/78 [01:20<00:46,  1.67s/it][A
 65%|█████████████████████████████████████████████████████████████████████████▏                                      | 51/78 [01:21<00:45,  1.67s/it][A
 67%|██████████████████████████████████████████████████████████████████████████▋                                     | 52/78 [01:23<00:43,  1.66s/it][A
 68%|████████████████████████████████████████████████████████████████████████████                                    | 53/78 [01:25<00:42,  1.68s/it][A
 69%|█████████████████████████████████████████████████████████████████████████████▌                                  | 54/78 [01:26<00:39,  1.65s/it][A
 71%|██████████████████████████████████████████████████████████████████████████████▉                                 | 55/78 [01:28<00:37,  1.64s/it][A
 72%|████████████████████████████████████████████████████████████████████████████████▍                               | 56/78 [01:30<00:35,  1.61s/it][A
 73%|█████████████████████████████████████████████████████████████████████████████████▊                              | 57/78 [01:31<00:33,  1.60s/it][A
 74%|███████████████████████████████████████████████████████████████████████████████████▎                            | 58/78 [01:33<00:31,  1.59s/it][A
 76%|████████████████████████████████████████████████████████████████████████████████████▋                           | 59/78 [01:34<00:29,  1.58s/it][A
 77%|██████████████████████████████████████████████████████████████████████████████████████▏                         | 60/78 [01:36<00:28,  1.58s/it][A
 78%|███████████████████████████████████████████████████████████████████████████████████████▌                        | 61/78 [01:37<00:26,  1.57s/it][A
 79%|█████████████████████████████████████████████████████████████████████████████████████████                       | 62/78 [01:39<00:25,  1.57s/it][A
 81%|██████████████████████████████████████████████████████████████████████████████████████████▍                     | 63/78 [01:41<00:24,  1.61s/it][A
 82%|███████████████████████████████████████████████████████████████████████████████████████████▉                    | 64/78 [01:42<00:22,  1.61s/it][A
 83%|█████████████████████████████████████████████████████████████████████████████████████████████▎                  | 65/78 [01:44<00:21,  1.62s/it][A
 85%|██████████████████████████████████████████████████████████████████████████████████████████████▊                 | 66/78 [01:45<00:19,  1.60s/it][A
 86%|████████████████████████████████████████████████████████████████████████████████████████████████▏               | 67/78 [01:47<00:17,  1.62s/it][A
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████▋              | 68/78 [01:49<00:16,  1.63s/it][A
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████             | 69/78 [01:50<00:14,  1.64s/it][A
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 70/78 [01:52<00:13,  1.66s/it][A
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 71/78 [01:54<00:11,  1.67s/it][A
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 72/78 [01:56<00:09,  1.67s/it][A
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 73/78 [01:57<00:08,  1.65s/it][A
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 74/78 [01:59<00:06,  1.63s/it][A
 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 75/78 [02:00<00:04,  1.63s/it][A
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 76/78 [02:02<00:03,  1.64s/it][A
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 77/78 [02:04<00:01,  1.63s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78/78 [02:05<00:00,  1.63s/it][A                                                                                                                                                     
                                                                                                                                                     [A{'eval_loss': 0.002111113630235195, 'eval_runtime': 128.6218, 'eval_samples_per_second': 1.446, 'eval_steps_per_second': 0.723, 'memory/max_active (GiB)': 85.95, 'memory/max_allocated (GiB)': 82.72, 'memory/device_reserved (GiB)': 106.04, 'epoch': 0.99}
 50%|███████████████████████████████████████████████████████                                                       | 174/348 [44:14<39:56, 13.77s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78/78 [02:06<00:00,  1.63s/it][A
                                                                                                                                                     [A[2026-01-06 07:17:14,516] [INFO] [axolotl.core.trainers.base._save:671] [PID:5347] Saving model checkpoint to /workspace/data/model-output-base/checkpoint-174
 50%|██████████████████████████████████████████████████████▎                                                     | 175/348 [46:10<4:04:17, 84.72s/it]                                                                                                                                                     {'loss': 0.0854, 'grad_norm': 0.5546875, 'learning_rate': 5.232282686817392e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.35, 'tokens_per_second_per_gpu': 3813.99, 'epoch': 1.01}
 50%|██████████████████████████████████████████████████████▎                                                     | 175/348 [46:10<4:04:17, 84.72s/it] 51%|██████████████████████████████████████████████████████▌                                                     | 176/348 [46:24<3:01:30, 63.32s/it]                                                                                                                                                     {'loss': 0.057, 'grad_norm': 0.63671875, 'learning_rate': 5.185850234298943e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 3182.1, 'epoch': 1.01}
 51%|██████████████████████████████████████████████████████▌                                                     | 176/348 [46:24<3:01:30, 63.32s/it] 51%|██████████████████████████████████████████████████████▉                                                     | 177/348 [46:37<2:17:56, 48.40s/it]                                                                                                                                                     {'loss': 0.0188, 'grad_norm': 0.224609375, 'learning_rate': 5.139401726188208e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2576.31, 'epoch': 1.02}
 51%|██████████████████████████████████████████████████████▉                                                     | 177/348 [46:37<2:17:56, 48.40s/it] 51%|███████████████████████████████████████████████████████▏                                                    | 178/348 [46:51<1:47:24, 37.91s/it]                                                                                                                                                     {'loss': 0.1055, 'grad_norm': 1.6328125, 'learning_rate': 5.09294117516944e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2746.03, 'epoch': 1.02}
 51%|███████████████████████████████████████████████████████▏                                                    | 178/348 [46:51<1:47:24, 37.91s/it] 51%|███████████████████████████████████████████████████████▌                                                    | 179/348 [47:04<1:26:05, 30.56s/it]                                                                                                                                                     {'loss': 0.0244, 'grad_norm': 0.2392578125, 'learning_rate': 5.046472594967279e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 3003.96, 'epoch': 1.03}
 51%|███████████████████████████████████████████████████████▌                                                    | 179/348 [47:04<1:26:05, 30.56s/it] 52%|███████████████████████████████████████████████████████▊                                                    | 180/348 [47:18<1:11:30, 25.54s/it]                                                                                                                                                     {'loss': 0.022, 'grad_norm': 0.224609375, 'learning_rate': 5e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2422.53, 'epoch': 1.03}
 52%|███████████████████████████████████████████████████████▊                                                    | 180/348 [47:18<1:11:30, 25.54s/it] 52%|████████████████████████████████████████████████████████▏                                                   | 181/348 [47:31<1:00:56, 21.90s/it]                                                                                                                                                     {'loss': 0.0251, 'grad_norm': 0.255859375, 'learning_rate': 4.953527405032723e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2945.71, 'epoch': 1.04}
 52%|████████████████████████████████████████████████████████▏                                                   | 181/348 [47:31<1:00:56, 21.90s/it] 52%|█████████████████████████████████████████████████████████▌                                                    | 182/348 [47:45<53:41, 19.41s/it]                                                                                                                                                     {'loss': 0.0141, 'grad_norm': 0.1953125, 'learning_rate': 4.90705882483056e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2853.32, 'epoch': 1.05}
 52%|█████████████████████████████████████████████████████████▌                                                    | 182/348 [47:45<53:41, 19.41s/it] 53%|█████████████████████████████████████████████████████████▊                                                    | 183/348 [47:58<48:27, 17.62s/it]                                                                                                                                                     {'loss': 0.0264, 'grad_norm': 0.232421875, 'learning_rate': 4.860598273811793e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2963.96, 'epoch': 1.05}
 53%|█████████████████████████████████████████████████████████▊                                                    | 183/348 [47:58<48:27, 17.62s/it] 53%|██████████████████████████████████████████████████████████▏                                                   | 184/348 [48:12<44:46, 16.38s/it]                                                                                                                                                     {'loss': 0.0546, 'grad_norm': 0.7890625, 'learning_rate': 4.814149765701059e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2697.48, 'epoch': 1.06}
 53%|██████████████████████████████████████████████████████████▏                                                   | 184/348 [48:12<44:46, 16.38s/it] 53%|██████████████████████████████████████████████████████████▍                                                   | 185/348 [48:25<41:52, 15.41s/it]                                                                                                                                                     {'loss': 0.0652, 'grad_norm': 0.48046875, 'learning_rate': 4.767717313182611e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2900.77, 'epoch': 1.06}
 53%|██████████████████████████████████████████████████████████▍                                                   | 185/348 [48:25<41:52, 15.41s/it] 53%|██████████████████████████████████████████████████████████▊                                                   | 186/348 [48:39<40:36, 15.04s/it]                                                                                                                                                     {'loss': 0.0462, 'grad_norm': 0.40625, 'learning_rate': 4.721304927553659e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2355.42, 'epoch': 1.07}
 53%|██████████████████████████████████████████████████████████▊                                                   | 186/348 [48:39<40:36, 15.04s/it] 54%|███████████████████████████████████████████████████████████                                                   | 187/348 [48:53<39:03, 14.55s/it]                                                                                                                                                     {'loss': 0.0749, 'grad_norm': 0.62109375, 'learning_rate': 4.6749166183778375e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2675.04, 'epoch': 1.07}
 54%|███████████████████████████████████████████████████████████                                                   | 187/348 [48:53<39:03, 14.55s/it] 54%|███████████████████████████████████████████████████████████▍                                                  | 188/348 [49:07<38:28, 14.43s/it]                                                                                                                                                     {'loss': 0.0212, 'grad_norm': 0.197265625, 'learning_rate': 4.628556393138816e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2738.01, 'epoch': 1.08}
 54%|███████████████████████████████████████████████████████████▍                                                  | 188/348 [49:07<38:28, 14.43s/it] 54%|███████████████████████████████████████████████████████████▋                                                  | 189/348 [49:21<37:45, 14.25s/it]                                                                                                                                                     {'loss': 0.0585, 'grad_norm': 0.439453125, 'learning_rate': 4.582228256894093e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2383.44, 'epoch': 1.09}
 54%|███████████████████████████████████████████████████████████▋                                                  | 189/348 [49:21<37:45, 14.25s/it] 55%|████████████████████████████████████████████████████████████                                                  | 190/348 [49:35<37:32, 14.26s/it]                                                                                                                                                     {'loss': 0.0315, 'grad_norm': 0.283203125, 'learning_rate': 4.535936211929005e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2199.03, 'epoch': 1.09}
 55%|████████████████████████████████████████████████████████████                                                  | 190/348 [49:35<37:32, 14.26s/it] 55%|████████████████████████████████████████████████████████████▎                                                 | 191/348 [49:49<36:58, 14.13s/it]                                                                                                                                                     {'loss': 0.016, 'grad_norm': 0.22265625, 'learning_rate': 4.489684257410959e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2496.83, 'epoch': 1.1}
 55%|████████████████████████████████████████████████████████████▎                                                 | 191/348 [49:49<36:58, 14.13s/it] 55%|████████████████████████████████████████████████████████████▋                                                 | 192/348 [50:02<36:18, 13.97s/it]                                                                                                                                                     {'loss': 0.0695, 'grad_norm': 0.71484375, 'learning_rate': 4.443476389043955e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2501.77, 'epoch': 1.1}
 55%|████████████████████████████████████████████████████████████▋                                                 | 192/348 [50:02<36:18, 13.97s/it] 55%|█████████████████████████████████████████████████████████████                                                 | 193/348 [50:17<36:14, 14.03s/it]                                                                                                                                                     {'loss': 0.0585, 'grad_norm': 0.484375, 'learning_rate': 4.397316598723385e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2558.66, 'epoch': 1.11}
 55%|█████████████████████████████████████████████████████████████                                                 | 193/348 [50:17<36:14, 14.03s/it] 56%|█████████████████████████████████████████████████████████████▎                                                | 194/348 [50:30<35:58, 14.01s/it]                                                                                                                                                     {'loss': 0.085, 'grad_norm': 0.67578125, 'learning_rate': 4.351208874191192e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2466.52, 'epoch': 1.11}
 56%|█████████████████████████████████████████████████████████████▎                                                | 194/348 [50:31<35:58, 14.01s/it] 56%|█████████████████████████████████████████████████████████████▋                                                | 195/348 [50:44<35:21, 13.87s/it]                                                                                                                                                     {'loss': 0.072, 'grad_norm': 0.625, 'learning_rate': 4.305157198691351e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2468.15, 'epoch': 1.12}
 56%|█████████████████████████████████████████████████████████████▋                                                | 195/348 [50:44<35:21, 13.87s/it] 56%|█████████████████████████████████████████████████████████████▉                                                | 196/348 [50:58<35:12, 13.90s/it]                                                                                                                                                     {'loss': 0.0215, 'grad_norm': 0.2314453125, 'learning_rate': 4.259165550625765e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2683.97, 'epoch': 1.13}
 56%|█████████████████████████████████████████████████████████████▉                                                | 196/348 [50:58<35:12, 13.90s/it] 57%|██████████████████████████████████████████████████████████████▎                                               | 197/348 [51:12<34:42, 13.79s/it]                                                                                                                                                     {'loss': 0.0198, 'grad_norm': 0.2255859375, 'learning_rate': 4.2132379032105695e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 3125.87, 'epoch': 1.13}
 57%|██████████████████████████████████████████████████████████████▎                                               | 197/348 [51:12<34:42, 13.79s/it] 57%|██████████████████████████████████████████████████████████████▌                                               | 198/348 [51:25<34:02, 13.61s/it]                                                                                                                                                     {'loss': 0.0176, 'grad_norm': 0.2265625, 'learning_rate': 4.167378224132891e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 3378.56, 'epoch': 1.14}
 57%|██████████████████████████████████████████████████████████████▌                                               | 198/348 [51:25<34:02, 13.61s/it] 57%|██████████████████████████████████████████████████████████████▉                                               | 199/348 [51:38<33:42, 13.57s/it]                                                                                                                                                     {'loss': 0.0313, 'grad_norm': 0.306640625, 'learning_rate': 4.121590475208071e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 3219.03, 'epoch': 1.14}
 57%|██████████████████████████████████████████████████████████████▉                                               | 199/348 [51:38<33:42, 13.57s/it] 57%|███████████████████████████████████████████████████████████████▏                                              | 200/348 [51:52<33:41, 13.66s/it]                                                                                                                                                     {'loss': 0.0173, 'grad_norm': 0.1826171875, 'learning_rate': 4.075878612037408e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2676.58, 'epoch': 1.15}
 57%|███████████████████████████████████████████████████████████████▏                                              | 200/348 [51:52<33:41, 13.66s/it] 58%|███████████████████████████████████████████████████████████████▌                                              | 201/348 [52:05<33:14, 13.57s/it]                                                                                                                                                     {'loss': 0.0241, 'grad_norm': 0.259765625, 'learning_rate': 4.030246583666437e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 3267.56, 'epoch': 1.15}
 58%|███████████████████████████████████████████████████████████████▌                                              | 201/348 [52:05<33:14, 13.57s/it] 58%|███████████████████████████████████████████████████████████████▊                                              | 202/348 [52:19<33:20, 13.70s/it]                                                                                                                                                     {'loss': 0.0159, 'grad_norm': 0.1875, 'learning_rate': 3.984698332243767e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2655.01, 'epoch': 1.16}
 58%|███████████████████████████████████████████████████████████████▊                                              | 202/348 [52:19<33:20, 13.70s/it] 58%|████████████████████████████████████████████████████████████████▏                                             | 203/348 [52:33<32:53, 13.61s/it]                                                                                                                                                     {'loss': 0.0275, 'grad_norm': 0.259765625, 'learning_rate': 3.9392377926805226e-06, 'memory/max_active (GiB)': 90.75, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2698.83, 'epoch': 1.17}
 58%|████████████████████████████████████████████████████████████████▏                                             | 203/348 [52:33<32:53, 13.61s/it] 59%|████████████████████████████████████████████████████████████████▍                                             | 204/348 [52:46<32:42, 13.63s/it]                                                                                                                                                     {'loss': 0.0565, 'grad_norm': 0.53125, 'learning_rate': 3.8938688923104015e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2555.03, 'epoch': 1.17}
 59%|████████████████████████████████████████████████████████████████▍                                             | 204/348 [52:47<32:42, 13.63s/it] 59%|████████████████████████████████████████████████████████████████▊                                             | 205/348 [53:01<32:50, 13.78s/it]                                                                                                                                                     {'loss': 0.0431, 'grad_norm': 0.380859375, 'learning_rate': 3.848595550550401e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2488.85, 'epoch': 1.18}
 59%|████████████████████████████████████████████████████████████████▊                                             | 205/348 [53:01<32:50, 13.78s/it] 59%|█████████████████████████████████████████████████████████████████                                             | 206/348 [53:15<32:42, 13.82s/it]                                                                                                                                                     {'loss': 0.0158, 'grad_norm': 0.21875, 'learning_rate': 3.803421678562213e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2637.72, 'epoch': 1.18}
 59%|█████████████████████████████████████████████████████████████████                                             | 206/348 [53:15<32:42, 13.82s/it] 59%|█████████████████████████████████████████████████████████████████▍                                            | 207/348 [53:28<32:25, 13.80s/it]                                                                                                                                                     {'loss': 0.0585, 'grad_norm': 0.466796875, 'learning_rate': 3.758351178914336e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2809.21, 'epoch': 1.19}
 59%|█████████████████████████████████████████████████████████████████▍                                            | 207/348 [53:28<32:25, 13.80s/it] 60%|█████████████████████████████████████████████████████████████████▋                                            | 208/348 [53:42<32:17, 13.84s/it]                                                                                                                                                     {'loss': 0.0462, 'grad_norm': 0.365234375, 'learning_rate': 3.713387945244945e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2882.43, 'epoch': 1.19}
 60%|█████████████████████████████████████████████████████████████████▋                                            | 208/348 [53:42<32:17, 13.84s/it] 60%|██████████████████████████████████████████████████████████████████                                            | 209/348 [53:56<32:02, 13.83s/it]                                                                                                                                                     {'loss': 0.024, 'grad_norm': 0.234375, 'learning_rate': 3.668535861925509e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2794.92, 'epoch': 1.2}
 60%|██████████████████████████████████████████████████████████████████                                            | 209/348 [53:56<32:02, 13.83s/it] 60%|██████████████████████████████████████████████████████████████████▍                                           | 210/348 [54:10<31:44, 13.80s/it]                                                                                                                                                     {'loss': 0.0242, 'grad_norm': 0.244140625, 'learning_rate': 3.623798803725223e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2481.22, 'epoch': 1.21}
 60%|██████████████████████████████████████████████████████████████████▍                                           | 210/348 [54:10<31:44, 13.80s/it] 61%|██████████████████████████████████████████████████████████████████▋                                           | 211/348 [54:24<31:30, 13.80s/it]                                                                                                                                                     {'loss': 0.0413, 'grad_norm': 0.392578125, 'learning_rate': 3.5791806354762702e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2771.5, 'epoch': 1.21}
 61%|██████████████████████████████████████████████████████████████████▋                                           | 211/348 [54:24<31:30, 13.80s/it] 61%|███████████████████████████████████████████████████████████████████                                           | 212/348 [54:37<31:04, 13.71s/it]                                                                                                                                                     {'loss': 0.0509, 'grad_norm': 0.59375, 'learning_rate': 3.534685211739935e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2543.46, 'epoch': 1.22}
 61%|███████████████████████████████████████████████████████████████████                                           | 212/348 [54:37<31:04, 13.71s/it] 61%|███████████████████████████████████████████████████████████████████▎                                          | 213/348 [54:50<30:36, 13.61s/it]                                                                                                                                                     {'loss': 0.0365, 'grad_norm': 0.39453125, 'learning_rate': 3.4903163764736104e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2874.76, 'epoch': 1.22}
 61%|███████████████████████████████████████████████████████████████████▎                                          | 213/348 [54:50<30:36, 13.61s/it] 61%|███████████████████████████████████████████████████████████████████▋                                          | 214/348 [55:04<30:23, 13.61s/it]                                                                                                                                                     {'loss': 0.0238, 'grad_norm': 0.2421875, 'learning_rate': 3.4460779626987186e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2439.67, 'epoch': 1.23}
 61%|███████████████████████████████████████████████████████████████████▋                                          | 214/348 [55:04<30:23, 13.61s/it] 62%|███████████████████████████████████████████████████████████████████▉                                          | 215/348 [55:18<30:06, 13.58s/it]                                                                                                                                                     {'loss': 0.0221, 'grad_norm': 0.3203125, 'learning_rate': 3.401973792169574e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 3236.53, 'epoch': 1.23}
 62%|███████████████████████████████████████████████████████████████████▉                                          | 215/348 [55:18<30:06, 13.58s/it] 62%|████████████████████████████████████████████████████████████████████▎                                         | 216/348 [55:32<30:08, 13.70s/it]                                                                                                                                                     {'loss': 0.0157, 'grad_norm': 0.2119140625, 'learning_rate': 3.3580076750432244e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2548.34, 'epoch': 1.24}
 62%|████████████████████████████████████████████████████████████████████▎                                         | 216/348 [55:32<30:08, 13.70s/it] 62%|████████████████████████████████████████████████████████████████████▌                                         | 217/348 [55:45<29:56, 13.72s/it]                                                                                                                                                     {'loss': 0.0253, 'grad_norm': 0.24609375, 'learning_rate': 3.314183409550293e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2592.69, 'epoch': 1.25}
 62%|████████████████████████████████████████████████████████████████████▌                                         | 217/348 [55:45<29:56, 13.72s/it] 63%|████████████████████████████████████████████████████████████████████▉                                         | 218/348 [55:59<29:43, 13.72s/it]                                                                                                                                                     {'loss': 0.0142, 'grad_norm': 0.265625, 'learning_rate': 3.270504781666845e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2646.25, 'epoch': 1.25}
 63%|████████████████████████████████████████████████████████████████████▉                                         | 218/348 [55:59<29:43, 13.72s/it] 63%|█████████████████████████████████████████████████████████████████████▏                                        | 219/348 [56:13<29:50, 13.88s/it]                                                                                                                                                     {'loss': 0.0242, 'grad_norm': 0.25, 'learning_rate': 3.226975564787322e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2563.16, 'epoch': 1.26}
 63%|█████████████████████████████████████████████████████████████████████▏                                        | 219/348 [56:13<29:50, 13.88s/it] 63%|█████████████████████████████████████████████████████████████████████▌                                        | 220/348 [56:27<29:39, 13.90s/it]                                                                                                                                                     {'loss': 0.0385, 'grad_norm': 0.3046875, 'learning_rate': 3.1835995193985548e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2652.55, 'epoch': 1.26}
 63%|█████████████████████████████████████████████████████████████████████▌                                        | 220/348 [56:27<29:39, 13.90s/it] 64%|█████████████████████████████████████████████████████████████████████▊                                        | 221/348 [56:41<29:07, 13.76s/it]                                                                                                                                                     {'loss': 0.0171, 'grad_norm': 0.2265625, 'learning_rate': 3.140380392754901e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2995.49, 'epoch': 1.27}
 64%|█████████████████████████████████████████████████████████████████████▊                                        | 221/348 [56:41<29:07, 13.76s/it] 64%|██████████████████████████████████████████████████████████████████████▏                                       | 222/348 [56:54<28:44, 13.69s/it]                                                                                                                                                     {'loss': 0.0121, 'grad_norm': 0.1865234375, 'learning_rate': 3.0973219185545077e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2742.55, 'epoch': 1.27}
 64%|██████████████████████████████████████████████████████████████████████▏                                       | 222/348 [56:54<28:44, 13.69s/it] 64%|██████████████████████████████████████████████████████████████████████▍                                       | 223/348 [57:08<28:43, 13.79s/it]                                                                                                                                                     {'loss': 0.0118, 'grad_norm': 0.2041015625, 'learning_rate': 3.054427816616773e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2829.54, 'epoch': 1.28}
 64%|██████████████████████████████████████████████████████████████████████▍                                       | 223/348 [57:08<28:43, 13.79s/it] 64%|██████████████████████████████████████████████████████████████████████▊                                       | 224/348 [57:22<28:32, 13.81s/it]                                                                                                                                                     {'loss': 0.0221, 'grad_norm': 0.2265625, 'learning_rate': 3.0117017925609802e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2400.83, 'epoch': 1.29}
 64%|██████████████████████████████████████████████████████████████████████▊                                       | 224/348 [57:22<28:32, 13.81s/it] 65%|███████████████████████████████████████████████████████████████████████                                       | 225/348 [57:36<28:32, 13.92s/it]                                                                                                                                                     {'loss': 0.0491, 'grad_norm': 0.388671875, 'learning_rate': 2.969147537486175e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2440.06, 'epoch': 1.29}
 65%|███████████████████████████████████████████████████████████████████████                                       | 225/348 [57:36<28:32, 13.92s/it] 65%|███████████████████████████████████████████████████████████████████████▍                                      | 226/348 [57:50<28:14, 13.89s/it]                                                                                                                                                     {'loss': 0.0447, 'grad_norm': 0.625, 'learning_rate': 2.9267687276522876e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2592.61, 'epoch': 1.3}
 65%|███████████████████████████████████████████████████████████████████████▍                                      | 226/348 [57:50<28:14, 13.89s/it] 65%|███████████████████████████████████████████████████████████████████████▊                                      | 227/348 [58:04<27:47, 13.78s/it]                                                                                                                                                     {'loss': 0.0413, 'grad_norm': 0.357421875, 'learning_rate': 2.8845690241625437e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2911.69, 'epoch': 1.3}
 65%|███████████████████████████████████████████████████████████████████████▊                                      | 227/348 [58:04<27:47, 13.78s/it] 66%|████████████████████████████████████████████████████████████████████████                                      | 228/348 [58:17<27:29, 13.74s/it]                                                                                                                                                     {'loss': 0.0451, 'grad_norm': 0.640625, 'learning_rate': 2.842552072647182e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2555.15, 'epoch': 1.31}
 66%|████████████████████████████████████████████████████████████████████████                                      | 228/348 [58:17<27:29, 13.74s/it] 66%|████████████████████████████████████████████████████████████████████████▍                                     | 229/348 [58:31<27:03, 13.64s/it]                                                                                                                                                     {'loss': 0.0237, 'grad_norm': 0.236328125, 'learning_rate': 2.800721502948506e-06, 'memory/max_active (GiB)': 90.75, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2626.84, 'epoch': 1.31}
 66%|████████████████████████████████████████████████████████████████████████▍                                     | 229/348 [58:31<27:03, 13.64s/it] 66%|████████████████████████████████████████████████████████████████████████▋                                     | 230/348 [58:44<26:43, 13.59s/it]                                                                                                                                                     {'loss': 0.0232, 'grad_norm': 0.259765625, 'learning_rate': 2.7590809288073e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2907.9, 'epoch': 1.32}
 66%|████████████████████████████████████████████████████████████████████████▋                                     | 230/348 [58:44<26:43, 13.59s/it] 66%|█████████████████████████████████████████████████████████████████████████                                     | 231/348 [58:58<26:35, 13.64s/it]                                                                                                                                                     {'loss': 0.0195, 'grad_norm': 0.2177734375, 'learning_rate': 2.7176339475506515e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2619.7, 'epoch': 1.33}
 66%|█████████████████████████████████████████████████████████████████████████                                     | 231/348 [58:58<26:35, 13.64s/it] 67%|█████████████████████████████████████████████████████████████████████████▎                                    | 232/348 [59:11<26:15, 13.58s/it]                                                                                                                                                     {'loss': 0.0153, 'grad_norm': 0.1796875, 'learning_rate': 2.6763841397811576e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2877.92, 'epoch': 1.33}
 67%|█████████████████████████████████████████████████████████████████████████▎                                    | 232/348 [59:11<26:15, 13.58s/it] 67%|█████████████████████████████████████████████████████████████████████████▋                                    | 233/348 [59:25<26:01, 13.58s/it]                                                                                                                                                     {'loss': 0.0139, 'grad_norm': 0.2099609375, 'learning_rate': 2.635335069067617e-06, 'memory/max_active (GiB)': 90.75, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2648.02, 'epoch': 1.34}
 67%|█████████████████████████████████████████████████████████████████████████▋                                    | 233/348 [59:25<26:01, 13.58s/it] 67%|█████████████████████████████████████████████████████████████████████████▉                                    | 234/348 [59:39<26:08, 13.76s/it]                                                                                                                                                     {'loss': 0.0787, 'grad_norm': 0.7109375, 'learning_rate': 2.5944902816371573e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2561.52, 'epoch': 1.34}
 67%|█████████████████████████████████████████████████████████████████████████▉                                    | 234/348 [59:39<26:08, 13.76s/it] 68%|██████████████████████████████████████████████████████████████████████████▎                                   | 235/348 [59:53<25:49, 13.71s/it]                                                                                                                                                     {'loss': 0.0827, 'grad_norm': 0.6640625, 'learning_rate': 2.553853306068888e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2911.94, 'epoch': 1.35}
 68%|██████████████████████████████████████████████████████████████████████████▎                                   | 235/348 [59:53<25:49, 13.71s/it] 68%|█████████████████████████████████████████████████████████████████████████▏                                  | 236/348 [1:00:06<25:24, 13.61s/it]                                                                                                                                                     {'loss': 0.0717, 'grad_norm': 0.63671875, 'learning_rate': 2.5134276529890646e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2807.38, 'epoch': 1.35}
 68%|█████████████████████████████████████████████████████████████████████████▏                                  | 236/348 [1:00:06<25:24, 13.61s/it] 68%|█████████████████████████████████████████████████████████████████████████▌                                  | 237/348 [1:00:20<25:08, 13.59s/it]                                                                                                                                                     {'loss': 0.0883, 'grad_norm': 0.72265625, 'learning_rate': 2.4732168147677927e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2650.86, 'epoch': 1.36}
 68%|█████████████████████████████████████████████████████████████████████████▌                                  | 237/348 [1:00:20<25:08, 13.59s/it] 68%|█████████████████████████████████████████████████████████████████████████▊                                  | 238/348 [1:00:33<24:52, 13.57s/it]                                                                                                                                                     {'loss': 0.0176, 'grad_norm': 0.197265625, 'learning_rate': 2.433224265217346e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2731.79, 'epoch': 1.37}
 68%|█████████████████████████████████████████████████████████████████████████▊                                  | 238/348 [1:00:33<24:52, 13.57s/it] 69%|██████████████████████████████████████████████████████████████████████████▏                                 | 239/348 [1:00:46<24:33, 13.52s/it]                                                                                                                                                     {'loss': 0.0293, 'grad_norm': 0.263671875, 'learning_rate': 2.3934534592920416e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2750.5, 'epoch': 1.37}
 69%|██████████████████████████████████████████████████████████████████████████▏                                 | 239/348 [1:00:47<24:33, 13.52s/it] 69%|██████████████████████████████████████████████████████████████████████████▍                                 | 240/348 [1:01:00<24:27, 13.58s/it]                                                                                                                                                     {'loss': 0.0198, 'grad_norm': 0.2490234375, 'learning_rate': 2.3539078327897846e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2764.31, 'epoch': 1.38}
 69%|██████████████████████████████████████████████████████████████████████████▍                                 | 240/348 [1:01:00<24:27, 13.58s/it] 69%|██████████████████████████████████████████████████████████████████████████▊                                 | 241/348 [1:01:14<24:16, 13.61s/it]                                                                                                                                                     {'loss': 0.0264, 'grad_norm': 0.24609375, 'learning_rate': 2.314590802055232e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2621.88, 'epoch': 1.38}
 69%|██████████████████████████████████████████████████████████████████████████▊                                 | 241/348 [1:01:14<24:16, 13.61s/it] 70%|███████████████████████████████████████████████████████████████████████████                                 | 242/348 [1:01:28<24:08, 13.66s/it]                                                                                                                                                     {'loss': 0.0138, 'grad_norm': 0.2158203125, 'learning_rate': 2.275505763684674e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2509.41, 'epoch': 1.39}
 70%|███████████████████████████████████████████████████████████████████████████                                 | 242/348 [1:01:28<24:08, 13.66s/it] 70%|███████████████████████████████████████████████████████████████████████████▍                                | 243/348 [1:01:42<23:59, 13.71s/it]                                                                                                                                                     {'loss': 0.0145, 'grad_norm': 0.1943359375, 'learning_rate': 2.2366560942325833e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 3038.27, 'epoch': 1.39}
 70%|███████████████████████████████████████████████████████████████████████████▍                                | 243/348 [1:01:42<23:59, 13.71s/it] 70%|███████████████████████████████████████████████████████████████████████████▋                                | 244/348 [1:01:55<23:51, 13.76s/it]                                                                                                                                                     {'loss': 0.0444, 'grad_norm': 0.3671875, 'learning_rate': 2.1980451499199262e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2756.67, 'epoch': 1.4}
 70%|███████████████████████████████████████████████████████████████████████████▋                                | 244/348 [1:01:55<23:51, 13.76s/it] 70%|████████████████████████████████████████████████████████████████████████████                                | 245/348 [1:02:09<23:38, 13.77s/it]                                                                                                                                                     {'loss': 0.0569, 'grad_norm': 0.474609375, 'learning_rate': 2.159676266344222e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2487.73, 'epoch': 1.41}
 70%|████████████████████████████████████████████████████████████████████████████                                | 245/348 [1:02:09<23:38, 13.77s/it] 71%|████████████████████████████████████████████████████████████████████████████▎                               | 246/348 [1:02:23<23:13, 13.66s/it]                                                                                                                                                     {'loss': 0.0125, 'grad_norm': 0.220703125, 'learning_rate': 2.121552758191366e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2861.25, 'epoch': 1.41}
 71%|████████████████████████████████████████████████████████████████████████████▎                               | 246/348 [1:02:23<23:13, 13.66s/it] 71%|████████████████████████████████████████████████████████████████████████████▋                               | 247/348 [1:02:37<23:22, 13.88s/it]                                                                                                                                                     {'loss': 0.0291, 'grad_norm': 0.314453125, 'learning_rate': 2.0836779189492925e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2666.84, 'epoch': 1.42}
 71%|████████████████████████████████████████████████████████████████████████████▋                               | 247/348 [1:02:37<23:22, 13.88s/it] 71%|████████████████████████████████████████████████████████████████████████████▉                               | 248/348 [1:02:51<23:02, 13.83s/it]                                                                                                                                                     {'loss': 0.0673, 'grad_norm': 0.54296875, 'learning_rate': 2.0460550206234324e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2660.68, 'epoch': 1.42}
 71%|████████████████████████████████████████████████████████████████████████████▉                               | 248/348 [1:02:51<23:02, 13.83s/it] 72%|█████████████████████████████████████████████████████████████████████████████▎                              | 249/348 [1:03:04<22:45, 13.79s/it]                                                                                                                                                     {'loss': 0.018, 'grad_norm': 0.23828125, 'learning_rate': 2.0086873134540626e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2869.73, 'epoch': 1.43}
 72%|█████████████████████████████████████████████████████████████████████████████▎                              | 249/348 [1:03:04<22:45, 13.79s/it] 72%|█████████████████████████████████████████████████████████████████████████████▌                              | 250/348 [1:03:18<22:25, 13.73s/it]                                                                                                                                                     {'loss': 0.022, 'grad_norm': 0.22265625, 'learning_rate': 1.9715780256355014e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2686.63, 'epoch': 1.43}
 72%|█████████████████████████████████████████████████████████████████████████████▌                              | 250/348 [1:03:18<22:25, 13.73s/it] 72%|█████████████████████████████████████████████████████████████████████████████▉                              | 251/348 [1:03:31<22:05, 13.66s/it]                                                                                                                                                     {'loss': 0.1629, 'grad_norm': 1.9453125, 'learning_rate': 1.9347303630372373e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 3002.91, 'epoch': 1.44}
 72%|█████████████████████████████████████████████████████████████████████████████▉                              | 251/348 [1:03:32<22:05, 13.66s/it] 72%|██████████████████████████████████████████████████████████████████████████████▏                             | 252/348 [1:03:45<21:52, 13.68s/it]                                                                                                                                                     {'loss': 0.0791, 'grad_norm': 1.296875, 'learning_rate': 1.8981475089269641e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2701.57, 'epoch': 1.45}
 72%|██████████████████████████████████████████████████████████████████████████████▏                             | 252/348 [1:03:45<21:52, 13.68s/it] 73%|██████████████████████████████████████████████████████████████████████████████▌                             | 253/348 [1:03:59<21:39, 13.68s/it]                                                                                                                                                     {'loss': 0.0286, 'grad_norm': 0.30078125, 'learning_rate': 1.8618326236955908e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2251.46, 'epoch': 1.45}
 73%|██████████████████████████████████████████████████████████████████████████████▌                             | 253/348 [1:03:59<21:39, 13.68s/it] 73%|██████████████████████████████████████████████████████████████████████████████▊                             | 254/348 [1:04:12<21:17, 13.59s/it]                                                                                                                                                     {'loss': 0.1294, 'grad_norm': 1.7578125, 'learning_rate': 1.8257888445842026e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2526.56, 'epoch': 1.46}
 73%|██████████████████████████████████████████████████████████████████████████████▊                             | 254/348 [1:04:12<21:17, 13.59s/it] 73%|███████████████████████████████████████████████████████████████████████████████▏                            | 255/348 [1:04:26<21:16, 13.73s/it]                                                                                                                                                     {'loss': 0.0132, 'grad_norm': 0.1689453125, 'learning_rate': 1.7900192854130465e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2636.65, 'epoch': 1.46}
 73%|███████████████████████████████████████████████████████████████████████████████▏                            | 255/348 [1:04:26<21:16, 13.73s/it] 74%|███████████████████████████████████████████████████████████████████████████████▍                            | 256/348 [1:04:40<21:12, 13.83s/it]                                                                                                                                                     {'loss': 0.0358, 'grad_norm': 0.46484375, 'learning_rate': 1.7545270363125155e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2174.36, 'epoch': 1.47}
 74%|███████████████████████████████████████████████████████████████████████████████▍                            | 256/348 [1:04:40<21:12, 13.83s/it] 74%|███████████████████████████████████████████████████████████████████████████████▊                            | 257/348 [1:04:54<20:56, 13.81s/it]                                                                                                                                                     {'loss': 0.011, 'grad_norm': 0.181640625, 'learning_rate': 1.7193151634562071e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2578.4, 'epoch': 1.47}
 74%|███████████████████████████████████████████████████████████████████████████████▊                            | 257/348 [1:04:54<20:56, 13.81s/it] 74%|████████████████████████████████████████████████████████████████████████████████                            | 258/348 [1:05:08<20:36, 13.74s/it]                                                                                                                                                     {'loss': 0.0142, 'grad_norm': 0.2197265625, 'learning_rate': 1.6843867087960252e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2640.0, 'epoch': 1.48}
 74%|████████████████████████████████████████████████████████████████████████████████                            | 258/348 [1:05:08<20:36, 13.74s/it] 74%|████████████████████████████████████████████████████████████████████████████████▍                           | 259/348 [1:05:21<20:17, 13.68s/it]                                                                                                                                                     {'loss': 0.0263, 'grad_norm': 0.2353515625, 'learning_rate': 1.6497446897993885e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2645.37, 'epoch': 1.49}
 74%|████████████████████████████████████████████████████████████████████████████████▍                           | 259/348 [1:05:21<20:17, 13.68s/it] 75%|████████████████████████████████████████████████████████████████████████████████▋                           | 260/348 [1:05:35<20:00, 13.64s/it]                                                                                                                                                     {'loss': 0.0195, 'grad_norm': 0.2099609375, 'learning_rate': 1.6153920991885591e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2810.61, 'epoch': 1.49}
 75%|████████████████████████████████████████████████████████████████████████████████▋                           | 260/348 [1:05:35<20:00, 13.64s/it] 75%|█████████████████████████████████████████████████████████████████████████████████                           | 261/348 [1:05:48<19:46, 13.64s/it]                                                                                                                                                     {'loss': 0.0154, 'grad_norm': 0.20703125, 'learning_rate': 1.581331904682089e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2882.95, 'epoch': 1.5}
 75%|█████████████████████████████████████████████████████████████████████████████████                           | 261/348 [1:05:48<19:46, 13.64s/it] 75%|█████████████████████████████████████████████████████████████████████████████████▎                          | 262/348 [1:06:03<19:47, 13.80s/it]                                                                                                                                                     {'loss': 0.0349, 'grad_norm': 0.294921875, 'learning_rate': 1.547567048738452e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2605.38, 'epoch': 1.5}
 75%|█████████████████████████████████████████████████████████████████████████████████▎                          | 262/348 [1:06:03<19:47, 13.80s/it] 76%|█████████████████████████████████████████████████████████████████████████████████▌                          | 263/348 [1:06:16<19:18, 13.63s/it]                                                                                                                                                     {'loss': 0.045, 'grad_norm': 0.40625, 'learning_rate': 1.5141004483018323e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2748.26, 'epoch': 1.51}
 76%|█████████████████████████████████████████████████████████████████████████████████▌                          | 263/348 [1:06:16<19:18, 13.63s/it] 76%|█████████████████████████████████████████████████████████████████████████████████▉                          | 264/348 [1:06:30<19:06, 13.65s/it]                                                                                                                                                     {'loss': 0.0277, 'grad_norm': 0.2490234375, 'learning_rate': 1.4809349945501422e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2672.19, 'epoch': 1.51}
 76%|█████████████████████████████████████████████████████████████████████████████████▉                          | 264/348 [1:06:30<19:06, 13.65s/it] 76%|██████████████████████████████████████████████████████████████████████████████████▏                         | 265/348 [1:06:43<18:54, 13.67s/it]                                                                                                                                                     {'loss': 0.0452, 'grad_norm': 0.4375, 'learning_rate': 1.4480735526452427e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2391.45, 'epoch': 1.52}
 76%|██████████████████████████████████████████████████████████████████████████████████▏                         | 265/348 [1:06:43<18:54, 13.67s/it] 76%|██████████████████████████████████████████████████████████████████████████████████▌                         | 266/348 [1:06:57<18:34, 13.59s/it]                                                                                                                                                     {'loss': 0.0459, 'grad_norm': 0.490234375, 'learning_rate': 1.4155189614854275e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2847.16, 'epoch': 1.53}
 76%|██████████████████████████████████████████████████████████████████████████████████▌                         | 266/348 [1:06:57<18:34, 13.59s/it] 77%|██████████████████████████████████████████████████████████████████████████████████▊                         | 267/348 [1:07:10<18:24, 13.63s/it]                                                                                                                                                     {'loss': 0.0173, 'grad_norm': 0.1943359375, 'learning_rate': 1.3832740334601692e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2813.13, 'epoch': 1.53}
 77%|██████████████████████████████████████████████████████████████████████████████████▊                         | 267/348 [1:07:10<18:24, 13.63s/it] 77%|███████████████████████████████████████████████████████████████████████████████████▏                        | 268/348 [1:07:24<18:04, 13.56s/it]                                                                                                                                                     {'loss': 0.0412, 'grad_norm': 0.41796875, 'learning_rate': 1.351341554207163e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2568.07, 'epoch': 1.54}
 77%|███████████████████████████████████████████████████████████████████████████████████▏                        | 268/348 [1:07:24<18:04, 13.56s/it] 77%|███████████████████████████████████████████████████████████████████████████████████▍                        | 269/348 [1:07:37<17:40, 13.42s/it]                                                                                                                                                     {'loss': 0.028, 'grad_norm': 0.40234375, 'learning_rate': 1.319724282371664e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 3087.0, 'epoch': 1.54}
 77%|███████████████████████████████████████████████████████████████████████████████████▍                        | 269/348 [1:07:37<17:40, 13.42s/it] 78%|███████████████████████████████████████████████████████████████████████████████████▊                        | 270/348 [1:07:51<17:32, 13.49s/it]                                                                                                                                                     {'loss': 0.0604, 'grad_norm': 0.66796875, 'learning_rate': 1.28842494936818e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2227.42, 'epoch': 1.55}
 78%|███████████████████████████████████████████████████████████████████████████████████▊                        | 270/348 [1:07:51<17:32, 13.49s/it] 78%|████████████████████████████████████████████████████████████████████████████████████                        | 271/348 [1:08:05<17:32, 13.67s/it]                                                                                                                                                     {'loss': 0.0587, 'grad_norm': 0.515625, 'learning_rate': 1.257446259144494e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2371.78, 'epoch': 1.55}
 78%|████████████████████████████████████████████████████████████████████████████████████                        | 271/348 [1:08:05<17:32, 13.67s/it] 78%|████████████████████████████████████████████████████████████████████████████████████▍                       | 272/348 [1:08:18<17:19, 13.68s/it]                                                                                                                                                     {'loss': 0.0211, 'grad_norm': 0.2275390625, 'learning_rate': 1.2267908879480822e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2781.74, 'epoch': 1.56}
 78%|████████████████████████████████████████████████████████████████████████████████████▍                       | 272/348 [1:08:18<17:19, 13.68s/it] 78%|████████████████████████████████████████████████████████████████████████████████████▋                       | 273/348 [1:08:32<17:00, 13.60s/it]                                                                                                                                                     {'loss': 0.0361, 'grad_norm': 0.361328125, 'learning_rate': 1.1964614840949002e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 3028.4, 'epoch': 1.57}
 78%|████████████████████████████████████████████████████████████████████████████████████▋                       | 273/348 [1:08:32<17:00, 13.60s/it] 79%|█████████████████████████████████████████████████████████████████████████████████████                       | 274/348 [1:08:45<16:47, 13.62s/it]                                                                                                                                                     {'loss': 0.015, 'grad_norm': 0.23046875, 'learning_rate': 1.1664606677406025e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2652.96, 'epoch': 1.57}
 79%|█████████████████████████████████████████████████████████████████████████████████████                       | 274/348 [1:08:45<16:47, 13.62s/it] 79%|█████████████████████████████████████████████████████████████████████████████████████▎                      | 275/348 [1:08:59<16:38, 13.67s/it]                                                                                                                                                     {'loss': 0.0673, 'grad_norm': 0.54296875, 'learning_rate': 1.1367910306541918e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2704.26, 'epoch': 1.58}
 79%|█████████████████████████████████████████████████████████████████████████████████████▎                      | 275/348 [1:08:59<16:38, 13.67s/it] 79%|█████████████████████████████████████████████████████████████████████████████████████▋                      | 276/348 [1:09:13<16:21, 13.64s/it]                                                                                                                                                     {'loss': 0.022, 'grad_norm': 0.21875, 'learning_rate': 1.1074551359941022e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2888.74, 'epoch': 1.58}
 79%|█████████████████████████████████████████████████████████████████████████████████████▋                      | 276/348 [1:09:13<16:21, 13.64s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████▉                      | 277/348 [1:09:26<16:09, 13.66s/it]                                                                                                                                                     {'loss': 0.0236, 'grad_norm': 0.2392578125, 'learning_rate': 1.078455518086784e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2710.51, 'epoch': 1.59}
 80%|█████████████████████████████████████████████████████████████████████████████████████▉                      | 277/348 [1:09:27<16:09, 13.66s/it] 80%|██████████████████████████████████████████████████████████████████████████████████████▎                     | 278/348 [1:09:40<15:53, 13.62s/it]                                                                                                                                                     {'loss': 0.0163, 'grad_norm': 0.1796875, 'learning_rate': 1.0497946822077504e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2702.2, 'epoch': 1.59}
 80%|██████████████████████████████████████████████████████████████████████████████████████▎                     | 278/348 [1:09:40<15:53, 13.62s/it] 80%|██████████████████████████████████████████████████████████████████████████████████████▌                     | 279/348 [1:09:54<15:38, 13.60s/it]                                                                                                                                                     {'loss': 0.0131, 'grad_norm': 0.2001953125, 'learning_rate': 1.0214751043651582e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2895.27, 'epoch': 1.6}
 80%|██████████████████████████████████████████████████████████████████████████████████████▌                     | 279/348 [1:09:54<15:38, 13.60s/it] 80%|██████████████████████████████████████████████████████████████████████████████████████▉                     | 280/348 [1:10:07<15:20, 13.53s/it]                                                                                                                                                     {'loss': 0.0299, 'grad_norm': 0.333984375, 'learning_rate': 9.934992310858944e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2738.35, 'epoch': 1.61}
 80%|██████████████████████████████████████████████████████████████████████████████████████▉                     | 280/348 [1:10:07<15:20, 13.53s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████▏                    | 281/348 [1:10:21<15:13, 13.63s/it]                                                                                                                                                     {'loss': 0.0136, 'grad_norm': 0.169921875, 'learning_rate': 9.658694792042284e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2437.71, 'epoch': 1.61}
 81%|███████████████████████████████████████████████████████████████████████████████████████▏                    | 281/348 [1:10:21<15:13, 13.63s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████▌                    | 282/348 [1:10:34<14:56, 13.59s/it]                                                                                                                                                     {'loss': 0.0169, 'grad_norm': 0.2197265625, 'learning_rate': 9.385882356530179e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2754.67, 'epoch': 1.62}
 81%|███████████████████████████████████████████████████████████████████████████████████████▌                    | 282/348 [1:10:34<14:56, 13.59s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████▊                    | 283/348 [1:10:48<14:48, 13.67s/it]                                                                                                                                                     {'loss': 0.0188, 'grad_norm': 0.2236328125, 'learning_rate': 9.116578572575091e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2472.74, 'epoch': 1.62}
 81%|███████████████████████████████████████████████████████████████████████████████████████▊                    | 283/348 [1:10:48<14:48, 13.67s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████▏                   | 284/348 [1:11:02<14:35, 13.68s/it]                                                                                                                                                     {'loss': 0.0875, 'grad_norm': 0.734375, 'learning_rate': 8.850806705317183e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2449.08, 'epoch': 1.63}
 82%|████████████████████████████████████████████████████████████████████████████████████████▏                   | 284/348 [1:11:02<14:35, 13.68s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████▍                   | 285/348 [1:11:15<14:16, 13.59s/it]                                                                                                                                                     {'loss': 0.0504, 'grad_norm': 0.416015625, 'learning_rate': 8.58858971477457e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2624.56, 'epoch': 1.63}
 82%|████████████████████████████████████████████████████████████████████████████████████████▍                   | 285/348 [1:11:15<14:16, 13.59s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████▊                   | 286/348 [1:11:29<14:14, 13.78s/it]                                                                                                                                                     {'loss': 0.045, 'grad_norm': 0.453125, 'learning_rate': 8.329950253859703e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2573.72, 'epoch': 1.64}
 82%|████████████████████████████████████████████████████████████████████████████████████████▊                   | 286/348 [1:11:29<14:14, 13.78s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████                   | 287/348 [1:11:43<14:01, 13.79s/it]                                                                                                                                                     {'loss': 0.0108, 'grad_norm': 0.1748046875, 'learning_rate': 8.074910666422475e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2376.3, 'epoch': 1.65}
 82%|█████████████████████████████████████████████████████████████████████████████████████████                   | 287/348 [1:11:43<14:01, 13.79s/it] 83%|█████████████████████████████████████████████████████████████████████████████████████████▍                  | 288/348 [1:11:57<13:48, 13.80s/it]                                                                                                                                                     {'loss': 0.0207, 'grad_norm': 0.2470703125, 'learning_rate': 7.823492985319858e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2385.59, 'epoch': 1.65}
 83%|█████████████████████████████████████████████████████████████████████████████████████████▍                  | 288/348 [1:11:57<13:48, 13.80s/it] 83%|█████████████████████████████████████████████████████████████████████████████████████████▋                  | 289/348 [1:12:10<13:27, 13.68s/it]                                                                                                                                                     {'loss': 0.0199, 'grad_norm': 0.2314453125, 'learning_rate': 7.575718930512516e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2984.19, 'epoch': 1.66}
 83%|█████████████████████████████████████████████████████████████████████████████████████████▋                  | 289/348 [1:12:11<13:27, 13.68s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████                  | 290/348 [1:12:24<13:08, 13.60s/it]                                                                                                                                                     {'loss': 0.0344, 'grad_norm': 0.337890625, 'learning_rate': 7.33160990718847e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2838.05, 'epoch': 1.66}
 83%|██████████████████████████████████████████████████████████████████████████████████████████                  | 290/348 [1:12:24<13:08, 13.60s/it] 84%|██████████████████████████████████████████████████████████████████████████████████████████▎                 | 291/348 [1:12:38<13:04, 13.76s/it]                                                                                                                                                     {'loss': 0.0445, 'grad_norm': 0.34375, 'learning_rate': 7.091187003913802e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2803.35, 'epoch': 1.67}
 84%|██████████████████████████████████████████████████████████████████████████████████████████▎                 | 291/348 [1:12:38<13:04, 13.76s/it] 84%|██████████████████████████████████████████████████████████████████████████████████████████▌                 | 292/348 [1:12:52<12:50, 13.76s/it]                                                                                                                                                     {'loss': 0.0968, 'grad_norm': 0.640625, 'learning_rate': 6.854470990810907e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2621.66, 'epoch': 1.67}
 84%|██████████████████████████████████████████████████████████████████████████████████████████▌                 | 292/348 [1:12:52<12:50, 13.76s/it] 84%|██████████████████████████████████████████████████████████████████████████████████████████▉                 | 293/348 [1:13:05<12:34, 13.73s/it]                                                                                                                                                     {'loss': 0.0454, 'grad_norm': 0.455078125, 'learning_rate': 6.621482317764105e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2659.68, 'epoch': 1.68}
 84%|██████████████████████████████████████████████████████████████████████████████████████████▉                 | 293/348 [1:13:05<12:34, 13.73s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████▏                | 294/348 [1:13:19<12:24, 13.78s/it]                                                                                                                                                     {'loss': 0.0533, 'grad_norm': 0.4296875, 'learning_rate': 6.392241112653031e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2555.56, 'epoch': 1.69}
 84%|███████████████████████████████████████████████████████████████████████████████████████████▏                | 294/348 [1:13:19<12:24, 13.78s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████▌                | 295/348 [1:13:33<12:09, 13.76s/it]                                                                                                                                                     {'loss': 0.0233, 'grad_norm': 0.2255859375, 'learning_rate': 6.166767179613691e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2628.11, 'epoch': 1.69}
 85%|███████████████████████████████████████████████████████████████████████████████████████████▌                | 295/348 [1:13:33<12:09, 13.76s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████▊                | 296/348 [1:13:47<11:53, 13.72s/it]                                                                                                                                                     {'loss': 0.0498, 'grad_norm': 0.55859375, 'learning_rate': 5.945079997327713e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2416.14, 'epoch': 1.7}
 85%|███████████████████████████████████████████████████████████████████████████████████████████▊                | 296/348 [1:13:47<11:53, 13.72s/it] 85%|████████████████████████████████████████████████████████████████████████████████████████████▏               | 297/348 [1:14:00<11:37, 13.67s/it]                                                                                                                                                     {'loss': 0.0327, 'grad_norm': 0.298828125, 'learning_rate': 5.727198717339511e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2438.16, 'epoch': 1.7}
 85%|████████████████████████████████████████████████████████████████████████████████████████████▏               | 297/348 [1:14:00<11:37, 13.67s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████▍               | 298/348 [1:14:14<11:20, 13.60s/it]                                                                                                                                                     {'loss': 0.0205, 'grad_norm': 0.2138671875, 'learning_rate': 5.513142162401746e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 3173.03, 'epoch': 1.71}
 86%|████████████████████████████████████████████████████████████████████████████████████████████▍               | 298/348 [1:14:14<11:20, 13.60s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████▊               | 299/348 [1:14:27<11:06, 13.61s/it]                                                                                                                                                     {'loss': 0.0205, 'grad_norm': 0.27734375, 'learning_rate': 5.302928824849335e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2666.54, 'epoch': 1.71}
 86%|████████████████████████████████████████████████████████████████████████████████████████████▊               | 299/348 [1:14:27<11:06, 13.61s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████               | 300/348 [1:14:41<10:56, 13.67s/it]                                                                                                                                                     {'loss': 0.0253, 'grad_norm': 0.2421875, 'learning_rate': 5.096576865001802e-07, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2538.25, 'epoch': 1.72}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████               | 300/348 [1:14:41<10:56, 13.67s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████▍              | 301/348 [1:14:55<10:43, 13.69s/it]                                                                                                                                                     {'loss': 0.0092, 'grad_norm': 0.1669921875, 'learning_rate': 4.894104109594466e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2791.66, 'epoch': 1.73}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████▍              | 301/348 [1:14:55<10:43, 13.69s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████▋              | 302/348 [1:15:09<10:32, 13.75s/it]                                                                                                                                                     {'loss': 0.0226, 'grad_norm': 0.271484375, 'learning_rate': 4.695528050238368e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2959.99, 'epoch': 1.73}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████▋              | 302/348 [1:15:09<10:32, 13.75s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████              | 303/348 [1:15:22<10:17, 13.72s/it]                                                                                                                                                     {'loss': 0.0161, 'grad_norm': 0.2294921875, 'learning_rate': 4.500865841909169e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2876.65, 'epoch': 1.74}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████              | 303/348 [1:15:22<10:17, 13.72s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████▎             | 304/348 [1:15:36<10:05, 13.76s/it]                                                                                                                                                     {'loss': 0.0625, 'grad_norm': 0.546875, 'learning_rate': 4.3101343014651356e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2758.96, 'epoch': 1.74}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████▎             | 304/348 [1:15:36<10:05, 13.76s/it] 88%|██████████████████████████████████████████████████████████████████████████████████████████████▋             | 305/348 [1:15:50<09:48, 13.69s/it]                                                                                                                                                     {'loss': 0.0387, 'grad_norm': 0.423828125, 'learning_rate': 4.123349906194357e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2928.83, 'epoch': 1.75}
 88%|██████████████████████████████████████████████████████████████████████████████████████████████▋             | 305/348 [1:15:50<09:48, 13.69s/it] 88%|██████████████████████████████████████████████████████████████████████████████████████████████▉             | 306/348 [1:16:03<09:33, 13.65s/it]                                                                                                                                                     {'loss': 0.0523, 'grad_norm': 0.458984375, 'learning_rate': 3.940528792391224e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2752.72, 'epoch': 1.75}
 88%|██████████████████████████████████████████████████████████████████████████████████████████████▉             | 306/348 [1:16:03<09:33, 13.65s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████▎            | 307/348 [1:16:17<09:16, 13.56s/it]                                                                                                                                                     {'loss': 0.0892, 'grad_norm': 0.6484375, 'learning_rate': 3.7616867539624733e-07, 'memory/max_active (GiB)': 90.75, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2567.24, 'epoch': 1.76}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████▎            | 307/348 [1:16:17<09:16, 13.56s/it] 89%|███████████████████████████████████████████████████████████████████████████████████████████████▌            | 308/348 [1:16:30<09:03, 13.60s/it]                                                                                                                                                     {'loss': 0.0704, 'grad_norm': 0.515625, 'learning_rate': 3.586839241062695e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2485.14, 'epoch': 1.77}
 89%|███████████████████████████████████████████████████████████████████████████████████████████████▌            | 308/348 [1:16:30<09:03, 13.60s/it] 89%|███████████████████████████████████████████████████████████████████████████████████████████████▉            | 309/348 [1:16:44<08:48, 13.55s/it]                                                                                                                                                     {'loss': 0.0528, 'grad_norm': 0.435546875, 'learning_rate': 3.416001358759635e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2763.44, 'epoch': 1.77}
 89%|███████████████████████████████████████████████████████████████████████████████████████████████▉            | 309/348 [1:16:44<08:48, 13.55s/it] 89%|████████████████████████████████████████████████████████████████████████████████████████████████▏           | 310/348 [1:16:57<08:33, 13.51s/it]                                                                                                                                                     {'loss': 0.0137, 'grad_norm': 0.26953125, 'learning_rate': 3.2491878657292643e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 3187.26, 'epoch': 1.78}
 89%|████████████████████████████████████████████████████████████████████████████████████████████████▏           | 310/348 [1:16:57<08:33, 13.51s/it] 89%|████████████████████████████████████████████████████████████████████████████████████████████████▌           | 311/348 [1:17:11<08:22, 13.57s/it]                                                                                                                                                     {'loss': 0.0117, 'grad_norm': 0.2080078125, 'learning_rate': 3.08641317298074e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2925.22, 'epoch': 1.78}
 89%|████████████████████████████████████████████████████████████████████████████████████████████████▌           | 311/348 [1:17:11<08:22, 13.57s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████▊           | 312/348 [1:17:24<08:07, 13.53s/it]                                                                                                                                                     {'loss': 0.0457, 'grad_norm': 0.421875, 'learning_rate': 2.927691342611505e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2879.88, 'epoch': 1.79}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████▊           | 312/348 [1:17:24<08:07, 13.53s/it] 90%|█████████████████████████████████████████████████████████████████████████████████████████████████▏          | 313/348 [1:17:38<07:57, 13.64s/it]                                                                                                                                                     {'loss': 0.0382, 'grad_norm': 0.447265625, 'learning_rate': 2.7730360865923954e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2496.08, 'epoch': 1.79}
 90%|█████████████████████████████████████████████████████████████████████████████████████████████████▏          | 313/348 [1:17:38<07:57, 13.64s/it] 90%|█████████████████████████████████████████████████████████████████████████████████████████████████▍          | 314/348 [1:17:52<07:48, 13.77s/it]                                                                                                                                                     {'loss': 0.0362, 'grad_norm': 0.314453125, 'learning_rate': 2.6224607655831236e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2465.88, 'epoch': 1.8}
 90%|█████████████████████████████████████████████████████████████████████████████████████████████████▍          | 314/348 [1:17:52<07:48, 13.77s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████▊          | 315/348 [1:18:06<07:34, 13.76s/it]                                                                                                                                                     {'loss': 0.0225, 'grad_norm': 0.232421875, 'learning_rate': 2.475978387778e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2901.64, 'epoch': 1.81}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████▊          | 315/348 [1:18:06<07:34, 13.76s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████          | 316/348 [1:18:20<07:21, 13.81s/it]                                                                                                                                                     {'loss': 0.0219, 'grad_norm': 0.2490234375, 'learning_rate': 2.3336016077822154e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2306.59, 'epoch': 1.81}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████          | 316/348 [1:18:20<07:21, 13.81s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████▍         | 317/348 [1:18:34<07:05, 13.71s/it]                                                                                                                                                     {'loss': 0.0491, 'grad_norm': 0.4140625, 'learning_rate': 2.1953427255185122e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2608.9, 'epoch': 1.82}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████▍         | 317/348 [1:18:34<07:05, 13.71s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████▋         | 318/348 [1:18:47<06:47, 13.58s/it]                                                                                                                                                     {'loss': 0.0204, 'grad_norm': 0.2578125, 'learning_rate': 2.0612136851647258e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2647.21, 'epoch': 1.82}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████▋         | 318/348 [1:18:47<06:47, 13.58s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████         | 319/348 [1:19:01<06:35, 13.65s/it]                                                                                                                                                     {'loss': 0.046, 'grad_norm': 0.390625, 'learning_rate': 1.9312260741218114e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2597.89, 'epoch': 1.83}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████         | 319/348 [1:19:01<06:35, 13.65s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████▎        | 320/348 [1:19:14<06:20, 13.60s/it]                                                                                                                                                     {'loss': 0.0271, 'grad_norm': 0.23828125, 'learning_rate': 1.805391122012884e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2596.1, 'epoch': 1.83}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████▎        | 320/348 [1:19:14<06:20, 13.60s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████▌        | 321/348 [1:19:28<06:11, 13.74s/it]                                                                                                                                                     {'loss': 0.0237, 'grad_norm': 0.255859375, 'learning_rate': 1.6837196997130434e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2516.83, 'epoch': 1.84}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████▌        | 321/348 [1:19:28<06:11, 13.74s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████▉        | 322/348 [1:19:42<05:56, 13.72s/it]                                                                                                                                                     {'loss': 0.0179, 'grad_norm': 0.2236328125, 'learning_rate': 1.5662223184102876e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2540.53, 'epoch': 1.85}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████▉        | 322/348 [1:19:42<05:56, 13.72s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 323/348 [1:19:56<05:43, 13.73s/it]                                                                                                                                                     {'loss': 0.0223, 'grad_norm': 0.2119140625, 'learning_rate': 1.4529091286973994e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2636.6, 'epoch': 1.85}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 323/348 [1:19:56<05:43, 13.73s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 324/348 [1:20:10<05:32, 13.86s/it]                                                                                                                                                     {'loss': 0.0325, 'grad_norm': 0.314453125, 'learning_rate': 1.3437899196950765e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2506.87, 'epoch': 1.86}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 324/348 [1:20:10<05:32, 13.86s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 325/348 [1:20:24<05:19, 13.88s/it]                                                                                                                                                     {'loss': 0.0705, 'grad_norm': 0.5390625, 'learning_rate': 1.2388741182062348e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2390.64, 'epoch': 1.86}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 325/348 [1:20:24<05:19, 13.88s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 326/348 [1:20:37<05:02, 13.77s/it]                                                                                                                                                     {'loss': 0.0189, 'grad_norm': 0.267578125, 'learning_rate': 1.1381707879016158e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 3125.16, 'epoch': 1.87}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 326/348 [1:20:37<05:02, 13.77s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 327/348 [1:20:51<04:49, 13.78s/it]                                                                                                                                                     {'loss': 0.04, 'grad_norm': 0.373046875, 'learning_rate': 1.0416886285368188e-07, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2547.03, 'epoch': 1.87}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 327/348 [1:20:51<04:49, 13.78s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊      | 328/348 [1:21:04<04:33, 13.68s/it]                                                                                                                                                     {'loss': 0.0637, 'grad_norm': 0.671875, 'learning_rate': 9.494359752006687e-08, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2941.31, 'epoch': 1.88}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊      | 328/348 [1:21:04<04:33, 13.68s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████      | 329/348 [1:21:18<04:18, 13.61s/it]                                                                                                                                                     {'loss': 0.0155, 'grad_norm': 0.21875, 'learning_rate': 8.614207975952083e-08, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 3064.75, 'epoch': 1.89}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████      | 329/348 [1:21:18<04:18, 13.61s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 330/348 [1:21:31<04:05, 13.61s/it]                                                                                                                                                     {'loss': 0.0116, 'grad_norm': 0.189453125, 'learning_rate': 7.776506993471323e-08, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2854.92, 'epoch': 1.89}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 330/348 [1:21:32<04:05, 13.61s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 331/348 [1:21:45<03:51, 13.63s/it]                                                                                                                                                     {'loss': 0.017, 'grad_norm': 0.23046875, 'learning_rate': 6.981329173509909e-08, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2744.8, 'epoch': 1.9}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 331/348 [1:21:45<03:51, 13.63s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████     | 332/348 [1:21:59<03:39, 13.70s/it]                                                                                                                                                     {'loss': 0.0347, 'grad_norm': 0.3359375, 'learning_rate': 6.22874321143907e-08, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2566.27, 'epoch': 1.9}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████     | 332/348 [1:21:59<03:39, 13.70s/it] 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 333/348 [1:22:13<03:24, 13.66s/it]                                                                                                                                                     {'loss': 0.0267, 'grad_norm': 0.232421875, 'learning_rate': 5.518814123121885e-08, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2629.21, 'epoch': 1.91}
 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 333/348 [1:22:13<03:24, 13.66s/it] 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 334/348 [1:22:26<03:11, 13.69s/it]                                                                                                                                                     {'loss': 0.025, 'grad_norm': 0.234375, 'learning_rate': 4.851603239296065e-08, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2616.08, 'epoch': 1.91}
 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 334/348 [1:22:26<03:11, 13.69s/it] 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 335/348 [1:22:40<02:57, 13.65s/it]                                                                                                                                                     {'loss': 0.0344, 'grad_norm': 0.271484375, 'learning_rate': 4.227168200276077e-08, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2754.3, 'epoch': 1.92}
 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 335/348 [1:22:40<02:57, 13.65s/it] 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 336/348 [1:22:54<02:44, 13.67s/it]                                                                                                                                                     {'loss': 0.0182, 'grad_norm': 0.2734375, 'learning_rate': 3.645562950973014e-08, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2942.43, 'epoch': 1.93}
 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 336/348 [1:22:54<02:44, 13.67s/it] 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 337/348 [1:23:07<02:29, 13.59s/it]                                                                                                                                                     {'loss': 0.1145, 'grad_norm': 0.8828125, 'learning_rate': 3.10683773623488e-08, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2586.12, 'epoch': 1.93}
 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 337/348 [1:23:07<02:29, 13.59s/it] 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 338/348 [1:23:21<02:16, 13.60s/it]                                                                                                                                                     {'loss': 0.0125, 'grad_norm': 0.19140625, 'learning_rate': 2.6110390965055632e-08, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2797.12, 'epoch': 1.94}
 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 338/348 [1:23:21<02:16, 13.60s/it] 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 339/348 [1:23:34<02:02, 13.63s/it]                                                                                                                                                     {'loss': 0.0176, 'grad_norm': 0.3203125, 'learning_rate': 2.158209863804217e-08, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2653.07, 'epoch': 1.94}
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 339/348 [1:23:34<02:02, 13.63s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 340/348 [1:23:48<01:49, 13.73s/it]                                                                                                                                                     {'loss': 0.0125, 'grad_norm': 0.2099609375, 'learning_rate': 1.7483891580253877e-08, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2343.01, 'epoch': 1.95}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 340/348 [1:23:48<01:49, 13.73s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 341/348 [1:24:02<01:36, 13.72s/it]                                                                                                                                                     {'loss': 0.0172, 'grad_norm': 0.2236328125, 'learning_rate': 1.3816123835588835e-08, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2868.71, 'epoch': 1.95}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 341/348 [1:24:02<01:36, 13.72s/it] 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 342/348 [1:24:16<01:22, 13.68s/it]                                                                                                                                                     {'loss': 0.0125, 'grad_norm': 0.1953125, 'learning_rate': 1.0579112262316116e-08, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2859.66, 'epoch': 1.96}
 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 342/348 [1:24:16<01:22, 13.68s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 343/348 [1:24:29<01:08, 13.66s/it]                                                                                                                                                     {'loss': 0.0312, 'grad_norm': 0.271484375, 'learning_rate': 7.773136505700995e-09, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2585.62, 'epoch': 1.97}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 343/348 [1:24:29<01:08, 13.66s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 344/348 [1:24:43<00:54, 13.58s/it]                                                                                                                                                     {'loss': 0.0467, 'grad_norm': 0.5234375, 'learning_rate': 5.398438973845954e-09, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.15, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 3047.98, 'epoch': 1.97}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 344/348 [1:24:43<00:54, 13.58s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████ | 345/348 [1:24:57<00:41, 13.70s/it]                                                                                                                                                     {'loss': 0.0341, 'grad_norm': 0.33984375, 'learning_rate': 3.4552248167507576e-09, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2928.57, 'epoch': 1.98}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████ | 345/348 [1:24:57<00:41, 13.70s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 346/348 [1:25:10<00:27, 13.67s/it]                                                                                                                                                     {'loss': 0.0179, 'grad_norm': 0.2275390625, 'learning_rate': 1.943661908586636e-09, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2693.28, 'epoch': 1.98}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 346/348 [1:25:10<00:27, 13.67s/it]100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 347/348 [1:25:24<00:13, 13.58s/it]                                                                                                                                                     {'loss': 0.0238, 'grad_norm': 0.21875, 'learning_rate': 8.638808331973281e-10, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2583.71, 'epoch': 1.99}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 347/348 [1:25:24<00:13, 13.58s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 348/348 [1:25:38<00:00, 13.69s/it]                                                                                                                                                     {'loss': 0.0296, 'grad_norm': 0.271484375, 'learning_rate': 2.1597487281366236e-10, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2041.17, 'epoch': 1.99}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 348/348 [1:25:38<00:00, 13.69s/it][2026-01-06 07:58:31,538] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:5347] Running evaluation step...
[2026-01-06 07:58:34,105] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.7597558498382568
[2026-01-06 07:58:34,875] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.7698726654052734
[2026-01-06 07:58:35,665] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.789264440536499
[2026-01-06 07:58:36,418] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.7527258396148682
[2026-01-06 07:58:36,548] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:5347] gather_len_batches: [78, 78]
[2026-01-06 07:58:36,549] [WARNING] [py.warnings._showwarnmsg:110] [PID:5347] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. 
  warnings.warn(  # warn only once


  0%|                                                                                                                         | 0/78 [00:00<?, ?it/s][A
  3%|██▉                                                                                                              | 2/78 [00:01<01:06,  1.15it/s][A
  4%|████▎                                                                                                            | 3/78 [00:03<01:27,  1.17s/it][A
  5%|█████▊                                                                                                           | 4/78 [00:04<01:37,  1.32s/it][A
  6%|███████▏                                                                                                         | 5/78 [00:06<01:46,  1.47s/it][A
  8%|████████▋                                                                                                        | 6/78 [00:08<01:51,  1.55s/it][A
  9%|██████████▏                                                                                                      | 7/78 [00:09<01:51,  1.56s/it][A
 10%|███████████▌                                                                                                     | 8/78 [00:11<01:52,  1.61s/it][A
 12%|█████████████                                                                                                    | 9/78 [00:13<01:51,  1.61s/it][A
 13%|██████████████▎                                                                                                 | 10/78 [00:14<01:50,  1.63s/it][A
 14%|███████████████▊                                                                                                | 11/78 [00:16<01:49,  1.63s/it][A
 15%|█████████████████▏                                                                                              | 12/78 [00:18<01:47,  1.63s/it][A
 17%|██████████████████▋                                                                                             | 13/78 [00:19<01:46,  1.64s/it][A
 18%|████████████████████                                                                                            | 14/78 [00:21<01:43,  1.61s/it][A
 19%|█████████████████████▌                                                                                          | 15/78 [00:23<01:41,  1.61s/it][A
 21%|██████████████████████▉                                                                                         | 16/78 [00:24<01:39,  1.61s/it][A
 22%|████████████████████████▍                                                                                       | 17/78 [00:26<01:40,  1.65s/it][A
 23%|█████████████████████████▊                                                                                      | 18/78 [00:27<01:37,  1.62s/it][A
 24%|███████████████████████████▎                                                                                    | 19/78 [00:29<01:36,  1.63s/it][A
 26%|████████████████████████████▋                                                                                   | 20/78 [00:31<01:34,  1.62s/it][A
 27%|██████████████████████████████▏                                                                                 | 21/78 [00:32<01:33,  1.64s/it][A
 28%|███████████████████████████████▌                                                                                | 22/78 [00:34<01:30,  1.62s/it][A
 29%|█████████████████████████████████                                                                               | 23/78 [00:35<01:27,  1.59s/it][A
 31%|██████████████████████████████████▍                                                                             | 24/78 [00:37<01:25,  1.58s/it][A
 32%|███████████████████████████████████▉                                                                            | 25/78 [00:39<01:23,  1.58s/it][A
 33%|█████████████████████████████████████▎                                                                          | 26/78 [00:40<01:22,  1.58s/it][A
 35%|██████████████████████████████████████▊                                                                         | 27/78 [00:42<01:20,  1.58s/it][A
 36%|████████████████████████████████████████▏                                                                       | 28/78 [00:43<01:19,  1.59s/it][A
 37%|█████████████████████████████████████████▋                                                                      | 29/78 [00:45<01:19,  1.62s/it][A
 38%|███████████████████████████████████████████                                                                     | 30/78 [00:47<01:17,  1.61s/it][A
 40%|████████████████████████████████████████████▌                                                                   | 31/78 [00:48<01:14,  1.58s/it][A
 41%|█████████████████████████████████████████████▉                                                                  | 32/78 [00:50<01:13,  1.59s/it][A
 42%|███████████████████████████████████████████████▍                                                                | 33/78 [00:51<01:12,  1.61s/it][A
 44%|████████████████████████████████████████████████▊                                                               | 34/78 [00:53<01:11,  1.64s/it][A
 45%|██████████████████████████████████████████████████▎                                                             | 35/78 [00:55<01:10,  1.64s/it][A
 46%|███████████████████████████████████████████████████▋                                                            | 36/78 [00:56<01:08,  1.62s/it][A
 47%|█████████████████████████████████████████████████████▏                                                          | 37/78 [00:58<01:06,  1.63s/it][A
 49%|██████████████████████████████████████████████████████▌                                                         | 38/78 [01:00<01:04,  1.62s/it][A
 50%|████████████████████████████████████████████████████████                                                        | 39/78 [01:01<01:02,  1.61s/it][A
 51%|█████████████████████████████████████████████████████████▍                                                      | 40/78 [01:03<01:01,  1.61s/it][A
 53%|██████████████████████████████████████████████████████████▊                                                     | 41/78 [01:05<01:01,  1.65s/it][A
 54%|████████████████████████████████████████████████████████████▎                                                   | 42/78 [01:06<00:59,  1.64s/it][A
 55%|█████████████████████████████████████████████████████████████▋                                                  | 43/78 [01:08<00:58,  1.66s/it][A
 56%|███████████████████████████████████████████████████████████████▏                                                | 44/78 [01:10<00:57,  1.69s/it][A
 58%|████████████████████████████████████████████████████████████████▌                                               | 45/78 [01:11<00:56,  1.71s/it][A
 59%|██████████████████████████████████████████████████████████████████                                              | 46/78 [01:13<00:54,  1.71s/it][A
 60%|███████████████████████████████████████████████████████████████████▍                                            | 47/78 [01:15<00:53,  1.71s/it][A
 62%|████████████████████████████████████████████████████████████████████▉                                           | 48/78 [01:16<00:50,  1.70s/it][A
 63%|██████████████████████████████████████████████████████████████████████▎                                         | 49/78 [01:18<00:48,  1.67s/it][A
 64%|███████████████████████████████████████████████████████████████████████▊                                        | 50/78 [01:20<00:46,  1.67s/it][A
 65%|█████████████████████████████████████████████████████████████████████████▏                                      | 51/78 [01:21<00:44,  1.67s/it][A
 67%|██████████████████████████████████████████████████████████████████████████▋                                     | 52/78 [01:23<00:43,  1.66s/it][A
 68%|████████████████████████████████████████████████████████████████████████████                                    | 53/78 [01:25<00:42,  1.68s/it][A
 69%|█████████████████████████████████████████████████████████████████████████████▌                                  | 54/78 [01:26<00:39,  1.65s/it][A
 71%|██████████████████████████████████████████████████████████████████████████████▉                                 | 55/78 [01:28<00:37,  1.64s/it][A
 72%|████████████████████████████████████████████████████████████████████████████████▍                               | 56/78 [01:30<00:35,  1.61s/it][A
 73%|█████████████████████████████████████████████████████████████████████████████████▊                              | 57/78 [01:31<00:33,  1.59s/it][A
 74%|███████████████████████████████████████████████████████████████████████████████████▎                            | 58/78 [01:33<00:31,  1.59s/it][A
 76%|████████████████████████████████████████████████████████████████████████████████████▋                           | 59/78 [01:34<00:29,  1.57s/it][A
 77%|██████████████████████████████████████████████████████████████████████████████████████▏                         | 60/78 [01:36<00:28,  1.58s/it][A
 78%|███████████████████████████████████████████████████████████████████████████████████████▌                        | 61/78 [01:37<00:26,  1.56s/it][A
 79%|█████████████████████████████████████████████████████████████████████████████████████████                       | 62/78 [01:39<00:25,  1.57s/it][A
 81%|██████████████████████████████████████████████████████████████████████████████████████████▍                     | 63/78 [01:41<00:24,  1.61s/it][A
 82%|███████████████████████████████████████████████████████████████████████████████████████████▉                    | 64/78 [01:42<00:22,  1.60s/it][A
 83%|█████████████████████████████████████████████████████████████████████████████████████████████▎                  | 65/78 [01:44<00:21,  1.62s/it][A
 85%|██████████████████████████████████████████████████████████████████████████████████████████████▊                 | 66/78 [01:45<00:19,  1.59s/it][A
 86%|████████████████████████████████████████████████████████████████████████████████████████████████▏               | 67/78 [01:47<00:17,  1.62s/it][A
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████▋              | 68/78 [01:49<00:16,  1.63s/it][A
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████             | 69/78 [01:50<00:14,  1.64s/it][A
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 70/78 [01:52<00:13,  1.66s/it][A
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 71/78 [01:54<00:11,  1.67s/it][A
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 72/78 [01:55<00:09,  1.66s/it][A
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 73/78 [01:57<00:08,  1.65s/it][A
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 74/78 [01:59<00:06,  1.63s/it][A
 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 75/78 [02:00<00:04,  1.63s/it][A
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 76/78 [02:02<00:03,  1.64s/it][A
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 77/78 [02:04<00:01,  1.63s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78/78 [02:05<00:00,  1.63s/it][A                                                                                                                                                     
                                                                                                                                                     [A{'eval_loss': 0.0018502993043512106, 'eval_runtime': 128.5707, 'eval_samples_per_second': 1.447, 'eval_steps_per_second': 0.723, 'memory/max_active (GiB)': 85.95, 'memory/max_allocated (GiB)': 82.72, 'memory/device_reserved (GiB)': 93.36, 'epoch': 1.99}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 348/348 [1:27:51<00:00, 13.69s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78/78 [02:06<00:00,  1.63s/it][A
                                                                                                                                                     [A[2026-01-06 08:00:45,125] [INFO] [axolotl.core.trainers.base._save:671] [PID:5347] Saving model checkpoint to /workspace/data/model-output-base/checkpoint-348
                                                                                                                                                     {'train_runtime': 5371.8241, 'train_samples_per_second': 0.518, 'train_steps_per_second': 0.065, 'train_loss': 0.22673210777053288, 'memory/max_active (GiB)': 57.54, 'memory/max_allocated (GiB)': 54.45, 'memory/device_reserved (GiB)': 93.35, 'epoch': 1.99}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 348/348 [1:29:31<00:00, 13.69s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 348/348 [1:29:31<00:00, 15.44s/it]
[2026-01-06 08:02:56,826] [INFO] [axolotl.train.save_trained_model:218] [PID:5347] Training completed! Saving trained model to /workspace/data/model-output-base.
[2026-01-06 08:03:25,286] [INFO] [axolotl.train.save_trained_model:336] [PID:5347] Model successfully saved to /workspace/data/model-output-base
[2026-01-06 08:03:25,397] [INFO] [axolotl.core.trainers.base._save:671] [PID:5347] Saving model checkpoint to /workspace/data/model-output-base
Processing Files (0 / 0)                : |                                                                             |  0.00B /  0.00B            
New Data Upload                         : |                                                                             |  0.00B /  0.00B            [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:   1%|▍                                                                        | 33.5MB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:   1%|▌                                                                        | 33.5MB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:   1%|▎                                                                        | 25.0MB / 4.97GB            [A[A[A[A[A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:   1%|▍                                                                        | 33.5MB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:   1%|▌                                                                        | 33.5MB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:   1%|▎                                                                        | 25.0MB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :   1%|▍                                                                        | 92.0MB / 14.6GB,   ???B/s  

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:   3%|██▏                                                                      |  151MB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:   3%|██▎                                                                      |  151MB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:   3%|██▏                                                                      |  147MB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :   3%|██▏                                                                      |  449MB / 14.6GB, 1.78GB/s  

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:   5%|███▉                                                                     |  268MB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:   6%|████▏                                                                    |  268MB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:   5%|███▊                                                                     |  256MB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :   5%|███▉                                                                     |  793MB / 14.6GB, 1.75GB/s  

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:   8%|█████▋                                                                   |  386MB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:   8%|██████                                                                   |  386MB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:   8%|█████▌                                                                   |  382MB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :   8%|█████▊                                                                   | 1.15GB / 14.6GB, 1.77GB/s  

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  10%|███████▍                                                                 |  503MB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  11%|███████▉                                                                 |  503MB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  10%|███████▎                                                                 |  500MB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  10%|███████▌                                                                 | 1.51GB / 14.6GB, 1.77GB/s  

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  12%|█████████                                                                |  621MB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  13%|█████████▊                                                               |  621MB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  12%|█████████                                                                |  617MB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  13%|█████████▎                                                               | 1.86GB / 14.6GB, 1.77GB/s  

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  13%|█████████▊                                                               |  671MB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  16%|███████████▌                                                             |  738MB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  15%|██████████▊                                                              |  735MB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  15%|██████████▋                                                              | 2.14GB / 14.6GB, 1.71GB/s  

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  13%|█████████▊                                                               |  671MB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  18%|█████████████▍                                                           |  856MB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  17%|████████████▌                                                            |  852MB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  16%|███████████▉                                                             | 2.38GB / 14.6GB, 1.63GB/s  

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  15%|██████████▊                                                              |  737MB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  21%|███████████████▎                                                         |  973MB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  20%|██████████████▏                                                          |  969MB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  18%|█████████████▍                                                           | 2.68GB / 14.6GB, 1.62GB/s  
New Data Upload                         :  50%|████████████████████████████████████▎                                    | 66.6MB /  134MB, 41.6MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  15%|███████████▎                                                             |  771MB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  23%|█████████████████▏                                                       | 1.09GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  22%|███████████████▉                                                         | 1.09GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  20%|██████████████▋                                                          | 2.95GB / 14.6GB, 1.59GB/s  
New Data Upload                         :  50%|████████████████████████████████████▎                                    | 99.9MB /  201MB, 55.5MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  16%|███████████▊                                                             |  804MB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  26%|██████████████████▉                                                      | 1.21GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  24%|█████████████████▋                                                       | 1.20GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  22%|████████████████                                                         | 3.22GB / 14.6GB, 1.56GB/s  
New Data Upload                         :  66%|████████████████████████████████████████████████▎                        |  133MB /  201MB, 66.6MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  17%|████████████▊                                                            |  871MB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  29%|████████████████████▊                                                    | 1.33GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  27%|███████████████████▍                                                     | 1.32GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  24%|█████████████████▌                                                       | 3.52GB / 14.6GB, 1.56GB/s  
New Data Upload                         :  75%|██████████████████████████████████████████████████████▍                  |  200MB /  268MB, 91.0MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  19%|█████████████▊                                                           |  938MB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  31%|██████████████████████▋                                                  | 1.44GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  29%|█████████████████████▏                                                   | 1.44GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  26%|███████████████████                                                      | 3.82GB / 14.6GB, 1.55GB/s  
New Data Upload                         :  80%|██████████████████████████████████████████████████████████▎              |  268MB /  335MB,  112MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  20%|██████████████▋                                                          | 1.01GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  34%|████████████████████████▌                                                | 1.56GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  31%|██████████████████████▊                                                  | 1.56GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  28%|████████████████████▌                                                    | 4.12GB / 14.6GB, 1.55GB/s  
New Data Upload                         : 100%|████████████████████████████████████████████████████████████████████████▊|  335MB /  335MB,  129MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  21%|███████████████                                                          | 1.03GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  36%|██████████████████████████▎                                              | 1.68GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  34%|████████████████████████▌                                                | 1.67GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  30%|█████████████████████▉                                                   | 4.38GB / 14.6GB, 1.53GB/s  
New Data Upload                         :  88%|████████████████████████████████████████████████████████████████▎        |  355MB /  402MB,  127MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  22%|███████████████▋                                                         | 1.07GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  39%|████████████████████████████▏                                            | 1.80GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  36%|██████████████████████████▎                                              | 1.79GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  32%|███████████████████████▎                                                 | 4.66GB / 14.6GB, 1.52GB/s  
New Data Upload                         :  86%|██████████████████████████████████████████████████████████████▍          |  402MB /  469MB,  134MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  23%|████████████████▋                                                        | 1.14GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  41%|██████████████████████████████                                           | 1.91GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  38%|████████████████████████████                                             | 1.91GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  34%|████████████████████████▊                                                | 4.96GB / 14.6GB, 1.52GB/s  
New Data Upload                         :  87%|███████████████████████████████████████████████████████████████▋         |  468MB /  537MB,  146MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  23%|████████████████▋                                                        | 1.14GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  42%|██████████████████████████████▌                                          | 1.94GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  41%|█████████████████████████████▊                                           | 2.03GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  35%|█████████████████████████▌                                               | 5.11GB / 14.6GB, 1.48GB/s  
New Data Upload                         :  78%|████████████████████████████████████████████████████████▋                |  469MB /  604MB,  138MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  24%|█████████████████▋                                                       | 1.21GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  43%|███████████████████████████████▌                                         | 2.01GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  43%|███████████████████████████████▍                                         | 2.14GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  37%|██████████████████████████▊                                              | 5.36GB / 14.6GB, 1.46GB/s  
New Data Upload                         :  90%|█████████████████████████████████████████████████████████████████▍       |  601MB /  671MB,  167MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  26%|██████████████████▋                                                      | 1.27GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  43%|███████████████████████████████▌                                         | 2.01GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  45%|█████████████████████████████████▏                                       | 2.26GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  38%|███████████████████████████▋                                             | 5.55GB / 14.6GB, 1.43GB/s  
New Data Upload                         :  83%|████████████████████████████████████████████████████████████▋            |  669MB /  805MB,  176MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  27%|███████████████████▋                                                     | 1.34GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  45%|████████████████████████████████▋                                        | 2.08GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  48%|██████████████████████████████████▉                                      | 2.38GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  40%|████████████████████████████▉                                            | 5.80GB / 14.6GB, 1.43GB/s  
New Data Upload                         :  85%|██████████████████████████████████████████████████████████████▎          |  802MB /  939MB,  200MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  27%|███████████████████▋                                                     | 1.34GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  46%|█████████████████████████████████▋                                       | 2.14GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  49%|███████████████████████████████████▍                                     | 2.41GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  40%|█████████████████████████████▌                                           | 5.90GB / 14.6GB, 1.38GB/s  
New Data Upload                         :  81%|███████████████████████████████████████████████████████████▍             |  873MB / 1.07GB,  208MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  28%|████████████████████▌                                                    | 1.41GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  46%|█████████████████████████████████▋                                       | 2.14GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  50%|████████████████████████████████████▍                                    | 2.48GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  41%|██████████████████████████████▏                                          | 6.03GB / 14.6GB, 1.35GB/s  
New Data Upload                         :  88%|████████████████████████████████████████████████████████████████▏        | 1.00GB / 1.14GB,  228MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  30%|█████████████████████▌                                                   | 1.47GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  48%|██████████████████████████████████▊                                      | 2.21GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  50%|████████████████████████████████████▍                                    | 2.48GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  42%|██████████████████████████████▊                                          | 6.16GB / 14.6GB, 1.32GB/s  
New Data Upload                         :  85%|█████████████████████████████████████████████████████████████▉           | 1.14GB / 1.34GB,  247MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  30%|█████████████████████▌                                                   | 1.47GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  49%|███████████████████████████████████▊                                     | 2.28GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  51%|█████████████████████████████████████▍                                   | 2.55GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  43%|███████████████████████████████▍                                         | 6.30GB / 14.6GB, 1.29GB/s  
New Data Upload                         :  90%|█████████████████████████████████████████████████████████████████▉       | 1.27GB / 1.41GB,  265MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  31%|██████████████████████▌                                                  | 1.54GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  49%|███████████████████████████████████▊                                     | 2.28GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  53%|██████████████████████████████████████▎                                  | 2.61GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  44%|████████████████████████████████▏                                        | 6.43GB / 14.6GB, 1.27GB/s  
New Data Upload                         :  91%|██████████████████████████████████████████████████████████████████▌      | 1.41GB / 1.54GB,  281MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  31%|██████████████████████▌                                                  | 1.54GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  51%|████████████████████████████████████▊                                    | 2.35GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  53%|██████████████████████████████████████▎                                  | 2.61GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  45%|████████████████████████████████▌                                        | 6.50GB / 14.6GB, 1.23GB/s  
New Data Upload                         :  88%|████████████████████████████████████████████████████████████████▏        | 1.47GB / 1.68GB,  283MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  32%|███████████████████████▌                                                 | 1.61GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  51%|████████████████████████████████████▊                                    | 2.35GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  54%|███████████████████████████████████████▎                                 | 2.68GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  45%|█████████████████████████████████▏                                       | 6.63GB / 14.6GB, 1.21GB/s  
New Data Upload                         :  89%|████████████████████████████████████████████████████████████████▊        | 1.61GB / 1.81GB,  298MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  33%|████████████████████████▎                                                | 1.66GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  52%|█████████████████████████████████████▉                                   | 2.41GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  54%|███████████████████████████████████████▎                                 | 2.68GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  46%|█████████████████████████████████▊                                       | 6.75GB / 14.6GB, 1.19GB/s  
New Data Upload                         :  92%|███████████████████████████████████████████████████████████████████      | 1.73GB / 1.88GB,  308MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  34%|████████████████████████▌                                                | 1.68GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  53%|██████████████████████████████████████▉                                  | 2.48GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  55%|████████████████████████████████████████▎                                | 2.75GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  47%|██████████████████████████████████▌                                      | 6.90GB / 14.6GB, 1.17GB/s  
New Data Upload                         :  93%|████████████████████████████████████████████████████████████████████     | 1.88GB / 2.01GB,  323MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  35%|█████████████████████████▌                                               | 1.74GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  53%|██████████████████████████████████████▉                                  | 2.48GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  57%|█████████████████████████████████████████▎                               | 2.81GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  48%|███████████████████████████████████▏                                     | 7.04GB / 14.6GB, 1.16GB/s  
New Data Upload                         :  94%|████████████████████████████████████████████████████████████████████▎    | 2.01GB / 2.15GB,  335MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  36%|██████████████████████████▌                                              | 1.81GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  55%|████████████████████████████████████████                                 | 2.55GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  57%|█████████████████████████████████████████▎                               | 2.81GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  49%|███████████████████████████████████▊                                     | 7.17GB / 14.6GB, 1.14GB/s  
New Data Upload                         :  94%|████████████████████████████████████████████████████████████████████▋    | 2.14GB / 2.28GB,  346MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  36%|██████████████████████████▌                                              | 1.81GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  55%|████████████████████████████████████████                                 | 2.55GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  58%|██████████████████████████████████████████▎                              | 2.88GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  50%|████████████████████████████████████▏                                    | 7.24GB / 14.6GB, 1.12GB/s  
New Data Upload                         :  94%|████████████████████████████████████████████████████████████████████▊    | 2.21GB / 2.35GB,  346MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  36%|██████████████████████████▌                                              | 1.81GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  56%|█████████████████████████████████████████                                | 2.62GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  58%|██████████████████████████████████████████▎                              | 2.88GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  50%|████████████████████████████████████▌                                    | 7.31GB / 14.6GB, 1.09GB/s  
New Data Upload                         :  92%|███████████████████████████████████████████████████████████████████      | 2.28GB / 2.48GB,  345MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  38%|███████████████████████████▌                                             | 1.88GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  58%|██████████████████████████████████████████▏                              | 2.68GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  59%|███████████████████████████████████████████▎                             | 2.95GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  51%|█████████████████████████████████████▌                                   | 7.51GB / 14.6GB, 1.09GB/s  
New Data Upload                         :  97%|███████████████████████████████████████████████████████████████████████  | 2.48GB / 2.55GB,  365MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  39%|████████████████████████████▍                                            | 1.94GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  58%|██████████████████████████████████████████▏                              | 2.68GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  59%|███████████████████████████████████████████▎                             | 2.95GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  52%|█████████████████████████████████████▉                                   | 7.57GB / 14.6GB, 1.07GB/s  
New Data Upload                         :  95%|█████████████████████████████████████████████████████████████████████▎   | 2.55GB / 2.68GB,  364MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  39%|████████████████████████████▍                                            | 1.94GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  59%|███████████████████████████████████████████▏                             | 2.75GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  61%|████████████████████████████████████████████▎                            | 3.01GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  53%|██████████████████████████████████████▌                                  | 7.71GB / 14.6GB, 1.06GB/s  
New Data Upload                         :  93%|███████████████████████████████████████████████████████████████████▊     | 2.68GB / 2.88GB,  372MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  40%|█████████████████████████████▍                                           | 2.01GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  61%|████████████████████████████████████████████▏                            | 2.82GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  61%|████████████████████████████████████████████▎                            | 3.01GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  54%|███████████████████████████████████████▏                                 | 7.84GB / 14.6GB, 1.05GB/s  
New Data Upload                         :  95%|█████████████████████████████████████████████████████████████████████▌   | 2.81GB / 2.95GB,  380MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  40%|█████████████████████████████▍                                           | 2.01GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  61%|████████████████████████████████████████████▏                            | 2.82GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  62%|█████████████████████████████████████████████▎                           | 3.08GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  54%|███████████████████████████████████████▌                                 | 7.91GB / 14.6GB, 1.03GB/s  
New Data Upload                         :  95%|█████████████████████████████████████████████████████████████████████▋   | 2.88GB / 3.02GB,  379MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  42%|██████████████████████████████▍                                          | 2.08GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  62%|█████████████████████████████████████████████▎                           | 2.88GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  62%|█████████████████████████████████████████████▎                           | 3.08GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  55%|████████████████████████████████████████▏                                | 8.04GB / 14.6GB, 1.02GB/s  
New Data Upload                         :  96%|█████████████████████████████████████████████████████████████████████▊   | 3.02GB / 3.15GB,  387MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  43%|███████████████████████████████▍                                         | 2.15GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  62%|█████████████████████████████████████████████▎                           | 2.88GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  63%|██████████████████████████████████████████████▎                          | 3.15GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  56%|████████████████████████████████████████▉                                | 8.18GB / 14.6GB, 1.01GB/s  
New Data Upload                         :  96%|█████████████████████████████████████████████████████████████████████▉   | 3.15GB / 3.29GB,  394MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  43%|███████████████████████████████▍                                         | 2.15GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  64%|██████████████████████████████████████████████▎                          | 2.95GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  63%|██████████████████████████████████████████████▎                          | 3.15GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  56%|█████████████████████████████████████████▏                               | 8.24GB / 14.6GB,  994MB/s  
New Data Upload                         :  96%|██████████████████████████████████████████████████████████████████████   | 3.22GB / 3.35GB,  392MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  44%|████████████████████████████████▍                                        | 2.21GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  64%|██████████████████████████████████████████████▎                          | 2.95GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  65%|███████████████████████████████████████████████▏                         | 3.22GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  57%|█████████████████████████████████████████▉                               | 8.38GB / 14.6GB,  987MB/s  
New Data Upload                         :  94%|████████████████████████████████████████████████████████████████████▊    | 3.35GB / 3.55GB,  399MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  44%|████████████████████████████████▍                                        | 2.21GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  65%|███████████████████████████████████████████████▍                         | 3.02GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  66%|████████████████████████████████████████████████▏                        | 3.28GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  58%|██████████████████████████████████████████▌                              | 8.51GB / 14.6GB,  979MB/s  
New Data Upload                         :  94%|████████████████████████████████████████████████████████████████████▉    | 3.48GB / 3.69GB,  405MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  46%|█████████████████████████████████▍                                       | 2.28GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  66%|████████████████████████████████████████████████▍                        | 3.08GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  66%|████████████████████████████████████████████████▏                        | 3.28GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  59%|███████████████████████████████████████████▏                             | 8.65GB / 14.6GB,  972MB/s  
New Data Upload                         :  98%|███████████████████████████████████████████████████████████████████████▌ | 3.62GB / 3.69GB,  411MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  46%|█████████████████████████████████▍                                       | 2.28GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  66%|████████████████████████████████████████████████▍                        | 3.08GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  66%|████████████████████████████████████████████████▏                        | 3.28GB / 4.97GB            [A[A[A[A[A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  47%|██████████████████████████████████▍                                      | 2.35GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  68%|█████████████████████████████████████████████████▌                       | 3.15GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  67%|█████████████████████████████████████████████████▏                       | 3.35GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  61%|████████████████████████████████████████████▏                            | 8.85GB / 14.6GB,  952MB/s  
New Data Upload                         :  97%|██████████████████████████████████████████████████████████████████████▍  | 3.82GB / 3.96GB,  415MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  48%|███████████████████████████████████▎                                     | 2.41GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  69%|██████████████████████████████████████████████████                       | 3.19GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  69%|██████████████████████████████████████████████████▏                      | 3.42GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  62%|█████████████████████████████████████████████                            | 9.02GB / 14.6GB,  950MB/s  
New Data Upload                         :  98%|███████████████████████████████████████████████████████████████████████▏ | 3.99GB / 4.09GB,  425MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  48%|███████████████████████████████████▎                                     | 2.41GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  69%|██████████████████████████████████████████████████▌                      | 3.22GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  69%|██████████████████████████████████████████████████▏                      | 3.42GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  62%|█████████████████████████████████████████████▏                           | 9.05GB / 14.6GB,  933MB/s  
New Data Upload                         :  95%|█████████████████████████████████████████████████████████████████████▍   | 4.02GB / 4.23GB,  419MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  50%|████████████████████████████████████▎                                    | 2.48GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  69%|██████████████████████████████████████████████████▌                      | 3.22GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  70%|███████████████████████████████████████████████████▏                     | 3.48GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  63%|█████████████████████████████████████████████▉                           | 9.18GB / 14.6GB,  927MB/s  
New Data Upload                         :  95%|█████████████████████████████████████████████████████████████████████▌   | 4.15GB / 4.36GB,  424MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  50%|████████████████████████████████████▎                                    | 2.48GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  71%|███████████████████████████████████████████████████▌                     | 3.28GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  71%|███████████████████████████████████████████████████▉                     | 3.53GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  64%|██████████████████████████████████████████████▍                          | 9.30GB / 14.6GB,  921MB/s  
New Data Upload                         :  96%|██████████████████████████████████████████████████████████████████████▍  | 4.27GB / 4.43GB,  427MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  51%|█████████████████████████████████████▎                                   | 2.55GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  72%|████████████████████████████████████████████████████▋                    | 3.35GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  71%|████████████████████████████████████████████████████▏                    | 3.55GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  65%|███████████████████████████████████████████████▎                         | 9.45GB / 14.6GB,  917MB/s  
New Data Upload                         :  97%|██████████████████████████████████████████████████████████████████████▊  | 4.42GB / 4.56GB,  434MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  52%|██████████████████████████████████████▎                                  | 2.61GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  72%|████████████████████████████████████████████████████▋                    | 3.35GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  73%|█████████████████████████████████████████████████████▏                   | 3.62GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  66%|███████████████████████████████████████████████▉                         | 9.58GB / 14.6GB,  896MB/s  
New Data Upload                         :  97%|██████████████████████████████████████████████████████████████████████▊  | 4.56GB / 4.69GB,  447MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  52%|██████████████████████████████████████▎                                  | 2.61GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  72%|████████████████████████████████████████████████████▋                    | 3.35GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  73%|█████████████████████████████████████████████████████▏                   | 3.62GB / 4.97GB            [A[A[A[A[A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  52%|██████████████████████████████████████▎                                  | 2.61GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  74%|█████████████████████████████████████████████████████▋                   | 3.42GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  73%|█████████████████████████████████████████████████████▏                   | 3.62GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  66%|████████████████████████████████████████████████▎                        | 9.65GB / 14.6GB,  833MB/s  
New Data Upload                         :  96%|█████████████████████████████████████████████████████████████████████▉   | 4.63GB / 4.83GB,  453MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  54%|███████████████████████████████████████▎                                 | 2.68GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  74%|█████████████████████████████████████████████████████▋                   | 3.42GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  74%|██████████████████████████████████████████████████████                   | 3.69GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  67%|████████████████████████████████████████████████▉                        | 9.79GB / 14.6GB,  812MB/s  
New Data Upload                         :  96%|██████████████████████████████████████████████████████████████████████   | 4.76GB / 4.96GB,  467MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  54%|███████████████████████████████████████▎                                 | 2.68GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  75%|██████████████████████████████████████████████████████▊                  | 3.49GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  75%|███████████████████████████████████████████████████████                  | 3.75GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  68%|█████████████████████████████████████████████████▌                       | 9.92GB / 14.6GB,  790MB/s  
New Data Upload                         :  97%|███████████████████████████████████████████████████████████████████████  | 4.89GB / 5.03GB,  480MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  55%|████████████████████████████████████████▎                                | 2.75GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  76%|███████████████████████████████████████████████████████▊                 | 3.55GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  75%|███████████████████████████████████████████████████████                  | 3.75GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  69%|██████████████████████████████████████████████████▎                      | 10.1GB / 14.6GB,  776MB/s  
New Data Upload                         :  97%|███████████████████████████████████████████████████████████████████████  | 5.03GB / 5.16GB,  493MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  55%|████████████████████████████████████████▎                                | 2.75GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  76%|███████████████████████████████████████████████████████▊                 | 3.55GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  77%|████████████████████████████████████████████████████████                 | 3.82GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  69%|██████████████████████████████████████████████████▌                      | 10.1GB / 14.6GB,  759MB/s  
New Data Upload                         :  97%|███████████████████████████████████████████████████████████████████████  | 5.09GB / 5.23GB,  499MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  57%|█████████████████████████████████████████▎                               | 2.82GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  78%|████████████████████████████████████████████████████████▊                | 3.62GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  77%|████████████████████████████████████████████████████████                 | 3.82GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  70%|███████████████████████████████████████████████████▎                     | 10.2GB / 14.6GB,  742MB/s  
New Data Upload                         :  97%|███████████████████████████████████████████████████████████████████████  | 5.22GB / 5.37GB,  506MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  58%|██████████████████████████████████████████▏                              | 2.88GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  78%|████████████████████████████████████████████████████████▉                | 3.62GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  78%|█████████████████████████████████████████████████████████                | 3.89GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  71%|███████████████████████████████████████████████████▉                     | 10.4GB / 14.6GB,  730MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████ | 5.36GB / 5.43GB,  516MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  58%|██████████████████████████████████████████▏                              | 2.88GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  79%|█████████████████████████████████████████████████████████▉               | 3.69GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  78%|█████████████████████████████████████████████████████████                | 3.89GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  72%|████████████████████████████████████████████████████▎                    | 10.5GB / 14.6GB,  710MB/s  
New Data Upload                         :  98%|███████████████████████████████████████████████████████████████████████▏ | 5.43GB / 5.57GB,  519MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  58%|██████████████████████████████████████████▏                              | 2.88GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  79%|█████████████████████████████████████████████████████████▉               | 3.69GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  80%|██████████████████████████████████████████████████████████               | 3.95GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  72%|████████████████████████████████████████████████████▋                    | 10.5GB / 14.6GB,  687MB/s  
New Data Upload                         :  96%|██████████████████████████████████████████████████████████████████████▍  | 5.50GB / 5.70GB,  519MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  59%|███████████████████████████████████████████▏                             | 2.95GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  79%|█████████████████████████████████████████████████████████▉               | 3.69GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  80%|██████████████████████████████████████████████████████████               | 3.95GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  73%|████████████████████████████████████████████████████▉                    | 10.6GB / 14.6GB,  664MB/s  
New Data Upload                         :  96%|██████████████████████████████████████████████████████████████████████▍  | 5.56GB / 5.77GB,  519MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  59%|███████████████████████████████████████████▏                             | 2.95GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  81%|███████████████████████████████████████████████████████████              | 3.75GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  81%|██████████████████████████████████████████████████████████▉              | 4.01GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  73%|█████████████████████████████████████████████████████▌                   | 10.7GB / 14.6GB,  647MB/s  
New Data Upload                         :  98%|███████████████████████████████████████████████████████████████████████▏ | 5.69GB / 5.83GB,  525MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  61%|████████████████████████████████████████████▏                            | 3.02GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  82%|████████████████████████████████████████████████████████████             | 3.82GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  81%|███████████████████████████████████████████████████████████              | 4.02GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  74%|██████████████████████████████████████████████████████▎                  | 10.9GB / 14.6GB,  636MB/s  
New Data Upload                         :  98%|███████████████████████████████████████████████████████████████████████▎ | 5.83GB / 5.97GB,  537MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  61%|████████████████████████████████████████████▏                            | 3.02GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  82%|████████████████████████████████████████████████████████████             | 3.82GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  82%|████████████████████████████████████████████████████████████             | 4.09GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  75%|██████████████████████████████████████████████████████▋                  | 10.9GB / 14.6GB,  614MB/s  
New Data Upload                         :  97%|██████████████████████████████████████████████████████████████████████▌  | 5.90GB / 6.10GB,  539MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  61%|████████████████████████████████████████████▋                            | 3.05GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  82%|████████████████████████████████████████████████████████████             | 3.82GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  82%|████████████████████████████████████████████████████████████             | 4.09GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  75%|██████████████████████████████████████████████████████▊                  | 11.0GB / 14.6GB,  588MB/s  
New Data Upload                         :  97%|██████████████████████████████████████████████████████████████████████▉  | 5.93GB / 6.10GB,  535MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  62%|█████████████████████████████████████████████▏                           | 3.08GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  84%|█████████████████████████████████████████████████████████████            | 3.89GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  83%|████████████████████████████████████████████████████████████▌            | 4.12GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  76%|███████████████████████████████████████████████████████▍                 | 11.1GB / 14.6GB,  587MB/s  
New Data Upload                         :  98%|███████████████████████████████████████████████████████████████████████▊ | 6.07GB / 6.17GB,  549MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  63%|██████████████████████████████████████████████▏                          | 3.15GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  84%|█████████████████████████████████████████████████████████████▍           | 3.91GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  84%|█████████████████████████████████████████████████████████████            | 4.15GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  77%|████████████████████████████████████████████████████████                 | 11.2GB / 14.6GB,  574MB/s  
New Data Upload                         :  97%|██████████████████████████████████████████████████████████████████████▊  | 6.18GB / 6.37GB,  547MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  65%|███████████████████████████████████████████████▏                         | 3.22GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  85%|██████████████████████████████████████████████████████████████▏          | 3.96GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  85%|█████████████████████████████████████████████████████████████▉           | 4.22GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  78%|████████████████████████████████████████████████████████▉                | 11.4GB / 14.6GB,  573MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▏| 6.37GB / 6.44GB,  559MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  65%|███████████████████████████████████████████████▏                         | 3.22GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  85%|██████████████████████████████████████████████████████████████▏          | 3.96GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  85%|█████████████████████████████████████████████████████████████▉           | 4.22GB / 4.97GB            [A[A[A[A[A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  65%|███████████████████████████████████████████████▏                         | 3.22GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  85%|██████████████████████████████████████████████████████████████▏          | 3.96GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  85%|█████████████████████████████████████████████████████████████▉           | 4.22GB / 4.97GB            [A[A[A[A[A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  66%|████████████████████████████████████████████████▏                        | 3.28GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  87%|███████████████████████████████████████████████████████████████▏         | 4.02GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  86%|██████████████████████████████████████████████████████████████▉          | 4.29GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  79%|█████████████████████████████████████████████████████████▉               | 11.6GB / 14.6GB,  546MB/s  
New Data Upload                         :  98%|███████████████████████████████████████████████████████████████████████▍ | 6.57GB / 6.71GB,  546MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  66%|████████████████████████████████████████████████▏                        | 3.28GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  88%|████████████████████████████████████████████████████████████████▎        | 4.09GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  88%|███████████████████████████████████████████████████████████████▉         | 4.36GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  80%|██████████████████████████████████████████████████████████▋              | 11.7GB / 14.6GB,  546MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▏| 6.70GB / 6.77GB,  546MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  66%|████████████████████████████████████████████████▏                        | 3.28GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  88%|████████████████████████████████████████████████████████████████▎        | 4.09GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  88%|███████████████████████████████████████████████████████████████▉         | 4.36GB / 4.97GB            [A[A[A[A[A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  67%|█████████████████████████████████████████████████                        | 3.35GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  89%|█████████████████████████████████████████████████████████████████▏       | 4.15GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  88%|███████████████████████████████████████████████████████████████▉         | 4.36GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  81%|███████████████████████████████████████████████████████████▎             | 11.9GB / 14.6GB,  531MB/s  
New Data Upload                         :  98%|███████████████████████████████████████████████████████████████████████▍ | 6.83GB / 6.97GB,  531MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  67%|█████████████████████████████████████████████████                        | 3.35GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  89%|█████████████████████████████████████████████████████████████████▎       | 4.16GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  88%|███████████████████████████████████████████████████████████████▉         | 4.36GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  81%|███████████████████████████████████████████████████████████▎             | 11.9GB / 14.6GB,  526MB/s  
New Data Upload                         :  97%|██████████████████████████████████████████████████████████████████████▉  | 6.84GB / 7.04GB,  526MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  69%|██████████████████████████████████████████████████                       | 3.42GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  91%|██████████████████████████████████████████████████████████████████▎      | 4.22GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  89%|████████████████████████████████████████████████████████████████▉        | 4.42GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  83%|████████████████████████████████████████████████████████████▎            | 12.1GB / 14.6GB,  532MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▏| 7.03GB / 7.11GB,  532MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  69%|██████████████████████████████████████████████████                       | 3.42GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  91%|██████████████████████████████████████████████████████████████████▍      | 4.22GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  90%|█████████████████████████████████████████████████████████████████▉       | 4.49GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  83%|████████████████████████████████████████████████████████████▋            | 12.1GB / 14.6GB,  527MB/s  
New Data Upload                         :  98%|███████████████████████████████████████████████████████████████████████▌ | 7.11GB / 7.24GB,  527MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  70%|███████████████████████████████████████████████████                      | 3.49GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  92%|███████████████████████████████████████████████████████████████████▍     | 4.29GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  90%|█████████████████████████████████████████████████████████████████▉       | 4.49GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  84%|█████████████████████████████████████████████████████████████▎           | 12.3GB / 14.6GB,  526MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▎| 7.24GB / 7.31GB,  526MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  70%|███████████████████████████████████████████████████                      | 3.49GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  92%|███████████████████████████████████████████████████████████████████▍     | 4.29GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  90%|█████████████████████████████████████████████████████████████████▉       | 4.49GB / 4.97GB            [A[A[A[A[A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  70%|███████████████████████████████████████████████████                      | 3.49GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  92%|███████████████████████████████████████████████████████████████████▍     | 4.29GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  90%|█████████████████████████████████████████████████████████████████▉       | 4.49GB / 4.97GB            [A[A[A[A[A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  71%|████████████████████████████████████████████████████                     | 3.55GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  94%|████████████████████████████████████████████████████████████████████▍    | 4.36GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  92%|██████████████████████████████████████████████████████████████████▉      | 4.56GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  85%|██████████████████████████████████████████████████████████████▎          | 12.5GB / 14.6GB,  513MB/s  
New Data Upload                         :  97%|███████████████████████████████████████████████████████████████████████  | 7.44GB / 7.65GB,  513MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  73%|█████████████████████████████████████████████████████                    | 3.62GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  94%|████████████████████████████████████████████████████████████████████▍    | 4.36GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  93%|███████████████████████████████████████████████████████████████████▉     | 4.62GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  86%|███████████████████████████████████████████████████████████████          | 12.6GB / 14.6GB,  519MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▎| 7.58GB / 7.65GB,  519MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  73%|█████████████████████████████████████████████████████                    | 3.62GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  95%|█████████████████████████████████████████████████████████████████████▌   | 4.43GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  93%|███████████████████████████████████████████████████████████████████▉     | 4.62GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  87%|███████████████████████████████████████████████████████████████▎         | 12.7GB / 14.6GB,  506MB/s  
New Data Upload                         :  97%|███████████████████████████████████████████████████████████████████████  | 7.64GB / 7.85GB,  506MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  74%|██████████████████████████████████████████████████████                   | 3.69GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  95%|█████████████████████████████████████████████████████████████████████▌   | 4.43GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  93%|████████████████████████████████████████████████████████████████████▏    | 4.65GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  87%|███████████████████████████████████████████████████████████████▊         | 12.8GB / 14.6GB,  508MB/s  
New Data Upload                         :  98%|███████████████████████████████████████████████████████████████████████▎ | 7.73GB / 7.91GB,  508MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  75%|██████████████████████████████████████████████████████▋                  | 3.73GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  97%|██████████████████████████████████████████████████████████████████████▌  | 4.49GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  94%|████████████████████████████████████████████████████████████████████▉    | 4.69GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  88%|████████████████████████████████████████████████████████████████▌        | 12.9GB / 14.6GB,  511MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▏| 7.89GB / 7.98GB,  511MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  75%|███████████████████████████████████████████████████████                  | 3.75GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  97%|██████████████████████████████████████████████████████████████████████▌  | 4.49GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  96%|█████████████████████████████████████████████████████████████████████▊   | 4.76GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  89%|█████████████████████████████████████████████████████████████████        | 13.0GB / 14.6GB,  506MB/s  
New Data Upload                         :  98%|███████████████████████████████████████████████████████████████████████▊ | 7.98GB / 8.11GB,  506MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  77%|███████████████████████████████████████████████████████▉                 | 3.82GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors:  98%|███████████████████████████████████████████████████████████████████████▋ | 4.56GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  96%|█████████████████████████████████████████████████████████████████████▉   | 4.76GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  90%|█████████████████████████████████████████████████████████████████▋       | 13.1GB / 14.6GB,  513MB/s  
New Data Upload                         :  98%|███████████████████████████████████████████████████████████████████████▏ | 8.11GB / 8.32GB,  513MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  77%|████████████████████████████████████████████████████████                 | 3.82GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  97%|██████████████████████████████████████████████████████████████████████▊  | 4.83GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  91%|██████████████████████████████████████████████████████████████████▍      | 13.3GB / 14.6GB,  513MB/s  
New Data Upload                         :  98%|███████████████████████████████████████████████████████████████████████▊ | 8.25GB / 8.38GB,  513MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  78%|████████████████████████████████████████████████████████▉                | 3.89GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  97%|██████████████████████████████████████████████████████████████████████▉  | 4.83GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  91%|██████████████████████████████████████████████████████████████████▋      | 13.3GB / 14.6GB,  506MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▍| 8.31GB / 8.38GB,  506MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  78%|████████████████████████████████████████████████████████▉                | 3.89GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors:  98%|███████████████████████████████████████████████████████████████████████▊ | 4.89GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  92%|███████████████████████████████████████████████████████████████████      | 13.4GB / 14.6GB,  506MB/s  
New Data Upload                         :  98%|███████████████████████████████████████████████████████████████████████▊ | 8.38GB / 8.52GB,  506MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  79%|█████████████████████████████████████████████████████████▉               | 3.96GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  93%|███████████████████████████████████████████████████████████████████▋     | 13.5GB / 14.6GB,  506MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▍| 8.52GB / 8.58GB,  506MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  81%|██████████████████████████████████████████████████████████▉              | 4.02GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  93%|████████████████████████████████████████████████████████████████████     | 13.6GB / 14.6GB,  500MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▍| 8.58GB / 8.65GB,  500MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  82%|███████████████████████████████████████████████████████████▉             | 4.09GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  94%|████████████████████████████████████████████████████████████████████▍    | 13.7GB / 14.6GB,  493MB/s  
New Data Upload                         : 100%|████████████████████████████████████████████████████████████████████████▉| 8.65GB / 8.65GB,  493MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  82%|███████████████████████████████████████████████████████████▉             | 4.09GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  94%|████████████████████████████████████████████████████████████████████▍    | 13.7GB / 14.6GB,  493MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▍| 8.65GB / 8.72GB,  493MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  83%|████████████████████████████████████████████████████████████▉            | 4.16GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  94%|████████████████████████████████████████████████████████████████████▋    | 13.7GB / 14.6GB,  480MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▍| 8.72GB / 8.79GB,  480MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  85%|█████████████████████████████████████████████████████████████▉           | 4.22GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  95%|█████████████████████████████████████████████████████████████████████    | 13.8GB / 14.6GB,  470MB/s  
New Data Upload                         : 100%|████████████████████████████████████████████████████████████████████████▉| 8.78GB / 8.79GB,  470MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  85%|██████████████████████████████████████████████████████████████           | 4.24GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  95%|█████████████████████████████████████████████████████████████████████▏   | 13.8GB / 14.6GB,  468MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▌| 8.80GB / 8.85GB,  468MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  86%|██████████████████████████████████████████████████████████████▉          | 4.29GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  95%|█████████████████████████████████████████████████████████████████████▍   | 13.9GB / 14.6GB,  460MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▍| 8.85GB / 8.92GB,  460MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  87%|███████████████████████████████████████████████████████████████▊         | 4.36GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  96%|█████████████████████████████████████████████████████████████████████▋   | 13.9GB / 14.6GB,  456MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▍| 8.92GB / 8.99GB,  456MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  89%|████████████████████████████████████████████████████████████████▊        | 4.42GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  96%|██████████████████████████████████████████████████████████████████████   | 14.0GB / 14.6GB,  447MB/s  
New Data Upload                         : 100%|████████████████████████████████████████████████████████████████████████▉| 8.99GB / 8.99GB,  447MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  89%|████████████████████████████████████████████████████████████████▊        | 4.43GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  96%|██████████████████████████████████████████████████████████████████████   | 14.0GB / 14.6GB,  434MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▍| 8.99GB / 9.05GB,  434MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  90%|█████████████████████████████████████████████████████████████████▊       | 4.49GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  96%|██████████████████████████████████████████████████████████████████████▍  | 14.1GB / 14.6GB,  441MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▍| 9.05GB / 9.12GB,  441MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  92%|██████████████████████████████████████████████████████████████████▊      | 4.56GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  97%|██████████████████████████████████████████████████████████████████████▋  | 14.1GB / 14.6GB,  441MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▍| 9.12GB / 9.19GB,  441MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  93%|███████████████████████████████████████████████████████████████████▊     | 4.63GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  97%|███████████████████████████████████████████████████████████████████████  | 14.2GB / 14.6GB,  434MB/s  
New Data Upload                         : 100%|████████████████████████████████████████████████████████████████████████▉| 9.19GB / 9.19GB,  434MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  93%|███████████████████████████████████████████████████████████████████▊     | 4.63GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  94%|████████████████████████████████████████████████████████████████████▊    | 4.69GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  98%|███████████████████████████████████████████████████████████████████████▍ | 14.3GB / 14.6GB,  414MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▍| 9.25GB / 9.32GB,  414MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  96%|█████████████████████████████████████████████████████████████████████▋   | 4.76GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  98%|███████████████████████████████████████████████████████████████████████▋ | 14.3GB / 14.6GB,  414MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▍| 9.32GB / 9.39GB,  414MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  97%|██████████████████████████████████████████████████████████████████████▋  | 4.83GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  99%|████████████████████████████████████████████████████████████████████████ | 14.4GB / 14.6GB,  408MB/s  
New Data Upload                         : 100%|████████████████████████████████████████████████████████████████████████▉| 9.39GB / 9.39GB,  408MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  97%|██████████████████████████████████████████████████████████████████████▋  | 4.83GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  99%|████████████████████████████████████████████████████████████████████████ | 14.4GB / 14.6GB,  395MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▍| 9.39GB / 9.46GB,  395MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors:  98%|███████████████████████████████████████████████████████████████████████▋ | 4.89GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                :  99%|████████████████████████████████████████████████████████████████████████▍| 14.5GB / 14.6GB,  395MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▍| 9.45GB / 9.52GB,  395MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.96GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▋| 4.63GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▊| 4.96GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                : 100%|████████████████████████████████████████████████████████████████████████▊| 14.5GB / 14.6GB,  395MB/s  
New Data Upload                         :  99%|████████████████████████████████████████████████████████████████████████▋| 9.52GB / 9.57GB,  395MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▉| 4.98GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▉| 4.65GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▉| 4.97GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                : 100%|████████████████████████████████████████████████████████████████████████▉| 14.6GB / 14.6GB,  393MB/s  
New Data Upload                         : 100%|████████████████████████████████████████████████████████████████████████▉| 9.57GB / 9.57GB,  393MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▉| 4.98GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▉| 4.65GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▉| 4.97GB / 4.97GB            [A[A[A[A[A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▉| 4.98GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▉| 4.65GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|████████████████████████████████████████████████████████████████████████▉| 4.97GB / 4.97GB            [A[A[A[A[AProcessing Files (1 / 4)                : 100%|████████████████████████████████████████████████████████████████████████▉| 14.6GB / 14.6GB,  366MB/s  
New Data Upload                         : 100%|████████████████████████████████████████████████████████████████████████▉| 9.57GB / 9.57GB,  366MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.98GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.65GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.97GB / 4.97GB            [A[A[A[A[AProcessing Files (4 / 4)                : 100%|█████████████████████████████████████████████████████████████████████████| 14.6GB / 14.6GB,  360MB/s  
New Data Upload                         : 100%|█████████████████████████████████████████████████████████████████████████| 9.57GB / 9.57GB,  360MB/s  [A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.98GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.65GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.97GB / 4.97GB            [A[A[A[A[A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.98GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.65GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.97GB / 4.97GB            [A[A[A[A[A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.98GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.65GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.97GB / 4.97GB            [A[A[A[A[A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.98GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.65GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.97GB / 4.97GB            [A[A[A[A[A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.98GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.65GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.97GB / 4.97GB            [A[A[A[A[A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.98GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.65GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.97GB / 4.97GB            [A[A[A[A[A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.98GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.65GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.97GB / 4.97GB            [A[A[A[A[A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.98GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.65GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.97GB / 4.97GB            [A[A[A[A[A

  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            [A[A


  ...se/model-00002-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.98GB / 4.98GB            [A[A[A


  ...se/model-00003-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.65GB / 4.65GB            [A[A[A[A


  ...se/model-00001-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.97GB / 4.97GB            [A[A[A[A[AProcessing Files (4 / 4)                : 100%|█████████████████████████████████████████████████████████████████████████| 14.6GB / 14.6GB,  287MB/s  
New Data Upload                         : 100%|█████████████████████████████████████████████████████████████████████████| 9.57GB / 9.57GB,  287MB/s  
  ...model-output-base/training_args.bin: 100%|█████████████████████████████████████████████████████████████████████████| 8.98kB / 8.98kB            
  ...se/model-00002-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.98GB / 4.98GB            
  ...se/model-00003-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.65GB / 4.65GB            
  ...se/model-00001-of-00003.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 4.97GB / 4.97GB