[2026-04-23 22:52:29,657] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:2655295] baseline 0.000GB ()
[2026-04-23 22:52:29,657] [INFO] [axolotl.cli.config.load_cfg:259] [PID:2655295] config:
{
  "activation_offloading": false,
  "adapter": "lora",
  "axolotl_config_path": "./axolotl_configs/Qwen2.5-coder-7b-instruct/stage_1_2/lora-stage2-secure-token-diff-ctx0.yaml",
  "base_model": "Qwen/Qwen2.5-Coder-7B-Instruct",
  "base_model_config": "Qwen/Qwen2.5-Coder-7B-Instruct",
  "batch_size": 64,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_90",
    "fp8": true,
    "n_gpu": 4,
    "n_node": 1
  },
  "context_parallel_size": 1,
  "dataloader_num_workers": 4,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 96,
  "dataset_prepared_path": "/home/tkwang/links/scratch/SecSteer-v2/axolotl-datasets/lora/Qwen2.5-Coder-7B/stage_2_secure_token_diff_mask_skip_indent_ctx0_chat",
  "datasets": [
    {
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "felixwangg/stage_2_secure_token_diff_mask_skip_indent_ctx0_chat",
      "split": "train",
      "trust_remote_code": false,
      "type": "pretokenized"
    }
  ],
  "ddp": true,
  "device": "cuda:0",
  "device_map": {
    "": 0
  },
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "early_stopping_patience": 1000,
  "env_capabilities": {
    "torch_version": "2.10.0"
  },
  "eval_batch_size": 4,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": false,
  "eval_steps": 15,
  "eval_table_size": 0,
  "experimental_skip_move_to_device": true,
  "flash_attention": true,
  "fp16": false,
  "gradient_accumulation_steps": 4,
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": true
  },
  "include_tkps": true,
  "is_falcon_derived_model": false,
  "is_llama_derived_model": false,
  "is_mistral_derived_model": false,
  "learning_rate": 4e-05,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": true,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_alpha": 16,
  "lora_dropout": 0.05,
  "lora_model_dir": "/home/tkwang/links/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen2.5-Coder-7B-stage1-combined/checkpoint-6",
  "lora_r": 16,
  "lora_target_linear": true,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "mean_resizing_embeddings": false,
  "merge_lora": false,
  "micro_batch_size": 4,
  "model_config_type": "qwen2",
  "num_epochs": 2.0,
  "optimizer": "adamw_torch",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "/home/tkwang/links/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen2.5-Coder-7B-stage2-secure-token-diff-ctx0",
  "pad_to_sequence_len": true,
  "plugins": [
    "diff_mask_trainer.plugin.DiffMaskPlugin"
  ],
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": false,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_steps": 15,
  "save_total_limit": 1000,
  "sequence_len": 4096,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "test_datasets": [
    {
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "felixwangg/stage_2_secure_token_diff_mask_skip_indent_ctx0_chat",
      "split": "validation",
      "trust_remote_code": false,
      "type": "pretokenized"
    }
  ],
  "tf32": false,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "Qwen/Qwen2.5-Coder-7B-Instruct",
  "tokenizer_save_jinja_files": true,
  "tokenizer_type": "AutoTokenizer",
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "type_of_model": "Qwen2ForCausalLM",
  "use_otel_metrics": false,
  "use_ray": false,
  "use_wandb": true,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_entity": "wtkuan",
  "wandb_log_model": "false",
  "wandb_name": "Qwen2.5-Coder-7B-stage2-secure-token-diff-ctx0",
  "wandb_project": "diff-mask-stage1-2-ctx-0",
  "wandb_watch": "false",
  "warmup_ratio": 0.1,
  "weight_decay": 0.02,
  "world_size": 4
}
[2026-04-23 22:52:30,639] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:2655295] EOS: 151645 / <|im_end|>
[2026-04-23 22:52:30,640] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:2655295] BOS: None / None
[2026-04-23 22:52:30,640] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:2655295] PAD: 151643 / <|endoftext|>
[2026-04-23 22:52:30,640] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:2655295] UNK: None / None
[2026-04-23 22:52:30,647] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:2655295] Loading prepared dataset from disk at /home/tkwang/links/scratch/SecSteer-v2/axolotl-datasets/lora/Qwen2.5-Coder-7B/stage_2_secure_token_diff_mask_skip_indent_ctx0_chat/21d42ece2ab8cc786c50d82a4c015221...
[2026-04-23 22:52:30,660] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:2655295] Loading prepared dataset from disk at /home/tkwang/links/scratch/SecSteer-v2/axolotl-datasets/lora/Qwen2.5-Coder-7B/stage_2_secure_token_diff_mask_skip_indent_ctx0_chat/8ede901bf97cb6e2000275c274728568...
[2026-04-23 22:52:30,676] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:417] [PID:2655295] total_num_tokens: 4_377_675
[2026-04-23 22:52:30,761] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:435] [PID:2655295] `total_supervised_tokens: 3_580_214`
[2026-04-23 22:52:30,761] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:533] [PID:2655295] total_num_steps: 115
[2026-04-23 22:52:30,761] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:2655295] Maximum number of steps set at 115
[2026-04-23 22:52:30,781] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:2655295] loading tokenizer... Qwen/Qwen2.5-Coder-7B-Instruct
[2026-04-23 22:52:31,189] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:2655295] EOS: 151645 / <|im_end|>
[2026-04-23 22:52:31,189] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:2655295] BOS: None / None
[2026-04-23 22:52:31,189] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:2655295] PAD: 151643 / <|endoftext|>
[2026-04-23 22:52:31,189] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:2655295] UNK: None / None
[2026-04-23 22:52:31,189] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:2655295] Loading model
[2026-04-23 22:52:31,245] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:2655295] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-04-23 22:52:31,246] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:2655295] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
Loading checkpoint shards:   0%|                                                                                                                                      | 0/4 [00:00<?, ?it/s]Loading checkpoint shards:  25%|███████████████████████████████▌                                                                                              | 1/4 [00:38<01:54, 38.20s/it]Loading checkpoint shards:  50%|███████████████████████████████████████████████████████████████                                                               | 2/4 [01:15<01:15, 37.74s/it]Loading checkpoint shards:  75%|██████████████████████████████████████████████████████████████████████████████████████████████▌                               | 3/4 [01:44<00:33, 33.87s/it]Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:52<00:00, 23.49s/it]Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:52<00:00, 28.11s/it]
[2026-04-23 22:54:24,987] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:347] [PID:2655295] Converting modules to torch.bfloat16
[2026-04-23 22:54:24,990] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:2655295] Memory usage after model load 17.233GB (+17.233GB allocated, +18.252GB reserved)
[2026-04-23 22:54:24,990] [INFO] [axolotl.loaders.adapter.load_lora:81] [PID:2655295] found linear modules: ['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj']
[2026-04-23 22:54:24,991] [DEBUG] [axolotl.loaders.adapter.load_lora:150] [PID:2655295] Loading pretrained PEFT - LoRA
trainable params: 40,370,176 || all params: 7,655,986,688 || trainable%: 0.5273
[2026-04-23 22:54:25,949] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:2655295] after adapters 14.487GB (+14.487GB allocated, +18.365GB reserved)
[2026-04-23 22:54:29,193] [WARNING] [py.warnings._showwarnmsg:112] [PID:2655295] /scratch/tkwang/SecSteer-v2/.venv/lib/python3.12/site-packages/trl/extras/vllm_client.py:37: UserWarning: TRL currently supports vLLM versions: 0.10.2, 0.11.0, 0.11.1, 0.11.2, 0.12.0. You have version 0.19.0 installed. We recommend installing a supported version to avoid compatibility issues.
  if is_vllm_available():

[2026-04-23 22:54:29,633] [WARNING] [py.warnings._showwarnmsg:112] [PID:2655295] /scratch/tkwang/SecSteer-v2/.venv/lib/python3.12/site-packages/trl/trainer/grpo_trainer.py:105: UserWarning: TRL currently supports vLLM versions: 0.10.2, 0.11.0, 0.11.1, 0.11.2, 0.12.0. You have version 0.19.0 installed. We recommend installing a supported version to avoid compatibility issues.
  if is_vllm_available():

DiffMaskPlugin: patching trainer with alpha=0.5
DiffMaskPlugin: compute_loss and prediction_step patched
[2026-04-23 22:54:39,171] [INFO] [axolotl.train.save_initial_configs:413] [PID:2655295] Pre-saving adapter config to /home/tkwang/links/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen2.5-Coder-7B-stage2-secure-token-diff-ctx0...
[2026-04-23 22:54:39,177] [INFO] [axolotl.train.save_initial_configs:417] [PID:2655295] Pre-saving tokenizer to /home/tkwang/links/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen2.5-Coder-7B-stage2-secure-token-diff-ctx0...
[2026-04-23 22:54:39,316] [INFO] [axolotl.train.save_initial_configs:422] [PID:2655295] Pre-saving model config to /home/tkwang/links/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen2.5-Coder-7B-stage2-secure-token-diff-ctx0...
[2026-04-23 22:54:39,325] [INFO] [axolotl.train.execute_training:212] [PID:2655295] Starting trainer...
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /home/tkwang/.netrc.
[34m[1mwandb[0m: Currently logged in as: [33mwtkuan[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: [38;5;178m⢿[0m Waiting for wandb.init()...
[Am[2K[34m[1mwandb[0m: [38;5;178m⣻[0m Waiting for wandb.init()...
[Am[2K[34m[1mwandb[0m: [38;5;178m⣽[0m Waiting for wandb.init()...
[Am[2K[34m[1mwandb[0m: [38;5;178m⣾[0m setting up run qu0bhhwl (0.4s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣷[0m setting up run qu0bhhwl (0.4s)
[Am[2K[34m[1mwandb[0m: Tracking run with wandb version 0.24.0
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/scratch/tkwang/SecSteer-v2/wandb/run-20260423_225443-qu0bhhwl[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mQwen2.5-Coder-7B-stage2-secure-token-diff-ctx0[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/wtkuan/diff-mask-stage1-2-ctx-0[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/wtkuan/diff-mask-stage1-2-ctx-0/runs/qu0bhhwl[0m
[34m[1mwandb[0m: Detected [huggingface_hub.inference, mcp, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/
[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
[34m[1mwandb[0m: [33mWARNING[0m Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
[2026-04-23 22:54:46,093] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:2655295] The Axolotl config has been saved to the WandB run under files.
  0%|                                                                                                                                                               | 0/115 [00:00<?, ?it/s][2026-04-23 22:54:46,096] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:2655295] Running evaluation step...

  0%|                                                                                                                                                                | 0/26 [00:00<?, ?it/s][A
  8%|███████████▋                                                                                                                                            | 2/26 [00:00<00:07,  3.36it/s][A
 12%|█████████████████▌                                                                                                                                      | 3/26 [00:01<00:10,  2.26it/s][A
 15%|███████████████████████▍                                                                                                                                | 4/26 [00:01<00:11,  1.94it/s][A
 19%|█████████████████████████████▏                                                                                                                          | 5/26 [00:02<00:12,  1.63it/s][A
 23%|███████████████████████████████████                                                                                                                     | 6/26 [00:03<00:12,  1.62it/s][A
 27%|████████████████████████████████████████▉                                                                                                               | 7/26 [00:03<00:12,  1.58it/s][A
 31%|██████████████████████████████████████████████▊                                                                                                         | 8/26 [00:04<00:11,  1.55it/s][A
 35%|████████████████████████████████████████████████████▌                                                                                                   | 9/26 [00:05<00:11,  1.49it/s][A
 38%|██████████████████████████████████████████████████████████                                                                                             | 10/26 [00:06<00:10,  1.51it/s][A
 42%|███████████████████████████████████████████████████████████████▉                                                                                       | 11/26 [00:06<00:09,  1.52it/s][A
 46%|█████████████████████████████████████████████████████████████████████▋                                                                                 | 12/26 [00:07<00:09,  1.52it/s][A
 50%|███████████████████████████████████████████████████████████████████████████▌                                                                           | 13/26 [00:08<00:08,  1.46it/s][A
 54%|█████████████████████████████████████████████████████████████████████████████████▎                                                                     | 14/26 [00:08<00:08,  1.49it/s][A
 58%|███████████████████████████████████████████████████████████████████████████████████████                                                                | 15/26 [00:09<00:07,  1.51it/s][A
 62%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                          | 16/26 [00:10<00:06,  1.51it/s][A
 65%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 17/26 [00:10<00:06,  1.45it/s][A
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 18/26 [00:11<00:05,  1.48it/s][A
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 19/26 [00:12<00:04,  1.50it/s][A
 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 20/26 [00:12<00:03,  1.50it/s][A
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 21/26 [00:13<00:03,  1.45it/s][A
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 22/26 [00:14<00:02,  1.47it/s][A
 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 23/26 [00:14<00:02,  1.49it/s][A
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 24/26 [00:15<00:01,  1.48it/s][A
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 25/26 [00:16<00:00,  1.45it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:16<00:00,  1.46it/s][A                                                                                                                                                                                            
                                                                                                                                                                                            [A{'eval_loss': 0.8545306921005249, 'eval_runtime': 18.1333, 'eval_samples_per_second': 22.555, 'eval_steps_per_second': 1.434, 'eval_ppl': 2.35027, 'memory/max_active (GiB)': 42.36, 'memory/max_allocated (GiB)': 42.36, 'memory/device_reserved (GiB)': 52.88, 'epoch': 0}
  0%|                                                                                                                                                               | 0/115 [00:18<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:16<00:00,  1.46it/s][A
                                                                                                                                                                                            [A  1%|█▎                                                                                                                                                     | 1/115 [00:28<54:33, 28.71s/it]                                                                                                                                                                                            {'loss': 3.0615, 'grad_norm': 0.2578267753124237, 'learning_rate': 0.0, 'ppl': 21.35957, 'memory/max_active (GiB)': 45.83, 'memory/max_allocated (GiB)': 45.83, 'memory/device_reserved (GiB)': 60.58, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.02}
  1%|█▎                                                                                                                                                     | 1/115 [00:28<54:33, 28.71s/it]  2%|██▋                                                                                                                                                    | 2/115 [00:37<32:04, 17.03s/it]                                                                                                                                                                                            {'loss': 3.2411, 'grad_norm': 0.27749794721603394, 'learning_rate': 3.6363636363636366e-06, 'ppl': 25.56182, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 60.61, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.03}
  2%|██▋                                                                                                                                                    | 2/115 [00:37<32:04, 17.03s/it]  3%|███▉                                                                                                                                                   | 3/115 [00:46<24:47, 13.28s/it]                                                                                                                                                                                            {'loss': 3.6254, 'grad_norm': 0.25002938508987427, 'learning_rate': 7.272727272727273e-06, 'ppl': 37.53974, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 60.61, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.05}
  3%|███▉                                                                                                                                                   | 3/115 [00:46<24:47, 13.28s/it]  3%|█████▎                                                                                                                                                 | 4/115 [00:55<21:19, 11.53s/it]                                                                                                                                                                                            {'loss': 3.4856, 'grad_norm': 0.27999335527420044, 'learning_rate': 1.0909090909090909e-05, 'ppl': 32.64201, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 60.61, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.07}
  3%|█████▎                                                                                                                                                 | 4/115 [00:55<21:19, 11.53s/it]  4%|██████▌                                                                                                                                                | 5/115 [01:04<19:23, 10.58s/it]                                                                                                                                                                                            {'loss': 3.0244, 'grad_norm': 0.22821375727653503, 'learning_rate': 1.4545454545454546e-05, 'ppl': 20.58165, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 60.61, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.09}
  4%|██████▌                                                                                                                                                | 5/115 [01:04<19:23, 10.58s/it]  5%|███████▉                                                                                                                                               | 6/115 [01:13<18:14, 10.05s/it]                                                                                                                                                                                            {'loss': 2.9809, 'grad_norm': 0.22468331456184387, 'learning_rate': 1.8181818181818182e-05, 'ppl': 19.70554, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 60.61, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.1}
  5%|███████▉                                                                                                                                               | 6/115 [01:13<18:14, 10.05s/it]  6%|█████████▏                                                                                                                                             | 7/115 [01:21<17:19,  9.63s/it]                                                                                                                                                                                            {'loss': 3.5497, 'grad_norm': 0.3416990339756012, 'learning_rate': 2.1818181818181818e-05, 'ppl': 34.80288, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 60.61, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.12}
  6%|█████████▏                                                                                                                                             | 7/115 [01:21<17:19,  9.63s/it]  7%|██████████▌                                                                                                                                            | 8/115 [01:30<16:46,  9.40s/it]                                                                                                                                                                                            {'loss': 3.2556, 'grad_norm': 0.26129773259162903, 'learning_rate': 2.5454545454545457e-05, 'ppl': 25.93517, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 60.61, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.14}
  7%|██████████▌                                                                                                                                            | 8/115 [01:30<16:46,  9.40s/it]  8%|███████████▊                                                                                                                                           | 9/115 [01:39<16:21,  9.26s/it]                                                                                                                                                                                            {'loss': 3.4488, 'grad_norm': 0.3081296384334564, 'learning_rate': 2.9090909090909093e-05, 'ppl': 31.46261, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 60.61, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.16}
  8%|███████████▊                                                                                                                                           | 9/115 [01:39<16:21,  9.26s/it]  9%|█████████████                                                                                                                                         | 10/115 [01:48<16:02,  9.17s/it]                                                                                                                                                                                            {'loss': 3.2793, 'grad_norm': 0.2935417890548706, 'learning_rate': 3.272727272727273e-05, 'ppl': 26.55718, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 60.61, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.17}
  9%|█████████████                                                                                                                                         | 10/115 [01:48<16:02,  9.17s/it] 10%|██████████████▎                                                                                                                                       | 11/115 [01:57<15:43,  9.07s/it]                                                                                                                                                                                            {'loss': 3.5966, 'grad_norm': 0.31488341093063354, 'learning_rate': 3.6363636363636364e-05, 'ppl': 36.47401, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 60.61, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.19}
 10%|██████████████▎                                                                                                                                       | 11/115 [01:57<15:43,  9.07s/it] 10%|███████████████▋                                                                                                                                      | 12/115 [02:06<15:30,  9.03s/it]                                                                                                                                                                                            {'loss': 3.1424, 'grad_norm': 0.29208967089653015, 'learning_rate': 4e-05, 'ppl': 23.15938, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 60.61, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.21}
 10%|███████████████▋                                                                                                                                      | 12/115 [02:06<15:30,  9.03s/it] 11%|████████████████▉                                                                                                                                     | 13/115 [02:15<15:17,  8.99s/it]                                                                                                                                                                                            {'loss': 3.3177, 'grad_norm': 0.31290557980537415, 'learning_rate': 3.9990875689790674e-05, 'ppl': 27.5968, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 60.61, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.23}
 11%|████████████████▉                                                                                                                                     | 13/115 [02:15<15:17,  8.99s/it] 12%|██████████████████▎                                                                                                                                   | 14/115 [02:24<15:03,  8.94s/it]                                                                                                                                                                                            {'loss': 3.4898, 'grad_norm': 0.35686081647872925, 'learning_rate': 3.996351108446635e-05, 'ppl': 32.77939, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 60.61, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.24}
 12%|██████████████████▎                                                                                                                                   | 14/115 [02:24<15:03,  8.94s/it] 13%|███████████████████▌                                                                                                                                  | 15/115 [02:33<14:53,  8.94s/it]                                                                                                                                                                                            {'loss': 3.4189, 'grad_norm': 0.33751943707466125, 'learning_rate': 3.991793115234182e-05, 'ppl': 30.53581, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 60.61, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.26}
 13%|███████████████████▌                                                                                                                                  | 15/115 [02:33<14:53,  8.94s/it][2026-04-23 22:57:19,275] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:2655295] Running evaluation step...

  0%|                                                                                                                                                                | 0/26 [00:00<?, ?it/s][A
  8%|███████████▋                                                                                                                                            | 2/26 [00:00<00:08,  2.96it/s][A
 12%|█████████████████▌                                                                                                                                      | 3/26 [00:01<00:10,  2.15it/s][A
 15%|███████████████████████▍                                                                                                                                | 4/26 [00:01<00:11,  1.88it/s][A
 19%|█████████████████████████████▏                                                                                                                          | 5/26 [00:02<00:13,  1.60it/s][A
 23%|███████████████████████████████████                                                                                                                     | 6/26 [00:03<00:12,  1.60it/s][A
 27%|████████████████████████████████████████▉                                                                                                               | 7/26 [00:04<00:12,  1.56it/s][A
 31%|██████████████████████████████████████████████▊                                                                                                         | 8/26 [00:04<00:11,  1.54it/s][A
 35%|████████████████████████████████████████████████████▌                                                                                                   | 9/26 [00:05<00:11,  1.48it/s][A
 38%|██████████████████████████████████████████████████████████                                                                                             | 10/26 [00:06<00:10,  1.51it/s][A
 42%|███████████████████████████████████████████████████████████████▉                                                                                       | 11/26 [00:06<00:09,  1.52it/s][A
 46%|█████████████████████████████████████████████████████████████████████▋                                                                                 | 12/26 [00:07<00:09,  1.53it/s][A
 50%|███████████████████████████████████████████████████████████████████████████▌                                                                           | 13/26 [00:08<00:08,  1.47it/s][A
 54%|█████████████████████████████████████████████████████████████████████████████████▎                                                                     | 14/26 [00:08<00:08,  1.50it/s][A
 58%|███████████████████████████████████████████████████████████████████████████████████████                                                                | 15/26 [00:09<00:07,  1.51it/s][A
 62%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                          | 16/26 [00:10<00:06,  1.51it/s][A
 65%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 17/26 [00:10<00:06,  1.45it/s][A
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 18/26 [00:11<00:05,  1.49it/s][A
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 19/26 [00:12<00:04,  1.50it/s][A
 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 20/26 [00:12<00:03,  1.50it/s][A
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 21/26 [00:13<00:03,  1.46it/s][A
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 22/26 [00:14<00:02,  1.48it/s][A
 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 23/26 [00:14<00:02,  1.49it/s][A
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 24/26 [00:15<00:01,  1.49it/s][A
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 25/26 [00:16<00:00,  1.45it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:16<00:00,  1.45it/s][A                                                                                                                                                                                            
                                                                                                                                                                                            [A{'eval_loss': 0.8127650618553162, 'eval_runtime': 17.8171, 'eval_samples_per_second': 22.956, 'eval_steps_per_second': 1.459, 'eval_ppl': 2.25413, 'memory/max_active (GiB)': 42.7, 'memory/max_allocated (GiB)': 42.7, 'memory/device_reserved (GiB)': 60.61, 'epoch': 0.26, 'tokens/train_per_sec_per_gpu': 0.0}
 13%|███████████████████▌                                                                                                                                  | 15/115 [02:51<14:53,  8.94s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:17<00:00,  1.45it/s][A
                                                                                                                                                                                            [A[2026-04-23 22:57:37,108] [INFO] [axolotl.core.trainers.base._save:721] [PID:2655295] Saving model checkpoint to /home/tkwang/links/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen2.5-Coder-7B-stage2-secure-token-diff-ctx0/checkpoint-15
[2026-04-23 22:57:37,751] [WARNING] [py.warnings._showwarnmsg:112] [PID:2655295] /scratch/tkwang/SecSteer-v2/.venv/lib/python3.12/site-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning.
  return func(*args, **kwargs)

 14%|████████████████████▊                                                                                                                                 | 16/115 [03:00<24:03, 14.58s/it]                                                                                                                                                                                            {'loss': 2.9616, 'grad_norm': 0.3194446265697479, 'learning_rate': 3.985417748196108e-05, 'ppl': 19.32887, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.28}
 14%|████████████████████▊                                                                                                                                 | 16/115 [03:00<24:03, 14.58s/it] 15%|██████████████████████▏                                                                                                                               | 17/115 [03:09<21:03, 12.90s/it]                                                                                                                                                                                            {'loss': 3.1854, 'grad_norm': 0.27943116426467896, 'learning_rate': 3.977230824415069e-05, 'ppl': 24.17696, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.3}
 15%|██████████████████████▏                                                                                                                               | 17/115 [03:09<21:03, 12.90s/it] 16%|███████████████████████▍                                                                                                                              | 18/115 [03:18<18:54, 11.69s/it]                                                                                                                                                                                            {'loss': 3.2153, 'grad_norm': 0.2595396935939789, 'learning_rate': 3.967239813894288e-05, 'ppl': 24.91076, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.31}
 16%|███████████████████████▍                                                                                                                              | 18/115 [03:18<18:54, 11.69s/it] 17%|████████████████████████▊                                                                                                                             | 19/115 [03:27<17:20, 10.84s/it]                                                                                                                                                                                            {'loss': 3.3825, 'grad_norm': 0.286888986825943, 'learning_rate': 3.955453832741694e-05, 'ppl': 29.44429, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.33}
 17%|████████████████████████▊                                                                                                                             | 19/115 [03:27<17:20, 10.84s/it] 17%|██████████████████████████                                                                                                                            | 20/115 [03:36<16:13, 10.25s/it]                                                                                                                                                                                            {'loss': 3.1478, 'grad_norm': 0.22283361852169037, 'learning_rate': 3.9418836348521045e-05, 'ppl': 23.28478, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.35}
 17%|██████████████████████████                                                                                                                            | 20/115 [03:36<16:13, 10.25s/it] 18%|███████████████████████████▍                                                                                                                          | 21/115 [03:45<15:24,  9.84s/it]                                                                                                                                                                                            {'loss': 3.1876, 'grad_norm': 0.24746249616146088, 'learning_rate': 3.926541602095033e-05, 'ppl': 24.23021, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.37}
 18%|███████████████████████████▍                                                                                                                          | 21/115 [03:45<15:24,  9.84s/it] 19%|████████████████████████████▋                                                                                                                         | 22/115 [03:54<14:47,  9.54s/it]                                                                                                                                                                                            {'loss': 3.0932, 'grad_norm': 0.2526022791862488, 'learning_rate': 3.909441733017092e-05, 'ppl': 22.04752, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.38}
 19%|████████████████████████████▋                                                                                                                         | 22/115 [03:54<14:47,  9.54s/it] 20%|██████████████████████████████                                                                                                                        | 23/115 [04:03<14:21,  9.36s/it]                                                                                                                                                                                            {'loss': 3.1009, 'grad_norm': 0.18958109617233276, 'learning_rate': 3.8905996300692806e-05, 'ppl': 22.21794, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.4}
 20%|██████████████████████████████                                                                                                                        | 23/115 [04:03<14:21,  9.36s/it] 21%|███████████████████████████████▎                                                                                                                      | 24/115 [04:11<13:57,  9.20s/it]                                                                                                                                                                                            {'loss': 2.6369, 'grad_norm': 0.21026931703090668, 'learning_rate': 3.8700324853708304e-05, 'ppl': 13.96983, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.42}
 21%|███████████████████████████████▎                                                                                                                      | 24/115 [04:11<13:57,  9.20s/it] 22%|████████████████████████████████▌                                                                                                                     | 25/115 [04:20<13:43,  9.15s/it]                                                                                                                                                                                            {'loss': 3.2383, 'grad_norm': 0.22608548402786255, 'learning_rate': 3.8477590650225735e-05, 'ppl': 25.49035, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.43}
 22%|████████████████████████████████▌                                                                                                                     | 25/115 [04:20<13:43,  9.15s/it] 23%|█████████████████████████████████▉                                                                                                                    | 26/115 [04:29<13:27,  9.08s/it]                                                                                                                                                                                            {'loss': 3.1365, 'grad_norm': 0.22349852323532104, 'learning_rate': 3.82379969198418e-05, 'ppl': 23.02314, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.45}
 23%|█████████████████████████████████▉                                                                                                                    | 26/115 [04:29<13:27,  9.08s/it] 23%|███████████████████████████████████▏                                                                                                                  | 27/115 [04:38<13:15,  9.04s/it]                                                                                                                                                                                            {'loss': 2.9747, 'grad_norm': 0.18596747517585754, 'learning_rate': 3.798176227530852e-05, 'ppl': 19.58375, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.47}
 23%|███████████████████████████████████▏                                                                                                                  | 27/115 [04:38<13:15,  9.04s/it] 24%|████████████████████████████████████▌                                                                                                                 | 28/115 [04:47<13:03,  9.01s/it]                                                                                                                                                                                            {'loss': 2.8119, 'grad_norm': 0.18497376143932343, 'learning_rate': 3.7709120513064196e-05, 'ppl': 16.64151, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.49}
 24%|████████████████████████████████████▌                                                                                                                 | 28/115 [04:47<13:03,  9.01s/it] 25%|█████████████████████████████████████▊                                                                                                                | 29/115 [04:56<12:51,  8.97s/it]                                                                                                                                                                                            {'loss': 2.9933, 'grad_norm': 0.23579834401607513, 'learning_rate': 3.7420320399910315e-05, 'ppl': 19.95141, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.5}
 25%|█████████████████████████████████████▊                                                                                                                | 29/115 [04:56<12:51,  8.97s/it] 26%|███████████████████████████████████████▏                                                                                                              | 30/115 [05:05<12:40,  8.95s/it]                                                                                                                                                                                            {'loss': 3.1757, 'grad_norm': 0.20321913063526154, 'learning_rate': 3.711562544602895e-05, 'ppl': 23.94357, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.52}
 26%|███████████████████████████████████████▏                                                                                                              | 30/115 [05:05<12:40,  8.95s/it][2026-04-23 22:59:51,681] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:2655295] Running evaluation step...

  0%|                                                                                                                                                                | 0/26 [00:00<?, ?it/s][A
  8%|███████████▋                                                                                                                                            | 2/26 [00:00<00:08,  3.00it/s][A
 12%|█████████████████▌                                                                                                                                      | 3/26 [00:01<00:10,  2.16it/s][A
 15%|███████████████████████▍                                                                                                                                | 4/26 [00:01<00:11,  1.89it/s][A
 19%|█████████████████████████████▏                                                                                                                          | 5/26 [00:02<00:13,  1.58it/s][A
 23%|███████████████████████████████████                                                                                                                     | 6/26 [00:03<00:12,  1.59it/s][A
 27%|████████████████████████████████████████▉                                                                                                               | 7/26 [00:04<00:12,  1.56it/s][A
 31%|██████████████████████████████████████████████▊                                                                                                         | 8/26 [00:04<00:11,  1.54it/s][A
 35%|████████████████████████████████████████████████████▌                                                                                                   | 9/26 [00:05<00:11,  1.47it/s][A
 38%|██████████████████████████████████████████████████████████                                                                                             | 10/26 [00:06<00:10,  1.51it/s][A
 42%|███████████████████████████████████████████████████████████████▉                                                                                       | 11/26 [00:06<00:09,  1.52it/s][A
 46%|█████████████████████████████████████████████████████████████████████▋                                                                                 | 12/26 [00:07<00:09,  1.53it/s][A
 50%|███████████████████████████████████████████████████████████████████████████▌                                                                           | 13/26 [00:08<00:08,  1.46it/s][A
 54%|█████████████████████████████████████████████████████████████████████████████████▎                                                                     | 14/26 [00:08<00:08,  1.50it/s][A
 58%|███████████████████████████████████████████████████████████████████████████████████████                                                                | 15/26 [00:09<00:07,  1.51it/s][A
 62%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                          | 16/26 [00:10<00:06,  1.51it/s][A
 65%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 17/26 [00:10<00:06,  1.45it/s][A
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 18/26 [00:11<00:05,  1.49it/s][A
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 19/26 [00:12<00:04,  1.50it/s][A
 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 20/26 [00:12<00:03,  1.50it/s][A
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 21/26 [00:13<00:03,  1.45it/s][A
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 22/26 [00:14<00:02,  1.47it/s][A
 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 23/26 [00:14<00:02,  1.49it/s][A
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 24/26 [00:15<00:01,  1.49it/s][A
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 25/26 [00:16<00:00,  1.45it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:16<00:00,  1.45it/s][A                                                                                                                                                                                            
                                                                                                                                                                                            [A{'eval_loss': 0.766757071018219, 'eval_runtime': 17.8285, 'eval_samples_per_second': 22.941, 'eval_steps_per_second': 1.458, 'eval_ppl': 2.15277, 'memory/max_active (GiB)': 42.7, 'memory/max_allocated (GiB)': 42.7, 'memory/device_reserved (GiB)': 62.92, 'epoch': 0.52, 'tokens/train_per_sec_per_gpu': 0.0}
 26%|███████████████████████████████████████▏                                                                                                              | 30/115 [05:23<12:40,  8.95s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:17<00:00,  1.45it/s][A
                                                                                                                                                                                            [A[2026-04-23 23:00:09,523] [INFO] [axolotl.core.trainers.base._save:721] [PID:2655295] Saving model checkpoint to /home/tkwang/links/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen2.5-Coder-7B-stage2-secure-token-diff-ctx0/checkpoint-30
[2026-04-23 23:00:10,039] [WARNING] [py.warnings._showwarnmsg:112] [PID:2655295] /scratch/tkwang/SecSteer-v2/.venv/lib/python3.12/site-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning.
  return func(*args, **kwargs)

 27%|████████████████████████████████████████▍                                                                                                             | 31/115 [05:33<20:27, 14.61s/it]                                                                                                                                                                                            {'loss': 3.4182, 'grad_norm': 0.16806775331497192, 'learning_rate': 3.6795313664547965e-05, 'ppl': 30.51444, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.54}
 27%|████████████████████████████████████████▍                                                                                                             | 31/115 [05:33<20:27, 14.61s/it] 28%|█████████████████████████████████████████▋                                                                                                            | 32/115 [05:42<17:51, 12.91s/it]                                                                                                                                                                                            {'loss': 3.0792, 'grad_norm': 0.16096840798854828, 'learning_rate': 3.645967731787313e-05, 'ppl': 21.741, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.56}
 28%|█████████████████████████████████████████▋                                                                                                            | 32/115 [05:42<17:51, 12.91s/it] 29%|███████████████████████████████████████████                                                                                                           | 33/115 [05:51<16:03, 11.75s/it]                                                                                                                                                                                            {'loss': 2.9631, 'grad_norm': 0.17233510315418243, 'learning_rate': 3.610902265101892e-05, 'ppl': 19.35789, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.57}
 29%|███████████████████████████████████████████                                                                                                           | 33/115 [05:51<16:03, 11.75s/it] 30%|████████████████████████████████████████████▎                                                                                                         | 34/115 [06:00<14:42, 10.89s/it]                                                                                                                                                                                            {'loss': 3.0288, 'grad_norm': 0.16947439312934875, 'learning_rate': 3.5743669612181004e-05, 'ppl': 20.67241, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.59}
 30%|████████████████████████████████████████████▎                                                                                                         | 34/115 [06:00<14:42, 10.89s/it] 30%|█████████████████████████████████████████████▋                                                                                                        | 35/115 [06:09<13:43, 10.30s/it]                                                                                                                                                                                            {'loss': 3.4427, 'grad_norm': 0.18328624963760376, 'learning_rate': 3.5363951560805615e-05, 'ppl': 31.27128, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.61}
 30%|█████████████████████████████████████████████▋                                                                                                        | 35/115 [06:09<13:43, 10.30s/it] 31%|██████████████████████████████████████████████▉                                                                                                       | 36/115 [06:18<13:00,  9.88s/it]                                                                                                                                                                                            {'loss': 3.0928, 'grad_norm': 0.17147624492645264, 'learning_rate': 3.497021496342203e-05, 'ppl': 22.0387, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.63}
 31%|██████████████████████████████████████████████▉                                                                                                       | 36/115 [06:18<13:00,  9.88s/it] 32%|████████████████████████████████████████████████▎                                                                                                     | 37/115 [06:26<12:26,  9.57s/it]                                                                                                                                                                                            {'loss': 2.9267, 'grad_norm': 0.2726716995239258, 'learning_rate': 3.456281907751577e-05, 'ppl': 18.66593, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.64}
 32%|████████████████████████████████████████████████▎                                                                                                     | 37/115 [06:26<12:26,  9.57s/it] 33%|█████████████████████████████████████████████████▌                                                                                                    | 38/115 [06:35<11:59,  9.34s/it]                                                                                                                                                                                            {'loss': 3.0869, 'grad_norm': 0.2066512107849121, 'learning_rate': 3.4142135623730954e-05, 'ppl': 21.90905, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.66}
 33%|█████████████████████████████████████████████████▌                                                                                                    | 38/115 [06:35<11:59,  9.34s/it] 34%|██████████████████████████████████████████████████▊                                                                                                   | 39/115 [06:44<11:40,  9.22s/it]                                                                                                                                                                                            {'loss': 2.8763, 'grad_norm': 0.1685408502817154, 'learning_rate': 3.37085484467008e-05, 'ppl': 17.74848, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.68}
 34%|██████████████████████████████████████████████████▊                                                                                                   | 39/115 [06:44<11:40,  9.22s/it] 35%|████████████████████████████████████████████████████▏                                                                                                 | 40/115 [06:53<11:25,  9.14s/it]                                                                                                                                                                                            {'loss': 3.2184, 'grad_norm': 0.2757796347141266, 'learning_rate': 3.326245316481591e-05, 'ppl': 24.98811, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.7}
 35%|████████████████████████████████████████████████████▏                                                                                                 | 40/115 [06:53<11:25,  9.14s/it] 36%|█████████████████████████████████████████████████████▍                                                                                                | 41/115 [07:02<11:09,  9.05s/it]                                                                                                                                                                                            {'loss': 3.4453, 'grad_norm': 0.22466956079006195, 'learning_rate': 3.280425680924976e-05, 'ppl': 31.35269, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.71}
 36%|█████████████████████████████████████████████████████▍                                                                                                | 41/115 [07:02<11:09,  9.05s/it] 37%|██████████████████████████████████████████████████████▊                                                                                               | 42/115 [07:11<10:54,  8.97s/it]                                                                                                                                                                                            {'loss': 3.0665, 'grad_norm': 0.2076496183872223, 'learning_rate': 3.2334377452570866e-05, 'ppl': 21.46664, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.73}
 37%|██████████████████████████████████████████████████████▊                                                                                               | 42/115 [07:11<10:54,  8.97s/it] 37%|████████████████████████████████████████████████████████                                                                                              | 43/115 [07:20<10:43,  8.94s/it]                                                                                                                                                                                            {'loss': 3.0176, 'grad_norm': 0.18503433465957642, 'learning_rate': 3.185324382728034e-05, 'ppl': 20.44217, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.75}
 37%|████████████████████████████████████████████████████████                                                                                              | 43/115 [07:20<10:43,  8.94s/it] 38%|█████████████████████████████████████████████████████████▍                                                                                            | 44/115 [07:28<10:30,  8.88s/it]                                                                                                                                                                                            {'loss': 3.1918, 'grad_norm': 0.20189054310321808, 'learning_rate': 3.136129493462312e-05, 'ppl': 24.33219, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.77}
 38%|█████████████████████████████████████████████████████████▍                                                                                            | 44/115 [07:28<10:30,  8.88s/it] 39%|██████████████████████████████████████████████████████████▋                                                                                           | 45/115 [07:37<10:20,  8.87s/it]                                                                                                                                                                                            {'loss': 3.0517, 'grad_norm': 0.18101483583450317, 'learning_rate': 3.085897964402958e-05, 'ppl': 21.15127, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.78}
 39%|██████████████████████████████████████████████████████████▋                                                                                           | 45/115 [07:37<10:20,  8.87s/it][2026-04-23 23:02:23,812] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:2655295] Running evaluation step...

  0%|                                                                                                                                                                | 0/26 [00:00<?, ?it/s][A
  8%|███████████▋                                                                                                                                            | 2/26 [00:00<00:07,  3.01it/s][A
 12%|█████████████████▌                                                                                                                                      | 3/26 [00:01<00:10,  2.16it/s][A
 15%|███████████████████████▍                                                                                                                                | 4/26 [00:01<00:11,  1.89it/s][A
 19%|█████████████████████████████▏                                                                                                                          | 5/26 [00:02<00:13,  1.59it/s][A
 23%|███████████████████████████████████                                                                                                                     | 6/26 [00:03<00:12,  1.59it/s][A
 27%|████████████████████████████████████████▉                                                                                                               | 7/26 [00:04<00:12,  1.56it/s][A
 31%|██████████████████████████████████████████████▊                                                                                                         | 8/26 [00:04<00:11,  1.54it/s][A
 35%|████████████████████████████████████████████████████▌                                                                                                   | 9/26 [00:05<00:11,  1.47it/s][A
 38%|██████████████████████████████████████████████████████████                                                                                             | 10/26 [00:06<00:10,  1.51it/s][A
 42%|███████████████████████████████████████████████████████████████▉                                                                                       | 11/26 [00:06<00:09,  1.52it/s][A
 46%|█████████████████████████████████████████████████████████████████████▋                                                                                 | 12/26 [00:07<00:09,  1.52it/s][A
 50%|███████████████████████████████████████████████████████████████████████████▌                                                                           | 13/26 [00:08<00:08,  1.46it/s][A
 54%|█████████████████████████████████████████████████████████████████████████████████▎                                                                     | 14/26 [00:08<00:08,  1.50it/s][A
 58%|███████████████████████████████████████████████████████████████████████████████████████                                                                | 15/26 [00:09<00:07,  1.51it/s][A
 62%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                          | 16/26 [00:10<00:06,  1.51it/s][A
 65%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 17/26 [00:10<00:06,  1.45it/s][A
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 18/26 [00:11<00:05,  1.49it/s][A
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 19/26 [00:12<00:04,  1.50it/s][A
 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 20/26 [00:12<00:03,  1.50it/s][A
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 21/26 [00:13<00:03,  1.45it/s][A
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 22/26 [00:14<00:02,  1.48it/s][A
 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 23/26 [00:14<00:02,  1.49it/s][A
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 24/26 [00:15<00:01,  1.49it/s][A
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 25/26 [00:16<00:00,  1.44it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:16<00:00,  1.45it/s][A                                                                                                                                                                                            
                                                                                                                                                                                            [A{'eval_loss': 0.7548192143440247, 'eval_runtime': 17.8612, 'eval_samples_per_second': 22.899, 'eval_steps_per_second': 1.456, 'eval_ppl': 2.12723, 'memory/max_active (GiB)': 42.7, 'memory/max_allocated (GiB)': 42.7, 'memory/device_reserved (GiB)': 62.92, 'epoch': 0.78, 'tokens/train_per_sec_per_gpu': 0.0}
 39%|██████████████████████████████████████████████████████████▋                                                                                           | 45/115 [07:55<10:20,  8.87s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:17<00:00,  1.45it/s][A
                                                                                                                                                                                            [A[2026-04-23 23:02:41,689] [INFO] [axolotl.core.trainers.base._save:721] [PID:2655295] Saving model checkpoint to /home/tkwang/links/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen2.5-Coder-7B-stage2-secure-token-diff-ctx0/checkpoint-45
[2026-04-23 23:02:42,173] [WARNING] [py.warnings._showwarnmsg:112] [PID:2655295] /scratch/tkwang/SecSteer-v2/.venv/lib/python3.12/site-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning.
  return func(*args, **kwargs)

 40%|████████████████████████████████████████████████████████████                                                                                          | 46/115 [08:05<16:40, 14.50s/it]                                                                                                                                                                                            {'loss': 2.8716, 'grad_norm': 0.18562503159046173, 'learning_rate': 3.0346756283553138e-05, 'ppl': 17.66526, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.8}
 40%|████████████████████████████████████████████████████████████                                                                                          | 46/115 [08:05<16:40, 14.50s/it] 41%|█████████████████████████████████████████████████████████████▎                                                                                        | 47/115 [08:14<14:30, 12.80s/it]                                                                                                                                                                                            {'loss': 2.9933, 'grad_norm': 0.17391358315944672, 'learning_rate': 2.982509222167755e-05, 'ppl': 19.95141, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.82}
 41%|█████████████████████████████████████████████████████████████▎                                                                                        | 47/115 [08:14<14:30, 12.80s/it] 42%|██████████████████████████████████████████████████████████████▌                                                                                       | 48/115 [08:23<12:58, 11.62s/it]                                                                                                                                                                                            {'loss': 3.1122, 'grad_norm': 0.18484313786029816, 'learning_rate': 2.9294463440875375e-05, 'ppl': 22.47042, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.83}
 42%|██████████████████████████████████████████████████████████████▌                                                                                       | 48/115 [08:23<12:58, 11.62s/it] 43%|███████████████████████████████████████████████████████████████▉                                                                                      | 49/115 [08:31<11:52, 10.80s/it]                                                                                                                                                                                            {'loss': 2.6907, 'grad_norm': 0.19169527292251587, 'learning_rate': 2.8755354103306808e-05, 'ppl': 14.74199, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.85}
 43%|███████████████████████████████████████████████████████████████▉                                                                                      | 49/115 [08:31<11:52, 10.80s/it] 43%|█████████████████████████████████████████████████████████████████▏                                                                                    | 50/115 [08:40<11:03, 10.21s/it]                                                                                                                                                                                            {'loss': 3.0123, 'grad_norm': 0.17211617529392242, 'learning_rate': 2.820825610905514e-05, 'ppl': 20.33411, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.87}
 43%|█████████████████████████████████████████████████████████████████▏                                                                                    | 50/115 [08:40<11:03, 10.21s/it] 44%|██████████████████████████████████████████████████████████████████▌                                                                                   | 51/115 [08:49<10:30,  9.85s/it]                                                                                                                                                                                            {'loss': 2.6819, 'grad_norm': 0.1777673065662384, 'learning_rate': 2.7653668647301797e-05, 'ppl': 14.61283, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.89}
 44%|██████████████████████████████████████████████████████████████████▌                                                                                   | 51/115 [08:49<10:30,  9.85s/it] 45%|███████████████████████████████████████████████████████████████████▊                                                                                  | 52/115 [08:58<10:01,  9.55s/it]                                                                                                                                                                                            {'loss': 2.9127, 'grad_norm': 0.16979217529296875, 'learning_rate': 2.7092097740850712e-05, 'ppl': 18.40643, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.9}
 45%|███████████████████████████████████████████████████████████████████▊                                                                                  | 52/115 [08:58<10:01,  9.55s/it] 46%|█████████████████████████████████████████████████████████████████████▏                                                                                | 53/115 [09:07<09:39,  9.35s/it]                                                                                                                                                                                            {'loss': 2.9124, 'grad_norm': 0.19475318491458893, 'learning_rate': 2.652405578441739e-05, 'ppl': 18.40091, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.92}
 46%|█████████████████████████████████████████████████████████████████████▏                                                                                | 53/115 [09:07<09:39,  9.35s/it] 47%|██████████████████████████████████████████████████████████████████████▍                                                                               | 54/115 [09:16<09:23,  9.24s/it]                                                                                                                                                                                            {'loss': 2.7561, 'grad_norm': 0.17494019865989685, 'learning_rate': 2.595006107710406e-05, 'ppl': 15.73834, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.94}
 47%|██████████████████████████████████████████████████████████████████████▍                                                                               | 54/115 [09:16<09:23,  9.24s/it] 48%|███████████████████████████████████████████████████████████████████████▋                                                                              | 55/115 [09:25<09:08,  9.14s/it]                                                                                                                                                                                            {'loss': 2.6787, 'grad_norm': 0.16695146262645721, 'learning_rate': 2.5370637349487537e-05, 'ppl': 14.56614, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.96}
 48%|███████████████████████████████████████████████████████████████████████▋                                                                              | 55/115 [09:25<09:08,  9.14s/it] 49%|█████████████████████████████████████████████████████████████████████████                                                                             | 56/115 [09:34<08:54,  9.06s/it]                                                                                                                                                                                            {'loss': 2.773, 'grad_norm': 0.15602070093154907, 'learning_rate': 2.4786313285751158e-05, 'ppl': 16.00658, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.97}
 49%|█████████████████████████████████████████████████████████████████████████                                                                             | 56/115 [09:34<08:54,  9.06s/it] 50%|██████████████████████████████████████████████████████████████████████████▎                                                                           | 57/115 [09:43<08:41,  8.99s/it]                                                                                                                                                                                            {'loss': 3.0458, 'grad_norm': 0.17278160154819489, 'learning_rate': 2.419762204129695e-05, 'ppl': 21.02685, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 0.99}
 50%|██████████████████████████████████████████████████████████████████████████▎                                                                           | 57/115 [09:43<08:41,  8.99s/it] 50%|███████████████████████████████████████████████████████████████████████████▋                                                                          | 58/115 [09:47<07:16,  7.66s/it]                                                                                                                                                                                            {'loss': 1.4202, 'grad_norm': 0.12146215885877609, 'learning_rate': 2.360510075627812e-05, 'ppl': 4.13795, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.0}
 50%|███████████████████████████████████████████████████████████████████████████▋                                                                          | 58/115 [09:47<07:16,  7.66s/it] 51%|████████████████████████████████████████████████████████████████████████████▉                                                                         | 59/115 [09:57<07:41,  8.24s/it]                                                                                                                                                                                            {'loss': 2.9106, 'grad_norm': 0.19605961441993713, 'learning_rate': 2.3009290065495663e-05, 'ppl': 18.36782, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.02}
 51%|████████████████████████████████████████████████████████████████████████████▉                                                                         | 59/115 [09:57<07:41,  8.24s/it] 52%|██████████████████████████████████████████████████████████████████████████████▎                                                                       | 60/115 [10:06<07:45,  8.46s/it]                                                                                                                                                                                            {'loss': 3.203, 'grad_norm': 0.16643154621124268, 'learning_rate': 2.2410733605106462e-05, 'ppl': 24.60624, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.03}
 52%|██████████████████████████████████████████████████████████████████████████████▎                                                                       | 60/115 [10:06<07:45,  8.46s/it][2026-04-23 23:04:52,299] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:2655295] Running evaluation step...

  0%|                                                                                                                                                                | 0/26 [00:00<?, ?it/s][A
  8%|███████████▋                                                                                                                                            | 2/26 [00:00<00:07,  3.03it/s][A
 12%|█████████████████▌                                                                                                                                      | 3/26 [00:01<00:10,  2.17it/s][A
 15%|███████████████████████▍                                                                                                                                | 4/26 [00:01<00:11,  1.90it/s][A
 19%|█████████████████████████████▏                                                                                                                          | 5/26 [00:02<00:13,  1.59it/s][A
 23%|███████████████████████████████████                                                                                                                     | 6/26 [00:03<00:12,  1.59it/s][A
 27%|████████████████████████████████████████▉                                                                                                               | 7/26 [00:04<00:12,  1.56it/s][A
 31%|██████████████████████████████████████████████▊                                                                                                         | 8/26 [00:04<00:11,  1.54it/s][A
 35%|████████████████████████████████████████████████████▌                                                                                                   | 9/26 [00:05<00:11,  1.47it/s][A
 38%|██████████████████████████████████████████████████████████                                                                                             | 10/26 [00:06<00:10,  1.51it/s][A
 42%|███████████████████████████████████████████████████████████████▉                                                                                       | 11/26 [00:06<00:09,  1.52it/s][A
 46%|█████████████████████████████████████████████████████████████████████▋                                                                                 | 12/26 [00:07<00:09,  1.52it/s][A
 50%|███████████████████████████████████████████████████████████████████████████▌                                                                           | 13/26 [00:08<00:08,  1.46it/s][A
 54%|█████████████████████████████████████████████████████████████████████████████████▎                                                                     | 14/26 [00:08<00:08,  1.49it/s][A
 58%|███████████████████████████████████████████████████████████████████████████████████████                                                                | 15/26 [00:09<00:07,  1.51it/s][A
 62%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                          | 16/26 [00:10<00:06,  1.51it/s][A
 65%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 17/26 [00:10<00:06,  1.46it/s][A
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 18/26 [00:11<00:05,  1.48it/s][A
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 19/26 [00:12<00:04,  1.49it/s][A
 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 20/26 [00:12<00:04,  1.50it/s][A
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 21/26 [00:13<00:03,  1.45it/s][A
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 22/26 [00:14<00:02,  1.47it/s][A
 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 23/26 [00:14<00:02,  1.49it/s][A
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 24/26 [00:15<00:01,  1.48it/s][A
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 25/26 [00:16<00:00,  1.44it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:16<00:00,  1.45it/s][A                                                                                                                                                                                            
                                                                                                                                                                                            [A{'eval_loss': 0.7495754361152649, 'eval_runtime': 17.8503, 'eval_samples_per_second': 22.913, 'eval_steps_per_second': 1.457, 'eval_ppl': 2.1161, 'memory/max_active (GiB)': 42.7, 'memory/max_allocated (GiB)': 42.7, 'memory/device_reserved (GiB)': 62.92, 'epoch': 1.03, 'tokens/train_per_sec_per_gpu': 0.0}
 52%|██████████████████████████████████████████████████████████████████████████████▎                                                                       | 60/115 [10:24<07:45,  8.46s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:17<00:00,  1.45it/s][A
                                                                                                                                                                                            [A[2026-04-23 23:05:10,167] [INFO] [axolotl.core.trainers.base._save:721] [PID:2655295] Saving model checkpoint to /home/tkwang/links/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen2.5-Coder-7B-stage2-secure-token-diff-ctx0/checkpoint-60
[2026-04-23 23:05:10,729] [WARNING] [py.warnings._showwarnmsg:112] [PID:2655295] /scratch/tkwang/SecSteer-v2/.venv/lib/python3.12/site-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning.
  return func(*args, **kwargs)

 53%|███████████████████████████████████████████████████████████████████████████████▌                                                                      | 61/115 [10:34<12:50, 14.26s/it]                                                                                                                                                                                            {'loss': 2.8548, 'grad_norm': 0.1900254637002945, 'learning_rate': 2.180997751659276e-05, 'ppl': 17.37096, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.05}
 53%|███████████████████████████████████████████████████████████████████████████████▌                                                                      | 61/115 [10:34<12:50, 14.26s/it] 54%|████████████████████████████████████████████████████████████████████████████████▊                                                                     | 62/115 [10:42<11:11, 12.66s/it]                                                                                                                                                                                            {'loss': 2.6655, 'grad_norm': 0.200274258852005, 'learning_rate': 2.1207569948445724e-05, 'ppl': 14.37514, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.07}
 54%|████████████████████████████████████████████████████████████████████████████████▊                                                                     | 62/115 [10:42<11:11, 12.66s/it] 55%|██████████████████████████████████████████████████████████████████████████████████▏                                                                   | 63/115 [10:51<10:00, 11.55s/it]                                                                                                                                                                                            {'loss': 2.8063, 'grad_norm': 0.17702236771583557, 'learning_rate': 2.060406055601778e-05, 'ppl': 16.54858, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.09}
 55%|██████████████████████████████████████████████████████████████████████████████████▏                                                                   | 63/115 [10:51<10:00, 11.55s/it] 56%|███████████████████████████████████████████████████████████████████████████████████▍                                                                  | 64/115 [11:00<09:08, 10.75s/it]                                                                                                                                                                                            {'loss': 2.9217, 'grad_norm': 0.19703693687915802, 'learning_rate': 2e-05, 'ppl': 18.57283, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.1}
 56%|███████████████████████████████████████████████████████████████████████████████████▍                                                                  | 64/115 [11:00<09:08, 10.75s/it] 57%|████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 65/115 [11:09<08:30, 10.20s/it]                                                                                                                                                                                            {'loss': 3.1359, 'grad_norm': 0.17079755663871765, 'learning_rate': 1.9395939443982228e-05, 'ppl': 23.00933, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.12}
 57%|████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 65/115 [11:09<08:30, 10.20s/it] 57%|██████████████████████████████████████████████████████████████████████████████████████                                                                | 66/115 [11:18<08:00,  9.81s/it]                                                                                                                                                                                            {'loss': 3.0341, 'grad_norm': 0.18701142072677612, 'learning_rate': 1.879243005155428e-05, 'ppl': 20.78227, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.14}
 57%|██████████████████████████████████████████████████████████████████████████████████████                                                                | 66/115 [11:18<08:00,  9.81s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████▍                                                              | 67/115 [11:27<07:39,  9.56s/it]                                                                                                                                                                                            {'loss': 3.0971, 'grad_norm': 0.17559665441513062, 'learning_rate': 1.8190022483407246e-05, 'ppl': 22.13367, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.16}
 58%|███████████████████████████████████████████████████████████████████████████████████████▍                                                              | 67/115 [11:27<07:39,  9.56s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████▋                                                             | 68/115 [11:36<07:20,  9.36s/it]                                                                                                                                                                                            {'loss': 2.9618, 'grad_norm': 0.1853734403848648, 'learning_rate': 1.758926639489354e-05, 'ppl': 19.33274, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.17}
 59%|████████████████████████████████████████████████████████████████████████████████████████▋                                                             | 68/115 [11:36<07:20,  9.36s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████                                                            | 69/115 [11:45<07:04,  9.22s/it]                                                                                                                                                                                            {'loss': 2.958, 'grad_norm': 0.20163635909557343, 'learning_rate': 1.699070993450434e-05, 'ppl': 19.25941, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.19}
 60%|██████████████████████████████████████████████████████████████████████████████████████████                                                            | 69/115 [11:45<07:04,  9.22s/it] 61%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 70/115 [11:54<06:52,  9.17s/it]                                                                                                                                                                                            {'loss': 2.5718, 'grad_norm': 0.15739287436008453, 'learning_rate': 1.6394899243721887e-05, 'ppl': 13.08936, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.21}
 61%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 70/115 [11:54<06:52,  9.17s/it] 62%|████████████████████████████████████████████████████████████████████████████████████████████▌                                                         | 71/115 [12:03<06:39,  9.08s/it]                                                                                                                                                                                            {'loss': 3.2599, 'grad_norm': 0.20242543518543243, 'learning_rate': 1.5802377958703054e-05, 'ppl': 26.04693, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.23}
 62%|████████████████████████████████████████████████████████████████████████████████████████████▌                                                         | 71/115 [12:03<06:39,  9.08s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 72/115 [12:12<06:29,  9.05s/it]                                                                                                                                                                                            {'loss': 2.8747, 'grad_norm': 0.18309704959392548, 'learning_rate': 1.5213686714248852e-05, 'ppl': 17.72011, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.24}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 72/115 [12:12<06:29,  9.05s/it] 63%|███████████████████████████████████████████████████████████████████████████████████████████████▏                                                      | 73/115 [12:21<06:18,  9.01s/it]                                                                                                                                                                                            {'loss': 3.2512, 'grad_norm': 0.174557626247406, 'learning_rate': 1.4629362650512464e-05, 'ppl': 25.82131, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.26}
 63%|███████████████████████████████████████████████████████████████████████████████████████████████▏                                                      | 73/115 [12:21<06:18,  9.01s/it] 64%|████████████████████████████████████████████████████████████████████████████████████████████████▌                                                     | 74/115 [12:30<06:07,  8.98s/it]                                                                                                                                                                                            {'loss': 3.0235, 'grad_norm': 0.20229797065258026, 'learning_rate': 1.4049938922895945e-05, 'ppl': 20.56314, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.28}
 64%|████████████████████████████████████████████████████████████████████████████████████████████████▌                                                     | 74/115 [12:30<06:07,  8.98s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                                                    | 75/115 [12:39<05:58,  8.97s/it]                                                                                                                                                                                            {'loss': 2.9977, 'grad_norm': 0.3296393156051636, 'learning_rate': 1.3475944215582619e-05, 'ppl': 20.03939, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.3}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                                                    | 75/115 [12:39<05:58,  8.97s/it][2026-04-23 23:07:25,120] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:2655295] Running evaluation step...

  0%|                                                                                                                                                                | 0/26 [00:00<?, ?it/s][A
  8%|███████████▋                                                                                                                                            | 2/26 [00:00<00:08,  2.95it/s][A
 12%|█████████████████▌                                                                                                                                      | 3/26 [00:01<00:10,  2.15it/s][A
 15%|███████████████████████▍                                                                                                                                | 4/26 [00:01<00:11,  1.88it/s][A
 19%|█████████████████████████████▏                                                                                                                          | 5/26 [00:02<00:13,  1.59it/s][A
 23%|███████████████████████████████████                                                                                                                     | 6/26 [00:03<00:12,  1.59it/s][A
 27%|████████████████████████████████████████▉                                                                                                               | 7/26 [00:04<00:12,  1.56it/s][A
 31%|██████████████████████████████████████████████▊                                                                                                         | 8/26 [00:04<00:11,  1.54it/s][A
 35%|████████████████████████████████████████████████████▌                                                                                                   | 9/26 [00:05<00:11,  1.47it/s][A
 38%|██████████████████████████████████████████████████████████                                                                                             | 10/26 [00:06<00:10,  1.51it/s][A
 42%|███████████████████████████████████████████████████████████████▉                                                                                       | 11/26 [00:06<00:09,  1.52it/s][A
 46%|█████████████████████████████████████████████████████████████████████▋                                                                                 | 12/26 [00:07<00:09,  1.52it/s][A
 50%|███████████████████████████████████████████████████████████████████████████▌                                                                           | 13/26 [00:08<00:08,  1.46it/s][A
 54%|█████████████████████████████████████████████████████████████████████████████████▎                                                                     | 14/26 [00:08<00:08,  1.49it/s][A
 58%|███████████████████████████████████████████████████████████████████████████████████████                                                                | 15/26 [00:09<00:07,  1.50it/s][A
 62%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                          | 16/26 [00:10<00:06,  1.51it/s][A
 65%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 17/26 [00:10<00:06,  1.45it/s][A
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 18/26 [00:11<00:05,  1.48it/s][A
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 19/26 [00:12<00:04,  1.50it/s][A
 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 20/26 [00:12<00:03,  1.50it/s][A
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 21/26 [00:13<00:03,  1.45it/s][A
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 22/26 [00:14<00:02,  1.48it/s][A
 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 23/26 [00:14<00:02,  1.49it/s][A
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 24/26 [00:15<00:01,  1.49it/s][A
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 25/26 [00:16<00:00,  1.45it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:16<00:00,  1.45it/s][A                                                                                                                                                                                            
                                                                                                                                                                                            [A{'eval_loss': 0.7471908926963806, 'eval_runtime': 17.8448, 'eval_samples_per_second': 22.92, 'eval_steps_per_second': 1.457, 'eval_ppl': 2.11106, 'memory/max_active (GiB)': 42.7, 'memory/max_allocated (GiB)': 42.7, 'memory/device_reserved (GiB)': 62.92, 'epoch': 1.3, 'tokens/train_per_sec_per_gpu': 0.0}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                                                    | 75/115 [12:56<05:58,  8.97s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:17<00:00,  1.45it/s][A
                                                                                                                                                                                            [A[2026-04-23 23:07:42,980] [INFO] [axolotl.core.trainers.base._save:721] [PID:2655295] Saving model checkpoint to /home/tkwang/links/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen2.5-Coder-7B-stage2-secure-token-diff-ctx0/checkpoint-75
[2026-04-23 23:07:43,476] [WARNING] [py.warnings._showwarnmsg:112] [PID:2655295] /scratch/tkwang/SecSteer-v2/.venv/lib/python3.12/site-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning.
  return func(*args, **kwargs)

 66%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                                  | 76/115 [13:06<09:29, 14.61s/it]                                                                                                                                                                                            {'loss': 2.6911, 'grad_norm': 0.16873489320278168, 'learning_rate': 1.2907902259149287e-05, 'ppl': 14.74789, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.31}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                                  | 76/115 [13:06<09:29, 14.61s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                 | 77/115 [13:15<08:09, 12.88s/it]                                                                                                                                                                                            {'loss': 2.4224, 'grad_norm': 0.199187770485878, 'learning_rate': 1.2346331352698206e-05, 'ppl': 11.27288, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.33}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                 | 77/115 [13:15<08:09, 12.88s/it] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                | 78/115 [13:24<07:12, 11.70s/it]                                                                                                                                                                                            {'loss': 2.9109, 'grad_norm': 0.15992921590805054, 'learning_rate': 1.1791743890944869e-05, 'ppl': 18.37333, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.35}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                | 78/115 [13:24<07:12, 11.70s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████                                               | 79/115 [13:33<06:31, 10.88s/it]                                                                                                                                                                                            {'loss': 2.799, 'grad_norm': 0.20177482068538666, 'learning_rate': 1.124464589669319e-05, 'ppl': 16.42821, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.37}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████                                               | 79/115 [13:33<06:31, 10.88s/it] 70%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                             | 80/115 [13:42<05:59, 10.28s/it]                                                                                                                                                                                            {'loss': 2.9347, 'grad_norm': 0.20969270169734955, 'learning_rate': 1.070553655912463e-05, 'ppl': 18.81586, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.38}
 70%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                             | 80/115 [13:42<05:59, 10.28s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                            | 81/115 [13:51<05:34,  9.84s/it]                                                                                                                                                                                            {'loss': 3.2556, 'grad_norm': 0.1974029541015625, 'learning_rate': 1.0174907778322458e-05, 'ppl': 25.93517, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.4}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                            | 81/115 [13:51<05:34,  9.84s/it] 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 82/115 [14:00<05:14,  9.54s/it]                                                                                                                                                                                            {'loss': 2.6758, 'grad_norm': 0.19425898790359497, 'learning_rate': 9.653243716446862e-06, 'ppl': 14.52396, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.42}
 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 82/115 [14:00<05:14,  9.54s/it] 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 83/115 [14:09<04:59,  9.36s/it]                                                                                                                                                                                            {'loss': 3.0952, 'grad_norm': 0.19029958546161652, 'learning_rate': 9.141020355970427e-06, 'ppl': 22.09166, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.43}
 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 83/115 [14:09<04:59,  9.36s/it] 73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 84/115 [14:17<04:45,  9.20s/it]                                                                                                                                                                                            {'loss': 3.1738, 'grad_norm': 0.2984507977962494, 'learning_rate': 8.638705065376887e-06, 'ppl': 23.89812, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.45}
 73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 84/115 [14:17<04:45,  9.20s/it] 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                       | 85/115 [14:26<04:33,  9.13s/it]                                                                                                                                                                                            {'loss': 2.8065, 'grad_norm': 0.17096593976020813, 'learning_rate': 8.146756172719668e-06, 'ppl': 16.55189, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.47}
 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                       | 85/115 [14:26<04:33,  9.13s/it] 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 86/115 [14:35<04:22,  9.05s/it]                                                                                                                                                                                            {'loss': 2.8746, 'grad_norm': 0.19590231776237488, 'learning_rate': 7.665622547429139e-06, 'ppl': 17.71834, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.49}
 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 86/115 [14:35<04:22,  9.05s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                    | 87/115 [14:44<04:13,  9.04s/it]                                                                                                                                                                                            {'loss': 2.9422, 'grad_norm': 0.18060949444770813, 'learning_rate': 7.195743190750241e-06, 'ppl': 18.95751, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.5}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                    | 87/115 [14:44<04:13,  9.04s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 88/115 [14:53<04:03,  9.01s/it]                                                                                                                                                                                            {'loss': 3.0125, 'grad_norm': 0.3589091897010803, 'learning_rate': 6.737546835184101e-06, 'ppl': 20.33818, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.52}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 88/115 [14:53<04:03,  9.01s/it] 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                  | 89/115 [15:02<03:53,  8.99s/it]                                                                                                                                                                                            {'loss': 3.0537, 'grad_norm': 0.1863524615764618, 'learning_rate': 6.291451553299204e-06, 'ppl': 21.19362, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.54}
 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                  | 89/115 [15:02<03:53,  8.99s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 90/115 [15:11<03:43,  8.96s/it]                                                                                                                                                                                            {'loss': 2.9272, 'grad_norm': 0.18664468824863434, 'learning_rate': 5.857864376269051e-06, 'ppl': 18.67527, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.56}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 90/115 [15:11<03:43,  8.96s/it][2026-04-23 23:09:57,570] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:2655295] Running evaluation step...

  0%|                                                                                                                                                                | 0/26 [00:00<?, ?it/s][A
  8%|███████████▋                                                                                                                                            | 2/26 [00:00<00:07,  3.01it/s][A
 12%|█████████████████▌                                                                                                                                      | 3/26 [00:01<00:10,  2.16it/s][A
 15%|███████████████████████▍                                                                                                                                | 4/26 [00:01<00:11,  1.89it/s][A
 19%|█████████████████████████████▏                                                                                                                          | 5/26 [00:02<00:13,  1.59it/s][A
 23%|███████████████████████████████████                                                                                                                     | 6/26 [00:03<00:12,  1.59it/s][A
 27%|████████████████████████████████████████▉                                                                                                               | 7/26 [00:04<00:12,  1.56it/s][A
 31%|██████████████████████████████████████████████▊                                                                                                         | 8/26 [00:04<00:11,  1.53it/s][A
 35%|████████████████████████████████████████████████████▌                                                                                                   | 9/26 [00:05<00:11,  1.48it/s][A
 38%|██████████████████████████████████████████████████████████                                                                                             | 10/26 [00:06<00:10,  1.51it/s][A
 42%|███████████████████████████████████████████████████████████████▉                                                                                       | 11/26 [00:06<00:09,  1.52it/s][A
 46%|█████████████████████████████████████████████████████████████████████▋                                                                                 | 12/26 [00:07<00:09,  1.53it/s][A
 50%|███████████████████████████████████████████████████████████████████████████▌                                                                           | 13/26 [00:08<00:08,  1.47it/s][A
 54%|█████████████████████████████████████████████████████████████████████████████████▎                                                                     | 14/26 [00:08<00:08,  1.50it/s][A
 58%|███████████████████████████████████████████████████████████████████████████████████████                                                                | 15/26 [00:09<00:07,  1.51it/s][A
 62%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                          | 16/26 [00:10<00:06,  1.51it/s][A
 65%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 17/26 [00:10<00:06,  1.45it/s][A
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 18/26 [00:11<00:05,  1.49it/s][A
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 19/26 [00:12<00:04,  1.50it/s][A
 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 20/26 [00:12<00:03,  1.50it/s][A
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 21/26 [00:13<00:03,  1.46it/s][A
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 22/26 [00:14<00:02,  1.48it/s][A
 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 23/26 [00:14<00:02,  1.49it/s][A
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 24/26 [00:15<00:01,  1.49it/s][A
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 25/26 [00:16<00:00,  1.44it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:16<00:00,  1.44it/s][A                                                                                                                                                                                            
                                                                                                                                                                                            [A{'eval_loss': 0.7461345791816711, 'eval_runtime': 17.8588, 'eval_samples_per_second': 22.902, 'eval_steps_per_second': 1.456, 'eval_ppl': 2.10883, 'memory/max_active (GiB)': 42.7, 'memory/max_allocated (GiB)': 42.7, 'memory/device_reserved (GiB)': 62.92, 'epoch': 1.56, 'tokens/train_per_sec_per_gpu': 0.0}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 90/115 [15:29<03:43,  8.96s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:17<00:00,  1.44it/s][A
                                                                                                                                                                                            [A[2026-04-23 23:10:15,443] [INFO] [axolotl.core.trainers.base._save:721] [PID:2655295] Saving model checkpoint to /home/tkwang/links/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen2.5-Coder-7B-stage2-secure-token-diff-ctx0/checkpoint-90
[2026-04-23 23:10:15,931] [WARNING] [py.warnings._showwarnmsg:112] [PID:2655295] /scratch/tkwang/SecSteer-v2/.venv/lib/python3.12/site-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning.
  return func(*args, **kwargs)

 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 91/115 [15:39<05:49, 14.55s/it]                                                                                                                                                                                            {'loss': 2.8287, 'grad_norm': 0.17649923264980316, 'learning_rate': 5.4371809224842354e-06, 'ppl': 16.92345, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.57}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 91/115 [15:39<05:49, 14.55s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 92/115 [15:47<04:54, 12.82s/it]                                                                                                                                                                                            {'loss': 2.9984, 'grad_norm': 0.21398091316223145, 'learning_rate': 5.029785036577976e-06, 'ppl': 20.05343, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.59}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 92/115 [15:47<04:54, 12.82s/it] 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                            | 93/115 [15:56<04:15, 11.63s/it]                                                                                                                                                                                            {'loss': 3.1782, 'grad_norm': 0.20467324554920197, 'learning_rate': 4.636048439194392e-06, 'ppl': 24.00351, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.61}
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                            | 93/115 [15:56<04:15, 11.63s/it] 82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 94/115 [16:05<03:46, 10.79s/it]                                                                                                                                                                                            {'loss': 2.9247, 'grad_norm': 0.20503148436546326, 'learning_rate': 4.256330387818999e-06, 'ppl': 18.62864, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.63}
 82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 94/115 [16:05<03:46, 10.79s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                          | 95/115 [16:14<03:24, 10.21s/it]                                                                                                                                                                                            {'loss': 2.9317, 'grad_norm': 0.21293747425079346, 'learning_rate': 3.89097734898108e-06, 'ppl': 18.75949, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.64}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                          | 95/115 [16:14<03:24, 10.21s/it] 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                        | 96/115 [16:23<03:06,  9.82s/it]                                                                                                                                                                                            {'loss': 2.7959, 'grad_norm': 0.18455636501312256, 'learning_rate': 3.5403226821268734e-06, 'ppl': 16.37736, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.66}
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                        | 96/115 [16:23<03:06,  9.82s/it] 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                       | 97/115 [16:32<02:52,  9.57s/it]                                                                                                                                                                                            {'loss': 2.9579, 'grad_norm': 0.18545471131801605, 'learning_rate': 3.204686335452043e-06, 'ppl': 19.25749, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.68}
 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                       | 97/115 [16:32<02:52,  9.57s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 98/115 [16:41<02:39,  9.35s/it]                                                                                                                                                                                            {'loss': 2.6455, 'grad_norm': 0.21932028234004974, 'learning_rate': 2.8843745539710523e-06, 'ppl': 14.09049, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.7}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 98/115 [16:41<02:39,  9.35s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                    | 99/115 [16:50<02:27,  9.23s/it]                                                                                                                                                                                            {'loss': 2.737, 'grad_norm': 0.17154179513454437, 'learning_rate': 2.5796796000896882e-06, 'ppl': 15.44059, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.71}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                    | 99/115 [16:50<02:27,  9.23s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                   | 100/115 [16:59<02:17,  9.14s/it]                                                                                                                                                                                            {'loss': 2.7122, 'grad_norm': 0.17894573509693146, 'learning_rate': 2.2908794869358044e-06, 'ppl': 15.06238, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.73}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                   | 100/115 [16:59<02:17,  9.14s/it] 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 101/115 [17:07<02:06,  9.06s/it]                                                                                                                                                                                            {'loss': 3.1339, 'grad_norm': 0.21843650937080383, 'learning_rate': 2.018237724691483e-06, 'ppl': 22.96336, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.75}
 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 101/115 [17:07<02:06,  9.06s/it] 89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                | 102/115 [17:16<01:57,  9.01s/it]                                                                                                                                                                                            {'loss': 3.1616, 'grad_norm': 0.19758166372776031, 'learning_rate': 1.7620030801581988e-06, 'ppl': 23.60834, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.77}
 89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                | 102/115 [17:16<01:57,  9.01s/it] 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 103/115 [17:25<01:47,  8.98s/it]                                                                                                                                                                                            {'loss': 3.016, 'grad_norm': 0.20229025185108185, 'learning_rate': 1.5224093497742654e-06, 'ppl': 20.40949, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.78}
 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 103/115 [17:25<01:47,  8.98s/it] 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋              | 104/115 [17:34<01:38,  8.94s/it]                                                                                                                                                                                            {'loss': 3.1642, 'grad_norm': 0.20342081785202026, 'learning_rate': 1.2996751462917057e-06, 'ppl': 23.6698, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.8}
 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋              | 104/115 [17:34<01:38,  8.94s/it] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 105/115 [17:43<01:29,  8.94s/it]                                                                                                                                                                                            {'loss': 2.8796, 'grad_norm': 0.21505790948867798, 'learning_rate': 1.0940036993071934e-06, 'ppl': 17.80715, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.82}
 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 105/115 [17:43<01:29,  8.94s/it][2026-04-23 23:12:29,560] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:2655295] Running evaluation step...

  0%|                                                                                                                                                                | 0/26 [00:00<?, ?it/s][A
  8%|███████████▋                                                                                                                                            | 2/26 [00:00<00:08,  2.99it/s][A
 12%|█████████████████▌                                                                                                                                      | 3/26 [00:01<00:10,  2.16it/s][A
 15%|███████████████████████▍                                                                                                                                | 4/26 [00:01<00:11,  1.89it/s][A
 19%|█████████████████████████████▏                                                                                                                          | 5/26 [00:02<00:13,  1.59it/s][A
 23%|███████████████████████████████████                                                                                                                     | 6/26 [00:03<00:12,  1.59it/s][A
 27%|████████████████████████████████████████▉                                                                                                               | 7/26 [00:04<00:12,  1.56it/s][A
 31%|██████████████████████████████████████████████▊                                                                                                         | 8/26 [00:04<00:11,  1.54it/s][A
 35%|████████████████████████████████████████████████████▌                                                                                                   | 9/26 [00:05<00:11,  1.48it/s][A
 38%|██████████████████████████████████████████████████████████                                                                                             | 10/26 [00:06<00:10,  1.52it/s][A
 42%|███████████████████████████████████████████████████████████████▉                                                                                       | 11/26 [00:06<00:09,  1.52it/s][A
 46%|█████████████████████████████████████████████████████████████████████▋                                                                                 | 12/26 [00:07<00:09,  1.53it/s][A
 50%|███████████████████████████████████████████████████████████████████████████▌                                                                           | 13/26 [00:08<00:08,  1.47it/s][A
 54%|█████████████████████████████████████████████████████████████████████████████████▎                                                                     | 14/26 [00:08<00:08,  1.50it/s][A
 58%|███████████████████████████████████████████████████████████████████████████████████████                                                                | 15/26 [00:09<00:07,  1.51it/s][A
 62%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                          | 16/26 [00:10<00:06,  1.51it/s][A
 65%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 17/26 [00:10<00:06,  1.45it/s][A
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 18/26 [00:11<00:05,  1.49it/s][A
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 19/26 [00:12<00:04,  1.50it/s][A
 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 20/26 [00:12<00:03,  1.50it/s][A
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 21/26 [00:13<00:03,  1.46it/s][A
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 22/26 [00:14<00:02,  1.47it/s][A
 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 23/26 [00:14<00:02,  1.49it/s][A
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 24/26 [00:15<00:01,  1.48it/s][A
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 25/26 [00:16<00:00,  1.45it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:16<00:00,  1.45it/s][A                                                                                                                                                                                            
                                                                                                                                                                                            [A{'eval_loss': 0.7458599209785461, 'eval_runtime': 17.8237, 'eval_samples_per_second': 22.947, 'eval_steps_per_second': 1.459, 'eval_ppl': 2.10825, 'memory/max_active (GiB)': 42.7, 'memory/max_allocated (GiB)': 42.7, 'memory/device_reserved (GiB)': 62.92, 'epoch': 1.82, 'tokens/train_per_sec_per_gpu': 0.0}
 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 105/115 [18:01<01:29,  8.94s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:17<00:00,  1.45it/s][A
                                                                                                                                                                                            [A[2026-04-23 23:12:47,398] [INFO] [axolotl.core.trainers.base._save:721] [PID:2655295] Saving model checkpoint to /home/tkwang/links/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen2.5-Coder-7B-stage2-secure-token-diff-ctx0/checkpoint-105
[2026-04-23 23:12:47,879] [WARNING] [py.warnings._showwarnmsg:112] [PID:2655295] /scratch/tkwang/SecSteer-v2/.venv/lib/python3.12/site-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning.
  return func(*args, **kwargs)

 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎           | 106/115 [18:11<02:10, 14.55s/it]                                                                                                                                                                                            {'loss': 2.6083, 'grad_norm': 0.17414577305316925, 'learning_rate': 9.055826698290881e-07, 'ppl': 13.57595, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.83}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎           | 106/115 [18:11<02:10, 14.55s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 107/115 [18:20<01:42, 12.87s/it]                                                                                                                                                                                            {'loss': 2.8681, 'grad_norm': 0.20923344790935516, 'learning_rate': 7.345839790496745e-07, 'ppl': 17.60354, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.85}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 107/115 [18:20<01:42, 12.87s/it] 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 108/115 [18:28<01:21, 11.69s/it]                                                                                                                                                                                            {'loss': 3.1563, 'grad_norm': 0.2226894348859787, 'learning_rate': 5.811636514789598e-07, 'ppl': 23.48355, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.87}
 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 108/115 [18:28<01:21, 11.69s/it] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 109/115 [18:37<01:04, 10.83s/it]                                                                                                                                                                                            {'loss': 2.9326, 'grad_norm': 0.21895429491996765, 'learning_rate': 4.4546167258306296e-07, 'ppl': 18.77639, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.89}
 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 109/115 [18:37<01:04, 10.83s/it] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 110/115 [18:46<00:51, 10.24s/it]                                                                                                                                                                                            {'loss': 3.2598, 'grad_norm': 0.205611452460289, 'learning_rate': 3.2760186105712964e-07, 'ppl': 26.04433, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.9}
 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 110/115 [18:46<00:51, 10.24s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 111/115 [18:55<00:39,  9.85s/it]                                                                                                                                                                                            {'loss': 2.6766, 'grad_norm': 0.1860496699810028, 'learning_rate': 2.2769175584931746e-07, 'ppl': 14.53559, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.92}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 111/115 [18:55<00:39,  9.85s/it] 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 112/115 [19:04<00:28,  9.55s/it]                                                                                                                                                                                            {'loss': 3.1156, 'grad_norm': 0.2042839080095291, 'learning_rate': 1.4582251803892055e-07, 'ppl': 22.54695, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.94}
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 112/115 [19:04<00:28,  9.55s/it] 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 113/115 [19:13<00:18,  9.34s/it]                                                                                                                                                                                            {'loss': 2.8296, 'grad_norm': 0.22802592813968658, 'learning_rate': 8.206884765818102e-08, 'ppl': 16.93868, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.96}
 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 113/115 [19:13<00:18,  9.34s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 114/115 [19:22<00:09,  9.23s/it]                                                                                                                                                                                            {'loss': 2.8064, 'grad_norm': 0.18351244926452637, 'learning_rate': 3.648891553365008e-08, 'ppl': 16.55023, 'memory/max_active (GiB)': 46.14, 'memory/max_allocated (GiB)': 46.14, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.97}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 114/115 [19:22<00:09,  9.23s/it]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 115/115 [19:31<00:00,  9.13s/it]                                                                                                                                                                                            {'loss': 3.1663, 'grad_norm': 0.21672320365905762, 'learning_rate': 9.12431020933191e-09, 'ppl': 23.71956, 'memory/max_active (GiB)': 46.13, 'memory/max_allocated (GiB)': 46.13, 'memory/device_reserved (GiB)': 62.92, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 0, 'tokens/trainable': 0, 'epoch': 1.99}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 115/115 [19:31<00:00,  9.13s/it][2026-04-23 23:14:17,286] [INFO] [axolotl.core.trainers.base._save:721] [PID:2655295] Saving model checkpoint to /home/tkwang/links/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen2.5-Coder-7B-stage2-secure-token-diff-ctx0/checkpoint-115
[2026-04-23 23:14:17,749] [WARNING] [py.warnings._showwarnmsg:112] [PID:2655295] /scratch/tkwang/SecSteer-v2/.venv/lib/python3.12/site-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning.
  return func(*args, **kwargs)

                                                                                                                                                                                            {'train_runtime': 1175.8224, 'train_samples_per_second': 6.259, 'train_steps_per_second': 0.098, 'train_loss': 3.0131628088329148, 'memory/max_active (GiB)': 15.01, 'memory/max_allocated (GiB)': 15.01, 'memory/device_reserved (GiB)': 62.92, 'epoch': 1.99, 'tokens/train_per_sec_per_gpu': 0.0}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 115/115 [19:32<00:00,  9.13s/it]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 115/115 [19:32<00:00, 10.19s/it]
[2026-04-23 23:14:18,345] [INFO] [axolotl.train.save_trained_model:233] [PID:2655295] Training completed! Saving trained model to /home/tkwang/links/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen2.5-Coder-7B-stage2-secure-token-diff-ctx0.
[2026-04-23 23:14:18,675] [INFO] [axolotl.train.save_trained_model:351] [PID:2655295] Model successfully saved to /home/tkwang/links/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen2.5-Coder-7B-stage2-secure-token-diff-ctx0