[2026-05-21 05:26:08,359] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:3208933] baseline 0.000GB ()
[2026-05-21 05:26:08,361] [INFO] [axolotl.cli.config.load_cfg:341] [PID:3208933] config:
{
  "activation_offloading": false,
  "axolotl_config_path": "configs/axolotl_qwen3-1.7b_nq-text-title.yml",
  "base_model": "Qwen/Qwen3-1.7B-Base",
  "base_model_config": "Qwen/Qwen3-1.7B-Base",
  "batch_size": 128,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_120",
    "fp8": true,
    "n_gpu": 1,
    "n_node": 1,
    "tf32": true
  },
  "context_parallel_size": 1,
  "dataloader_num_workers": 2,
  "dataset_num_proc": 128,
  "datasets": [
    {
      "chat_template": "tokenizer_default_fallback_chatml",
      "field_messages": "conversations",
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "nq_text_compressed_axolotl/train_with_pseudo_axolotl.jsonl",
      "roles": {
        "assistant": [
          "assistant",
          "gpt",
          "model"
        ],
        "system": [
          "system"
        ],
        "user": [
          "user",
          "human"
        ]
      },
      "trust_remote_code": false,
      "type": "chat_template"
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "eaft_alpha": 1.0,
  "eaft_k": 20,
  "env_capabilities": {
    "torch_version": "2.8.0"
  },
  "eval_batch_size": 4,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_table_size": 0,
  "experimental_skip_move_to_device": true,
  "flash_attention": false,
  "flex_attention": false,
  "fp16": false,
  "generate_samples": false,
  "generation_do_sample": true,
  "generation_max_new_tokens": 50,
  "generation_prompt_ratio": 0.5,
  "generation_temperature": 0.7,
  "gradient_accumulation_steps": 32,
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": true
  },
  "include_tkps": true,
  "layer_offloading": false,
  "learning_rate": 0.0001,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 50,
  "lora_dropout": 0.0,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "mean_resizing_embeddings": false,
  "merge_method": "memory_efficient",
  "micro_batch_size": 4,
  "model_config_type": "qwen3",
  "num_epochs": 10.0,
  "num_generation_samples": 3,
  "optimizer": "adamw_torch",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./checkpoint/Qwen3-1.7B-nq_text_compressed-with_pseudo-lr1e-4-10epochs",
  "pad_to_sequence_len": false,
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "quantize_moe_experts": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": false,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_strategy": "epoch",
  "save_total_limit": 3,
  "sdp_attention": true,
  "sequence_len": 512,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "special_tokens": {
    "eos_token": "<|im_end|>"
  },
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": false,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "Qwen/Qwen3-1.7B-Base",
  "tokenizer_save_jinja_files": true,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "async_prefetch": false,
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "replay_buffer_size": 0,
    "replay_recompute_logps": true,
    "reroll_max_groups": 1,
    "reroll_start_fraction": 1.0,
    "reward_num_workers": 1,
    "scale_rewards": true,
    "skip_zero_advantage_batches": true,
    "sync_ref_model": false,
    "use_data_producer": false,
    "use_vllm": false,
    "vllm_lora_sync": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "use_otel_metrics": false,
  "use_ray": false,
  "use_wandb": true,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_entity": "abnerden0803-national-taiwan-university",
  "wandb_name": "qwen3-1.7b-nq_text_compressed-pseudo-lr1e-4-10epochs",
  "wandb_project": "ICLGR-NQ",
  "warmup_ratio": 0.1,
  "weight_decay": 0.0,
  "world_size": 1,
  "xformers_attention": false
}
[2026-05-21 05:26:10,257] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:307] [PID:3208933] EOS: 151645 / <|im_end|>
[2026-05-21 05:26:10,257] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:308] [PID:3208933] BOS: None / None
[2026-05-21 05:26:10,257] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:309] [PID:3208933] PAD: 151643 / <|endoftext|>
[2026-05-21 05:26:10,257] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:310] [PID:3208933] UNK: None / None
[2026-05-21 05:26:10,259] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:480] [PID:3208933] Unable to find prepared dataset in last_run_prepared/a8d61713fe28909dcab9370999e181f6
[2026-05-21 05:26:10,259] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:3208933] Loading raw datasets...
[2026-05-21 05:26:10,259] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:3208933] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
[2026-05-21 05:26:10,953] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:3208933] Loading dataset: nq_text_compressed_axolotl/train_with_pseudo_axolotl.jsonl with base_type: chat_template and prompt_style: None
[2026-05-21 05:26:10,956] [INFO] [axolotl.prompt_strategies.chat_template.__call__:998] [PID:3208933] Using chat template:
---
{%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0].role == 'system' %}
        {{- messages[0].content + '\n\n' }}
    {%- endif %}
    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
    {%- if messages[0].role == 'system' %}
        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
    {%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{%- for message in messages[::-1] %}
    {%- set index = (messages|length - 1) - loop.index0 %}
    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
        {%- set ns.multi_step_tool = false %}
        {%- set ns.last_query_index = index %}
    {%- endif %}
{%- endfor %}
{%- for message in messages %}
    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
    {%- elif message.role == "assistant" %}
        {%- set content = message.content %}
        {%- set reasoning_content = '' %}
        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
            {%- set reasoning_content = message.reasoning_content %}
        {%- else %}
            {%- if '</think>' in message.content %}
                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
            {%- endif %}
        {%- endif %}
        {%- if loop.index0 > ns.last_query_index %}
            {%- if loop.last or (not loop.last and reasoning_content) %}
                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
            {%- else %}
                {{- '<|im_start|>' + message.role + '\n' + content }}
            {%- endif %}
        {%- else %}
            {{- '<|im_start|>' + message.role + '\n' + content }}
        {%- endif %}
        {%- if message.tool_calls %}
            {%- for tool_call in message.tool_calls %}
                {%- if (loop.first and content) or (not loop.first) %}
                    {{- '\n' }}
                {%- endif %}
                {%- if tool_call.function %}
                    {%- set tool_call = tool_call.function %}
                {%- endif %}
                {{- '<tool_call>\n{"name": "' }}
                {{- tool_call.name }}
                {{- '", "arguments": ' }}
                {%- if tool_call.arguments is string %}
                    {{- tool_call.arguments }}
                {%- else %}
                    {{- tool_call.arguments | tojson }}
                {%- endif %}
                {{- '}\n</tool_call>' }}
            {%- endfor %}
        {%- endif %}
        {{- '<|im_end|>\n' }}
    {%- elif message.role == "tool" %}
        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
            {{- '<|im_start|>user' }}
        {%- endif %}
        {{- '\n<tool_response>\n' }}
        {{- message.content }}
        {{- '\n</tool_response>' }}
        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
            {{- '<|im_end|>\n' }}
        {%- endif %}
    {%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|im_start|>assistant\n' }}
    {%- if enable_thinking is defined and enable_thinking is false %}
        {{- '<think>\n\n</think>\n\n' }}
    {%- endif %}
{%- endif %}
---
[2026-05-21 05:26:17,621] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:3208933] min_input_len: 16
[2026-05-21 05:26:17,621] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:3208933] max_input_len: 987
Dropping Invalid Sequences (<None or >512) (num_proc=128):   0%|                                                          | 0/748586 [00:00<?, ? examples/s]Dropping Invalid Sequences (<None or >512) (num_proc=128):   0%|                                             | 2000/748586 [00:01<08:11, 1519.14 examples/s]Dropping Invalid Sequences (<None or >512) (num_proc=128):   3%|█▍                                         | 25000/748586 [00:01<00:30, 23716.64 examples/s]Dropping Invalid Sequences (<None or >512) (num_proc=128):   7%|██▉                                        | 51849/748586 [00:01<00:13, 53251.21 examples/s]Dropping Invalid Sequences (<None or >512) (num_proc=128):  10%|████                                       | 71396/748586 [00:01<00:10, 67423.15 examples/s]Dropping Invalid Sequences (<None or >512) (num_proc=128):  12%|█████                                      | 88792/748586 [00:01<00:09, 69990.23 examples/s]Dropping Invalid Sequences (<None or >512) (num_proc=128):  15%|██████                                    | 108641/748586 [00:02<00:07, 85445.81 examples/s]Dropping Invalid Sequences (<None or >512) (num_proc=128):  16%|██████▉                                   | 123188/748586 [00:02<00:06, 94851.39 examples/s]Dropping Invalid Sequences (<None or >512) (num_proc=128):  19%|███████▌                                 | 138584/748586 [00:02<00:05, 103872.86 examples/s]Dropping Invalid Sequences (<None or >512) (num_proc=128):  58%|███████████████████████▊                 | 435162/748586 [00:02<00:00, 726531.03 examples/s]Dropping Invalid Sequences (<None or >512) (num_proc=128):  97%|██████████████████████████████████████▋ | 723258/748586 [00:02<00:00, 1225601.15 examples/s]Dropping Invalid Sequences (<None or >512) (num_proc=128): 100%|█████████████████████████████████████████| 748586/748586 [00:02<00:00, 249715.33 examples/s]
[2026-05-21 05:26:21,561] [INFO] [axolotl.utils.data.utils._drop_outside_range:306] [PID:3208933] Dropped 467 sequences outside valid range ([None, 512])
Saving the dataset (0/128 shards):   0%|                                                                                  | 0/748119 [00:00<?, ? examples/s]Saving the dataset (0/128 shards):   0%|▎                                                                     | 3000/748119 [00:04<18:33, 668.92 examples/s]Saving the dataset (1/128 shards):   2%|█                                                                    | 11845/748119 [00:04<18:20, 668.92 examples/s]Saving the dataset (2/128 shards):   3%|██▎                                                                  | 24690/748119 [00:04<18:01, 668.92 examples/s]Saving the dataset (3/128 shards):   4%|██▌                                                                  | 27535/748119 [00:04<17:57, 668.92 examples/s]Saving the dataset (4/128 shards):   4%|██▋                                                                  | 29380/748119 [00:04<17:54, 668.92 examples/s]Saving the dataset (5/128 shards):   6%|███▉                                                                 | 42070/748119 [00:04<17:35, 668.92 examples/s]Saving the dataset (6/128 shards):   6%|███▉                                                                 | 42070/748119 [00:04<17:35, 668.92 examples/s]Saving the dataset (7/128 shards):   7%|████▌                                                                | 48915/748119 [00:04<17:25, 668.92 examples/s]Saving the dataset (8/128 shards):   7%|████▌                                                                | 49760/748119 [00:04<17:24, 668.92 examples/s]Saving the dataset (9/128 shards):   8%|█████▏                                                               | 56605/748119 [00:04<17:13, 668.92 examples/s]Saving the dataset (10/128 shards):   8%|█████▎                                                              | 58450/748119 [00:04<17:11, 668.92 examples/s]Saving the dataset (11/128 shards):   9%|██████▍                                                             | 70295/748119 [00:04<16:53, 668.92 examples/s]Saving the dataset (12/128 shards):  10%|██████▋                                                             | 73140/748119 [00:04<16:49, 668.92 examples/s]Saving the dataset (13/128 shards):  10%|██████▉                                                             | 75985/748119 [00:04<16:44, 668.92 examples/s]Saving the dataset (14/128 shards):  11%|███████▋                                                            | 84830/748119 [00:04<16:31, 668.92 examples/s]Saving the dataset (15/128 shards):  12%|███████▉                                                            | 87675/748119 [00:04<16:27, 668.92 examples/s]Saving the dataset (16/128 shards):  13%|█████████                                                           | 99520/748119 [00:04<16:09, 668.92 examples/s]Saving the dataset (17/128 shards):  14%|█████████                                                          | 101365/748119 [00:04<16:06, 668.92 examples/s]Saving the dataset (18/128 shards):  14%|█████████▌                                                         | 107210/748119 [00:04<15:58, 668.92 examples/s]Saving the dataset (19/128 shards):  15%|██████████                                                         | 113055/748119 [00:04<15:49, 668.92 examples/s]Saving the dataset (20/128 shards):  16%|██████████▍                                                        | 116900/748119 [00:04<15:43, 668.92 examples/s]Saving the dataset (21/128 shards):  17%|███████████▎                                                       | 125745/748119 [00:04<15:30, 668.92 examples/s]Saving the dataset (22/128 shards):  17%|███████████▌                                                       | 128590/748119 [00:04<15:26, 668.92 examples/s]Saving the dataset (23/128 shards):  18%|████████████                                                       | 134435/748119 [00:04<15:17, 668.92 examples/s]Saving the dataset (24/128 shards):  19%|████████████▉                                                      | 144280/748119 [00:04<15:02, 668.92 examples/s]Saving the dataset (25/128 shards):  20%|█████████████▎                                                     | 148125/748119 [00:04<14:56, 668.92 examples/s]Saving the dataset (26/128 shards):  20%|█████████████▌                                                     | 151970/748119 [00:04<14:51, 668.92 examples/s]Saving the dataset (27/128 shards):  22%|██████████████▌                                                    | 162815/748119 [00:04<14:34, 668.92 examples/s]Saving the dataset (28/128 shards):  22%|██████████████▉                                                    | 166660/748119 [00:04<14:29, 668.92 examples/s]Saving the dataset (29/128 shards):  23%|███████████████▍                                                   | 172505/748119 [00:04<14:20, 668.92 examples/s]Saving the dataset (30/128 shards):  24%|████████████████▏                                                  | 181350/748119 [00:04<14:07, 668.92 examples/s]Saving the dataset (31/128 shards):  25%|████████████████▊                                                  | 187195/748119 [00:04<13:58, 668.92 examples/s]Saving the dataset (32/128 shards):  26%|█████████████████▎                                                 | 193040/748119 [00:04<13:49, 668.92 examples/s]Saving the dataset (33/128 shards):  26%|█████████████████▌                                                 | 195885/748119 [00:04<13:45, 668.92 examples/s]Saving the dataset (34/128 shards):  27%|█████████████████▊                                                 | 198730/748119 [00:04<13:41, 668.92 examples/s]Saving the dataset (35/128 shards):  28%|██████████████████▌                                                | 207575/748119 [00:04<13:28, 668.92 examples/s]Saving the dataset (36/128 shards):  29%|███████████████████▍                                               | 217420/748119 [00:04<13:13, 668.92 examples/s]Saving the dataset (37/128 shards):  29%|███████████████████▋                                               | 219265/748119 [00:04<13:10, 668.92 examples/s]Saving the dataset (38/128 shards):  30%|████████████████████▏                                              | 225110/748119 [00:04<13:01, 668.92 examples/s]Saving the dataset (39/128 shards):  31%|████████████████████▊                                              | 231955/748119 [00:04<12:51, 668.92 examples/s]Saving the dataset (40/128 shards):  32%|█████████████████████▎                                             | 237800/748119 [00:04<12:42, 668.92 examples/s]Saving the dataset (41/128 shards):  32%|█████████████████████▍                                             | 239645/748119 [00:04<12:40, 668.92 examples/s]Saving the dataset (42/128 shards):  33%|██████████████████████▎                                            | 248490/748119 [00:04<12:26, 668.92 examples/s]Saving the dataset (43/128 shards):  34%|██████████████████████▌                                            | 251335/748119 [00:04<12:22, 668.92 examples/s]Saving the dataset (44/128 shards):  35%|███████████████████████▎                                           | 260180/748119 [00:04<12:09, 668.92 examples/s]Saving the dataset (45/128 shards):  35%|███████████████████████▌                                           | 263025/748119 [00:04<12:05, 668.92 examples/s]Saving the dataset (46/128 shards):  36%|████████████████████████▎                                          | 271870/748119 [00:04<11:51, 668.92 examples/s]Saving the dataset (47/128 shards):  37%|████████████████████████▌                                          | 274715/748119 [00:04<11:47, 668.92 examples/s]Saving the dataset (48/128 shards):  39%|█████████████████████████▊                                         | 288560/748119 [00:04<11:27, 668.92 examples/s]Saving the dataset (49/128 shards):  39%|█████████████████████████▉                                         | 289405/748119 [00:04<11:25, 668.92 examples/s]Saving the dataset (50/128 shards):  39%|██████████████████████████▏                                        | 292250/748119 [00:04<11:21, 668.92 examples/s]Saving the dataset (51/128 shards):  40%|██████████████████████████▋                                        | 298095/748119 [00:04<11:12, 668.92 examples/s]Saving the dataset (52/128 shards):  42%|███████████████████████████▉                                       | 311940/748119 [00:04<10:52, 668.92 examples/s]Saving the dataset (53/128 shards):  42%|████████████████████████████                                       | 313785/748119 [00:04<10:49, 668.92 examples/s]Saving the dataset (54/128 shards):  43%|████████████████████████████▌                                      | 318630/748119 [00:04<10:42, 668.92 examples/s]Saving the dataset (55/128 shards):  44%|█████████████████████████████▍                                     | 328475/748119 [00:04<10:27, 668.92 examples/s]Saving the dataset (56/128 shards):  44%|█████████████████████████████▋                                     | 331320/748119 [00:04<10:23, 668.92 examples/s]Saving the dataset (57/128 shards):  45%|█████████████████████████████▊                                     | 333165/748119 [00:04<10:20, 668.92 examples/s]Saving the dataset (58/128 shards):  46%|██████████████████████████████▋                                    | 342010/748119 [00:04<10:07, 668.92 examples/s]Saving the dataset (59/128 shards):  46%|███████████████████████████████▏                                   | 347855/748119 [00:04<09:58, 668.92 examples/s]Saving the dataset (60/128 shards):  48%|████████████████████████████████                                   | 358700/748119 [00:04<09:42, 668.92 examples/s]Saving the dataset (61/128 shards):  48%|████████████████████████████████▏                                  | 359545/748119 [00:04<09:40, 668.92 examples/s]Saving the dataset (62/128 shards):  48%|████████████████████████████████▍                                  | 362390/748119 [00:04<09:36, 668.92 examples/s]Saving the dataset (63/128 shards):  50%|█████████████████████████████████▌                                 | 374235/748119 [00:04<09:18, 668.92 examples/s]Saving the dataset (64/128 shards):  51%|██████████████████████████████████▏                                | 381080/748119 [00:04<09:08, 668.92 examples/s]Saving the dataset (65/128 shards):  51%|██████████████████████████████████▎                                | 382925/748119 [00:04<09:05, 668.92 examples/s]Saving the dataset (66/128 shards):  52%|██████████████████████████████████▌                                | 385770/748119 [00:04<09:01, 668.92 examples/s]Saving the dataset (67/128 shards):  53%|███████████████████████████████████▎                               | 394615/748119 [00:04<08:48, 668.92 examples/s]Saving the dataset (68/128 shards):  53%|███████████████████████████████████▌                               | 397460/748119 [00:04<08:44, 668.92 examples/s]Saving the dataset (69/128 shards):  55%|████████████████████████████████████▊                              | 411305/748119 [00:04<08:23, 668.92 examples/s]Saving the dataset (70/128 shards):  55%|████████████████████████████████████▉                              | 412150/748119 [00:04<08:22, 668.92 examples/s]Saving the dataset (71/128 shards):  55%|█████████████████████████████████████▏                             | 414995/748119 [00:04<08:17, 668.92 examples/s]Saving the dataset (72/128 shards):  57%|█████████████████████████████████████▉                             | 423840/748119 [00:04<08:04, 668.92 examples/s]Saving the dataset (73/128 shards):  57%|██████████████████████████████████████▏                            | 426685/748119 [00:04<08:00, 668.92 examples/s]Saving the dataset (74/128 shards):  58%|███████████████████████████████████████                            | 435530/748119 [00:04<07:47, 668.92 examples/s]Saving the dataset (75/128 shards):  59%|███████████████████████████████████████▎                           | 438375/748119 [00:04<07:43, 668.92 examples/s]Saving the dataset (76/128 shards):  60%|████████████████████████████████████████                           | 447220/748119 [00:04<07:29, 668.92 examples/s]Saving the dataset (77/128 shards):  61%|████████████████████████████████████████▌                          | 453065/748119 [00:04<07:21, 668.92 examples/s]Saving the dataset (78/128 shards):  62%|█████████████████████████████████████████▎                         | 461910/748119 [00:04<07:07, 668.92 examples/s]Saving the dataset (79/128 shards):  63%|██████████████████████████████████████████                         | 469755/748119 [00:04<06:56, 668.92 examples/s]Saving the dataset (80/128 shards):  63%|██████████████████████████████████████████▏                        | 470600/748119 [00:04<06:54, 668.92 examples/s]Saving the dataset (81/128 shards):  63%|██████████████████████████████████████████▍                        | 473445/748119 [00:04<06:50, 668.92 examples/s]Saving the dataset (82/128 shards):  64%|███████████████████████████████████████████▏                       | 482290/748119 [00:04<06:37, 668.92 examples/s]Saving the dataset (83/128 shards):  66%|████████████████████████████████████████████                       | 492135/748119 [00:04<06:22, 668.92 examples/s]Saving the dataset (84/128 shards):  67%|████████████████████████████████████████████▌                      | 497980/748119 [00:04<06:13, 668.92 examples/s]Saving the dataset (85/128 shards):  67%|████████████████████████████████████████████▊                      | 500825/748119 [00:04<06:09, 668.92 examples/s]Saving the dataset (86/128 shards):  67%|█████████████████████████████████████████████                      | 502670/748119 [00:04<06:06, 668.92 examples/s]Saving the dataset (87/128 shards):  68%|█████████████████████████████████████████████▌                     | 508515/748119 [00:04<05:58, 668.92 examples/s]Saving the dataset (88/128 shards):  69%|██████████████████████████████████████████████▎                    | 517359/748119 [00:04<05:44, 668.92 examples/s]Saving the dataset (89/128 shards):  70%|██████████████████████████████████████████████▊                    | 523203/748119 [00:04<05:36, 668.92 examples/s]Saving the dataset (90/128 shards):  71%|███████████████████████████████████████████████▍                   | 529047/748119 [00:04<05:27, 668.92 examples/s]Saving the dataset (91/128 shards):  71%|███████████████████████████████████████████████▋                   | 531891/748119 [00:04<05:23, 668.92 examples/s]Saving the dataset (92/128 shards):  72%|████████████████████████████████████████████████▍                  | 540735/748119 [00:04<05:10, 668.92 examples/s]Saving the dataset (93/128 shards):  73%|█████████████████████████████████████████████████                  | 547579/748119 [00:04<04:59, 668.92 examples/s]Saving the dataset (94/128 shards):  73%|█████████████████████████████████████████████████▏                 | 549423/748119 [00:04<04:57, 668.92 examples/s]Saving the dataset (95/128 shards):  74%|█████████████████████████████████████████████████▋                 | 555267/748119 [00:04<04:48, 668.92 examples/s]Saving the dataset (96/128 shards):  75%|██████████████████████████████████████████████████▌                | 564111/748119 [00:04<04:35, 668.92 examples/s]Saving the dataset (97/128 shards):  76%|██████████████████████████████████████████████████▊                | 566955/748119 [00:04<04:30, 668.92 examples/s]Saving the dataset (98/128 shards):  77%|███████████████████████████████████████████████████▌               | 575799/748119 [00:04<04:17, 668.92 examples/s]Saving the dataset (99/128 shards):  78%|████████████████████████████████████████████████████               | 581643/748119 [00:04<04:08, 668.92 examples/s]Saving the dataset (100/128 shards):  79%|███████████████████████████████████████████████████▊              | 587487/748119 [00:04<04:00, 668.92 examples/s]Saving the dataset (101/128 shards):  79%|████████████████████████████████████████████████████▎             | 593331/748119 [00:04<03:51, 668.92 examples/s]Saving the dataset (102/128 shards):  80%|████████████████████████████████████████████████████▊             | 599175/748119 [00:04<03:42, 668.92 examples/s]Saving the dataset (103/128 shards):  81%|█████████████████████████████████████████████████████▍            | 605019/748119 [00:04<03:33, 668.92 examples/s]Saving the dataset (104/128 shards):  82%|█████████████████████████████████████████████████████▉            | 611863/748119 [00:04<03:23, 668.92 examples/s]Saving the dataset (105/128 shards):  82%|██████████████████████████████████████████████████████▏           | 613707/748119 [00:04<03:20, 668.92 examples/s]Saving the dataset (106/128 shards):  84%|███████████████████████████████████████████████████████▎          | 626551/748119 [00:04<03:01, 668.92 examples/s]Saving the dataset (107/128 shards):  84%|███████████████████████████████████████████████████████▍          | 628395/748119 [00:04<02:58, 668.92 examples/s]Saving the dataset (108/128 shards):  84%|███████████████████████████████████████████████████████▋          | 631239/748119 [00:04<02:54, 668.92 examples/s]Saving the dataset (109/128 shards):  86%|████████████████████████████████████████████████████████▍         | 640083/748119 [00:04<02:41, 668.92 examples/s]Saving the dataset (110/128 shards):  87%|█████████████████████████████████████████████████████████▎        | 649927/748119 [00:04<02:26, 668.92 examples/s]Saving the dataset (111/128 shards):  87%|█████████████████████████████████████████████████████████▌        | 651771/748119 [00:04<02:24, 668.92 examples/s]Saving the dataset (112/128 shards):  88%|██████████████████████████████████████████████████████████        | 657615/748119 [00:04<02:15, 668.92 examples/s]Saving the dataset (113/128 shards):  89%|██████████████████████████████████████████████████████████▊       | 666459/748119 [00:04<02:02, 668.92 examples/s]Saving the dataset (114/128 shards):  90%|███████████████████████████████████████████████████████████▎      | 672147/748119 [00:04<01:53, 668.92 examples/s]Saving the dataset (115/128 shards):  90%|███████████████████████████████████████████████████████████▎      | 672147/748119 [00:04<01:53, 668.92 examples/s]Saving the dataset (116/128 shards):  91%|████████████████████████████████████████████████████████████▎     | 682991/748119 [00:04<01:37, 668.92 examples/s]Saving the dataset (117/128 shards):  92%|████████████████████████████████████████████████████████████▊     | 689835/748119 [00:04<01:27, 668.92 examples/s]Saving the dataset (118/128 shards):  93%|█████████████████████████████████████████████████████████████▍    | 696679/748119 [00:04<01:16, 668.92 examples/s]Saving the dataset (119/128 shards):  93%|█████████████████████████████████████████████████████████████▌    | 698523/748119 [00:04<01:14, 668.92 examples/s]Saving the dataset (120/128 shards):  95%|██████████████████████████████████████████████████████████████▍   | 707367/748119 [00:04<01:00, 668.92 examples/s]Saving the dataset (121/128 shards):  95%|███████████████████████████████████████████████████████████████   | 714211/748119 [00:04<00:50, 668.92 examples/s]Saving the dataset (122/128 shards):  96%|███████████████████████████████████████████████████████████████▌  | 721055/748119 [00:04<00:40, 668.92 examples/s]Saving the dataset (123/128 shards):  97%|███████████████████████████████████████████████████████████████▊  | 723899/748119 [00:04<00:36, 668.92 examples/s]Saving the dataset (124/128 shards):  97%|███████████████████████████████████████████████████████████████▉  | 724743/748119 [00:04<00:34, 668.92 examples/s]Saving the dataset (125/128 shards):  98%|████████████████████████████████████████████████████████████████▋ | 733587/748119 [00:04<00:21, 668.92 examples/s]Saving the dataset (125/128 shards):  99%|██████████████████████████████████████████████████████████████▎| 740431/748119 [00:04<00:00, 227547.30 examples/s]Saving the dataset (126/128 shards):  99%|██████████████████████████████████████████████████████████████▎| 740431/748119 [00:04<00:00, 227547.30 examples/s]Saving the dataset (127/128 shards):  99%|██████████████████████████████████████████████████████████████▋| 744275/748119 [00:04<00:00, 227547.30 examples/s]Saving the dataset (128/128 shards): 100%|███████████████████████████████████████████████████████████████| 748119/748119 [00:04<00:00, 227547.30 examples/s]Saving the dataset (128/128 shards): 100%|███████████████████████████████████████████████████████████████| 748119/748119 [00:04<00:00, 158325.36 examples/s]
[2026-05-21 05:26:30,874] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:3208933] total_num_tokens: 41_357_188
[2026-05-21 05:26:33,832] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:3208933] `total_supervised_tokens: 4_704_021`
[2026-05-21 05:26:33,833] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:3208933] total_num_steps: 58447
[2026-05-21 05:26:33,833] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:3208933] Maximum number of steps set at 58447
[2026-05-21 05:26:33,887] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:3208933] loading tokenizer... Qwen/Qwen3-1.7B-Base
[2026-05-21 05:26:35,407] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:307] [PID:3208933] EOS: 151645 / <|im_end|>
[2026-05-21 05:26:35,407] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:308] [PID:3208933] BOS: None / None
[2026-05-21 05:26:35,407] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:309] [PID:3208933] PAD: 151643 / <|endoftext|>
[2026-05-21 05:26:35,407] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:310] [PID:3208933] UNK: None / None
[2026-05-21 05:26:35,408] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:3208933] Loading model
[2026-05-21 05:26:35,615] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:75] [PID:3208933] Patched OptimState8bit for torch.compile compatibility
[2026-05-21 05:26:35,615] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:122] [PID:3208933] Patched OptimState4bit for torch.compile compatibility
[2026-05-21 05:26:35,615] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:154] [PID:3208933] Patched OptimStateFp8 for torch.compile compatibility
[2026-05-21 05:26:35,621] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:3208933] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-05-21 05:26:35,622] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:3208933] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
Loading weights:   0%|                                                                                                              | 0/310 [00:00<?, ?it/s]Loading weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 310/310 [00:00<00:00, 10198.32it/s]
[2026-05-21 05:26:36,736] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:3208933] Memory usage after model load 0.000GB ()
[2026-05-21 05:26:41,122] [INFO] [axolotl.train.save_initial_configs:421] [PID:3208933] Pre-saving tokenizer to ./checkpoint/Qwen3-1.7B-nq_text_compressed-with_pseudo-lr1e-4-10epochs...
[2026-05-21 05:26:41,234] [INFO] [axolotl.train.save_initial_configs:426] [PID:3208933] Pre-saving model config to ./checkpoint/Qwen3-1.7B-nq_text_compressed-with_pseudo-lr1e-4-10epochs...
[2026-05-21 05:26:41,238] [INFO] [axolotl.train.execute_training:222] [PID:3208933] Starting trainer...
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /mnt/raid0/home/abner/.netrc.
[34m[1mwandb[0m: Currently logged in as: [33mabnerden0803[0m ([33mabnerden0803-national-taiwan-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: [38;5;178m⢿[0m Waiting for wandb.init()...
[Am[2K[34m[1mwandb[0m: [38;5;178m⣻[0m setting up run 0zmanqq0 (0.2s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣽[0m setting up run 0zmanqq0 (0.2s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣾[0m setting up run 0zmanqq0 (0.2s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣷[0m setting up run 0zmanqq0 (0.2s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣯[0m setting up run 0zmanqq0 (0.2s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣟[0m setting up run 0zmanqq0 (0.7s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⡿[0m setting up run 0zmanqq0 (0.7s)
[Am[2K[34m[1mwandb[0m: Tracking run with wandb version 0.26.0
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/mnt/raid0/home/abner/git/ICLGR/wandb/run-20260521_052641-0zmanqq0[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mqwen3-1.7b-nq_text_compressed-pseudo-lr1e-4-10epochs[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/abnerden0803-national-taiwan-university/ICLGR-NQ[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/abnerden0803-national-taiwan-university/ICLGR-NQ/runs/0zmanqq0[0m
[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
[34m[1mwandb[0m: [33mWARNING[0m Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
[2026-05-21 05:26:44,641] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:3208933] The Axolotl config has been saved to the WandB run under files.
  0%|                                                                                                                             | 0/58447 [00:00<?, ?it/s]  0%|                                                                                                                  | 1/58447 [00:06<98:43:15,  6.08s/it]  0%|                                                                                                                  | 2/58447 [00:11<94:24:48,  5.82s/it]  0%|                                                                                                                  | 3/58447 [00:17<93:47:10,  5.78s/it]  0%|                                                                                                                  | 4/58447 [00:22<92:02:08,  5.67s/it]  0%|                                                                                                                  | 5/58447 [00:28<92:38:38,  5.71s/it]  0%|                                                                                                                  | 6/58447 [00:34<94:00:50,  5.79s/it]  0%|                                                                                                                  | 7/58447 [00:40<93:43:16,  5.77s/it]  0%|                                                                                                                  | 8/58447 [00:45<92:30:08,  5.70s/it]  0%|                                                                                                                  | 9/58447 [00:51<91:27:17,  5.63s/it]  0%|                                                                                                                 | 10/58447 [00:56<90:37:10,  5.58s/it]  0%|                                                                                                                 | 11/58447 [01:01<88:10:08,  5.43s/it]  0%|                                                                                                                 | 12/58447 [01:07<88:15:30,  5.44s/it]  0%|                                                                                                                 | 13/58447 [01:13<89:00:47,  5.48s/it]  0%|                                                                                                                 | 14/58447 [01:18<89:45:36,  5.53s/it]  0%|                                                                                                                 | 15/58447 [01:24<89:33:02,  5.52s/it]  0%|                                                                                                                 | 16/58447 [01:29<88:36:17,  5.46s/it]  0%|                                                                                                                 | 17/58447 [01:35<89:16:27,  5.50s/it]  0%|                                                                                                                 | 18/58447 [01:40<88:16:16,  5.44s/it]  0%|                                                                                                                 | 19/58447 [01:46<89:30:06,  5.51s/it]  0%|                                                                                                                 | 20/58447 [01:51<89:40:46,  5.53s/it]  0%|                                                                                                                 | 21/58447 [01:57<89:37:15,  5.52s/it]  0%|                                                                                                                 | 22/58447 [02:02<89:52:48,  5.54s/it]  0%|                                                                                                                 | 23/58447 [02:08<89:11:26,  5.50s/it]  0%|                                                                                                                 | 24/58447 [02:13<89:04:53,  5.49s/it]  0%|                                                                                                                 | 25/58447 [02:19<88:48:49,  5.47s/it]  0%|                                                                                                                 | 26/58447 [02:24<89:27:22,  5.51s/it]  0%|                                                                                                                 | 27/58447 [02:29<88:20:42,  5.44s/it]  0%|                                                                                                                 | 28/58447 [02:35<89:40:57,  5.53s/it]  0%|                                                                                                                 | 29/58447 [02:41<89:54:19,  5.54s/it]  0%|                                                                                                                 | 30/58447 [02:46<90:36:04,  5.58s/it]  0%|                                                                                                                 | 31/58447 [02:51<85:52:50,  5.29s/it]  0%|                                                                                                                 | 32/58447 [02:56<82:03:32,  5.06s/it]  0%|                                                                                                                 | 33/58447 [03:00<79:50:40,  4.92s/it]  0%|                                                                                                                 | 34/58447 [03:05<78:08:31,  4.82s/it]  0%|                                                                                                                 | 35/58447 [03:09<76:27:18,  4.71s/it]  0%|                                                                                                                 | 36/58447 [03:14<75:21:20,  4.64s/it]  0%|                                                                                                                 | 37/58447 [03:18<76:05:25,  4.69s/it]  0%|                                                                                                                 | 38/58447 [03:24<78:25:20,  4.83s/it]  0%|                                                                                                                 | 39/58447 [03:29<81:19:58,  5.01s/it]  0%|                                                                                                                 | 40/58447 [03:34<83:31:22,  5.15s/it]  0%|                                                                                                                 | 41/58447 [03:39<82:24:26,  5.08s/it]  0%|                                                                                                                 | 42/58447 [03:45<84:59:51,  5.24s/it]  0%|                                                                                                                 | 43/58447 [03:50<84:47:41,  5.23s/it]  0%|                                                                                                                 | 44/58447 [03:56<85:12:25,  5.25s/it]  0%|                                                                                                                 | 45/58447 [04:01<86:10:28,  5.31s/it]  0%|                                                                                                                 | 46/58447 [04:06<86:38:05,  5.34s/it]  0%|                                                                                                                 | 47/58447 [04:12<87:19:03,  5.38s/it]  0%|                                                                                                                 | 48/58447 [04:17<87:38:39,  5.40s/it]  0%|                                                                                                                 | 49/58447 [04:23<89:55:22,  5.54s/it]  0%|                                                                                                                 | 50/58447 [04:29<89:21:57,  5.51s/it]                                                                                                                                                            {'loss': '6.238', 'grad_norm': '2.27', 'learning_rate': '8.385e-07', 'ppl': '512', 'memory/max_active (GiB)': '19.7', 'memory/max_allocated (GiB)': '19.7', 'memory/device_reserved (GiB)': '20.59', 'tokens/train_per_sec_per_gpu': '4.694', 'tokens/total': 800316, 'tokens/trainable': 40263, 'epoch': '0.008555'}
  0%|                                                                                                                 | 50/58447 [04:29<89:21:57,  5.51s/it]  0%|                                                                                                                 | 51/58447 [04:34<89:04:29,  5.49s/it]  0%|                                                                                                                 | 52/58447 [04:40<88:48:15,  5.47s/it]  0%|                                                                                                                 | 53/58447 [04:45<90:09:01,  5.56s/it]  0%|                                                                                                                 | 54/58447 [04:51<90:12:42,  5.56s/it]  0%|                                                                                                                 | 55/58447 [04:56<87:51:41,  5.42s/it]Process Process-2:
Traceback (most recent call last):
  File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/torch/utils/data/_utils/worker.py", line 315, in _worker_loop
    r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/queues.py", line 113, in get
    if not self._poll(timeout):
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/connection.py", line 257, in poll
    return self._poll(timeout)
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/connection.py", line 440, in _poll
    r = wait([self], timeout)
        ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/connection.py", line 1136, in wait
    ready = selector.select(timeout)
            ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/selectors.py", line 415, in select
    fd_event_list = self._selector.poll(timeout)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/axolotl/train.py", line 175, in <lambda>
    lambda signum, frame: terminate_handler(signum, frame, _model_weakref),
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/axolotl/train.py", line 167, in terminate_handler
    _model.save_pretrained(cfg.output_dir)
  File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py", line 3352, in save_pretrained
    state_dict = remove_tied_weights_from_state_dict(state_dict, model_to_save)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py", line 438, in remove_tied_weights_from_state_dict
    shared_names, disjoint_names = _find_disjoint(shared_ptrs.values(), state_dict)
                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py", line 352, in _find_disjoint
    areas.append((tensor.data_ptr(), _end_ptr(tensor), name))
                                     ^^^^^^^^^^^^^^^^
  File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py", line 328, in _end_ptr
    stop = tensor.view(-1)[-1].data_ptr() + tensor.element_size()
           ~~~~~~~~~~~~~~~^^^^
torch.AcceleratorError: CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Process Process-1:
Traceback (most recent call last):
  File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/torch/utils/data/_utils/worker.py", line 315, in _worker_loop
    r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/queues.py", line 113, in get
    if not self._poll(timeout):
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/connection.py", line 257, in poll
    return self._poll(timeout)
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/connection.py", line 440, in _poll
    r = wait([self], timeout)
        ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/connection.py", line 1136, in wait
    ready = selector.select(timeout)
            ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/selectors.py", line 415, in select
    fd_event_list = self._selector.poll(timeout)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/axolotl/train.py", line 175, in <lambda>
    lambda signum, frame: terminate_handler(signum, frame, _model_weakref),
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/axolotl/train.py", line 167, in terminate_handler
    _model.save_pretrained(cfg.output_dir)
  File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py", line 3352, in save_pretrained
    state_dict = remove_tied_weights_from_state_dict(state_dict, model_to_save)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py", line 438, in remove_tied_weights_from_state_dict
    shared_names, disjoint_names = _find_disjoint(shared_ptrs.values(), state_dict)
                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py", line 352, in _find_disjoint
    areas.append((tensor.data_ptr(), _end_ptr(tensor), name))
                                     ^^^^^^^^^^^^^^^^
  File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py", line 328, in _end_ptr
    stop = tensor.view(-1)[-1].data_ptr() + tensor.element_size()
           ~~~~~~~~~~~~~~~^^^^
torch.AcceleratorError: CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


Writing model shards:   0%|                                                                                                           | 0/1 [00:00<?, ?it/s][A

Writing model shards:   0%|                                                                                                           | 0/1 [00:00<?, ?it/s][A[A

Writing model shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.44s/it][A[AWriting model shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.44s/it]
Exception ignored in: <generator object tqdm.__iter__ at 0x74400ae771c0>
Traceback (most recent call last):
  File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/tqdm/std.py", line 1196, in __iter__
    self.close()
  File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/tqdm/std.py", line 1265, in close
    def close(self):

  File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/torch/utils/data/_utils/signal_handling.py", line 73, in handler
    _error_if_any_worker_fails()
RuntimeError: DataLoader worker (pid 3210856) exited unexpectedly with exit code 1. Details are lost due to multiprocessing. Rerunning with num_workers=0 may give better error trace.
Writing model shards:   0%|                                                                                                           | 0/1 [00:14<?, ?it/s]
  0%|                                                                                                                 | 55/58447 [05:11<91:52:41,  5.66s/it]