[2026-05-21 05:26:08,359] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:3208933] baseline 0.000GB () [2026-05-21 05:26:08,361] [INFO] [axolotl.cli.config.load_cfg:341] [PID:3208933] config: { "activation_offloading": false, "axolotl_config_path": "configs/axolotl_qwen3-1.7b_nq-text-title.yml", "base_model": "Qwen/Qwen3-1.7B-Base", "base_model_config": "Qwen/Qwen3-1.7B-Base", "batch_size": 128, "bf16": true, "capabilities": { "bf16": true, "compute_capability": "sm_120", "fp8": true, "n_gpu": 1, "n_node": 1, "tf32": true }, "context_parallel_size": 1, "dataloader_num_workers": 2, "dataset_num_proc": 128, "datasets": [ { "chat_template": "tokenizer_default_fallback_chatml", "field_messages": "conversations", "message_property_mappings": { "content": "content", "role": "role" }, "path": "nq_text_compressed_axolotl/train_with_pseudo_axolotl.jsonl", "roles": { "assistant": [ "assistant", "gpt", "model" ], "system": [ "system" ], "user": [ "user", "human" ] }, "trust_remote_code": false, "type": "chat_template" } ], "ddp": false, "device": "cuda:0", "dion_rank_fraction": 1.0, "dion_rank_multiple_of": 1, "eaft_alpha": 1.0, "eaft_k": 20, "env_capabilities": { "torch_version": "2.8.0" }, "eval_batch_size": 4, "eval_causal_lm_metrics": [ "sacrebleu", "comet", "ter", "chrf" ], "eval_max_new_tokens": 128, "eval_table_size": 0, "experimental_skip_move_to_device": true, "flash_attention": false, "flex_attention": false, "fp16": false, "generate_samples": false, "generation_do_sample": true, "generation_max_new_tokens": 50, "generation_prompt_ratio": 0.5, "generation_temperature": 0.7, "gradient_accumulation_steps": 32, "gradient_checkpointing": true, "gradient_checkpointing_kwargs": { "use_reentrant": true }, "include_tkps": true, "layer_offloading": false, "learning_rate": 0.0001, "lisa_layers_attribute": "model.layers", "load_best_model_at_end": false, "load_in_4bit": false, "load_in_8bit": false, "local_rank": 0, "logging_steps": 50, "lora_dropout": 0.0, "loraplus_lr_embedding": 1e-06, "lr_scheduler": "cosine", "mean_resizing_embeddings": false, "merge_method": "memory_efficient", "micro_batch_size": 4, "model_config_type": "qwen3", "num_epochs": 10.0, "num_generation_samples": 3, "optimizer": "adamw_torch", "otel_metrics_host": "localhost", "otel_metrics_port": 8000, "output_dir": "./checkpoint/Qwen3-1.7B-nq_text_compressed-with_pseudo-lr1e-4-10epochs", "pad_to_sequence_len": false, "pretrain_multipack_attn": true, "profiler_steps_start": 0, "qlora_sharded_model_loading": false, "quantize_moe_experts": false, "ray_num_workers": 1, "resources_per_worker": { "GPU": 1 }, "sample_packing": false, "sample_packing_bin_size": 200, "sample_packing_group_size": 100000, "save_only_model": false, "save_safetensors": true, "save_strategy": "epoch", "save_total_limit": 3, "sdp_attention": true, "sequence_len": 512, "shuffle_before_merging_datasets": false, "shuffle_merged_datasets": true, "skip_prepare_dataset": false, "special_tokens": { "eos_token": "<|im_end|>" }, "streaming_multipack_buffer_size": 10000, "strict": false, "tensor_parallel_size": 1, "tf32": false, "tiled_mlp_use_original_mlp": true, "tokenizer_config": "Qwen/Qwen3-1.7B-Base", "tokenizer_save_jinja_files": true, "torch_dtype": "torch.bfloat16", "train_on_inputs": false, "trl": { "async_prefetch": false, "log_completions": false, "mask_truncated_completions": false, "ref_model_mixup_alpha": 0.9, "ref_model_sync_steps": 64, "replay_buffer_size": 0, "replay_recompute_logps": true, "reroll_max_groups": 1, "reroll_start_fraction": 1.0, "reward_num_workers": 1, "scale_rewards": true, "skip_zero_advantage_batches": true, "sync_ref_model": false, "use_data_producer": false, "use_vllm": false, "vllm_lora_sync": false, "vllm_server_host": "0.0.0.0", "vllm_server_port": 8000 }, "use_otel_metrics": false, "use_ray": false, "use_wandb": true, "val_set_size": 0.0, "vllm": { "device": "auto", "dtype": "auto", "gpu_memory_utilization": 0.9, "host": "0.0.0.0", "port": 8000 }, "wandb_entity": "abnerden0803-national-taiwan-university", "wandb_name": "qwen3-1.7b-nq_text_compressed-pseudo-lr1e-4-10epochs", "wandb_project": "ICLGR-NQ", "warmup_ratio": 0.1, "weight_decay": 0.0, "world_size": 1, "xformers_attention": false } [2026-05-21 05:26:10,257] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:307] [PID:3208933] EOS: 151645 / <|im_end|> [2026-05-21 05:26:10,257] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:308] [PID:3208933] BOS: None / None [2026-05-21 05:26:10,257] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:309] [PID:3208933] PAD: 151643 / <|endoftext|> [2026-05-21 05:26:10,257] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:310] [PID:3208933] UNK: None / None [2026-05-21 05:26:10,259] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:480] [PID:3208933] Unable to find prepared dataset in last_run_prepared/a8d61713fe28909dcab9370999e181f6 [2026-05-21 05:26:10,259] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:3208933] Loading raw datasets... [2026-05-21 05:26:10,259] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:3208933] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`. [2026-05-21 05:26:10,953] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:3208933] Loading dataset: nq_text_compressed_axolotl/train_with_pseudo_axolotl.jsonl with base_type: chat_template and prompt_style: None [2026-05-21 05:26:10,956] [INFO] [axolotl.prompt_strategies.chat_template.__call__:998] [PID:3208933] Using chat template: --- {%- if tools %} {{- '<|im_start|>system\n' }} {%- if messages[0].role == 'system' %} {{- messages[0].content + '\n\n' }} {%- endif %} {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} {%- for tool in tools %} {{- "\n" }} {{- tool | tojson }} {%- endfor %} {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} {%- else %} {%- if messages[0].role == 'system' %} {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} {%- for message in messages[::-1] %} {%- set index = (messages|length - 1) - loop.index0 %} {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} {%- set ns.multi_step_tool = false %} {%- set ns.last_query_index = index %} {%- endif %} {%- endfor %} {%- for message in messages %} {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" %} {%- set content = message.content %} {%- set reasoning_content = '' %} {%- if message.reasoning_content is defined and message.reasoning_content is not none %} {%- set reasoning_content = message.reasoning_content %} {%- else %} {%- if '' in message.content %} {%- set content = message.content.split('')[-1].lstrip('\n') %} {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} {%- endif %} {%- endif %} {%- if loop.index0 > ns.last_query_index %} {%- if loop.last or (not loop.last and reasoning_content) %} {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} {%- else %} {{- '<|im_start|>' + message.role + '\n' + content }} {%- endif %} {%- else %} {{- '<|im_start|>' + message.role + '\n' + content }} {%- endif %} {%- if message.tool_calls %} {%- for tool_call in message.tool_calls %} {%- if (loop.first and content) or (not loop.first) %} {{- '\n' }} {%- endif %} {%- if tool_call.function %} {%- set tool_call = tool_call.function %} {%- endif %} {{- '\n{"name": "' }} {{- tool_call.name }} {{- '", "arguments": ' }} {%- if tool_call.arguments is string %} {{- tool_call.arguments }} {%- else %} {{- tool_call.arguments | tojson }} {%- endif %} {{- '}\n' }} {%- endfor %} {%- endif %} {{- '<|im_end|>\n' }} {%- elif message.role == "tool" %} {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} {{- '<|im_start|>user' }} {%- endif %} {{- '\n\n' }} {{- message.content }} {{- '\n' }} {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} {{- '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- endfor %} {%- if add_generation_prompt %} {{- '<|im_start|>assistant\n' }} {%- if enable_thinking is defined and enable_thinking is false %} {{- '\n\n\n\n' }} {%- endif %} {%- endif %} --- [2026-05-21 05:26:17,621] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:3208933] min_input_len: 16 [2026-05-21 05:26:17,621] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:3208933] max_input_len: 987 Dropping Invalid Sequences (512) (num_proc=128): 0%| | 0/748586 [00:00512) (num_proc=128): 0%| | 2000/748586 [00:01<08:11, 1519.14 examples/s] Dropping Invalid Sequences (512) (num_proc=128): 3%|█▍ | 25000/748586 [00:01<00:30, 23716.64 examples/s] Dropping Invalid Sequences (512) (num_proc=128): 7%|██▉ | 51849/748586 [00:01<00:13, 53251.21 examples/s] Dropping Invalid Sequences (512) (num_proc=128): 10%|████ | 71396/748586 [00:01<00:10, 67423.15 examples/s] Dropping Invalid Sequences (512) (num_proc=128): 12%|█████ | 88792/748586 [00:01<00:09, 69990.23 examples/s] Dropping Invalid Sequences (512) (num_proc=128): 15%|██████ | 108641/748586 [00:02<00:07, 85445.81 examples/s] Dropping Invalid Sequences (512) (num_proc=128): 16%|██████▉ | 123188/748586 [00:02<00:06, 94851.39 examples/s] Dropping Invalid Sequences (512) (num_proc=128): 19%|███████▌ | 138584/748586 [00:02<00:05, 103872.86 examples/s] Dropping Invalid Sequences (512) (num_proc=128): 58%|███████████████████████▊ | 435162/748586 [00:02<00:00, 726531.03 examples/s] Dropping Invalid Sequences (512) (num_proc=128): 97%|██████████████████████████████████████▋ | 723258/748586 [00:02<00:00, 1225601.15 examples/s] Dropping Invalid Sequences (512) (num_proc=128): 100%|█████████████████████████████████████████| 748586/748586 [00:02<00:00, 249715.33 examples/s] [2026-05-21 05:26:21,561] [INFO] [axolotl.utils.data.utils._drop_outside_range:306] [PID:3208933] Dropped 467 sequences outside valid range ([None, 512]) Saving the dataset (0/128 shards): 0%| | 0/748119 [00:00 [2026-05-21 05:26:35,407] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:308] [PID:3208933] BOS: None / None [2026-05-21 05:26:35,407] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:309] [PID:3208933] PAD: 151643 / <|endoftext|> [2026-05-21 05:26:35,407] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:310] [PID:3208933] UNK: None / None [2026-05-21 05:26:35,408] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:3208933] Loading model [2026-05-21 05:26:35,615] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:75] [PID:3208933] Patched OptimState8bit for torch.compile compatibility [2026-05-21 05:26:35,615] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:122] [PID:3208933] Patched OptimState4bit for torch.compile compatibility [2026-05-21 05:26:35,615] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:154] [PID:3208933] Patched OptimStateFp8 for torch.compile compatibility [2026-05-21 05:26:35,621] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:3208933] Patched Trainer.evaluation_loop with nanmean loss calculation [2026-05-21 05:26:35,622] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:3208933] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation Loading weights: 0%| | 0/310 [00:00 lambda signum, frame: terminate_handler(signum, frame, _model_weakref), ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/axolotl/train.py", line 167, in terminate_handler _model.save_pretrained(cfg.output_dir) File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py", line 3352, in save_pretrained state_dict = remove_tied_weights_from_state_dict(state_dict, model_to_save) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py", line 438, in remove_tied_weights_from_state_dict shared_names, disjoint_names = _find_disjoint(shared_ptrs.values(), state_dict) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py", line 352, in _find_disjoint areas.append((tensor.data_ptr(), _end_ptr(tensor), name)) ^^^^^^^^^^^^^^^^ File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py", line 328, in _end_ptr stop = tensor.view(-1)[-1].data_ptr() + tensor.element_size() ~~~~~~~~~~~~~~~^^^^ torch.AcceleratorError: CUDA error: initialization error CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1 Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. Process Process-1: Traceback (most recent call last): File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/torch/utils/data/_utils/worker.py", line 315, in _worker_loop r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/queues.py", line 113, in get if not self._poll(timeout): ^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 257, in poll return self._poll(timeout) ^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 440, in _poll r = wait([self], timeout) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 1136, in wait ready = selector.select(timeout) ^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/selectors.py", line 415, in select fd_event_list = self._selector.poll(timeout) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/axolotl/train.py", line 175, in lambda signum, frame: terminate_handler(signum, frame, _model_weakref), ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/axolotl/train.py", line 167, in terminate_handler _model.save_pretrained(cfg.output_dir) File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py", line 3352, in save_pretrained state_dict = remove_tied_weights_from_state_dict(state_dict, model_to_save) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py", line 438, in remove_tied_weights_from_state_dict shared_names, disjoint_names = _find_disjoint(shared_ptrs.values(), state_dict) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py", line 352, in _find_disjoint areas.append((tensor.data_ptr(), _end_ptr(tensor), name)) ^^^^^^^^^^^^^^^^ File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py", line 328, in _end_ptr stop = tensor.view(-1)[-1].data_ptr() + tensor.element_size() ~~~~~~~~~~~~~~~^^^^ torch.AcceleratorError: CUDA error: initialization error CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1 Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. Writing model shards: 0%| | 0/1 [00:00 Traceback (most recent call last): File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/tqdm/std.py", line 1196, in __iter__ self.close() File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/tqdm/std.py", line 1265, in close def close(self): File "/mnt/raid0/home/abner/git/conv-gr-new-dataset/.venv/lib/python3.12/site-packages/torch/utils/data/_utils/signal_handling.py", line 73, in handler _error_if_any_worker_fails() RuntimeError: DataLoader worker (pid 3210856) exited unexpectedly with exit code 1. Details are lost due to multiprocessing. Rerunning with num_workers=0 may give better error trace. Writing model shards: 0%| | 0/1 [00:14