diff --git "a/debug.log" "b/debug.log" --- "a/debug.log" +++ "b/debug.log" @@ -1,12 +1,12 @@ -[2025-12-22 16:46:29,510] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:37090] baseline 0.000GB () -[2025-12-22 16:46:29,513] [INFO] [axolotl.cli.config.load_cfg:248] [PID:37090] config: +[2025-12-23 14:21:13,443] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:4046] baseline 0.000GB () +[2025-12-23 14:21:13,444] [INFO] [axolotl.cli.config.load_cfg:248] [PID:4046] config: { "activation_offloading": false, "auto_resume_from_checkpoints": true, - "axolotl_config_path": "config_template_ded.yaml", + "axolotl_config_path": "config.yaml", "base_model": "Qwen/Qwen3-4B-Instruct-2507", "base_model_config": "Qwen/Qwen3-4B-Instruct-2507", - "batch_size": 64, + "batch_size": 32, "bf16": true, "capabilities": { "bf16": true, @@ -15,11 +15,12 @@ "n_gpu": 1, "n_node": 1 }, + "chat_template": "qwen3", "context_parallel_size": 1, "dataloader_num_workers": 1, "dataloader_pin_memory": true, "dataloader_prefetch_factor": 256, - "dataset_processes": 32, + "dataset_processes": 24, "datasets": [ { "chat_template": "tokenizer_default", @@ -51,7 +52,7 @@ "env_capabilities": { "torch_version": "2.7.1" }, - "eval_batch_size": 8, + "eval_batch_size": 2, "eval_causal_lm_metrics": [ "sacrebleu", "comet", @@ -59,12 +60,11 @@ "chrf" ], "eval_max_new_tokens": 128, - "eval_sample_packing": true, "eval_table_size": 0, "experimental_skip_move_to_device": true, "flash_attention": true, "fp16": false, - "gradient_accumulation_steps": 8, + "gradient_accumulation_steps": 16, "gradient_checkpointing": true, "gradient_checkpointing_kwargs": { "use_reentrant": true @@ -73,7 +73,7 @@ "is_falcon_derived_model": false, "is_llama_derived_model": false, "is_mistral_derived_model": false, - "learning_rate": 2e-05, + "learning_rate": 5e-06, "lisa_layers_attribute": "model.layers", "load_best_model_at_end": false, "load_in_4bit": false, @@ -83,12 +83,11 @@ "loraplus_lr_embedding": 1e-06, "lr_scheduler": "cosine", "mean_resizing_embeddings": false, - "micro_batch_size": 8, + "micro_batch_size": 2, "model_config_type": "qwen3", - "num_epochs": 3.0, + "num_epochs": 10.0, "optimizer": "adamw_torch_fused", "output_dir": "./outputs/qwen3-4b-instruct-ded-full-train", - "pad_to_sequence_len": true, "pretrain_multipack_attn": true, "profiler_steps_start": 0, "qlora_sharded_model_loading": false, @@ -96,12 +95,12 @@ "resources_per_worker": { "GPU": 1 }, - "sample_packing": true, + "sample_packing": false, "sample_packing_bin_size": 200, "sample_packing_group_size": 100000, "save_only_model": false, "save_safetensors": true, - "sequence_len": 2048, + "sequence_len": 4096, "shuffle_before_merging_datasets": false, "shuffle_merged_datasets": true, "skip_prepare_dataset": false, @@ -140,16 +139,273 @@ "weight_decay": 0.01, "world_size": 1 } -[2025-12-22 16:46:31,168] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:37090] EOS: 151645 / <|im_end|> -[2025-12-22 16:46:31,169] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:37090] BOS: None / None -[2025-12-22 16:46:31,170] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:37090] PAD: 151643 / <|endoftext|> -[2025-12-22 16:46:31,172] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:37090] UNK: None / None -[2025-12-22 16:46:31,177] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:476] [PID:37090] Unable to find prepared dataset in last_run_prepared/6f0bc810b1537ff654f206db06f6f760 -[2025-12-22 16:46:31,179] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:37090] Loading raw datasets... -[2025-12-22 16:46:31,180] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:37090] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`. - Generating train split: 0 examples [00:00, ? examples/s] Generating train split: 6593 examples [00:00, 56190.99 examples/s] Generating train split: 13162 examples [00:00, 45202.78 examples/s] Generating train split: 22995 examples [00:00, 46638.45 examples/s] Generating train split: 23109 examples [00:02, 8002.16 examples/s] -[2025-12-22 16:46:34,571] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:37090] Loading dataset: ./dataset/ded_full_train.jsonl with base_type: chat_template and prompt_style: None -[2025-12-22 16:46:34,591] [INFO] [axolotl.prompt_strategies.chat_template.__call__:969] [PID:37090] Using chat template: +[2025-12-23 14:21:13,859] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:4046] EOS: 151645 / <|im_end|> +[2025-12-23 14:21:13,859] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:4046] BOS: None / None +[2025-12-23 14:21:13,859] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:4046] PAD: 151643 / <|endoftext|> +[2025-12-23 14:21:13,860] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:4046] UNK: None / None +[2025-12-23 14:21:13,860] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:476] [PID:4046] Unable to find prepared dataset in last_run_prepared/a600686cba93a5478a612e625b2dc31c +[2025-12-23 14:21:13,860] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:4046] Loading raw datasets... +[2025-12-23 14:21:13,861] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:4046] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`. + Generating train split: 0 examples [00:00, ? examples/s][2025-12-23 14:21:14,245] [ERROR] [datasets.packaged_modules.json.json._generate_tables:162] [PID:4046] Failed to load JSON from file '/workspace/fine-tuning/dataset/ded_full_train.jsonl' with error : JSON parse error: Missing a closing quotation mark in string. in row 39 + Generating train split: 9862 examples [00:00, 30701.94 examples/s] Generating train split: 9862 examples [00:00, 30637.90 examples/s] +Traceback (most recent call last): + File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/datasets/packaged_modules/json/json.py", line 160, in _generate_tables + df = pandas_read_json(f) + ^^^^^^^^^^^^^^^^^^^ + File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/datasets/packaged_modules/json/json.py", line 38, in pandas_read_json + return pd.read_json(path_or_buf, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/pandas/io/json/_json.py", line 815, in read_json + return json_reader.read() + ^^^^^^^^^^^^^^^^^^ + File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/pandas/io/json/_json.py", line 1014, in read + obj = self._get_object_parser(self.data) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/pandas/io/json/_json.py", line 1040, in _get_object_parser + obj = FrameParser(json, **kwargs).parse() + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/pandas/io/json/_json.py", line 1176, in parse + self._parse() + File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/pandas/io/json/_json.py", line 1392, in _parse + ujson_loads(json, precise_float=self.precise_float), dtype=None + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +ValueError: Trailing data + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/datasets/builder.py", line 1815, in _prepare_split_single + for _, table in generator: + File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/datasets/packaged_modules/json/json.py", line 163, in _generate_tables + raise e + File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/datasets/packaged_modules/json/json.py", line 137, in _generate_tables + pa_table = paj.read_json( + ^^^^^^^^^^^^^^ + File "pyarrow/_json.pyx", line 342, in pyarrow._json.read_json + File "pyarrow/error.pxi", line 155, in pyarrow.lib.pyarrow_internal_check_status + File "pyarrow/error.pxi", line 92, in pyarrow.lib.check_status +pyarrow.lib.ArrowInvalid: JSON parse error: Missing a closing quotation mark in string. in row 39 + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "", line 198, in _run_module_as_main + File "", line 88, in _run_code + File "/workspace/axolotl/src/axolotl/cli/train.py", line 121, in + fire.Fire(do_cli) + File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/fire/core.py", line 135, in Fire + component_trace = _Fire(component, args, parsed_flag_args, context, name) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/fire/core.py", line 468, in _Fire + component, remaining_args = _CallAndUpdateTrace( + ^^^^^^^^^^^^^^^^^^^^ + File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace + component = fn(*varargs, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^ + File "/workspace/axolotl/src/axolotl/cli/train.py", line 88, in do_cli + return do_train(parsed_cfg, parsed_cli_args) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/workspace/axolotl/src/axolotl/cli/train.py", line 43, in do_train + dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/workspace/axolotl/src/axolotl/common/datasets.py", line 59, in load_datasets + train_dataset, eval_dataset, total_num_steps, prompters = prepare_datasets( + ^^^^^^^^^^^^^^^^^ + File "/workspace/axolotl/src/axolotl/utils/data/utils.py", line 50, in wrapper + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 65, in prepare_datasets + return _prepare_standard_dataset(cfg, tokenizer, processor) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 98, in _prepare_standard_dataset + train_dataset, eval_dataset, prompters = loader.load(_load_datasets) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/workspace/axolotl/src/axolotl/utils/data/lock.py", line 38, in load + result = load_fn() + ^^^^^^^^^ + File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 77, in _load_datasets + train_dataset, eval_dataset, prompters = _load_and_prepare_datasets( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 503, in _load_and_prepare_datasets + dataset, prompters = _load_tokenized_prepared_datasets( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 299, in _load_tokenized_prepared_datasets + dataset, prompters = _load_raw_datasets( + ^^^^^^^^^^^^^^^^^^^ + File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 331, in _load_raw_datasets + dataset_wrapper, dataset_prompter = _load_and_process_single_dataset( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 374, in _load_and_process_single_dataset + dataset = load_dataset_with_config( + ^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/workspace/axolotl/src/axolotl/utils/data/shared.py", line 118, in load_dataset_with_config + return _load_from_local_path(dataset_config, load_dataset_kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/workspace/axolotl/src/axolotl/utils/data/shared.py", line 242, in _load_from_local_path + return load_dataset( + ^^^^^^^^^^^^^ + File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset + builder_instance.download_and_prepare( + File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/datasets/builder.py", line 894, in download_and_prepare + self._download_and_prepare( + File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/datasets/builder.py", line 970, in _download_and_prepare + self._prepare_split(split_generator, **prepare_split_kwargs) + File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/datasets/builder.py", line 1702, in _prepare_split + for job_id, done, content in self._prepare_split_single( + File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/datasets/builder.py", line 1858, in _prepare_split_single + raise DatasetGenerationError("An error occurred while generating the dataset") from e +datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset +[2025-12-23 14:25:44,642] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:5529] baseline 0.000GB () +[2025-12-23 14:25:44,643] [INFO] [axolotl.cli.config.load_cfg:248] [PID:5529] config: +{ + "activation_offloading": false, + "auto_resume_from_checkpoints": true, + "axolotl_config_path": "config.yaml", + "base_model": "Qwen/Qwen3-4B-Instruct-2507", + "base_model_config": "Qwen/Qwen3-4B-Instruct-2507", + "batch_size": 32, + "bf16": true, + "capabilities": { + "bf16": true, + "compute_capability": "sm_90", + "fp8": false, + "n_gpu": 1, + "n_node": 1 + }, + "chat_template": "qwen3", + "context_parallel_size": 1, + "dataloader_num_workers": 1, + "dataloader_pin_memory": true, + "dataloader_prefetch_factor": 256, + "dataset_processes": 24, + "datasets": [ + { + "chat_template": "tokenizer_default", + "field_messages": "conversations", + "message_property_mappings": { + "content": "value", + "role": "from" + }, + "path": "./dataset/ded_full_train.jsonl", + "roles": { + "assistant": [ + "gpt" + ], + "system": [ + "system" + ], + "user": [ + "human" + ] + }, + "trust_remote_code": false, + "type": "chat_template" + } + ], + "ddp": false, + "device": "cuda:0", + "dion_rank_fraction": 1.0, + "dion_rank_multiple_of": 1, + "env_capabilities": { + "torch_version": "2.7.1" + }, + "eval_batch_size": 2, + "eval_causal_lm_metrics": [ + "sacrebleu", + "comet", + "ter", + "chrf" + ], + "eval_max_new_tokens": 128, + "eval_table_size": 0, + "experimental_skip_move_to_device": true, + "flash_attention": true, + "fp16": false, + "gradient_accumulation_steps": 16, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": { + "use_reentrant": true + }, + "include_tkps": true, + "is_falcon_derived_model": false, + "is_llama_derived_model": false, + "is_mistral_derived_model": false, + "learning_rate": 5e-06, + "lisa_layers_attribute": "model.layers", + "load_best_model_at_end": false, + "load_in_4bit": false, + "load_in_8bit": false, + "local_rank": 0, + "lora_dropout": 0.0, + "loraplus_lr_embedding": 1e-06, + "lr_scheduler": "cosine", + "mean_resizing_embeddings": false, + "micro_batch_size": 2, + "model_config_type": "qwen3", + "num_epochs": 10.0, + "optimizer": "adamw_torch_fused", + "output_dir": "./outputs/qwen3-4b-instruct-ded-full-train", + "pretrain_multipack_attn": true, + "profiler_steps_start": 0, + "qlora_sharded_model_loading": false, + "ray_num_workers": 1, + "resources_per_worker": { + "GPU": 1 + }, + "sample_packing": false, + "sample_packing_bin_size": 200, + "sample_packing_group_size": 100000, + "save_only_model": false, + "save_safetensors": true, + "sequence_len": 4096, + "shuffle_before_merging_datasets": false, + "shuffle_merged_datasets": true, + "skip_prepare_dataset": false, + "streaming_multipack_buffer_size": 10000, + "strict": false, + "tensor_parallel_size": 1, + "tf32": true, + "tiled_mlp_use_original_mlp": true, + "tokenizer_config": "Qwen/Qwen3-4B-Instruct-2507", + "tokenizer_save_jinja_files": true, + "tokenizer_type": "AutoTokenizer", + "torch_dtype": "torch.bfloat16", + "train_on_inputs": false, + "trl": { + "log_completions": false, + "mask_truncated_completions": false, + "ref_model_mixup_alpha": 0.9, + "ref_model_sync_steps": 64, + "scale_rewards": true, + "sync_ref_model": false, + "use_vllm": false, + "vllm_server_host": "0.0.0.0", + "vllm_server_port": 8000 + }, + "type_of_model": "AutoModelForCausalLM", + "use_ray": false, + "val_set_size": 0.0, + "vllm": { + "device": "auto", + "dtype": "auto", + "gpu_memory_utilization": 0.9, + "host": "0.0.0.0", + "port": 8000 + }, + "warmup_steps": 100, + "weight_decay": 0.01, + "world_size": 1 +} +[2025-12-23 14:25:45,085] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:5529] EOS: 151645 / <|im_end|> +[2025-12-23 14:25:45,085] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:5529] BOS: None / None +[2025-12-23 14:25:45,085] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:5529] PAD: 151643 / <|endoftext|> +[2025-12-23 14:25:45,086] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:5529] UNK: None / None +[2025-12-23 14:25:45,086] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:476] [PID:5529] Unable to find prepared dataset in last_run_prepared/a600686cba93a5478a612e625b2dc31c +[2025-12-23 14:25:45,087] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:5529] Loading raw datasets... +[2025-12-23 14:25:45,087] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:5529] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`. + Generating train split: 0 examples [00:00, ? examples/s] Generating train split: 13162 examples [00:00, 87476.32 examples/s] Generating train split: 15985 examples [00:00, 67991.23 examples/s] +[2025-12-23 14:25:45,433] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:5529] Loading dataset: ./dataset/ded_full_train.jsonl with base_type: chat_template and prompt_style: None +[2025-12-23 14:25:45,442] [INFO] [axolotl.prompt_strategies.chat_template.__call__:969] [PID:5529] Using chat template: --- {%- if tools %} {{- '<|im_start|>system\n' }} @@ -167,16 +423,37 @@ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} {%- endif %} {%- endif %} -{%- for message in messages %} - {%- if message.content is string %} - {%- set content = message.content %} - {%- else %} - {%- set content = '' %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} {%- endif %} +{%- endfor %} +{%- for message in messages %} {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} - {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" %} - {{- '<|im_start|>' + message.role + '\n' + content }} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} {%- if message.tool_calls %} {%- for tool_call in message.tool_calls %} {%- if (loop.first and content) or (not loop.first) %} @@ -202,7 +479,7 @@ {{- '<|im_start|>user' }} {%- endif %} {{- '\n\n' }} - {{- content }} + {{- message.content }} {{- '\n' }} {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} {{- '<|im_end|>\n' }} @@ -211,272 +488,553 @@ {%- endfor %} {%- if add_generation_prompt %} {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- else %} + {{- '\n\n' }} + {%- endif %} {%- endif %} + --- - Tokenizing Prompts (num_proc=32): 0%| | 0/23109 [00:002048) (num_proc=32): 0%| | 0/23109 [00:002048) (num_proc=32): 3%|██▊ | 723/23109 [00:01<00:38, 574.20 examples/s] Dropping Long Sequences (>2048) (num_proc=32): 72%|████████████████████████████████████████████████████████████▍ | 16611/23109 [00:01<00:00, 16634.34 examples/s] Dropping Long Sequences (>2048) (num_proc=32): 100%|████████████████████████████████████████████████████████████████████████████████████| 23109/23109 [00:01<00:00, 14091.79 examples/s] -[2025-12-22 16:46:45,321] [WARNING] [axolotl.utils.data.utils.handle_long_seq_in_dataset:260] [PID:37090] Dropped 282 samples from dataset - Drop Samples with Zero Trainable Tokens (num_proc=32): 0%| | 0/22827 [00:00 -[2025-12-22 16:46:58,204] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:37090] BOS: None / None -[2025-12-22 16:46:58,209] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:37090] PAD: 151643 / <|endoftext|> -[2025-12-22 16:46:58,212] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:37090] UNK: None / None -[2025-12-22 16:46:58,215] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:37090] Loading model -[2025-12-22 16:46:58,478] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:37090] Patched Trainer.evaluation_loop with nanmean loss calculation -[2025-12-22 16:46:58,484] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:37090] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation -[2025-12-22 16:46:58,486] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:37090] Applying multipack dataloader patch for sample packing... - Loading checkpoint shards: 0%| | 0/3 [00:004096) (num_proc=24): 0%| | 0/15985 [00:004096) (num_proc=24): 4%|▊ | 667/15985 [00:00<00:10, 1501.98 examples/s] Dropping Long Sequences (>4096) (num_proc=24): 96%|████████████████▎| 15319/15985 [00:00<00:00, 32317.27 examples/s] Dropping Long Sequences (>4096) (num_proc=24): 100%|█████████████████| 15985/15985 [00:00<00:00, 22266.76 examples/s] +[2025-12-23 14:25:52,104] [WARNING] [axolotl.utils.data.utils.handle_long_seq_in_dataset:260] [PID:5529] Dropped 8 samples from dataset + Saving the dataset (0/24 shards): 0%| | 0/15977 [00:00 +[2025-12-23 14:25:53,219] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:5529] BOS: None / None +[2025-12-23 14:25:53,219] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:5529] PAD: 151643 / <|endoftext|> +[2025-12-23 14:25:53,220] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:5529] UNK: None / None +[2025-12-23 14:25:53,220] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:5529] Loading model +[2025-12-23 14:25:53,267] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:5529] Patched Trainer.evaluation_loop with nanmean loss calculation +[2025-12-23 14:25:53,269] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:5529] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation + model.safetensors.index.json: 0.00B [00:00, ?B/s] model.safetensors.index.json: 32.8kB [00:00, 187MB/s] + model-00001-of-00003.safetensors: 0%| | 0.00/3.96G [00:00