diff --git "a/debug.log" "b/debug.log" --- "a/debug.log" +++ "b/debug.log" @@ -1,26 +1,27 @@ -[2025-10-18 19:02:01,879] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:42363] baseline 0.000GB () -[2025-10-18 19:02:01,880] [INFO] [axolotl.cli.config.load_cfg:248] [PID:42363] config: +[2026-03-30 13:38:13,335] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:37135] baseline 0.000GB () +[2026-03-30 13:38:13,336] [INFO] [axolotl.cli.config.load_cfg:341] [PID:37135] config: { "activation_offloading": false, "adapter": "qlora", - "axolotl_config_path": "mrCuddle-stream.yaml", + "axolotl_config_path": "rp-sft_Attention-Block-Only_Test.yml", "base_model": "google/gemma-2-2b-it", "base_model_config": "google/gemma-2-2b-it", "batch_size": 8, "bf16": true, "capabilities": { "bf16": true, - "compute_capability": "sm_86", - "fp8": false, + "compute_capability": "sm_100", + "fp8": true, "n_gpu": 1, - "n_node": 1 + "n_node": 1, + "tf32": true }, "context_parallel_size": 1, "dataloader_num_workers": 1, "dataloader_pin_memory": true, "dataloader_prefetch_factor": 256, - "dataset_num_proc": 12, - "dataset_prepared_path": "last_run_prepared", + "dataset_num_proc": 28, + "dataset_prepared_path": "/workspace/axolotl/last_run_prepared", "datasets": [ { "chat_template": "jinja", @@ -31,7 +32,7 @@ "content": "value", "role": "from" }, - "path": "AiAF/conversations", + "path": ".", "roles_to_train": [ "assistant" ], @@ -45,8 +46,10 @@ "device": "cuda:0", "dion_rank_fraction": 1.0, "dion_rank_multiple_of": 1, + "eaft_alpha": 1.0, + "eaft_k": 20, "env_capabilities": { - "torch_version": "2.7.1" + "torch_version": "2.9.1" }, "eot_tokens": [ "" @@ -63,10 +66,14 @@ "eval_steps": 50, "eval_strategy": "steps", "eval_table_size": 0, - "evaluation_strategy": "steps", "experimental_skip_move_to_device": true, "flash_attention": true, "fp16": false, + "generate_samples": false, + "generation_do_sample": true, + "generation_max_new_tokens": 50, + "generation_prompt_ratio": 0.5, + "generation_temperature": 0.7, "gradient_accumulation_steps": 4, "gradient_checkpointing": true, "gradient_checkpointing_kwargs": { @@ -78,6 +85,7 @@ "is_falcon_derived_model": false, "is_llama_derived_model": false, "is_mistral_derived_model": false, + "layer_offloading": false, "learning_rate": 0.0002, "lisa_layers_attribute": "model.layers", "load_best_model_at_end": false, @@ -87,26 +95,39 @@ "logging_steps": 1, "lora_alpha": 128, "lora_dropout": 0.05, + "lora_embedding_kernel": true, + "lora_mlp_kernel": true, + "lora_o_kernel": true, + "lora_qkv_kernel": true, "lora_r": 64, - "lora_target_linear": true, + "lora_target_linear": false, + "lora_target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj" + ], "loraplus_lr_embedding": 1e-06, "lr_scheduler": "cosine", "max_steps": 1000, "mean_resizing_embeddings": false, + "merge_method": "memory_efficient", "micro_batch_size": 2, "model_config_type": "gemma2", "num_epochs": 1.0, + "num_generation_samples": 3, "optimizer": "adamw_bnb_8bit", - "output_dir": "./outputs/sft/gemma-2-2b-it-rp-sft-qlora", - "pad_to_sequence_len": true, + "otel_metrics_host": "localhost", + "otel_metrics_port": 8000, + "output_dir": "/workspace/data/axolotl-outputs/sft/gemma-2-2b-it-rp-sft-qlora", "pretrain_multipack_attn": true, "profiler_steps_start": 0, "qlora_sharded_model_loading": false, + "quantize_moe_experts": false, "ray_num_workers": 1, "resources_per_worker": { "GPU": 1 }, - "sample_packing": true, "sample_packing_bin_size": 200, "sample_packing_group_size": 100000, "save_only_model": false, @@ -123,7 +144,6 @@ "eos_token": "", "pad_token": "" }, - "streaming": true, "streaming_multipack_buffer_size": 10000, "strict": false, "tensor_parallel_size": 1, @@ -131,14 +151,12 @@ { "chat_template": "jinja", "chat_template_jinja": "{{ bos_token }}\n{% for m in messages %}\n {% set role = 'model' if m['role']=='assistant' else 'user' %}\n {{ '' + role + '\\n' + m['content'] | trim + '\\n' }}\n{% endfor %}\n{% if add_generation_prompt %}\n{{ 'model\\n' }}\n{% endif %}\n", - "data_files": "eval-datasets/shuf-1000_conversations_V2.jsonl", "field_messages": "conversations", "message_property_mappings": { "content": "value", "role": "from" }, - "name": "json", - "path": ".", + "path": "eval-datasets/shuf-1000_conversations_V3.jsonl", "roles_to_train": [ "assistant" ], @@ -155,17 +173,27 @@ "torch_dtype": "torch.bfloat16", "train_on_inputs": false, "trl": { + "async_prefetch": false, "log_completions": false, "mask_truncated_completions": false, "ref_model_mixup_alpha": 0.9, "ref_model_sync_steps": 64, + "replay_buffer_size": 0, + "replay_recompute_logps": true, + "reroll_max_groups": 1, + "reroll_start_fraction": 1.0, + "reward_num_workers": 1, "scale_rewards": true, + "skip_zero_advantage_batches": true, "sync_ref_model": false, + "use_data_producer": false, "use_vllm": false, + "vllm_lora_sync": false, "vllm_server_host": "0.0.0.0", "vllm_server_port": 8000 }, "type_of_model": "AutoModelForCausalLM", + "use_otel_metrics": false, "use_ray": false, "use_wandb": true, "val_set_size": 0.0, @@ -177,242 +205,22 @@ "port": 8000 }, "wandb_log_model": "false", - "wandb_name": "gemma-2-2b-it-rp-sft-qlora", + "wandb_name": "Attention-Block-Only_Test", "wandb_project": "rp-sft", - "wandb_run_id": "gemma-2-2b-it-rp-sft-qlora", + "wandb_run_id": "Attention-Block-Only_Test", + "warmup_ratio": 0.03, "weight_decay": 0.0, "world_size": 1 } -[2025-10-18 19:02:03,610] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:42363] EOS: 1 / -[2025-10-18 19:02:03,610] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:42363] BOS: 2 / -[2025-10-18 19:02:03,610] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:42363] PAD: 0 / -[2025-10-18 19:02:03,610] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:42363] UNK: 3 / -[2025-10-18 19:02:17,503] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:470] [PID:42363] Loading prepared dataset from disk at last_run_prepared/323978649404d0f4da7e1f3e2dc7b3de... -[2025-10-18 19:02:17,508] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:42363] Loading tokenizer... google/gemma-2-2b-it -[2025-10-18 19:02:18,978] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:42363] EOS: 1 / -[2025-10-18 19:02:18,978] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:42363] BOS: 2 / -[2025-10-18 19:02:18,978] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:42363] PAD: 0 / -[2025-10-18 19:02:18,979] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:42363] UNK: 3 / -[2025-10-18 19:02:18,979] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:42363] Loading model -[2025-10-18 19:02:19,156] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:42363] Patched Trainer.evaluation_loop with nanmean loss calculation -[2025-10-18 19:02:19,159] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:42363] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation -[2025-10-18 19:02:19,160] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:42363] Applying multipack dataloader patch for sample packing... - Loading checkpoint shards: 0%| | 0/2 [00:00 +[2026-03-30 13:38:15,214] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:308] [PID:37135] BOS: 2 / +[2026-03-30 13:38:15,215] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:309] [PID:37135] PAD: 0 / +[2026-03-30 13:38:15,215] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:310] [PID:37135] UNK: 3 / +[2026-03-30 13:38:15,216] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:480] [PID:37135] Unable to find prepared dataset in /workspace/axolotl/last_run_prepared/f493251e06461a149e3a38551d1b7982 +[2026-03-30 13:38:15,217] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:37135] Loading raw datasets... +[2026-03-30 13:38:15,217] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:37135] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`. +[2026-03-30 13:38:15,533] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:37135] Loading dataset: . with base_type: chat_template and prompt_style: None +[2026-03-30 13:38:15,536] [INFO] [axolotl.prompt_strategies.chat_template.__call__:998] [PID:37135] Using chat template: --- {{ bos_token }} {% for m in messages %} @@ -424,1047 +232,17 @@ trainable params: 83,066,880 || all params: 2,697,408,768 || trainable%: 3.0795 {% endif %} --- - - Tokenizing Prompts (num_proc=12): 0%| | 0/10000 [00:00204 Dropping Invalid Sequences (204 Dropping Invalid Sequences (204 Dropping Invalid Sequences (204 Dropping Invalid Sequences (204 Dropping Invalid Sequences (204 Dropping Invalid Sequences (204 Dropping Invalid Sequences (204 Dropping Invalid Sequences (204 Dropping Invalid Sequences (204 Dropping Invalid Sequences (204 +[2026-03-30 14:34:12,077] [INFO] [axolotl.utils.data.utils._drop_outside_range:306] [PID:37135] Dropped 60374 sequences outside valid range ([None, 2048]) + Saving the dataset (0/28 shards): 0%| | Saving the dataset (0/28 shards): 4%| | Saving the dataset (1/28 shards): 4%| | Saving the dataset (2/28 shards): 7%| | Saving the dataset (3/28 shards): 11%| | Saving the dataset (4/28 shards): 18%|▏| Saving the dataset (5/28 shards): 21%|▏| Saving the dataset (6/28 shards): 21%|▏| Saving the dataset (7/28 shards): 29%|▎| Saving the dataset (8/28 shards): 29%|▎| Saving the dataset (9/28 shards): 39%|▍| Saving the dataset (10/28 shards): 39%|▍ Saving the dataset (11/28 shards): 43%|▍ Saving the dataset (12/28 shards): 46%|▍ Saving the dataset (13/28 shards): 46%|▍ Saving the dataset (14/28 shards): 50%|▍ Saving the dataset (15/28 shards): 54%|▌ Saving the dataset (16/28 shards): 61%|▌ Saving the dataset (17/28 shards): 61%|▌ Saving the dataset (18/28 shards): 68%|▋ Saving the dataset (19/28 shards): 68%|▋ Saving the dataset (20/28 shards): 71%|▋ Saving the dataset (21/28 shards): 75%|▋ Saving the dataset (22/28 shards): 86%|▊ Saving the dataset (23/28 shards): 89%|▉ Saving the dataset (24/28 shards): 89%|▉ Saving the dataset (25/28 shards): 89%|▉ Saving the dataset (26/28 shards): 93%|▉ Saving the dataset (27/28 shards): 96%|▉ Saving the dataset (28/28 shards): 100%|█ Saving the dataset (28/28 shards): 100%|█ +[2026-03-30 14:34:15,379] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:480] [PID:37135] Unable to find prepared dataset in /workspace/axolotl/last_run_prepared/df80f313c04db5e542fa25408a23272d +[2026-03-30 14:34:15,379] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:37135] Loading raw datasets... +[2026-03-30 14:34:15,380] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:37135] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`. +[2026-03-30 14:34:15,673] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:37135] Loading dataset: eval-datasets/shuf-1000_conversations_V3.jsonl with base_type: chat_template and prompt_style: None +[2026-03-30 14:34:15,674] [INFO] [axolotl.prompt_strategies.chat_template.__call__:998] [PID:37135] Using chat template: --- {{ bos_token }} {% for m in messages %} @@ -1476,4157 +254,215 @@ trainable params: 83,066,880 || all params: 2,697,408,768 || trainable%: 3.0795 {% endif %} --- - - Tokenizing Prompts (num_proc=12): 0%| | 0/10000 [00:00' + role + '\n' + m['content'] | trim + '\n' }} -{% endfor %} -{% if add_generation_prompt %} -{{ 'model\n' }} -{% endif %} - ---- - - Tokenizing Prompts (num_proc=12): 0%| | 0/10000 [00:00' + role + '\n' + m['content'] | trim + '\n' }} -{% endfor %} -{% if add_generation_prompt %} -{{ 'model\n' }} -{% endif %} - ---- - - Tokenizing Prompts (num_proc=12): 0%| | 0/10000 [00:00' + role + '\n' + m['content'] | trim + '\n' }} -{% endfor %} -{% if add_generation_prompt %} -{{ 'model\n' }} -{% endif %} - ---- - - Tokenizing Prompts (num_proc=12): 0%| | 0/10000 [00:00204 Dropping Invalid Sequences (204 Dropping Invalid Sequences (204 +[2026-03-30 14:34:28,940] [INFO] [axolotl.utils.data.utils._drop_outside_range:306] [PID:37135] Dropped 800 sequences outside valid range ([None, 2048]) + Drop Samples with Zero Trainable Tokens ( Drop Samples with Zero Trainable Tokens ( Drop Samples with Zero Trainable Tokens ( Drop Samples with Zero Trainable Tokens ( + Saving the dataset (0/1 shards): 0%| | Saving the dataset (0/1 shards): 100%|█| Saving the dataset (1/1 shards): 100%|█| Saving the dataset (1/1 shards): 100%|█| +[2026-03-30 14:34:30,705] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:37135] total_num_tokens: 24_920_885 +[2026-03-30 14:34:45,899] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:37135] `total_supervised_tokens: 246_408_026` +[2026-03-30 14:34:45,906] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:37135] total_num_steps: 2051 +[2026-03-30 14:34:45,907] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:37135] Maximum number of steps set at 1000 +[2026-03-30 14:34:46,033] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:37135] loading tokenizer... google/gemma-2-2b-it +[2026-03-30 14:34:48,013] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:307] [PID:37135] EOS: 1 / +[2026-03-30 14:34:48,014] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:308] [PID:37135] BOS: 2 / +[2026-03-30 14:34:48,014] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:309] [PID:37135] PAD: 0 / +[2026-03-30 14:34:48,014] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:310] [PID:37135] UNK: 3 / +[2026-03-30 14:34:48,015] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:37135] Loading model +[2026-03-30 14:34:48,126] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:37135] Patched Trainer.evaluation_loop with nanmean loss calculation +[2026-03-30 14:34:48,127] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:37135] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation +[2026-03-30 14:34:48,180] [INFO] [axolotl.monkeypatch.attention.flash_attn_4.patch_flash_attn_4:52] [PID:37135] Flash Attention 4 is available for your GPU and offers faster training speeds. To enable: pip install flash-attn-4 +[2026-03-30 14:34:48,180] [WARNING] [axolotl.loaders.patch_manager._apply_self_attention_lora_patch:360] [PID:37135] Cannot patch self-attention - requires no dropout + Loading weights: 0%| | 0/288 [00:00