|
|
[2025-09-09 07:47:05,190] [INFO] [axolotl.cli.config.load_cfg:245] [PID:37] [RANK:0] config: |
|
|
{ |
|
|
"activation_offloading": false, |
|
|
"adapter": "lora", |
|
|
"attn_implementation": "eager", |
|
|
"axolotl_config_path": "/app/checkpoints/instr-fast-052b/ares56-test-text/train_instr-fast-052b.yml", |
|
|
"base_model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", |
|
|
"base_model_config": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", |
|
|
"batch_size": 1, |
|
|
"bf16": false, |
|
|
"capabilities": { |
|
|
"bf16": false, |
|
|
"fp8": false, |
|
|
"n_gpu": 1, |
|
|
"n_node": 1 |
|
|
}, |
|
|
"context_parallel_size": 1, |
|
|
"dataloader_num_workers": 1, |
|
|
"dataloader_pin_memory": true, |
|
|
"dataloader_prefetch_factor": 256, |
|
|
"dataset_processes": 32, |
|
|
"datasets": [ |
|
|
{ |
|
|
"message_property_mappings": { |
|
|
"content": "content", |
|
|
"role": "role" |
|
|
}, |
|
|
"path": "/app/axolotl/data/mini_instruct_50.jsonl", |
|
|
"trust_remote_code": false, |
|
|
"type": "alpaca" |
|
|
} |
|
|
], |
|
|
"ddp": false, |
|
|
"device": "cpu", |
|
|
"device_map": "auto", |
|
|
"dion_rank_fraction": 1.0, |
|
|
"dion_rank_multiple_of": 1, |
|
|
"env_capabilities": { |
|
|
"torch_version": "2.6.0" |
|
|
}, |
|
|
"eval_batch_size": 1, |
|
|
"eval_causal_lm_metrics": [ |
|
|
"sacrebleu", |
|
|
"comet", |
|
|
"ter", |
|
|
"chrf" |
|
|
], |
|
|
"eval_max_new_tokens": 128, |
|
|
"eval_steps": 0, |
|
|
"eval_table_size": 0, |
|
|
"experimental_skip_move_to_device": true, |
|
|
"fp16": false, |
|
|
"gradient_accumulation_steps": 1, |
|
|
"gradient_checkpointing": false, |
|
|
"is_llama_derived_model": true, |
|
|
"learning_rate": 0.0002, |
|
|
"lisa_layers_attribute": "model.layers", |
|
|
"load_best_model_at_end": false, |
|
|
"load_in_4bit": false, |
|
|
"load_in_8bit": false, |
|
|
"local_rank": 0, |
|
|
"logging_steps": 1, |
|
|
"lora_alpha": 16, |
|
|
"lora_dropout": 0.05, |
|
|
"lora_r": 8, |
|
|
"lora_target_modules": [ |
|
|
"q_proj", |
|
|
"k_proj", |
|
|
"v_proj", |
|
|
"o_proj", |
|
|
"gate_proj", |
|
|
"up_proj", |
|
|
"down_proj" |
|
|
], |
|
|
"loraplus_lr_embedding": 1e-06, |
|
|
"lr_scheduler": "cosine", |
|
|
"max_prompt_len": 512, |
|
|
"max_steps": 10, |
|
|
"mean_resizing_embeddings": false, |
|
|
"micro_batch_size": 1, |
|
|
"model_config_type": "llama", |
|
|
"num_epochs": 1.0, |
|
|
"optimizer": "adamw_torch", |
|
|
"output_dir": "/app/checkpoints/instr-fast-052b/ares56-test-text", |
|
|
"pretrain_multipack_attn": true, |
|
|
"profiler_steps_start": 0, |
|
|
"qlora_sharded_model_loading": false, |
|
|
"ray_num_workers": 1, |
|
|
"resources_per_worker": { |
|
|
"GPU": 1 |
|
|
}, |
|
|
"sample_packing": false, |
|
|
"sample_packing_bin_size": 200, |
|
|
"sample_packing_group_size": 100000, |
|
|
"save_only_model": false, |
|
|
"save_safetensors": true, |
|
|
"save_steps": 10, |
|
|
"save_strategy": "steps", |
|
|
"save_total_limit": 1, |
|
|
"sequence_len": 256, |
|
|
"shuffle_before_merging_datasets": false, |
|
|
"shuffle_merged_datasets": true, |
|
|
"skip_prepare_dataset": false, |
|
|
"streaming_multipack_buffer_size": 10000, |
|
|
"strict": false, |
|
|
"tensor_parallel_size": 1, |
|
|
"tf32": false, |
|
|
"tiled_mlp_use_original_mlp": true, |
|
|
"tokenizer_config": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", |
|
|
"tokenizer_save_jinja_files": true, |
|
|
"torch_dtype": "torch.float32", |
|
|
"train_on_inputs": false, |
|
|
"trl": { |
|
|
"log_completions": false, |
|
|
"mask_truncated_completions": false, |
|
|
"ref_model_mixup_alpha": 0.9, |
|
|
"ref_model_sync_steps": 64, |
|
|
"scale_rewards": true, |
|
|
"sync_ref_model": false, |
|
|
"use_vllm": false, |
|
|
"vllm_server_host": "0.0.0.0", |
|
|
"vllm_server_port": 8000 |
|
|
}, |
|
|
"use_ray": false, |
|
|
"val_set_size": 0.0, |
|
|
"vllm": { |
|
|
"device": "auto", |
|
|
"dtype": "auto", |
|
|
"gpu_memory_utilization": 0.9, |
|
|
"host": "0.0.0.0", |
|
|
"port": 8000 |
|
|
}, |
|
|
"warmup_steps": 0, |
|
|
"weight_decay": 0.0, |
|
|
"world_size": 1 |
|
|
}[39m |
|
|
[2025-09-09 07:47:05,871] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:37] [RANK:0] No Chat template selected. Consider adding a chat template for easier inference.[39m |
|
|
[2025-09-09 07:47:05,871] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:476] [PID:37] [RANK:0] Unable to find prepared dataset in last_run_prepared/103416ae75fe35cf3a7cdd59f8415c5e[39m |
|
|
[2025-09-09 07:47:05,871] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:37] [RANK:0] Loading raw datasets...[39m |
|
|
[33m[2025-09-09 07:47:05,871] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:37] [RANK:0] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.[39m |
|
|
Generating train split: 0 examples [00:00, ? examples/s]
Generating train split: 50 examples [00:00, 17666.18 examples/s] |
|
|
[2025-09-09 07:47:06,858] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:37] [RANK:0] Loading dataset: /app/axolotl/data/mini_instruct_50.jsonl with base_type: alpaca and prompt_style: None[39m |
|
|
Tokenizing Prompts (num_proc=32): 0%| | 0/50 [00:00<?, ? examples/s]
Tokenizing Prompts (num_proc=32): 4%|β | 2/50 [00:00<00:07, 6.64 examples/s]
Tokenizing Prompts (num_proc=32): 32%|ββββ | 16/50 [00:00<00:00, 49.07 examples/s]
Tokenizing Prompts (num_proc=32): 64%|βββββββ | 32/50 [00:00<00:00, 78.43 examples/s]
Tokenizing Prompts (num_proc=32): 86%|βββββββββ | 43/50 [00:00<00:00, 83.86 examples/s]
Tokenizing Prompts (num_proc=32): 100%|ββββββββββ| 50/50 [00:00<00:00, 59.60 examples/s] |
|
|
[2025-09-09 07:47:07,731] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:37] [RANK:0] min_input_len: 69[39m |
|
|
[2025-09-09 07:47:07,731] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:37] [RANK:0] max_input_len: 71[39m |
|
|
Dropping Long Sequences (>256) (num_proc=32): 0%| | 0/50 [00:00<?, ? examples/s]
Dropping Long Sequences (>256) (num_proc=32): 4%|β | 2/50 [00:00<00:05, 8.84 examples/s]
Dropping Long Sequences (>256) (num_proc=32): 100%|ββββββββββ| 50/50 [00:00<00:00, 132.29 examples/s] |
|
|
Saving the dataset (0/1 shards): 0%| | 0/50 [00:00<?, ? examples/s]
Saving the dataset (1/1 shards): 100%|ββββββββββ| 50/50 [00:00<00:00, 13005.59 examples/s]
Saving the dataset (1/1 shards): 100%|ββββββββββ| 50/50 [00:00<00:00, 12705.39 examples/s] |
|
|
[2025-09-09 07:47:08,152] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:37] [RANK:0] Maximum number of steps set at 10[39m |
|
|
[2025-09-09 07:47:08,722] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:37] [RANK:0] No Chat template selected. Consider adding a chat template for easier inference.[39m |
|
|
[2025-09-09 07:47:08,917] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:37] [RANK:0] Patched Trainer.evaluation_loop with nanmean loss calculation[39m |
|
|
[2025-09-09 07:47:08,918] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:37] [RANK:0] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation[39m |
|
|
`torch_dtype` is deprecated! Use `dtype` instead! |
|
|
[2025-09-09 07:47:09,681] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:351] [PID:37] [RANK:0] Converting modules to torch.float32[39m |
|
|
trainable params: 6,307,840 || all params: 1,106,356,224 || trainable%: 0.5701 |
|
|
[2025-09-09 07:47:10,932] [INFO] [axolotl.train.save_initial_configs:414] [PID:37] [RANK:0] Pre-saving adapter config to /app/checkpoints/instr-fast-052b/ares56-test-text...[39m |
|
|
[2025-09-09 07:47:10,932] [INFO] [axolotl.train.save_initial_configs:418] [PID:37] [RANK:0] Pre-saving tokenizer to /app/checkpoints/instr-fast-052b/ares56-test-text...[39m |
|
|
[2025-09-09 07:47:10,946] [INFO] [axolotl.train.save_initial_configs:423] [PID:37] [RANK:0] Pre-saving model config to /app/checkpoints/instr-fast-052b/ares56-test-text...[39m |
|
|
[2025-09-09 07:47:10,947] [INFO] [axolotl.train.execute_training:203] [PID:37] [RANK:0] Starting trainer...[39m |
|
|
0%| | 0/10 [00:00<?, ?it/s]
10%|β | 1/10 [00:01<00:13, 1.45s/it]
{'loss': 4.5061, 'grad_norm': 5.485438823699951, 'learning_rate': 0.0002, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.02} |
|
|
10%|β | 1/10 [00:01<00:13, 1.45s/it]
20%|ββ | 2/10 [00:02<00:09, 1.22s/it]
{'loss': 3.7913, 'grad_norm': 4.593176364898682, 'learning_rate': 0.00019510565162951537, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.04} |
|
|
20%|ββ | 2/10 [00:02<00:09, 1.22s/it]
30%|βββ | 3/10 [00:03<00:08, 1.18s/it]
{'loss': 3.0368, 'grad_norm': 4.607494354248047, 'learning_rate': 0.00018090169943749476, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.06} |
|
|
30%|βββ | 3/10 [00:03<00:08, 1.18s/it]
40%|ββββ | 4/10 [00:04<00:06, 1.15s/it]
{'loss': 2.4057, 'grad_norm': 4.247849464416504, 'learning_rate': 0.00015877852522924732, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.08} |
|
|
40%|ββββ | 4/10 [00:04<00:06, 1.15s/it]
50%|βββββ | 5/10 [00:05<00:05, 1.15s/it]
{'loss': 1.9879, 'grad_norm': 3.5455574989318848, 'learning_rate': 0.00013090169943749476, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.1} |
|
|
50%|βββββ | 5/10 [00:05<00:05, 1.15s/it]
60%|ββββββ | 6/10 [00:06<00:04, 1.08s/it]
{'loss': 1.6576, 'grad_norm': 3.5534489154815674, 'learning_rate': 0.0001, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.12} |
|
|
60%|ββββββ | 6/10 [00:06<00:04, 1.08s/it]
70%|βββββββ | 7/10 [00:07<00:03, 1.03s/it]
{'loss': 1.4126, 'grad_norm': 3.670276403427124, 'learning_rate': 6.909830056250527e-05, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.14} |
|
|
70%|βββββββ | 7/10 [00:07<00:03, 1.03s/it]
80%|ββββββββ | 8/10 [00:08<00:02, 1.01s/it]
{'loss': 1.2206, 'grad_norm': 4.0369062423706055, 'learning_rate': 4.12214747707527e-05, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.16} |
|
|
80%|ββββββββ | 8/10 [00:08<00:02, 1.01s/it]
90%|βββββββββ | 9/10 [00:09<00:00, 1.02it/s]
{'loss': 1.0935, 'grad_norm': 4.194610595703125, 'learning_rate': 1.9098300562505266e-05, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.18} |
|
|
90%|βββββββββ | 9/10 [00:09<00:00, 1.02it/s]
100%|ββββββββββ| 10/10 [00:10<00:00, 1.05s/it]
{'loss': 1.0354, 'grad_norm': 4.174754619598389, 'learning_rate': 4.8943483704846475e-06, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.2} |
|
|
100%|ββββββββββ| 10/10 [00:10<00:00, 1.05s/it][2025-09-09 07:47:21,982] [INFO] [axolotl.core.trainers.base._save:632] [PID:37] [RANK:0] Saving model checkpoint to /app/checkpoints/instr-fast-052b/ares56-test-text/checkpoint-10[39m |
|
|
[2025-09-09 07:47:22,404] [INFO] [axolotl.core.trainers.base._save:681] [PID:37] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m |
|
|
{'train_runtime': 11.3209, 'train_samples_per_second': 0.883, 'train_steps_per_second': 0.883, 'train_loss': 2.214738917350769, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.2} |
|
|
100%|ββββββββββ| 10/10 [00:11<00:00, 1.05s/it]
100%|ββββββββββ| 10/10 [00:11<00:00, 1.13s/it] |
|
|
[2025-09-09 07:47:22,504] [INFO] [axolotl.train.save_trained_model:228] [PID:37] [RANK:0] Training completed! Saving trained model to /app/checkpoints/instr-fast-052b/ares56-test-text.[39m |
|
|
[2025-09-09 07:47:22,841] [INFO] [axolotl.train.save_trained_model:352] [PID:37] [RANK:0] Model successfully saved to /app/checkpoints/instr-fast-052b/ares56-test-text[39m |
|
|
|