ares56-test-text / train.log

upload ares56-test-text @ 2025-09-09T07:55:07.531105Z

2fac470 verified 4 months ago

12.7 kB

	[2025-09-09 07:47:05,190] [INFO] [axolotl.cli.config.load_cfg:245] [PID:37] [RANK:0] config:
	{
	"activation_offloading": false,
	"adapter": "lora",
	"attn_implementation": "eager",
	"axolotl_config_path": "/app/checkpoints/instr-fast-052b/ares56-test-text/train_instr-fast-052b.yml",
	"base_model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
	"base_model_config": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
	"batch_size": 1,
	"bf16": false,
	"capabilities": {
	"bf16": false,
	"fp8": false,
	"n_gpu": 1,
	"n_node": 1
	},
	"context_parallel_size": 1,
	"dataloader_num_workers": 1,
	"dataloader_pin_memory": true,
	"dataloader_prefetch_factor": 256,
	"dataset_processes": 32,
	"datasets": [
	{
	"message_property_mappings": {
	"content": "content",
	"role": "role"
	},
	"path": "/app/axolotl/data/mini_instruct_50.jsonl",
	"trust_remote_code": false,
	"type": "alpaca"
	}
	],
	"ddp": false,
	"device": "cpu",
	"device_map": "auto",
	"dion_rank_fraction": 1.0,
	"dion_rank_multiple_of": 1,
	"env_capabilities": {
	"torch_version": "2.6.0"
	},
	"eval_batch_size": 1,
	"eval_causal_lm_metrics": [
	"sacrebleu",
	"comet",
	"ter",
	"chrf"
	],
	"eval_max_new_tokens": 128,
	"eval_steps": 0,
	"eval_table_size": 0,
	"experimental_skip_move_to_device": true,
	"fp16": false,
	"gradient_accumulation_steps": 1,
	"gradient_checkpointing": false,
	"is_llama_derived_model": true,
	"learning_rate": 0.0002,
	"lisa_layers_attribute": "model.layers",
	"load_best_model_at_end": false,
	"load_in_4bit": false,
	"load_in_8bit": false,
	"local_rank": 0,
	"logging_steps": 1,
	"lora_alpha": 16,
	"lora_dropout": 0.05,
	"lora_r": 8,
	"lora_target_modules": [
	"q_proj",
	"k_proj",
	"v_proj",
	"o_proj",
	"gate_proj",
	"up_proj",
	"down_proj"
	],
	"loraplus_lr_embedding": 1e-06,
	"lr_scheduler": "cosine",
	"max_prompt_len": 512,
	"max_steps": 10,
	"mean_resizing_embeddings": false,
	"micro_batch_size": 1,
	"model_config_type": "llama",
	"num_epochs": 1.0,
	"optimizer": "adamw_torch",
	"output_dir": "/app/checkpoints/instr-fast-052b/ares56-test-text",
	"pretrain_multipack_attn": true,
	"profiler_steps_start": 0,
	"qlora_sharded_model_loading": false,
	"ray_num_workers": 1,
	"resources_per_worker": {
	"GPU": 1
	},
	"sample_packing": false,
	"sample_packing_bin_size": 200,
	"sample_packing_group_size": 100000,
	"save_only_model": false,
	"save_safetensors": true,
	"save_steps": 10,
	"save_strategy": "steps",
	"save_total_limit": 1,
	"sequence_len": 256,
	"shuffle_before_merging_datasets": false,
	"shuffle_merged_datasets": true,
	"skip_prepare_dataset": false,
	"streaming_multipack_buffer_size": 10000,
	"strict": false,
	"tensor_parallel_size": 1,
	"tf32": false,
	"tiled_mlp_use_original_mlp": true,
	"tokenizer_config": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
	"tokenizer_save_jinja_files": true,
	"torch_dtype": "torch.float32",
	"train_on_inputs": false,
	"trl": {
	"log_completions": false,
	"mask_truncated_completions": false,
	"ref_model_mixup_alpha": 0.9,
	"ref_model_sync_steps": 64,
	"scale_rewards": true,
	"sync_ref_model": false,
	"use_vllm": false,
	"vllm_server_host": "0.0.0.0",
	"vllm_server_port": 8000
	},
	"use_ray": false,
	"val_set_size": 0.0,
	"vllm": {
	"device": "auto",
	"dtype": "auto",
	"gpu_memory_utilization": 0.9,
	"host": "0.0.0.0",
	"port": 8000
	},
	"warmup_steps": 0,
	"weight_decay": 0.0,
	"world_size": 1
	}[39m
	[2025-09-09 07:47:05,871] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:37] [RANK:0] No Chat template selected. Consider adding a chat template for easier inference.[39m
	[2025-09-09 07:47:05,871] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:476] [PID:37] [RANK:0] Unable to find prepared dataset in last_run_prepared/103416ae75fe35cf3a7cdd59f8415c5e[39m
	[2025-09-09 07:47:05,871] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:37] [RANK:0] Loading raw datasets...[39m
	[33m[2025-09-09 07:47:05,871] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:37] [RANK:0] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.[39m
	Generating train split: 0 examples [00:00, ? examples/s] Generating train split: 50 examples [00:00, 17666.18 examples/s]
	[2025-09-09 07:47:06,858] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:37] [RANK:0] Loading dataset: /app/axolotl/data/mini_instruct_50.jsonl with base_type: alpaca and prompt_style: None[39m
	Tokenizing Prompts (num_proc=32): 0%\| \| 0/50 [00:00<?, ? examples/s] Tokenizing Prompts (num_proc=32): 4%\|▍ \| 2/50 [00:00<00:07, 6.64 examples/s] Tokenizing Prompts (num_proc=32): 32%\|███▏ \| 16/50 [00:00<00:00, 49.07 examples/s] Tokenizing Prompts (num_proc=32): 64%\|██████▍ \| 32/50 [00:00<00:00, 78.43 examples/s] Tokenizing Prompts (num_proc=32): 86%\|████████▌ \| 43/50 [00:00<00:00, 83.86 examples/s] Tokenizing Prompts (num_proc=32): 100%\|██████████\| 50/50 [00:00<00:00, 59.60 examples/s]
	[2025-09-09 07:47:07,731] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:37] [RANK:0] min_input_len: 69[39m
	[2025-09-09 07:47:07,731] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:37] [RANK:0] max_input_len: 71[39m
	Dropping Long Sequences (>256) (num_proc=32): 0%\| \| 0/50 [00:00<?, ? examples/s] Dropping Long Sequences (>256) (num_proc=32): 4%\|▍ \| 2/50 [00:00<00:05, 8.84 examples/s] Dropping Long Sequences (>256) (num_proc=32): 100%\|██████████\| 50/50 [00:00<00:00, 132.29 examples/s]
	Saving the dataset (0/1 shards): 0%\| \| 0/50 [00:00<?, ? examples/s] Saving the dataset (1/1 shards): 100%\|██████████\| 50/50 [00:00<00:00, 13005.59 examples/s] Saving the dataset (1/1 shards): 100%\|██████████\| 50/50 [00:00<00:00, 12705.39 examples/s]
	[2025-09-09 07:47:08,152] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:37] [RANK:0] Maximum number of steps set at 10[39m
	[2025-09-09 07:47:08,722] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:37] [RANK:0] No Chat template selected. Consider adding a chat template for easier inference.[39m
	[2025-09-09 07:47:08,917] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:37] [RANK:0] Patched Trainer.evaluation_loop with nanmean loss calculation[39m
	[2025-09-09 07:47:08,918] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:37] [RANK:0] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation[39m
	`torch_dtype` is deprecated! Use `dtype` instead!
	[2025-09-09 07:47:09,681] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:351] [PID:37] [RANK:0] Converting modules to torch.float32[39m
	trainable params: 6,307,840 \|\| all params: 1,106,356,224 \|\| trainable%: 0.5701
	[2025-09-09 07:47:10,932] [INFO] [axolotl.train.save_initial_configs:414] [PID:37] [RANK:0] Pre-saving adapter config to /app/checkpoints/instr-fast-052b/ares56-test-text...[39m
	[2025-09-09 07:47:10,932] [INFO] [axolotl.train.save_initial_configs:418] [PID:37] [RANK:0] Pre-saving tokenizer to /app/checkpoints/instr-fast-052b/ares56-test-text...[39m
	[2025-09-09 07:47:10,946] [INFO] [axolotl.train.save_initial_configs:423] [PID:37] [RANK:0] Pre-saving model config to /app/checkpoints/instr-fast-052b/ares56-test-text...[39m
	[2025-09-09 07:47:10,947] [INFO] [axolotl.train.execute_training:203] [PID:37] [RANK:0] Starting trainer...[39m
	0%\| \| 0/10 [00:00<?, ?it/s] 10%\|█ \| 1/10 [00:01<00:13, 1.45s/it] {'loss': 4.5061, 'grad_norm': 5.485438823699951, 'learning_rate': 0.0002, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.02}
	10%\|█ \| 1/10 [00:01<00:13, 1.45s/it] 20%\|██ \| 2/10 [00:02<00:09, 1.22s/it] {'loss': 3.7913, 'grad_norm': 4.593176364898682, 'learning_rate': 0.00019510565162951537, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.04}
	20%\|██ \| 2/10 [00:02<00:09, 1.22s/it] 30%\|███ \| 3/10 [00:03<00:08, 1.18s/it] {'loss': 3.0368, 'grad_norm': 4.607494354248047, 'learning_rate': 0.00018090169943749476, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.06}
	30%\|███ \| 3/10 [00:03<00:08, 1.18s/it] 40%\|████ \| 4/10 [00:04<00:06, 1.15s/it] {'loss': 2.4057, 'grad_norm': 4.247849464416504, 'learning_rate': 0.00015877852522924732, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.08}
	40%\|████ \| 4/10 [00:04<00:06, 1.15s/it] 50%\|█████ \| 5/10 [00:05<00:05, 1.15s/it] {'loss': 1.9879, 'grad_norm': 3.5455574989318848, 'learning_rate': 0.00013090169943749476, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.1}
	50%\|█████ \| 5/10 [00:05<00:05, 1.15s/it] 60%\|██████ \| 6/10 [00:06<00:04, 1.08s/it] {'loss': 1.6576, 'grad_norm': 3.5534489154815674, 'learning_rate': 0.0001, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.12}
	60%\|██████ \| 6/10 [00:06<00:04, 1.08s/it] 70%\|███████ \| 7/10 [00:07<00:03, 1.03s/it] {'loss': 1.4126, 'grad_norm': 3.670276403427124, 'learning_rate': 6.909830056250527e-05, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.14}
	70%\|███████ \| 7/10 [00:07<00:03, 1.03s/it] 80%\|████████ \| 8/10 [00:08<00:02, 1.01s/it] {'loss': 1.2206, 'grad_norm': 4.0369062423706055, 'learning_rate': 4.12214747707527e-05, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.16}
	80%\|████████ \| 8/10 [00:08<00:02, 1.01s/it] 90%\|█████████ \| 9/10 [00:09<00:00, 1.02it/s] {'loss': 1.0935, 'grad_norm': 4.194610595703125, 'learning_rate': 1.9098300562505266e-05, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.18}
	90%\|█████████ \| 9/10 [00:09<00:00, 1.02it/s] 100%\|██████████\| 10/10 [00:10<00:00, 1.05s/it] {'loss': 1.0354, 'grad_norm': 4.174754619598389, 'learning_rate': 4.8943483704846475e-06, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.2}
	100%\|██████████\| 10/10 [00:10<00:00, 1.05s/it][2025-09-09 07:47:21,982] [INFO] [axolotl.core.trainers.base._save:632] [PID:37] [RANK:0] Saving model checkpoint to /app/checkpoints/instr-fast-052b/ares56-test-text/checkpoint-10[39m
	[2025-09-09 07:47:22,404] [INFO] [axolotl.core.trainers.base._save:681] [PID:37] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m
	{'train_runtime': 11.3209, 'train_samples_per_second': 0.883, 'train_steps_per_second': 0.883, 'train_loss': 2.214738917350769, 'memory/max_active (GiB)': 0.0, 'memory/max_allocated (GiB)': 0.0, 'memory/device_reserved (GiB)': 0.0, 'epoch': 0.2}
	100%\|██████████\| 10/10 [00:11<00:00, 1.05s/it] 100%\|██████████\| 10/10 [00:11<00:00, 1.13s/it]
	[2025-09-09 07:47:22,504] [INFO] [axolotl.train.save_trained_model:228] [PID:37] [RANK:0] Training completed! Saving trained model to /app/checkpoints/instr-fast-052b/ares56-test-text.[39m
	[2025-09-09 07:47:22,841] [INFO] [axolotl.train.save_trained_model:352] [PID:37] [RANK:0] Model successfully saved to /app/checkpoints/instr-fast-052b/ares56-test-text[39m