Upload folder using huggingface_hub

ca3c54a verified 9 days ago

30.3 kB

	[2026-01-25 09:34:01,866] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:10047] baseline 0.000GB ()
	[2026-01-25 09:34:01,867] [INFO] [axolotl.cli.config.load_cfg:259] [PID:10047] config:
	{
	"activation_offloading": false,
	"axolotl_config_path": "train.yml",
	"base_model": "google/gemma-3-4b-it",
	"base_model_config": "google/gemma-3-4b-it",
	"batch_size": 13,
	"bf16": true,
	"capabilities": {
	"bf16": true,
	"compute_capability": "sm_86",
	"fp8": false,
	"n_gpu": 1,
	"n_node": 1
	},
	"context_parallel_size": 1,
	"dataloader_num_workers": 1,
	"dataloader_pin_memory": true,
	"dataloader_prefetch_factor": 256,
	"dataset_num_proc": 9,
	"datasets": [
	{
	"message_property_mappings": {
	"content": "content",
	"role": "role"
	},
	"path": "AlexHung29629/MerlynIfeEldridge2",
	"trust_remote_code": false,
	"type": "input_output"
	}
	],
	"ddp": false,
	"device": "cuda:0",
	"dion_rank_fraction": 1.0,
	"dion_rank_multiple_of": 1,
	"env_capabilities": {
	"torch_version": "2.9.1"
	},
	"eval_batch_size": 13,
	"eval_causal_lm_metrics": [
	"sacrebleu",
	"comet",
	"ter",
	"chrf"
	],
	"eval_max_new_tokens": 128,
	"eval_table_size": 0,
	"experimental_skip_move_to_device": true,
	"fp16": false,
	"gradient_accumulation_steps": 1,
	"gradient_checkpointing": true,
	"gradient_checkpointing_kwargs": {
	"use_reentrant": false
	},
	"include_tkps": true,
	"is_multimodal": true,
	"learning_rate": 0.001,
	"liger_fused_linear_cross_entropy": true,
	"liger_glu_activation": true,
	"liger_layer_norm": true,
	"liger_rms_norm": true,
	"liger_rope": true,
	"liger_use_token_scaling": true,
	"lisa_layers_attribute": "model.layers",
	"load_best_model_at_end": false,
	"load_in_4bit": false,
	"load_in_8bit": false,
	"local_rank": 0,
	"lora_dropout": 0.0,
	"loraplus_lr_embedding": 1e-06,
	"lr_scheduler": "constant",
	"max_grad_norm": 1.0,
	"mean_resizing_embeddings": false,
	"micro_batch_size": 13,
	"model_config_type": "gemma3",
	"num_epochs": 16.0,
	"optimizer": "sgd",
	"otel_metrics_host": "localhost",
	"otel_metrics_port": 8000,
	"output_dir": "./model-out",
	"plugins": [
	"axolotl.integrations.liger.LigerPlugin"
	],
	"pretrain_multipack_attn": true,
	"processor_config": "google/gemma-3-4b-it",
	"profiler_steps_start": 0,
	"qlora_sharded_model_loading": false,
	"ray_num_workers": 1,
	"resources_per_worker": {
	"GPU": 1
	},
	"sample_packing": false,
	"sample_packing_bin_size": 200,
	"sample_packing_group_size": 100000,
	"save_only_model": false,
	"save_safetensors": true,
	"save_strategy": "no",
	"seed": 42,
	"sequence_len": 758,
	"shuffle_before_merging_datasets": false,
	"shuffle_merged_datasets": true,
	"skip_prepare_dataset": false,
	"streaming_multipack_buffer_size": 10000,
	"strict": false,
	"tensor_parallel_size": 1,
	"tf32": true,
	"tiled_mlp_use_original_mlp": true,
	"tokenizer_config": "google/gemma-3-4b-it",
	"tokenizer_save_jinja_files": true,
	"torch_dtype": "torch.bfloat16",
	"train_on_inputs": false,
	"trl": {
	"log_completions": false,
	"mask_truncated_completions": false,
	"ref_model_mixup_alpha": 0.9,
	"ref_model_sync_steps": 64,
	"scale_rewards": true,
	"sync_ref_model": false,
	"use_vllm": false,
	"vllm_server_host": "0.0.0.0",
	"vllm_server_port": 8000
	},
	"use_otel_metrics": false,
	"use_ray": false,
	"use_tensorboard": true,
	"use_wandb": false,
	"val_set_size": 0.0,
	"vllm": {
	"device": "auto",
	"dtype": "auto",
	"gpu_memory_utilization": 0.9,
	"host": "0.0.0.0",
	"port": 8000
	},
	"warmup_ratio": 0.0,
	"weight_decay": 0.0,
	"world_size": 1
	}
	[2026-01-25 09:34:01,983] [DEBUG] [axolotl.loaders.utils.check_model_config:88] [PID:10047] Loaded image size: 896 from model config
	[2026-01-25 09:34:03,852] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:10047] EOS: 1 / <eos>
	[2026-01-25 09:34:03,853] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:10047] BOS: 2 / <bos>
	[2026-01-25 09:34:03,853] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:10047] PAD: 0 / <pad>
	[2026-01-25 09:34:03,853] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:10047] UNK: 3 / <unk>
	[2026-01-25 09:34:03,853] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:481] [PID:10047] Unable to find prepared dataset in last_run_prepared/79c123e6ef0babe72cf6db37825069f8
	[2026-01-25 09:34:03,854] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:10047] Loading raw datasets...
	[2026-01-25 09:34:03,854] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:10047] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
	[2026-01-25 09:34:08,929] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:10047] Loading dataset: AlexHung29629/MerlynIfeEldridge2 with base_type: input_output and prompt_style: None
	[2026-01-25 09:34:09,525] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:224] [PID:10047] min_input_len: 152
	[2026-01-25 09:34:09,526] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:226] [PID:10047] max_input_len: 676
	Saving the dataset (0/1 shards): 0%\| \| 0/13 [00:00<?, ? examples/s] Saving the dataset (0/1 shards): 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████\| 13/13 [00:00<00:00, 59.57 examples/s] Saving the dataset (1/1 shards): 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████\| 13/13 [00:00<00:00, 59.57 examples/s] Saving the dataset (1/1 shards): 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████\| 13/13 [00:00<00:00, 41.66 examples/s]
	[2026-01-25 09:34:10,159] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:417] [PID:10047] total_num_tokens: 4_827
	[2026-01-25 09:34:10,162] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:435] [PID:10047] `total_supervised_tokens: 43`
	[2026-01-25 09:34:10,162] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:533] [PID:10047] total_num_steps: 16
	[2026-01-25 09:34:10,163] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:10047] Maximum number of steps set at 16
	[2026-01-25 09:34:10,283] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:10047] loading tokenizer... google/gemma-3-4b-it
	[2026-01-25 09:34:12,371] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:10047] EOS: 1 / <eos>
	[2026-01-25 09:34:12,371] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:10047] BOS: 2 / <bos>
	[2026-01-25 09:34:12,371] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:10047] PAD: 0 / <pad>
	[2026-01-25 09:34:12,371] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:10047] UNK: 3 / <unk>
	[2026-01-25 09:34:23,539] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:10047] Loading model
	[2026-01-25 09:34:23,660] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:10047] Patched Trainer.evaluation_loop with nanmean loss calculation
	[2026-01-25 09:34:23,662] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:10047] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
	[2026-01-25 09:34:23,793] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:98] [PID:10047] Applying LIGER to gemma3 with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'layer_norm': True, 'geglu': True}
	Loading checkpoint shards: 0%\| \| 0/2 [00:00<?, ?it/s] Loading checkpoint shards: 50%\|████████████████████████████████████████████████████████▌ \| 1/2 [00:01<00:01, 1.58s/it] Loading checkpoint shards: 100%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:02<00:00, 1.18s/it] Loading checkpoint shards: 100%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:02<00:00, 1.24s/it]
	[2026-01-25 09:34:32,161] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:10047] Memory usage after model load 0.000GB ()
	[2026-01-25 09:35:21,180] [INFO] [axolotl.train.save_initial_configs:417] [PID:10047] Pre-saving tokenizer to ./model-out...
	[2026-01-25 09:35:21,802] [INFO] [axolotl.train.save_initial_configs:422] [PID:10047] Pre-saving model config to ./model-out...
	[2026-01-25 09:35:21,809] [INFO] [axolotl.train.save_initial_configs:426] [PID:10047] Pre-saving processor to ./model-out...
	[2026-01-25 09:35:25,524] [INFO] [axolotl.train.execute_training:212] [PID:10047] Starting trainer...
	0%\| \| 0/16 [00:00<?, ?it/s] 6%\|████████▋ \| 1/16 [00:07<01:45, 7.05s/it] {'loss': 0.0345, 'grad_norm': 61.53063201904297, 'learning_rate': 0.001, 'ppl': 1.0351, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 6.187943458557129, 'tokens/total': 9152, 'tokens/trainable': 43, 'epoch': 1.0}
	6%\|████████▋ \| 1/16 [00:07<01:45, 7.05s/it] 12%\|█████████████████▍ \| 2/16 [00:13<01:31, 6.54s/it] {'loss': 0.033, 'grad_norm': 57.19621276855469, 'learning_rate': 0.001, 'ppl': 1.03355, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.120673656463623, 'tokens/total': 18304, 'tokens/trainable': 86, 'epoch': 2.0}
	12%\|█████████████████▍ \| 2/16 [00:13<01:31, 6.54s/it] 19%\|██████████████████████████ \| 3/16 [00:19<01:23, 6.39s/it] {'loss': 0.0321, 'grad_norm': 57.623077392578125, 'learning_rate': 0.001, 'ppl': 1.03262, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.125824928283691, 'tokens/total': 27456, 'tokens/trainable': 129, 'epoch': 3.0}
	19%\|██████████████████████████ \| 3/16 [00:19<01:23, 6.39s/it] 25%\|██████████████████████████████████▊ \| 4/16 [00:25<01:15, 6.33s/it] {'loss': 0.0299, 'grad_norm': 63.824161529541016, 'learning_rate': 0.001, 'ppl': 1.03035, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.115628719329834, 'tokens/total': 36608, 'tokens/trainable': 172, 'epoch': 4.0}
	25%\|██████████████████████████████████▊ \| 4/16 [00:25<01:15, 6.33s/it] 31%\|███████████████████████████████████████████▍ \| 5/16 [00:31<01:09, 6.29s/it] {'loss': 0.03, 'grad_norm': 61.47892761230469, 'learning_rate': 0.001, 'ppl': 1.03045, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.117379665374756, 'tokens/total': 45760, 'tokens/trainable': 215, 'epoch': 5.0}
	31%\|███████████████████████████████████████████▍ \| 5/16 [00:32<01:09, 6.29s/it] 38%\|████████████████████████████████████████████████████▏ \| 6/16 [00:38<01:02, 6.27s/it] {'loss': 0.0242, 'grad_norm': 40.61567687988281, 'learning_rate': 0.001, 'ppl': 1.0245, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.108977317810059, 'tokens/total': 54912, 'tokens/trainable': 258, 'epoch': 6.0}
	38%\|████████████████████████████████████████████████████▏ \| 6/16 [00:38<01:02, 6.27s/it] 44%\|████████████████████████████████████████████████████████████▊ \| 7/16 [00:44<00:56, 6.26s/it] {'loss': 0.0225, 'grad_norm': 31.520526885986328, 'learning_rate': 0.001, 'ppl': 1.02276, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.105500221252441, 'tokens/total': 64064, 'tokens/trainable': 301, 'epoch': 7.0}
	44%\|████████████████████████████████████████████████████████████▊ \| 7/16 [00:44<00:56, 6.26s/it] 50%\|█████████████████████████████████████████████████████████████████████▌ \| 8/16 [00:50<00:50, 6.26s/it] {'loss': 0.0217, 'grad_norm': 29.32663917541504, 'learning_rate': 0.001, 'ppl': 1.02194, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.099156379699707, 'tokens/total': 73216, 'tokens/trainable': 344, 'epoch': 8.0}
	50%\|█████████████████████████████████████████████████████████████████████▌ \| 8/16 [00:50<00:50, 6.26s/it] 56%\|██████████████████████████████████████████████████████████████████████████████▏ \| 9/16 [00:56<00:43, 6.26s/it] {'loss': 0.0211, 'grad_norm': 26.701892852783203, 'learning_rate': 0.001, 'ppl': 1.02132, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.0905585289001465, 'tokens/total': 82368, 'tokens/trainable': 387, 'epoch': 9.0}
	56%\|██████████████████████████████████████████████████████████████████████████████▏ \| 9/16 [00:56<00:43, 6.26s/it] 62%\|██████████████████████████████████████████████████████████████████████████████████████▎ \| 10/16 [01:03<00:37, 6.26s/it] {'loss': 0.0205, 'grad_norm': 24.277631759643555, 'learning_rate': 0.001, 'ppl': 1.02071, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.076664447784424, 'tokens/total': 91520, 'tokens/trainable': 430, 'epoch': 10.0}
	62%\|██████████████████████████████████████████████████████████████████████████████████████▎ \| 10/16 [01:03<00:37, 6.26s/it] 69%\|██████████████████████████████████████████████████████████████████████████████████████████████▉ \| 11/16 [01:09<00:31, 6.26s/it] {'loss': 0.02, 'grad_norm': 24.709354400634766, 'learning_rate': 0.001, 'ppl': 1.0202, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.076472759246826, 'tokens/total': 100672, 'tokens/trainable': 473, 'epoch': 11.0}
	69%\|██████████████████████████████████████████████████████████████████████████████████████████████▉ \| 11/16 [01:09<00:31, 6.26s/it] 75%\|███████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 12/16 [01:15<00:25, 6.27s/it] {'loss': 0.0187, 'grad_norm': 23.36050033569336, 'learning_rate': 0.001, 'ppl': 1.01888, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.066161155700684, 'tokens/total': 109824, 'tokens/trainable': 516, 'epoch': 12.0}
	75%\|███████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 12/16 [01:15<00:25, 6.27s/it] 81%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 13/16 [01:21<00:18, 6.27s/it] {'loss': 0.0187, 'grad_norm': 25.07172393798828, 'learning_rate': 0.001, 'ppl': 1.01888, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.074800968170166, 'tokens/total': 118976, 'tokens/trainable': 559, 'epoch': 13.0}
	81%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 13/16 [01:22<00:18, 6.27s/it] 88%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 14/16 [01:28<00:12, 6.27s/it] {'loss': 0.0172, 'grad_norm': 24.219331741333008, 'learning_rate': 0.001, 'ppl': 1.01735, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.072431564331055, 'tokens/total': 128128, 'tokens/trainable': 602, 'epoch': 14.0}
	88%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 14/16 [01:28<00:12, 6.27s/it] 94%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 15/16 [01:34<00:06, 6.26s/it] {'loss': 0.0166, 'grad_norm': 23.965293884277344, 'learning_rate': 0.001, 'ppl': 1.01674, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.07435417175293, 'tokens/total': 137280, 'tokens/trainable': 645, 'epoch': 15.0}
	94%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 15/16 [01:34<00:06, 6.26s/it] 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 16/16 [01:40<00:00, 6.27s/it] {'loss': 0.0139, 'grad_norm': 21.725933074951172, 'learning_rate': 0.001, 'ppl': 1.014, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.0699357986450195, 'tokens/total': 146432, 'tokens/trainable': 688, 'epoch': 16.0}
	100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 16/16 [01:40<00:00, 6.27s/it] {'train_runtime': 100.8727, 'train_samples_per_second': 2.062, 'train_steps_per_second': 0.159, 'train_loss': 0.023414524795953184, 'memory/max_active (GiB)': 9.29, 'memory/max_allocated (GiB)': 9.29, 'memory/device_reserved (GiB)': 23.4, 'epoch': 16.0, 'tokens/train_per_sec_per_gpu': 0.0}
	100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 16/16 [01:40<00:00, 6.27s/it] 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 16/16 [01:40<00:00, 6.30s/it]
	[2026-01-25 09:37:06,886] [INFO] [axolotl.train.save_trained_model:233] [PID:10047] Training completed! Saving trained model to ./model-out.
	[2026-01-25 09:37:19,416] [INFO] [axolotl.train.save_trained_model:351] [PID:10047] Model successfully saved to ./model-out