Upload folder using huggingface_hub

34006e6 verified 4 months ago

8.97 kB

	[2025-10-12 23:56:52,944] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:25175] bf16 support detected, enabling for this configuration.
	[2025-10-12 23:56:52,947] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:25175] baseline 0.000GB ()
	[2025-10-12 23:56:52,947] [INFO] [axolotl.cli.config.load_cfg:248] [PID:25175] config:
	{
	"activation_offloading": false,
	"adapter": "qlora",
	"axolotl_config_path": "stage-3.yaml",
	"base_model": "./merged-stage-1",
	"base_model_config": "./merged-stage-1",
	"batch_size": 4,
	"bf16": true,
	"capabilities": {
	"bf16": true,
	"compute_capability": "sm_86",
	"fp8": false,
	"n_gpu": 1,
	"n_node": 1
	},
	"chat_template": "chatml",
	"context_parallel_size": 1,
	"cut_cross_entropy": true,
	"dataloader_num_workers": 1,
	"dataloader_pin_memory": true,
	"dataloader_prefetch_factor": 256,
	"dataset_prepared_path": "last_run_prepared",
	"dataset_processes": 24,
	"datasets": [
	{
	"chat_template": "tokenizer_default",
	"field_messages": "conversations",
	"message_property_mappings": {
	"content": "value",
	"role": "from"
	},
	"path": "little-koto-instruct.json",
	"trust_remote_code": false,
	"type": "chat_template"
	}
	],
	"ddp": false,
	"device": "cuda:0",
	"device_map": "auto",
	"dion_rank_fraction": 1.0,
	"dion_rank_multiple_of": 1,
	"env_capabilities": {
	"torch_version": "2.7.1"
	},
	"eval_batch_size": 4,
	"eval_causal_lm_metrics": [
	"sacrebleu",
	"comet",
	"ter",
	"chrf"
	],
	"eval_max_new_tokens": 128,
	"eval_steps": 0.1,
	"eval_table_size": 0,
	"evals_per_epoch": 10,
	"experimental_skip_move_to_device": true,
	"flash_attention": false,
	"fp16": false,
	"gc_steps": 10,
	"gradient_accumulation_steps": 1,
	"gradient_checkpointing": false,
	"group_by_length": false,
	"hub_model_id": "ToastyPigeon/muse-marvin-stage3-lora",
	"hub_strategy": "every_save",
	"include_tkps": true,
	"is_mistral_derived_model": true,
	"learning_rate": 2e-06,
	"liger_glu_activation": true,
	"liger_layer_norm": true,
	"liger_rms_norm": true,
	"liger_rope": true,
	"lisa_layers_attribute": "model.layers",
	"load_best_model_at_end": false,
	"load_in_4bit": false,
	"load_in_8bit": false,
	"local_rank": 0,
	"logging_steps": 1,
	"lora_alpha": 32,
	"lora_dropout": 0.1,
	"lora_r": 32,
	"lora_target_linear": true,
	"loraplus_lr_embedding": 1e-06,
	"lr_scheduler": "rex",
	"max_grad_norm": 1.0,
	"mean_resizing_embeddings": false,
	"merge_lora": true,
	"micro_batch_size": 4,
	"model_config_type": "mistral",
	"num_epochs": 1.0,
	"optimizer": "adamw_torch_fused",
	"output_dir": "ckpts-stage-2",
	"pad_to_sequence_len": false,
	"peft_use_rslora": false,
	"plugins": [
	"axolotl.integrations.liger.LigerPlugin",
	"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
	],
	"pretrain_multipack_attn": true,
	"profiler_steps_start": 0,
	"qlora_sharded_model_loading": false,
	"ray_num_workers": 1,
	"resources_per_worker": {
	"GPU": 1
	},
	"sample_packing": false,
	"sample_packing_bin_size": 200,
	"sample_packing_group_size": 100000,
	"save_only_model": false,
	"save_safetensors": true,
	"save_total_limit": 1,
	"saves_per_epoch": 1,
	"seed": 69,
	"sequence_len": 4096,
	"shuffle_before_merging_datasets": false,
	"shuffle_merged_datasets": true,
	"skip_prepare_dataset": false,
	"streaming_multipack_buffer_size": 10000,
	"strict": false,
	"tensor_parallel_size": 1,
	"tiled_mlp_use_original_mlp": true,
	"tokenizer_config": "./merged-stage-1",
	"tokenizer_save_jinja_files": true,
	"torch_dtype": "torch.bfloat16",
	"train_on_inputs": false,
	"trl": {
	"log_completions": false,
	"mask_truncated_completions": false,
	"ref_model_mixup_alpha": 0.9,
	"ref_model_sync_steps": 64,
	"scale_rewards": true,
	"sync_ref_model": false,
	"use_vllm": false,
	"vllm_server_host": "0.0.0.0",
	"vllm_server_port": 8000
	},
	"use_ray": false,
	"use_wandb": true,
	"val_set_size": 0.025,
	"vllm": {
	"device": "auto",
	"dtype": "auto",
	"gpu_memory_utilization": 0.9,
	"host": "0.0.0.0",
	"port": 8000
	},
	"wandb_name": "r32-qlora-stage3",
	"wandb_project": "MuseMarvin",
	"warmup_ratio": 0.025,
	"weight_decay": 0.01,
	"world_size": 1
	}
	[2025-10-12 23:56:52,947] [INFO] [axolotl.cli.utils.load.load_model_and_tokenizer:40] [PID:25175] loading tokenizer... ./merged-stage-1
	[2025-10-12 23:56:53,424] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:25175] EOS: 131072 / <\|im_end\|>
	[2025-10-12 23:56:53,424] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:25175] BOS: 1 / <s>
	[2025-10-12 23:56:53,424] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:25175] PAD: 10 / <pad>
	[2025-10-12 23:56:53,424] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:25175] UNK: 0 / <unk>
	[2025-10-12 23:56:53,425] [INFO] [axolotl.cli.utils.load.load_model_and_tokenizer:43] [PID:25175] loading model...
	[2025-10-12 23:56:53,432] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:25175] Patched Trainer.evaluation_loop with nanmean loss calculation
	[2025-10-12 23:56:53,433] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:25175] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
	[2025-10-12 23:56:53,446] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:71] [PID:25175] Applying LIGER to mistral with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': None, 'rms_norm': True, 'swiglu': True}
	[2025-10-12 23:56:53,814] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:25175] Applying Cut Cross Entropy to model type: mistral
	Loading checkpoint shards: 0%\| \| 0/5 [00:00<?, ?it/s] Loading checkpoint shards: 20%\|██████████████ \| 1/5 [00:01<00:04, 1.00s/it] Loading checkpoint shards: 40%\|████████████████████████████ \| 2/5 [00:01<00:02, 1.15it/s] Loading checkpoint shards: 60%\|██████████████████████████████████████████ \| 3/5 [00:02<00:01, 1.21it/s] Loading checkpoint shards: 80%\|████████████████████████████████████████████████████████ \| 4/5 [00:03<00:00, 1.24it/s] Loading checkpoint shards: 100%\|██████████████████████████████████████████████████████████████████████\| 5/5 [00:04<00:00, 1.27it/s] Loading checkpoint shards: 100%\|██████████████████████████████████████████████████████████████████████\| 5/5 [00:04<00:00, 1.22it/s]
	[2025-10-12 23:56:58,379] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:25175] Converting modules to torch.bfloat16
	[2025-10-12 23:56:58,382] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:25175] Memory usage after model load 12.891GB (+12.891GB allocated, +12.904GB reserved)
	[2025-10-12 23:56:58,383] [INFO] [axolotl.loaders.adapter.load_lora:80] [PID:25175] found linear modules: ['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj']
	[2025-10-12 23:56:58,383] [DEBUG] [axolotl.loaders.adapter.load_lora:143] [PID:25175] Loading pretrained PEFT - LoRA
	trainable params: 114,032,640 \|\| all params: 12,361,835,520 \|\| trainable%: 0.9225
	[2025-10-12 23:56:59,786] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:25175] after adapters 11.007GB (+11.007GB allocated, +13.242GB reserved)
	[2025-10-12 23:57:00,457] [INFO] [axolotl.cli.merge_lora.do_merge_lora:27] [PID:25175] Running merge of LoRA with base model...
	Unloading and merging model: 0%\| \| 0/767 [00:00<?, ?it/s] Unloading and merging model: 16%\|█████████▌ \| 119/767 [00:00<00:00, 1189.49it/s] Unloading and merging model: 100%\|██████████████████████████████████████████████████████████████\| 767/767 [00:00<00:00, 4667.34it/s]
	[2025-10-12 23:57:00,628] [INFO] [axolotl.cli.merge_lora.do_merge_lora:40] [PID:25175] Saving merged model to: ckpts-stage-2/merged...