[2025-10-12 23:56:52,944] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:25175] bf16 support detected, enabling for this configuration.
[2025-10-12 23:56:52,947] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:25175] baseline 0.000GB ()
[2025-10-12 23:56:52,947] [INFO] [axolotl.cli.config.load_cfg:248] [PID:25175] config:
{
  "activation_offloading": false,
  "adapter": "qlora",
  "axolotl_config_path": "stage-3.yaml",
  "base_model": "./merged-stage-1",
  "base_model_config": "./merged-stage-1",
  "batch_size": 4,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_86",
    "fp8": false,
    "n_gpu": 1,
    "n_node": 1
  },
  "chat_template": "chatml",
  "context_parallel_size": 1,
  "cut_cross_entropy": true,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_prepared_path": "last_run_prepared",
  "dataset_processes": 24,
  "datasets": [
    {
      "chat_template": "tokenizer_default",
      "field_messages": "conversations",
      "message_property_mappings": {
        "content": "value",
        "role": "from"
      },
      "path": "little-koto-instruct.json",
      "trust_remote_code": false,
      "type": "chat_template"
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "device_map": "auto",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "env_capabilities": {
    "torch_version": "2.7.1"
  },
  "eval_batch_size": 4,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_steps": 0.1,
  "eval_table_size": 0,
  "evals_per_epoch": 10,
  "experimental_skip_move_to_device": true,
  "flash_attention": false,
  "fp16": false,
  "gc_steps": 10,
  "gradient_accumulation_steps": 1,
  "gradient_checkpointing": false,
  "group_by_length": false,
  "hub_model_id": "ToastyPigeon/muse-marvin-stage3-lora",
  "hub_strategy": "every_save",
  "include_tkps": true,
  "is_mistral_derived_model": true,
  "learning_rate": 2e-06,
  "liger_glu_activation": true,
  "liger_layer_norm": true,
  "liger_rms_norm": true,
  "liger_rope": true,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_alpha": 32,
  "lora_dropout": 0.1,
  "lora_r": 32,
  "lora_target_linear": true,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "rex",
  "max_grad_norm": 1.0,
  "mean_resizing_embeddings": false,
  "merge_lora": true,
  "micro_batch_size": 4,
  "model_config_type": "mistral",
  "num_epochs": 1.0,
  "optimizer": "adamw_torch_fused",
  "output_dir": "ckpts-stage-2",
  "pad_to_sequence_len": false,
  "peft_use_rslora": false,
  "plugins": [
    "axolotl.integrations.liger.LigerPlugin",
    "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
  ],
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": false,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_total_limit": 1,
  "saves_per_epoch": 1,
  "seed": 69,
  "sequence_len": 4096,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "./merged-stage-1",
  "tokenizer_save_jinja_files": true,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "use_ray": false,
  "use_wandb": true,
  "val_set_size": 0.025,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_name": "r32-qlora-stage3",
  "wandb_project": "MuseMarvin",
  "warmup_ratio": 0.025,
  "weight_decay": 0.01,
  "world_size": 1
}
[2025-10-12 23:56:52,947] [INFO] [axolotl.cli.utils.load.load_model_and_tokenizer:40] [PID:25175] loading tokenizer... ./merged-stage-1
[2025-10-12 23:56:53,424] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:25175] EOS: 131072 / <|im_end|>
[2025-10-12 23:56:53,424] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:25175] BOS: 1 / <s>
[2025-10-12 23:56:53,424] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:25175] PAD: 10 / <pad>
[2025-10-12 23:56:53,424] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:25175] UNK: 0 / <unk>
[2025-10-12 23:56:53,425] [INFO] [axolotl.cli.utils.load.load_model_and_tokenizer:43] [PID:25175] loading model...
[2025-10-12 23:56:53,432] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:25175] Patched Trainer.evaluation_loop with nanmean loss calculation
[2025-10-12 23:56:53,433] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:25175] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2025-10-12 23:56:53,446] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:71] [PID:25175] Applying LIGER to mistral with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': None, 'rms_norm': True, 'swiglu': True}
[2025-10-12 23:56:53,814] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:25175] Applying Cut Cross Entropy to model type: mistral
Loading checkpoint shards:   0%|                                                                              | 0/5 [00:00<?, ?it/s]Loading checkpoint shards:  20%|██████████████                                                        | 1/5 [00:01<00:04,  1.00s/it]Loading checkpoint shards:  40%|████████████████████████████                                          | 2/5 [00:01<00:02,  1.15it/s]Loading checkpoint shards:  60%|██████████████████████████████████████████                            | 3/5 [00:02<00:01,  1.21it/s]Loading checkpoint shards:  80%|████████████████████████████████████████████████████████              | 4/5 [00:03<00:00,  1.24it/s]Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.27it/s]Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.22it/s]
[2025-10-12 23:56:58,379] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:25175] Converting modules to torch.bfloat16
[2025-10-12 23:56:58,382] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:25175] Memory usage after model load 12.891GB (+12.891GB allocated, +12.904GB reserved)
[2025-10-12 23:56:58,383] [INFO] [axolotl.loaders.adapter.load_lora:80] [PID:25175] found linear modules: ['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj']
[2025-10-12 23:56:58,383] [DEBUG] [axolotl.loaders.adapter.load_lora:143] [PID:25175] Loading pretrained PEFT - LoRA
trainable params: 114,032,640 || all params: 12,361,835,520 || trainable%: 0.9225
[2025-10-12 23:56:59,786] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:25175] after adapters 11.007GB (+11.007GB allocated, +13.242GB reserved)
[2025-10-12 23:57:00,457] [INFO] [axolotl.cli.merge_lora.do_merge_lora:27] [PID:25175] Running merge of LoRA with base model...
Unloading and merging model:   0%|                                                                          | 0/767 [00:00<?, ?it/s]Unloading and merging model:  16%|█████████▌                                                    | 119/767 [00:00<00:00, 1189.49it/s]Unloading and merging model: 100%|██████████████████████████████████████████████████████████████| 767/767 [00:00<00:00, 4667.34it/s]
[2025-10-12 23:57:00,628] [INFO] [axolotl.cli.merge_lora.do_merge_lora:40] [PID:25175] Saving merged model to: ckpts-stage-2/merged...