Gege24's picture
Upload task output 1
8f7e46d verified
[2026-01-30 09:56:15,990] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:301] bf16 support detected, enabling for this configuration.
[2026-01-30 09:56:15,993] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:301] baseline 0.000GB ()
[2026-01-30 09:56:15,993] [INFO] [axolotl.cli.config.load_cfg:259] [PID:301] config:
{
"activation_offloading": false,
"adapter": "lora",
"axolotl_config_path": "/workspace/axolotl/configs/1.yml",
"base_model": "/cache/models/Qwen--Qwen2.5-3B-Instruct",
"base_model_config": "/cache/models/Qwen--Qwen2.5-3B-Instruct",
"batch_size": 12,
"bf16": true,
"capabilities": {
"bf16": true,
"compute_capability": "sm_90",
"fp8": true,
"n_gpu": 2,
"n_node": 1
},
"chat_template": "llama3",
"context_parallel_size": 1,
"dataloader_num_workers": 2,
"dataloader_pin_memory": true,
"dataloader_prefetch_factor": 256,
"dataset_num_proc": 80,
"datasets": [
{
"data_files": [
"1_train_data.json"
],
"ds_type": "json",
"message_property_mappings": {
"content": "content",
"role": "role"
},
"path": "/workspace/axolotl/data",
"split": "train",
"trust_remote_code": false
}
],
"ddp": true,
"device": "cuda:0",
"device_map": {
"": 0
},
"dion_rank_fraction": 1.0,
"dion_rank_multiple_of": 1,
"eaft_alpha": 1.0,
"eaft_k": 20,
"env_capabilities": {
"torch_version": "2.4.0"
},
"eval_batch_size": 6,
"eval_causal_lm_metrics": [
"sacrebleu",
"comet",
"ter",
"chrf"
],
"eval_max_new_tokens": 128,
"eval_strategy": "no",
"eval_table_size": 0,
"experimental_skip_move_to_device": true,
"flash_attention": false,
"fp16": false,
"gradient_accumulation_steps": 1,
"gradient_checkpointing": false,
"group_by_length": false,
"include_tkps": true,
"is_falcon_derived_model": false,
"is_llama_derived_model": false,
"is_mistral_derived_model": false,
"learning_rate": 1e-05,
"lisa_layers_attribute": "model.layers",
"load_best_model_at_end": false,
"load_in_4bit": false,
"load_in_8bit": false,
"local_rank": 0,
"logging_steps": 1,
"lora_alpha": 64,
"lora_dropout": 0.0,
"lora_r": 64,
"lora_target_linear": true,
"loraplus_lr_embedding": 1e-06,
"lr_scheduler": "cosine",
"max_grad_norm": 1.0,
"max_steps": 100000,
"mean_resizing_embeddings": false,
"micro_batch_size": 6,
"model_config_type": "qwen2",
"num_epochs": 1.0,
"optimizer": "adamw_bnb_8bit",
"otel_metrics_host": "localhost",
"otel_metrics_port": 8000,
"output_dir": "/app/checkpoints/1/environment_test_affinedasasd",
"pad_to_sequence_len": true,
"pretrain_multipack_attn": true,
"profiler_steps_start": 0,
"qlora_sharded_model_loading": false,
"ray_num_workers": 1,
"resources_per_worker": {
"GPU": 1
},
"rl": "grpo",
"sample_packing": false,
"sample_packing_bin_size": 200,
"sample_packing_group_size": 100000,
"save_only_model": false,
"save_safetensors": true,
"save_steps": 10,
"save_total_limit": 1,
"sequence_len": 24000,
"shuffle_before_merging_datasets": false,
"shuffle_merged_datasets": true,
"skip_prepare_dataset": false,
"special_tokens": {
"bos_token": "<|im_end|>"
},
"streaming_multipack_buffer_size": 10000,
"strict": false,
"tensor_parallel_size": 1,
"tf32": false,
"tiled_mlp_use_original_mlp": true,
"tokenizer_config": "/cache/models/Qwen--Qwen2.5-3B-Instruct",
"tokenizer_save_jinja_files": true,
"tokenizer_type": "AutoTokenizer",
"torch_dtype": "torch.bfloat16",
"train_on_inputs": false,
"trl": {
"beta": 0.001,
"log_completions": false,
"mask_truncated_completions": false,
"max_completion_length": 512,
"num_generations": 6,
"ref_model_mixup_alpha": 0.9,
"ref_model_sync_steps": 64,
"reward_funcs": [
"affine_game.rollout_reward_func"
],
"reward_weights": [
1.0
],
"rollout_func": "affine_game.rollout_first_prompt_and_completion",
"scale_rewards": true,
"sync_ref_model": false,
"temperature": 0.7,
"use_vllm": true,
"vllm_enable_sleep_mode": false,
"vllm_mode": "colocate",
"vllm_server_host": "0.0.0.0",
"vllm_server_port": 8000
},
"trust_remote_code": true,
"type_of_model": "AutoModelForCausalLM",
"use_mlflow": false,
"use_otel_metrics": false,
"use_ray": false,
"use_wandb": true,
"val_set_size": 0.0,
"vllm": {
"device": "auto",
"dtype": "auto",
"enable_prefix_caching": false,
"gpu_memory_utilization": 0.15,
"host": "0.0.0.0",
"max_model_len": 24000,
"port": 8000,
"tensor_parallel_size": 1
},
"wandb_mode": "online",
"wandb_name": "1_environment_test_affinedasasd",
"wandb_project": "Affine-GAME-Tests",
"warmup_steps": 20,
"weight_decay": 0.0,
"world_size": 2
}
[2026-01-30 09:56:16,721] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:301] EOS: 151645 / <|im_end|>
[2026-01-30 09:56:16,721] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:301] BOS: 151645 / <|im_end|>
[2026-01-30 09:56:16,721] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:301] PAD: 151643 / <|endoftext|>
[2026-01-30 09:56:16,721] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:301] UNK: None / None
[2026-01-30 09:57:39,263] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:481] [PID:301] Unable to find prepared dataset in last_run_prepared/ba0ae834220c702ae7aefbdbfde66c85
[2026-01-30 09:57:40,207] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:301] EOS: 151645 / <|im_end|>
[2026-01-30 09:57:40,207] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:301] BOS: 151645 / <|im_end|>
[2026-01-30 09:57:40,207] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:301] PAD: 151643 / <|endoftext|>
[2026-01-30 09:57:40,207] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:301] UNK: None / None
Saving the dataset (0/3 shards): 0%| | 0/1000 [00:00<?, ? examples/s] Saving the dataset (0/3 shards): 33%|β–ˆβ–ˆβ–ˆβ–Ž | 334/1000 [00:00<00:00, 2874.63 examples/s] Saving the dataset (1/3 shards): 33%|β–ˆβ–ˆβ–ˆβ–Ž | 334/1000 [00:00<00:00, 2874.63 examples/s] Saving the dataset (2/3 shards): 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 667/1000 [00:00<00:00, 2874.63 examples/s] Saving the dataset (3/3 shards): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1000/1000 [00:00<00:00, 2874.63 examples/s] Saving the dataset (3/3 shards): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1000/1000 [00:00<00:00, 4982.82 examples/s]
[2026-01-30 09:57:41,815] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:301] loading tokenizer... /cache/models/Qwen--Qwen2.5-3B-Instruct
[2026-01-30 09:57:42,451] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:301] EOS: 151645 / <|im_end|>
[2026-01-30 09:57:42,452] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:301] BOS: 151645 / <|im_end|>
[2026-01-30 09:57:42,452] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:301] PAD: 151643 / <|endoftext|>
[2026-01-30 09:57:42,452] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:301] UNK: None / None
[2026-01-30 09:57:42,452] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:301] Loading model
[2026-01-30 09:57:42,461] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:301] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-01-30 09:57:42,463] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:301] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
Loading weights: 0%| | 0/434 [00:00<?, ?it/s] Loading weights: 0%| | 1/434 [00:00<00:00, 6574.14it/s, Materializing param=model.embed_tokens.weight] Loading weights: 0%| | 1/434 [00:00<00:00, 4084.04it/s, Materializing param=model.embed_tokens.weight] Loading weights: 0%| | 2/434 [00:00<00:12, 35.85it/s, Materializing param=model.layers.0.input_layernorm.weight] Loading weights: 0%| | 2/434 [00:00<00:12, 35.74it/s, Materializing param=model.layers.0.input_layernorm.weight] Loading weights: 1%| | 3/434 [00:00<00:08, 53.43it/s, Materializing param=model.layers.0.mlp.down_proj.weight] Loading weights: 1%| | 3/434 [00:00<00:08, 53.37it/s, Materializing param=model.layers.0.mlp.down_proj.weight] Loading weights: 1%| | 4/434 [00:00<00:06, 63.18it/s, Materializing param=model.layers.0.mlp.gate_proj.weight] Loading weights: 1%| | 4/434 [00:00<00:06, 63.11it/s, Materializing param=model.layers.0.mlp.gate_proj.weight] Loading weights: 1%| | 5/434 [00:00<00:06, 70.12it/s, Materializing param=model.layers.0.mlp.up_proj.weight] Loading weights: 1%| | 5/434 [00:00<00:06, 70.07it/s, Materializing param=model.layers.0.mlp.up_proj.weight] Loading weights: 1%|▏ | 6/434 [00:00<00:05, 78.73it/s, Materializing param=model.layers.0.post_attention_layernorm.weight] Loading weights: 1%|▏ | 6/434 [00:00<00:05, 78.65it/s, Materializing param=model.layers.0.post_attention_layernorm.weight] Loading weights: 2%|▏ | 7/434 [00:00<00:04, 91.62it/s, Materializing param=model.layers.0.self_attn.k_proj.bias] Loading weights: 2%|▏ | 7/434 [00:00<00:04, 91.56it/s, Materializing param=model.layers.0.self_attn.k_proj.bias] Loading weights: 2%|▏ | 8/434 [00:00<00:04, 104.43it/s, Materializing param=model.layers.0.self_attn.k_proj.weight] Loading weights: 2%|▏ | 8/434 [00:00<00:04, 104.34it/s, Materializing param=model.layers.0.self_attn.k_proj.weight] Loading weights: 2%|▏ | 9/434 [00:00<00:03, 117.26it/s, Materializing param=model.layers.0.self_attn.o_proj.weight] Loading weights: 2%|▏ | 9/434 [00:00<00:03, 117.19it/s, Materializing param=model.layers.0.self_attn.o_proj.weight] Loading weights: 2%|▏ | 10/434 [00:00<00:03, 128.69it/s, Materializing param=model.layers.0.self_attn.q_proj.bias] Loading weights: 2%|▏ | 10/434 [00:00<00:03, 128.59it/s, Materializing param=model.layers.0.self_attn.q_proj.bias] Loading weights: 3%|β–Ž | 11/434 [00:00<00:02, 141.31it/s, Materializing param=model.layers.0.self_attn.q_proj.weight] Loading weights: 3%|β–Ž | 11/434 [00:00<00:02, 141.20it/s, Materializing param=model.layers.0.self_attn.q_proj.weight] Loading weights: 3%|β–Ž | 12/434 [00:00<00:02, 151.43it/s, Materializing param=model.layers.0.self_attn.v_proj.bias] Loading weights: 3%|β–Ž | 12/434 [00:00<00:02, 151.27it/s, Materializing param=model.layers.0.self_attn.v_proj.bias] Loading weights: 3%|β–Ž | 13/434 [00:00<00:02, 161.98it/s, Materializing param=model.layers.0.self_attn.v_proj.weight] Loading weights: 3%|β–Ž | 13/434 [00:00<00:02, 161.82it/s, Materializing param=model.layers.0.self_attn.v_proj.weight] Loading weights: 3%|β–Ž | 14/434 [00:00<00:02, 174.08it/s, Materializing param=model.layers.1.input_layernorm.weight] Loading weights: 3%|β–Ž | 14/434 [00:00<00:02, 173.99it/s, Materializing param=model.layers.1.input_layernorm.weight] Loading weights: 3%|β–Ž | 15/434 [00:00<00:02, 186.25it/s, Materializing param=model.layers.1.mlp.down_proj.weight] Loading weights: 3%|β–Ž | 15/434 [00:00<00:02, 186.16it/s, Materializing param=model.layers.1.mlp.down_proj.weight] Loading weights: 4%|β–Ž | 16/434 [00:00<00:02, 182.99it/s, Materializing param=model.layers.1.mlp.gate_proj.weight] Loading weights: 4%|β–Ž | 16/434 [00:00<00:02, 182.79it/s, Materializing param=model.layers.1.mlp.gate_proj.weight] Loading weights: 4%|▍ | 17/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.mlp.gate_proj.weight] Loading weights: 4%|▍ | 17/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.mlp.up_proj.weight] Loading weights: 4%|▍ | 17/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.mlp.up_proj.weight] Loading weights: 4%|▍ | 18/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.post_attention_layernorm.weight] Loading weights: 4%|▍ | 18/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.post_attention_layernorm.weight] Loading weights: 4%|▍ | 19/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.k_proj.bias] Loading weights: 4%|▍ | 19/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.k_proj.bias] Loading weights: 5%|▍ | 20/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.k_proj.weight] Loading weights: 5%|▍ | 20/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.k_proj.weight] Loading weights: 5%|▍ | 21/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.o_proj.weight] Loading weights: 5%|▍ | 21/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.o_proj.weight] Loading weights: 5%|β–Œ | 22/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.q_proj.bias] Loading weights: 5%|β–Œ | 22/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.q_proj.bias] Loading weights: 5%|β–Œ | 23/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.q_proj.weight] Loading weights: 5%|β–Œ | 23/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.q_proj.weight] Loading weights: 6%|β–Œ | 24/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.v_proj.bias] Loading weights: 6%|β–Œ | 24/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.v_proj.bias] Loading weights: 6%|β–Œ | 25/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.v_proj.weight] Loading weights: 6%|β–Œ | 25/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.v_proj.weight] Loading weights: 6%|β–Œ | 26/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.input_layernorm.weight] Loading weights: 6%|β–Œ | 26/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.input_layernorm.weight] Loading weights: 6%|β–Œ | 27/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.mlp.down_proj.weight] Loading weights: 6%|β–Œ | 27/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.mlp.down_proj.weight] Loading weights: 6%|β–‹ | 28/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.mlp.gate_proj.weight] Loading weights: 6%|β–‹ | 28/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.mlp.gate_proj.weight] Loading weights: 7%|β–‹ | 29/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.mlp.up_proj.weight] Loading weights: 7%|β–‹ | 29/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.mlp.up_proj.weight] Loading weights: 7%|β–‹ | 30/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.post_attention_layernorm.weight] Loading weights: 7%|β–‹ | 30/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.post_attention_layernorm.weight] Loading weights: 7%|β–‹ | 31/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.k_proj.bias] Loading weights: 7%|β–‹ | 31/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.k_proj.bias] Loading weights: 7%|β–‹ | 32/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.k_proj.weight] Loading weights: 7%|β–‹ | 32/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.k_proj.weight] Loading weights: 8%|β–Š | 33/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.o_proj.weight] Loading weights: 8%|β–Š | 33/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.o_proj.weight] Loading weights: 8%|β–Š | 34/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.q_proj.bias] Loading weights: 8%|β–Š | 34/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.q_proj.bias] Loading weights: 8%|β–Š | 35/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.q_proj.weight] Loading weights: 8%|β–Š | 35/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.q_proj.weight] Loading weights: 8%|β–Š | 36/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.v_proj.bias] Loading weights: 8%|β–Š | 36/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.v_proj.bias] Loading weights: 9%|β–Š | 37/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.v_proj.weight] Loading weights: 9%|β–Š | 37/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.v_proj.weight] Loading weights: 9%|β–‰ | 38/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.input_layernorm.weight] Loading weights: 9%|β–‰ | 38/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.input_layernorm.weight] Loading weights: 9%|β–‰ | 39/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.mlp.down_proj.weight] Loading weights: 9%|β–‰ | 39/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.mlp.down_proj.weight] Loading weights: 9%|β–‰ | 40/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.mlp.gate_proj.weight] Loading weights: 9%|β–‰ | 40/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.mlp.gate_proj.weight] Loading weights: 9%|β–‰ | 41/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.mlp.up_proj.weight] Loading weights: 9%|β–‰ | 41/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.mlp.up_proj.weight] Loading weights: 10%|β–‰ | 42/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.post_attention_layernorm.weight] Loading weights: 10%|β–‰ | 42/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.post_attention_layernorm.weight] Loading weights: 10%|β–‰ | 43/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.k_proj.bias] Loading weights: 10%|β–‰ | 43/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.k_proj.bias] Loading weights: 10%|β–ˆ | 44/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.k_proj.weight] Loading weights: 10%|β–ˆ | 44/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.k_proj.weight] Loading weights: 10%|β–ˆ | 45/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.o_proj.weight] Loading weights: 10%|β–ˆ | 45/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.o_proj.weight] Loading weights: 11%|β–ˆ | 46/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.q_proj.bias] Loading weights: 11%|β–ˆ | 46/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.q_proj.bias] Loading weights: 11%|β–ˆ | 47/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.q_proj.weight] Loading weights: 11%|β–ˆ | 47/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.q_proj.weight] Loading weights: 11%|β–ˆ | 48/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.3.self_attn.q_proj.weight] Loading weights: 11%|β–ˆ | 48/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.3.self_attn.v_proj.bias] Loading weights: 11%|β–ˆ | 48/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.3.self_attn.v_proj.bias] Loading weights: 11%|β–ˆβ– | 49/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.3.self_attn.v_proj.weight] Loading weights: 11%|β–ˆβ– | 49/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.3.self_attn.v_proj.weight] Loading weights: 12%|β–ˆβ– | 50/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.input_layernorm.weight] Loading weights: 12%|β–ˆβ– | 50/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.input_layernorm.weight] Loading weights: 12%|β–ˆβ– | 51/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.mlp.down_proj.weight] Loading weights: 12%|β–ˆβ– | 51/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.mlp.down_proj.weight] Loading weights: 12%|β–ˆβ– | 52/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.mlp.gate_proj.weight] Loading weights: 12%|β–ˆβ– | 52/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.mlp.gate_proj.weight] Loading weights: 12%|β–ˆβ– | 53/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.mlp.up_proj.weight] Loading weights: 12%|β–ˆβ– | 53/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.mlp.up_proj.weight] Loading weights: 12%|β–ˆβ– | 54/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.post_attention_layernorm.weight] Loading weights: 12%|β–ˆβ– | 54/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.post_attention_layernorm.weight] Loading weights: 13%|β–ˆβ–Ž | 55/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.k_proj.bias] Loading weights: 13%|β–ˆβ–Ž | 55/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.k_proj.bias] Loading weights: 13%|β–ˆβ–Ž | 56/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.k_proj.weight] Loading weights: 13%|β–ˆβ–Ž | 56/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.k_proj.weight] Loading weights: 13%|β–ˆβ–Ž | 57/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.o_proj.weight] Loading weights: 13%|β–ˆβ–Ž | 57/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.o_proj.weight] Loading weights: 13%|β–ˆβ–Ž | 58/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.q_proj.bias] Loading weights: 13%|β–ˆβ–Ž | 58/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.q_proj.bias] Loading weights: 14%|β–ˆβ–Ž | 59/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.q_proj.weight] Loading weights: 14%|β–ˆβ–Ž | 59/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.q_proj.weight] Loading weights: 14%|β–ˆβ– | 60/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.v_proj.bias] Loading weights: 14%|β–ˆβ– | 60/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.v_proj.bias] Loading weights: 14%|β–ˆβ– | 61/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.v_proj.weight] Loading weights: 14%|β–ˆβ– | 61/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.v_proj.weight] Loading weights: 14%|β–ˆβ– | 62/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.input_layernorm.weight] Loading weights: 14%|β–ˆβ– | 62/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.input_layernorm.weight] Loading weights: 15%|β–ˆβ– | 63/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.mlp.down_proj.weight] Loading weights: 15%|β–ˆβ– | 63/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.mlp.down_proj.weight] Loading weights: 15%|β–ˆβ– | 64/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.mlp.gate_proj.weight] Loading weights: 15%|β–ˆβ– | 64/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.mlp.gate_proj.weight] Loading weights: 15%|β–ˆβ– | 65/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.mlp.up_proj.weight] Loading weights: 15%|β–ˆβ– | 65/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.mlp.up_proj.weight] Loading weights: 15%|β–ˆβ–Œ | 66/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.post_attention_layernorm.weight] Loading weights: 15%|β–ˆβ–Œ | 66/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.post_attention_layernorm.weight] Loading weights: 15%|β–ˆβ–Œ | 67/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.self_attn.k_proj.bias] Loading weights: 15%|β–ˆβ–Œ | 67/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.self_attn.k_proj.bias] Loading weights: 16%|β–ˆβ–Œ | 68/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.self_attn.k_proj.weight] Loading weights: 16%|β–ˆβ–Œ | 68/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.self_attn.k_proj.weight] Loading weights: 16%|β–ˆβ–Œ | 69/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.self_attn.o_proj.weight] Loading weights: 16%|β–ˆβ–Œ | 69/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.self_attn.o_proj.weight]