| [2026-01-30 09:56:15,990] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:301] bf16 support detected, enabling for this configuration. |
| [2026-01-30 09:56:15,993] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:301] baseline 0.000GB () |
| [2026-01-30 09:56:15,993] [INFO] [axolotl.cli.config.load_cfg:259] [PID:301] config: |
| { |
| "activation_offloading": false, |
| "adapter": "lora", |
| "axolotl_config_path": "/workspace/axolotl/configs/1.yml", |
| "base_model": "/cache/models/Qwen--Qwen2.5-3B-Instruct", |
| "base_model_config": "/cache/models/Qwen--Qwen2.5-3B-Instruct", |
| "batch_size": 12, |
| "bf16": true, |
| "capabilities": { |
| "bf16": true, |
| "compute_capability": "sm_90", |
| "fp8": true, |
| "n_gpu": 2, |
| "n_node": 1 |
| }, |
| "chat_template": "llama3", |
| "context_parallel_size": 1, |
| "dataloader_num_workers": 2, |
| "dataloader_pin_memory": true, |
| "dataloader_prefetch_factor": 256, |
| "dataset_num_proc": 80, |
| "datasets": [ |
| { |
| "data_files": [ |
| "1_train_data.json" |
| ], |
| "ds_type": "json", |
| "message_property_mappings": { |
| "content": "content", |
| "role": "role" |
| }, |
| "path": "/workspace/axolotl/data", |
| "split": "train", |
| "trust_remote_code": false |
| } |
| ], |
| "ddp": true, |
| "device": "cuda:0", |
| "device_map": { |
| "": 0 |
| }, |
| "dion_rank_fraction": 1.0, |
| "dion_rank_multiple_of": 1, |
| "eaft_alpha": 1.0, |
| "eaft_k": 20, |
| "env_capabilities": { |
| "torch_version": "2.4.0" |
| }, |
| "eval_batch_size": 6, |
| "eval_causal_lm_metrics": [ |
| "sacrebleu", |
| "comet", |
| "ter", |
| "chrf" |
| ], |
| "eval_max_new_tokens": 128, |
| "eval_strategy": "no", |
| "eval_table_size": 0, |
| "experimental_skip_move_to_device": true, |
| "flash_attention": false, |
| "fp16": false, |
| "gradient_accumulation_steps": 1, |
| "gradient_checkpointing": false, |
| "group_by_length": false, |
| "include_tkps": true, |
| "is_falcon_derived_model": false, |
| "is_llama_derived_model": false, |
| "is_mistral_derived_model": false, |
| "learning_rate": 1e-05, |
| "lisa_layers_attribute": "model.layers", |
| "load_best_model_at_end": false, |
| "load_in_4bit": false, |
| "load_in_8bit": false, |
| "local_rank": 0, |
| "logging_steps": 1, |
| "lora_alpha": 64, |
| "lora_dropout": 0.0, |
| "lora_r": 64, |
| "lora_target_linear": true, |
| "loraplus_lr_embedding": 1e-06, |
| "lr_scheduler": "cosine", |
| "max_grad_norm": 1.0, |
| "max_steps": 100000, |
| "mean_resizing_embeddings": false, |
| "micro_batch_size": 6, |
| "model_config_type": "qwen2", |
| "num_epochs": 1.0, |
| "optimizer": "adamw_bnb_8bit", |
| "otel_metrics_host": "localhost", |
| "otel_metrics_port": 8000, |
| "output_dir": "/app/checkpoints/1/environment_test_affinedasasd", |
| "pad_to_sequence_len": true, |
| "pretrain_multipack_attn": true, |
| "profiler_steps_start": 0, |
| "qlora_sharded_model_loading": false, |
| "ray_num_workers": 1, |
| "resources_per_worker": { |
| "GPU": 1 |
| }, |
| "rl": "grpo", |
| "sample_packing": false, |
| "sample_packing_bin_size": 200, |
| "sample_packing_group_size": 100000, |
| "save_only_model": false, |
| "save_safetensors": true, |
| "save_steps": 10, |
| "save_total_limit": 1, |
| "sequence_len": 24000, |
| "shuffle_before_merging_datasets": false, |
| "shuffle_merged_datasets": true, |
| "skip_prepare_dataset": false, |
| "special_tokens": { |
| "bos_token": "<|im_end|>" |
| }, |
| "streaming_multipack_buffer_size": 10000, |
| "strict": false, |
| "tensor_parallel_size": 1, |
| "tf32": false, |
| "tiled_mlp_use_original_mlp": true, |
| "tokenizer_config": "/cache/models/Qwen--Qwen2.5-3B-Instruct", |
| "tokenizer_save_jinja_files": true, |
| "tokenizer_type": "AutoTokenizer", |
| "torch_dtype": "torch.bfloat16", |
| "train_on_inputs": false, |
| "trl": { |
| "beta": 0.001, |
| "log_completions": false, |
| "mask_truncated_completions": false, |
| "max_completion_length": 512, |
| "num_generations": 6, |
| "ref_model_mixup_alpha": 0.9, |
| "ref_model_sync_steps": 64, |
| "reward_funcs": [ |
| "affine_game.rollout_reward_func" |
| ], |
| "reward_weights": [ |
| 1.0 |
| ], |
| "rollout_func": "affine_game.rollout_first_prompt_and_completion", |
| "scale_rewards": true, |
| "sync_ref_model": false, |
| "temperature": 0.7, |
| "use_vllm": true, |
| "vllm_enable_sleep_mode": false, |
| "vllm_mode": "colocate", |
| "vllm_server_host": "0.0.0.0", |
| "vllm_server_port": 8000 |
| }, |
| "trust_remote_code": true, |
| "type_of_model": "AutoModelForCausalLM", |
| "use_mlflow": false, |
| "use_otel_metrics": false, |
| "use_ray": false, |
| "use_wandb": true, |
| "val_set_size": 0.0, |
| "vllm": { |
| "device": "auto", |
| "dtype": "auto", |
| "enable_prefix_caching": false, |
| "gpu_memory_utilization": 0.15, |
| "host": "0.0.0.0", |
| "max_model_len": 24000, |
| "port": 8000, |
| "tensor_parallel_size": 1 |
| }, |
| "wandb_mode": "online", |
| "wandb_name": "1_environment_test_affinedasasd", |
| "wandb_project": "Affine-GAME-Tests", |
| "warmup_steps": 20, |
| "weight_decay": 0.0, |
| "world_size": 2 |
| } |
| [2026-01-30 09:56:16,721] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:301] EOS: 151645 / <|im_end|> |
| [2026-01-30 09:56:16,721] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:301] BOS: 151645 / <|im_end|> |
| [2026-01-30 09:56:16,721] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:301] PAD: 151643 / <|endoftext|> |
| [2026-01-30 09:56:16,721] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:301] UNK: None / None |
| [2026-01-30 09:57:39,263] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:481] [PID:301] Unable to find prepared dataset in last_run_prepared/ba0ae834220c702ae7aefbdbfde66c85 |
| [2026-01-30 09:57:40,207] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:301] EOS: 151645 / <|im_end|> |
| [2026-01-30 09:57:40,207] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:301] BOS: 151645 / <|im_end|> |
| [2026-01-30 09:57:40,207] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:301] PAD: 151643 / <|endoftext|> |
| [2026-01-30 09:57:40,207] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:301] UNK: None / None |
|
Saving the dataset (0/3 shards): 0%| | 0/1000 [00:00<?, ? examples/s]
Saving the dataset (0/3 shards): 33%|ββββ | 334/1000 [00:00<00:00, 2874.63 examples/s]
Saving the dataset (1/3 shards): 33%|ββββ | 334/1000 [00:00<00:00, 2874.63 examples/s]
Saving the dataset (2/3 shards): 67%|βββββββ | 667/1000 [00:00<00:00, 2874.63 examples/s]
Saving the dataset (3/3 shards): 100%|ββββββββββ| 1000/1000 [00:00<00:00, 2874.63 examples/s]
Saving the dataset (3/3 shards): 100%|ββββββββββ| 1000/1000 [00:00<00:00, 4982.82 examples/s] |
| [2026-01-30 09:57:41,815] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:301] loading tokenizer... /cache/models/Qwen--Qwen2.5-3B-Instruct |
| [2026-01-30 09:57:42,451] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:301] EOS: 151645 / <|im_end|> |
| [2026-01-30 09:57:42,452] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:301] BOS: 151645 / <|im_end|> |
| [2026-01-30 09:57:42,452] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:301] PAD: 151643 / <|endoftext|> |
| [2026-01-30 09:57:42,452] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:301] UNK: None / None |
| [2026-01-30 09:57:42,452] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:301] Loading model |
| [2026-01-30 09:57:42,461] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:301] Patched Trainer.evaluation_loop with nanmean loss calculation |
| [2026-01-30 09:57:42,463] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:301] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation |
|
Loading weights: 0%| | 0/434 [00:00<?, ?it/s]
Loading weights: 0%| | 1/434 [00:00<00:00, 6574.14it/s, Materializing param=model.embed_tokens.weight]
Loading weights: 0%| | 1/434 [00:00<00:00, 4084.04it/s, Materializing param=model.embed_tokens.weight]
Loading weights: 0%| | 2/434 [00:00<00:12, 35.85it/s, Materializing param=model.layers.0.input_layernorm.weight]
Loading weights: 0%| | 2/434 [00:00<00:12, 35.74it/s, Materializing param=model.layers.0.input_layernorm.weight]
Loading weights: 1%| | 3/434 [00:00<00:08, 53.43it/s, Materializing param=model.layers.0.mlp.down_proj.weight]
Loading weights: 1%| | 3/434 [00:00<00:08, 53.37it/s, Materializing param=model.layers.0.mlp.down_proj.weight]
Loading weights: 1%| | 4/434 [00:00<00:06, 63.18it/s, Materializing param=model.layers.0.mlp.gate_proj.weight]
Loading weights: 1%| | 4/434 [00:00<00:06, 63.11it/s, Materializing param=model.layers.0.mlp.gate_proj.weight]
Loading weights: 1%| | 5/434 [00:00<00:06, 70.12it/s, Materializing param=model.layers.0.mlp.up_proj.weight]
Loading weights: 1%| | 5/434 [00:00<00:06, 70.07it/s, Materializing param=model.layers.0.mlp.up_proj.weight]
Loading weights: 1%|β | 6/434 [00:00<00:05, 78.73it/s, Materializing param=model.layers.0.post_attention_layernorm.weight]
Loading weights: 1%|β | 6/434 [00:00<00:05, 78.65it/s, Materializing param=model.layers.0.post_attention_layernorm.weight]
Loading weights: 2%|β | 7/434 [00:00<00:04, 91.62it/s, Materializing param=model.layers.0.self_attn.k_proj.bias]
Loading weights: 2%|β | 7/434 [00:00<00:04, 91.56it/s, Materializing param=model.layers.0.self_attn.k_proj.bias]
Loading weights: 2%|β | 8/434 [00:00<00:04, 104.43it/s, Materializing param=model.layers.0.self_attn.k_proj.weight]
Loading weights: 2%|β | 8/434 [00:00<00:04, 104.34it/s, Materializing param=model.layers.0.self_attn.k_proj.weight]
Loading weights: 2%|β | 9/434 [00:00<00:03, 117.26it/s, Materializing param=model.layers.0.self_attn.o_proj.weight]
Loading weights: 2%|β | 9/434 [00:00<00:03, 117.19it/s, Materializing param=model.layers.0.self_attn.o_proj.weight]
Loading weights: 2%|β | 10/434 [00:00<00:03, 128.69it/s, Materializing param=model.layers.0.self_attn.q_proj.bias]
Loading weights: 2%|β | 10/434 [00:00<00:03, 128.59it/s, Materializing param=model.layers.0.self_attn.q_proj.bias]
Loading weights: 3%|β | 11/434 [00:00<00:02, 141.31it/s, Materializing param=model.layers.0.self_attn.q_proj.weight]
Loading weights: 3%|β | 11/434 [00:00<00:02, 141.20it/s, Materializing param=model.layers.0.self_attn.q_proj.weight]
Loading weights: 3%|β | 12/434 [00:00<00:02, 151.43it/s, Materializing param=model.layers.0.self_attn.v_proj.bias]
Loading weights: 3%|β | 12/434 [00:00<00:02, 151.27it/s, Materializing param=model.layers.0.self_attn.v_proj.bias]
Loading weights: 3%|β | 13/434 [00:00<00:02, 161.98it/s, Materializing param=model.layers.0.self_attn.v_proj.weight]
Loading weights: 3%|β | 13/434 [00:00<00:02, 161.82it/s, Materializing param=model.layers.0.self_attn.v_proj.weight]
Loading weights: 3%|β | 14/434 [00:00<00:02, 174.08it/s, Materializing param=model.layers.1.input_layernorm.weight]
Loading weights: 3%|β | 14/434 [00:00<00:02, 173.99it/s, Materializing param=model.layers.1.input_layernorm.weight]
Loading weights: 3%|β | 15/434 [00:00<00:02, 186.25it/s, Materializing param=model.layers.1.mlp.down_proj.weight]
Loading weights: 3%|β | 15/434 [00:00<00:02, 186.16it/s, Materializing param=model.layers.1.mlp.down_proj.weight]
Loading weights: 4%|β | 16/434 [00:00<00:02, 182.99it/s, Materializing param=model.layers.1.mlp.gate_proj.weight]
Loading weights: 4%|β | 16/434 [00:00<00:02, 182.79it/s, Materializing param=model.layers.1.mlp.gate_proj.weight]
Loading weights: 4%|β | 17/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.mlp.gate_proj.weight]
Loading weights: 4%|β | 17/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.mlp.up_proj.weight]
Loading weights: 4%|β | 17/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.mlp.up_proj.weight]
Loading weights: 4%|β | 18/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.post_attention_layernorm.weight]
Loading weights: 4%|β | 18/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.post_attention_layernorm.weight]
Loading weights: 4%|β | 19/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.k_proj.bias]
Loading weights: 4%|β | 19/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.k_proj.bias]
Loading weights: 5%|β | 20/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.k_proj.weight]
Loading weights: 5%|β | 20/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.k_proj.weight]
Loading weights: 5%|β | 21/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.o_proj.weight]
Loading weights: 5%|β | 21/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.o_proj.weight]
Loading weights: 5%|β | 22/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.q_proj.bias]
Loading weights: 5%|β | 22/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.q_proj.bias]
Loading weights: 5%|β | 23/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.q_proj.weight]
Loading weights: 5%|β | 23/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.q_proj.weight]
Loading weights: 6%|β | 24/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.v_proj.bias]
Loading weights: 6%|β | 24/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.v_proj.bias]
Loading weights: 6%|β | 25/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.v_proj.weight]
Loading weights: 6%|β | 25/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.1.self_attn.v_proj.weight]
Loading weights: 6%|β | 26/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.input_layernorm.weight]
Loading weights: 6%|β | 26/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.input_layernorm.weight]
Loading weights: 6%|β | 27/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.mlp.down_proj.weight]
Loading weights: 6%|β | 27/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.mlp.down_proj.weight]
Loading weights: 6%|β | 28/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.mlp.gate_proj.weight]
Loading weights: 6%|β | 28/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.mlp.gate_proj.weight]
Loading weights: 7%|β | 29/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.mlp.up_proj.weight]
Loading weights: 7%|β | 29/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.mlp.up_proj.weight]
Loading weights: 7%|β | 30/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.post_attention_layernorm.weight]
Loading weights: 7%|β | 30/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.post_attention_layernorm.weight]
Loading weights: 7%|β | 31/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.k_proj.bias]
Loading weights: 7%|β | 31/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.k_proj.bias]
Loading weights: 7%|β | 32/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.k_proj.weight]
Loading weights: 7%|β | 32/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.k_proj.weight]
Loading weights: 8%|β | 33/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.o_proj.weight]
Loading weights: 8%|β | 33/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.o_proj.weight]
Loading weights: 8%|β | 34/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.q_proj.bias]
Loading weights: 8%|β | 34/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.q_proj.bias]
Loading weights: 8%|β | 35/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.q_proj.weight]
Loading weights: 8%|β | 35/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.q_proj.weight]
Loading weights: 8%|β | 36/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.v_proj.bias]
Loading weights: 8%|β | 36/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.v_proj.bias]
Loading weights: 9%|β | 37/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.v_proj.weight]
Loading weights: 9%|β | 37/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.2.self_attn.v_proj.weight]
Loading weights: 9%|β | 38/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.input_layernorm.weight]
Loading weights: 9%|β | 38/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.input_layernorm.weight]
Loading weights: 9%|β | 39/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.mlp.down_proj.weight]
Loading weights: 9%|β | 39/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.mlp.down_proj.weight]
Loading weights: 9%|β | 40/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.mlp.gate_proj.weight]
Loading weights: 9%|β | 40/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.mlp.gate_proj.weight]
Loading weights: 9%|β | 41/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.mlp.up_proj.weight]
Loading weights: 9%|β | 41/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.mlp.up_proj.weight]
Loading weights: 10%|β | 42/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.post_attention_layernorm.weight]
Loading weights: 10%|β | 42/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.post_attention_layernorm.weight]
Loading weights: 10%|β | 43/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.k_proj.bias]
Loading weights: 10%|β | 43/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.k_proj.bias]
Loading weights: 10%|β | 44/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.k_proj.weight]
Loading weights: 10%|β | 44/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.k_proj.weight]
Loading weights: 10%|β | 45/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.o_proj.weight]
Loading weights: 10%|β | 45/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.o_proj.weight]
Loading weights: 11%|β | 46/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.q_proj.bias]
Loading weights: 11%|β | 46/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.q_proj.bias]
Loading weights: 11%|β | 47/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.q_proj.weight]
Loading weights: 11%|β | 47/434 [00:00<00:02, 164.86it/s, Materializing param=model.layers.3.self_attn.q_proj.weight]
Loading weights: 11%|β | 48/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.3.self_attn.q_proj.weight]
Loading weights: 11%|β | 48/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.3.self_attn.v_proj.bias]
Loading weights: 11%|β | 48/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.3.self_attn.v_proj.bias]
Loading weights: 11%|ββ | 49/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.3.self_attn.v_proj.weight]
Loading weights: 11%|ββ | 49/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.3.self_attn.v_proj.weight]
Loading weights: 12%|ββ | 50/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.input_layernorm.weight]
Loading weights: 12%|ββ | 50/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.input_layernorm.weight]
Loading weights: 12%|ββ | 51/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.mlp.down_proj.weight]
Loading weights: 12%|ββ | 51/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.mlp.down_proj.weight]
Loading weights: 12%|ββ | 52/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.mlp.gate_proj.weight]
Loading weights: 12%|ββ | 52/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.mlp.gate_proj.weight]
Loading weights: 12%|ββ | 53/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.mlp.up_proj.weight]
Loading weights: 12%|ββ | 53/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.mlp.up_proj.weight]
Loading weights: 12%|ββ | 54/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.post_attention_layernorm.weight]
Loading weights: 12%|ββ | 54/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.post_attention_layernorm.weight]
Loading weights: 13%|ββ | 55/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.k_proj.bias]
Loading weights: 13%|ββ | 55/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.k_proj.bias]
Loading weights: 13%|ββ | 56/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.k_proj.weight]
Loading weights: 13%|ββ | 56/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.k_proj.weight]
Loading weights: 13%|ββ | 57/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.o_proj.weight]
Loading weights: 13%|ββ | 57/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.o_proj.weight]
Loading weights: 13%|ββ | 58/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.q_proj.bias]
Loading weights: 13%|ββ | 58/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.q_proj.bias]
Loading weights: 14%|ββ | 59/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.q_proj.weight]
Loading weights: 14%|ββ | 59/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.q_proj.weight]
Loading weights: 14%|ββ | 60/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.v_proj.bias]
Loading weights: 14%|ββ | 60/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.v_proj.bias]
Loading weights: 14%|ββ | 61/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.v_proj.weight]
Loading weights: 14%|ββ | 61/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.4.self_attn.v_proj.weight]
Loading weights: 14%|ββ | 62/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.input_layernorm.weight]
Loading weights: 14%|ββ | 62/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.input_layernorm.weight]
Loading weights: 15%|ββ | 63/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.mlp.down_proj.weight]
Loading weights: 15%|ββ | 63/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.mlp.down_proj.weight]
Loading weights: 15%|ββ | 64/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.mlp.gate_proj.weight]
Loading weights: 15%|ββ | 64/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.mlp.gate_proj.weight]
Loading weights: 15%|ββ | 65/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.mlp.up_proj.weight]
Loading weights: 15%|ββ | 65/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.mlp.up_proj.weight]
Loading weights: 15%|ββ | 66/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.post_attention_layernorm.weight]
Loading weights: 15%|ββ | 66/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.post_attention_layernorm.weight]
Loading weights: 15%|ββ | 67/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.self_attn.k_proj.bias]
Loading weights: 15%|ββ | 67/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.self_attn.k_proj.bias]
Loading weights: 16%|ββ | 68/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.self_attn.k_proj.weight]
Loading weights: 16%|ββ | 68/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.self_attn.k_proj.weight]
Loading weights: 16%|ββ | 69/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.self_attn.o_proj.weight]
Loading weights: 16%|ββ | 69/434 [00:00<00:01, 248.98it/s, Materializing param=model.layers.5.self_attn.o_proj.weight] |