[2025-10-12 02:39:16,867] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:1367687] baseline 0.000GB () [2025-10-12 02:39:16,867] [INFO] [axolotl.cli.config.load_cfg:248] [PID:1367687] config: { "activation_offloading": false, "auto_resume_from_checkpoints": true, "axolotl_config_path": "train_350m_multitask.yaml", "base_model": "/home/ubuntu/axolotl/out-350m-audio-pt", "base_model_config": "/home/ubuntu/axolotl/out-350m-audio-pt", "batch_size": 256, "bf16": true, "bfloat16": true, "capabilities": { "bf16": true, "compute_capability": "sm_80", "fp8": false, "n_gpu": 8, "n_node": 1 }, "context_parallel_size": 1, "cut_cross_entropy": true, "dataloader_num_workers": 8, "dataloader_pin_memory": true, "dataloader_prefetch_factor": 256, "dataset_prepared_path": "/home/ubuntu/axolotl/preprocessed-data-350m-multitask-ft", "dataset_processes": 240, "datasets": [ { "ds_type": "json", "message_property_mappings": { "content": "content", "role": "role" }, "path": "/home/ubuntu/axolotl/hackathon-train_data-s2s-jaen.jsonl", "trust_remote_code": false }, { "ds_type": "json", "message_property_mappings": { "content": "content", "role": "role" }, "path": "/home/ubuntu/axolotl/hackathon-train_data-s2s-enja.jsonl", "trust_remote_code": false }, { "ds_type": "json", "message_property_mappings": { "content": "content", "role": "role" }, "path": "/home/ubuntu/axolotl/hackathon-train_data-asr-ja.jsonl", "trust_remote_code": false }, { "ds_type": "json", "message_property_mappings": { "content": "content", "role": "role" }, "path": "/home/ubuntu/axolotl/hackathon-train_data-tts-ja.jsonl", "trust_remote_code": false }, { "ds_type": "json", "message_property_mappings": { "content": "content", "role": "role" }, "path": "/home/ubuntu/axolotl/hackathon-train_data-asr-en.jsonl", "trust_remote_code": false }, { "ds_type": "json", "message_property_mappings": { "content": "content", "role": "role" }, "path": "/home/ubuntu/axolotl/hackathon-train_data-tts-en.jsonl", "trust_remote_code": false } ], "ddp": true, "device": "cuda:0", "device_map": { "": 0 }, "dion_rank_fraction": 1.0, "dion_rank_multiple_of": 1, "env_capabilities": { "torch_version": "2.8.0" }, "eval_batch_size": 8, "eval_causal_lm_metrics": [ "sacrebleu", "comet", "ter", "chrf" ], "eval_max_new_tokens": 128, "eval_sample_packing": false, "eval_steps": 100, "eval_strategy": "steps", "eval_table_size": 0, "experimental_skip_move_to_device": true, "flash_attention": true, "fp16": false, "gradient_accumulation_steps": 1, "gradient_checkpointing": false, "group_by_length": false, "include_tkps": true, "is_falcon_derived_model": false, "is_llama_derived_model": false, "is_mistral_derived_model": false, "learning_rate": 5e-05, "lisa_layers_attribute": "model.layers", "load_best_model_at_end": false, "load_in_4bit": false, "load_in_8bit": false, "local_rank": 0, "logging_steps": 1, "loraplus_lr_embedding": 1e-06, "lr_scheduler": "cosine", "max_grad_norm": 1.0, "mean_resizing_embeddings": false, "micro_batch_size": 32, "model_config_type": "lfm2", "num_epochs": 3.0, "optimizer": "adamw_torch_fused", "output_dir": "/home/ubuntu/axolotl/out-350m-multitask-ft", "pad_to_sequence_len": true, "plugins": [ "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin" ], "pretrain_multipack_attn": true, "profiler_steps_start": 0, "qlora_sharded_model_loading": false, "ray_num_workers": 1, "remove_unused_columns": false, "resources_per_worker": { "GPU": 1 }, "sample_packing": true, "sample_packing_bin_size": 200, "sample_packing_group_size": 100000, "save_only_model": false, "save_safetensors": true, "save_steps": 100, "save_strategy": "steps", "sequence_len": 4096, "shuffle_before_merging_datasets": false, "shuffle_merged_datasets": true, "skip_prepare_dataset": false, "streaming_multipack_buffer_size": 10000, "strict": false, "tensor_parallel_size": 1, "tf32": true, "tiled_mlp_use_original_mlp": true, "tokenizer_config": "/home/ubuntu/axolotl/out-350m-audio-pt", "tokenizer_save_jinja_files": true, "tokenizer_type": "AutoTokenizer", "torch_dtype": "torch.bfloat16", "train_on_inputs": false, "trl": { "log_completions": false, "mask_truncated_completions": false, "ref_model_mixup_alpha": 0.9, "ref_model_sync_steps": 64, "scale_rewards": true, "sync_ref_model": false, "use_vllm": false, "vllm_server_host": "0.0.0.0", "vllm_server_port": 8000 }, "type_of_model": "AutoModelForCausalLM", "use_ray": false, "use_wandb": true, "val_set_size": 0.01, "vllm": { "device": "auto", "dtype": "auto", "gpu_memory_utilization": 0.9, "host": "0.0.0.0", "port": 8000 }, "wandb_entity": "aratako-lm", "wandb_name": "350m-multitask-ft-run1", "wandb_project": "liquidai-hackathon", "warmup_ratio": 0.1, "weight_decay": 0.01, "world_size": 8 } [2025-10-12 02:40:01,705] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:1367689] Loading raw datasets... Loading dataset shards: 0%| | 0/18 [00:00 [2025-10-12 02:40:10,483] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:1367687] BOS: 1 / <|startoftext|> [2025-10-12 02:40:10,484] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:1367687] PAD: 0 / <|pad|> [2025-10-12 02:40:10,484] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:1367687] UNK: None / None Dropping Long Sequences (>4096) (num_proc=240): 0%| | 0/1185642 [00:004096) (num_proc=240): 0%| | 1000/1185642 [00:06<2:13:13, 148.20 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 2%|▋ | 20000/1185642 [00:06<04:45, 4078.92 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 3%|█▎ | 37000/1185642 [00:06<02:09, 8838.29 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 5%|█▉ | 55000/1185642 [00:07<01:13, 15368.21 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 6%|██▋ | 73000/1185642 [00:07<00:46, 23967.79 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 8%|███▏ | 89000/1185642 [00:07<00:33, 33183.52 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 9%|███▊ | 106000/1185642 [00:07<00:23, 45484.98 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 11%|████▍ | 127000/1185642 [00:07<00:16, 64139.83 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 13%|█████▋ | 159000/1185642 [00:07<00:10, 99657.38 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 16%|██████▍ | 187000/1185642 [00:07<00:07, 129571.10 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 19%|███████▌ | 220000/1185642 [00:07<00:05, 167897.73 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 21%|████████▋ | 251000/1185642 [00:07<00:04, 197134.90 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 24%|█████████▋ | 279000/1185642 [00:08<00:04, 211483.70 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 26%|██████████▋ | 309000/1185642 [00:08<00:03, 232237.16 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 29%|███████████▊ | 343000/1185642 [00:08<00:03, 260015.55 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 32%|█████████████ | 377000/1185642 [00:08<00:02, 278359.02 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 34%|██████████████ | 408000/1185642 [00:08<00:02, 277664.56 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 37%|███████████████▏ | 438000/1185642 [00:08<00:02, 268511.49 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 39%|████████████████▏ | 467000/1185642 [00:08<00:02, 254779.79 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 42%|█████████████████ | 494000/1185642 [00:08<00:03, 224759.76 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 44%|█████████████████▉ | 518000/1185642 [00:08<00:03, 208819.91 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 46%|██████████████████▋ | 539941/1185642 [00:09<00:03, 207937.00 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 48%|███████████████████▍ | 563823/1185642 [00:09<00:03, 197604.25 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 50%|████████████████████▌ | 592823/1185642 [00:09<00:02, 219616.15 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 52%|█████████████████████▎ | 615763/1185642 [00:09<00:02, 198801.06 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 54%|██████████████████████ | 636704/1185642 [00:09<00:03, 159172.24 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 56%|██████████████████████▊ | 659350/1185642 [00:09<00:03, 173269.28 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 57%|███████████████████████▍ | 678818/1185642 [00:09<00:02, 170075.01 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 59%|████████████████████████▎ | 703344/1185642 [00:10<00:02, 188366.46 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 61%|█████████████████████████▏ | 726868/1185642 [00:10<00:02, 197958.91 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 63%|█████████████████████████▉ | 748331/1185642 [00:10<00:02, 198028.04 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 66%|███████████████████████████ | 783032/1185642 [00:10<00:01, 236517.53 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 68%|███████████████████████████▉ | 808314/1185642 [00:10<00:01, 235742.91 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 71%|████████████████████████████▉ | 836716/1185642 [00:10<00:01, 248221.17 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 74%|██████████████████████████████▏ | 873177/1185642 [00:10<00:01, 280186.00 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 76%|███████████████████████████████▏ | 903400/1185642 [00:10<00:00, 285986.91 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 79%|████████████████████████████████▍ | 936621/1185642 [00:10<00:00, 298678.02 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 82%|█████████████████████████████████▌ | 971201/1185642 [00:10<00:00, 309856.38 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 85%|█████████████████████████████████▉ | 1004722/1185642 [00:11<00:00, 315287.00 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 87%|██████████████████████████████████▉ | 1036642/1185642 [00:11<00:00, 274993.63 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 90%|███████████████████████████████████▉ | 1065682/1185642 [00:11<00:00, 245335.45 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 92%|████████████████████████████████████▊ | 1091902/1185642 [00:11<00:00, 238996.99 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 94%|█████████████████████████████████████▋ | 1117362/1185642 [00:11<00:00, 236046.67 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 96%|██████████████████████████████████████▌ | 1142282/1185642 [00:11<00:00, 197506.25 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 98%|███████████████████████████████████████▎| 1164022/1185642 [00:11<00:00, 177990.40 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 100%|███████████████████████████████████████▉| 1183762/1185642 [00:12<00:00, 116666.59 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 100%|█████████████████████████████████████████| 1185642/1185642 [00:13<00:00, 89249.57 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=240): 0%| | 0/1185642 [00:00 [2025-10-12 02:43:35,540] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:1367687] BOS: 1 / <|startoftext|> [2025-10-12 02:43:35,540] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:1367687] PAD: 0 / <|pad|> [2025-10-12 02:43:35,540] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:1367687] UNK: None / None [2025-10-12 02:43:35,540] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:1367687] Loading model [2025-10-12 02:43:35,564] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:1367687] Patched Trainer.evaluation_loop with nanmean loss calculation [2025-10-12 02:43:35,566] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:1367687] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation [2025-10-12 02:43:35,567] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:1367687] Applying multipack dataloader patch for sample packing... [2025-10-12 02:43:36,156] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:1367687] Applying Cut Cross Entropy to model type: lfm2 [2025-10-12 02:43:36,508] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:1367687] Converting modules to torch.bfloat16 [2025-10-12 02:43:36,511] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:1367687] Memory usage after model load 1.285GB (+1.285GB allocated, +1.303GB reserved) [2025-10-12 02:43:40,065] [INFO] [axolotl.train.save_initial_configs:402] [PID:1367687] Pre-saving tokenizer to /home/ubuntu/axolotl/out-350m-multitask-ft... [2025-10-12 02:43:40,792] [INFO] [axolotl.train.save_initial_configs:407] [PID:1367687] Pre-saving model config to /home/ubuntu/axolotl/out-350m-multitask-ft... [2025-10-12 02:43:40,795] [INFO] [axolotl.train.execute_training:196] [PID:1367687] Starting trainer... [2025-10-12 02:50:32,078] [WARNING] [py.warnings._showwarnmsg:110] [PID:1367693] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once [2025-10-12 02:50:40,609] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:1367687] generate_batches time: 2.821959972381592 [2025-10-12 02:50:42,712] [WARNING] [py.warnings._showwarnmsg:110] [PID:1367688] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once [2025-10-12 02:50:44,543] [WARNING] [py.warnings._showwarnmsg:110] [PID:1367689] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once [2025-10-12 02:50:45,500] [WARNING] [py.warnings._showwarnmsg:110] [PID:1367694] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once [2025-10-12 02:50:47,700] [WARNING] [py.warnings._showwarnmsg:110] [PID:1367690] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once [2025-10-12 02:50:52,950] [WARNING] [py.warnings._showwarnmsg:110] [PID:1367691] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once [2025-10-12 02:50:59,700] [WARNING] [py.warnings._showwarnmsg:110] [PID:1367692] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once [2025-10-12 02:50:59,701] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:1367687] gather_len_batches: [7647, 7647, 7647, 7647, 7647, 7647, 7647, 7647] [2025-10-12 02:50:59,703] [WARNING] [py.warnings._showwarnmsg:110] [PID:1367687] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once wandb: Currently logged in as: aratako1998 (aratako-lm) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin wandb: ⢿ Waiting for wandb.init()... m wandb: ⣻ Waiting for wandb.init()... m wandb: Tracking run with wandb version 0.22.2 wandb: Run data is saved locally in /home/ubuntu/axolotl/wandb/run-20251012_025059-guktalmo wandb: Run `wandb offline` to turn off syncing. wandb: Syncing run 350m-multitask-ft-run1 wandb: ⭐️ View project at https://wandb.ai/aratako-lm/liquidai-hackathon wandb: 🚀 View run at https://wandb.ai/aratako-lm/liquidai-hackathon/runs/guktalmo wandb: Detected [huggingface_hub.inference] in use. wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ wandb: WARNING Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt") [2025-10-12 02:51:01,203] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:1367687] The Axolotl config has been saved to the WandB run under files. 0%| | 0/2865 [00:00", line 198, in _run_module_as_main [rank1]: File "", line 88, in _run_code [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 121, in [rank1]: fire.Fire(do_cli) [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire [rank1]: component_trace = _Fire(component, args, parsed_flag_args, context, name) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire [rank1]: component, remaining_args = _CallAndUpdateTrace( [rank1]: ^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace [rank1]: component = fn(*varargs, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 88, in do_cli [rank1]: return do_train(parsed_cfg, parsed_cli_args) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 45, in do_train [rank1]: model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 584, in train [rank1]: execute_training(cfg, trainer, resume_from_checkpoint) [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 197, in execute_training [rank1]: trainer.train(resume_from_checkpoint=resume_from_checkpoint) [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2224, in train [rank1]: return inner_training_loop( [rank1]: ^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2561, in _inner_training_loop [rank1]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/mixins/activation_checkpointing.py", line 46, in training_step [rank1]: return super().training_step(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3854, in training_step [rank1]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/base.py", line 367, in compute_loss [rank1]: return super().compute_loss( [rank1]: ^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3936, in compute_loss [rank1]: outputs = model(**inputs) [rank1]: ^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank1]: return self._call_impl(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank1]: return forward_call(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1648, in forward [rank1]: else self._run_ddp_forward(*inputs, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1474, in _run_ddp_forward [rank1]: return self.module(*inputs, **kwargs) # type: ignore[index] [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank1]: return self._call_impl(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank1]: return forward_call(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 818, in forward [rank1]: return model_forward(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 806, in __call__ [rank1]: return convert_to_fp32(self.model_forward(*args, **kwargs)) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast [rank1]: return func(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 52, in cce_forward [rank1]: outputs: BaseModelOutputWithPast = self.model( [rank1]: ^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank1]: return self._call_impl(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank1]: return forward_call(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 927, in wrapper [rank1]: outputs = func(self, *args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 652, in forward [rank1]: hidden_states = decoder_layer( [rank1]: ^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ [rank1]: return super().__call__(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank1]: return self._call_impl(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank1]: return forward_call(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank1]: return func(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 556, in forward [rank1]: hidden_states = self.conv( [rank1]: ^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank1]: return self._call_impl(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank1]: return forward_call(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank1]: return func(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 517, in forward [rank1]: return self.slow_forward(hidden_states, past_key_values, cache_position, attention_mask) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank1]: return func(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 500, in slow_forward [rank1]: conv_out = self.conv(Bx)[..., :seqlen] [rank1]: ^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank1]: return self._call_impl(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank1]: return forward_call(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 371, in forward [rank1]: return self._conv_forward(input, self.weight, self.bias) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 366, in _conv_forward [rank1]: return F.conv1d( [rank1]: ^^^^^^^^^ [rank1]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 272.00 MiB. GPU 1 has a total capacity of 79.25 GiB of which 160.94 MiB is free. Including non-PyTorch memory, this process has 79.09 GiB memory in use. Of the allocated memory 77.68 GiB is allocated by PyTorch, and 85.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) Traceback (most recent call last): File "", line 198, in _run_module_as_main File "", line 88, in _run_code File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 121, in fire.Fire(do_cli) File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire component_trace = _Fire(component, args, parsed_flag_args, context, name) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire component, remaining_args = _CallAndUpdateTrace( ^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace component = fn(*varargs, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 88, in do_cli return do_train(parsed_cfg, parsed_cli_args) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 45, in do_train model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 584, in train execute_training(cfg, trainer, resume_from_checkpoint) File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 197, in execute_training trainer.train(resume_from_checkpoint=resume_from_checkpoint) File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2224, in train return inner_training_loop( ^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2561, in _inner_training_loop tr_loss_step = self.training_step(model, inputs, num_items_in_batch) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/mixins/activation_checkpointing.py", line 46, in training_step return super().training_step(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3854, in training_step loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/base.py", line 367, in compute_loss return super().compute_loss( ^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3936, in compute_loss outputs = model(**inputs) ^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1648, in forward else self._run_ddp_forward(*inputs, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1474, in _run_ddp_forward return self.module(*inputs, **kwargs) # type: ignore[index] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 818, in forward return model_forward(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 806, in __call__ return convert_to_fp32(self.model_forward(*args, **kwargs)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 52, in cce_forward outputs: BaseModelOutputWithPast = self.model( ^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 927, in wrapper outputs = func(self, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 652, in forward hidden_states = decoder_layer( ^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ return super().__call__(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 557, in forward hidden_states=self.operator_norm(hidden_states), ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 63, in forward return self.weight * hidden_states.to(input_dtype) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 0 has a total capacity of 79.25 GiB of which 208.94 MiB is free. Including non-PyTorch memory, this process has 79.04 GiB memory in use. Of the allocated memory 76.43 GiB is allocated by PyTorch, and 165.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) [rank0]: Traceback (most recent call last): [rank0]: File "", line 198, in _run_module_as_main [rank0]: File "", line 88, in _run_code [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 121, in [rank0]: fire.Fire(do_cli) [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire [rank0]: component_trace = _Fire(component, args, parsed_flag_args, context, name) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire [rank0]: component, remaining_args = _CallAndUpdateTrace( [rank0]: ^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace [rank0]: component = fn(*varargs, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 88, in do_cli [rank0]: return do_train(parsed_cfg, parsed_cli_args) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 45, in do_train [rank0]: model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 584, in train [rank0]: execute_training(cfg, trainer, resume_from_checkpoint) [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 197, in execute_training [rank0]: trainer.train(resume_from_checkpoint=resume_from_checkpoint) [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2224, in train [rank0]: return inner_training_loop( [rank0]: ^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2561, in _inner_training_loop [rank0]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/mixins/activation_checkpointing.py", line 46, in training_step [rank0]: return super().training_step(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3854, in training_step [rank0]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/base.py", line 367, in compute_loss [rank0]: return super().compute_loss( [rank0]: ^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3936, in compute_loss [rank0]: outputs = model(**inputs) [rank0]: ^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank0]: return self._call_impl(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank0]: return forward_call(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1648, in forward [rank0]: else self._run_ddp_forward(*inputs, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1474, in _run_ddp_forward [rank0]: return self.module(*inputs, **kwargs) # type: ignore[index] [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank0]: return self._call_impl(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank0]: return forward_call(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 818, in forward [rank0]: return model_forward(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 806, in __call__ [rank0]: return convert_to_fp32(self.model_forward(*args, **kwargs)) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast [rank0]: return func(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 52, in cce_forward [rank0]: outputs: BaseModelOutputWithPast = self.model( [rank0]: ^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank0]: return self._call_impl(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank0]: return forward_call(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 927, in wrapper [rank0]: outputs = func(self, *args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 652, in forward [rank0]: hidden_states = decoder_layer( [rank0]: ^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ [rank0]: return super().__call__(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank0]: return self._call_impl(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank0]: return forward_call(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank0]: return func(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 557, in forward [rank0]: hidden_states=self.operator_norm(hidden_states), [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank0]: return self._call_impl(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank0]: return forward_call(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 63, in forward [rank0]: return self.weight * hidden_states.to(input_dtype) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 0 has a total capacity of 79.25 GiB of which 208.94 MiB is free. Including non-PyTorch memory, this process has 79.04 GiB memory in use. Of the allocated memory 76.43 GiB is allocated by PyTorch, and 165.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) [rank2]: Traceback (most recent call last): [rank2]: File "", line 198, in _run_module_as_main [rank2]: File "", line 88, in _run_code [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 121, in [rank2]: fire.Fire(do_cli) [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire [rank2]: component_trace = _Fire(component, args, parsed_flag_args, context, name) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire [rank2]: component, remaining_args = _CallAndUpdateTrace( [rank2]: ^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace [rank2]: component = fn(*varargs, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 88, in do_cli [rank2]: return do_train(parsed_cfg, parsed_cli_args) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 45, in do_train [rank2]: model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 584, in train [rank2]: execute_training(cfg, trainer, resume_from_checkpoint) [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 197, in execute_training [rank2]: trainer.train(resume_from_checkpoint=resume_from_checkpoint) [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2224, in train [rank2]: return inner_training_loop( [rank2]: ^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2561, in _inner_training_loop [rank2]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/mixins/activation_checkpointing.py", line 46, in training_step [rank2]: return super().training_step(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3854, in training_step [rank2]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/base.py", line 367, in compute_loss [rank2]: return super().compute_loss( [rank2]: ^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3936, in compute_loss [rank2]: outputs = model(**inputs) [rank2]: ^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank2]: return self._call_impl(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank2]: return forward_call(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1648, in forward [rank2]: else self._run_ddp_forward(*inputs, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1474, in _run_ddp_forward [rank2]: return self.module(*inputs, **kwargs) # type: ignore[index] [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank2]: return self._call_impl(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank2]: return forward_call(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 818, in forward [rank2]: return model_forward(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 806, in __call__ [rank2]: return convert_to_fp32(self.model_forward(*args, **kwargs)) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast [rank2]: return func(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 52, in cce_forward [rank2]: outputs: BaseModelOutputWithPast = self.model( [rank2]: ^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank2]: return self._call_impl(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank2]: return forward_call(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 927, in wrapper [rank2]: outputs = func(self, *args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 652, in forward [rank2]: hidden_states = decoder_layer( [rank2]: ^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ [rank2]: return super().__call__(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank2]: return self._call_impl(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank2]: return forward_call(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank2]: return func(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 556, in forward [rank2]: hidden_states = self.conv( [rank2]: ^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank2]: return self._call_impl(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank2]: return forward_call(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank2]: return func(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 517, in forward [rank2]: return self.slow_forward(hidden_states, past_key_values, cache_position, attention_mask) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank2]: return func(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 500, in slow_forward [rank2]: conv_out = self.conv(Bx)[..., :seqlen] [rank2]: ^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank2]: return self._call_impl(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank2]: return forward_call(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 371, in forward [rank2]: return self._conv_forward(input, self.weight, self.bias) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 366, in _conv_forward [rank2]: return F.conv1d( [rank2]: ^^^^^^^^^ [rank2]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 272.00 MiB. GPU 2 has a total capacity of 79.25 GiB of which 160.94 MiB is free. Including non-PyTorch memory, this process has 79.09 GiB memory in use. Of the allocated memory 77.68 GiB is allocated by PyTorch, and 85.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) [rank5]: Traceback (most recent call last): [rank5]: File "", line 198, in _run_module_as_main [rank5]: File "", line 88, in _run_code [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 121, in [rank5]: fire.Fire(do_cli) [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire [rank5]: component_trace = _Fire(component, args, parsed_flag_args, context, name) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire [rank5]: component, remaining_args = _CallAndUpdateTrace( [rank5]: ^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace [rank5]: component = fn(*varargs, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 88, in do_cli [rank5]: return do_train(parsed_cfg, parsed_cli_args) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 45, in do_train [rank5]: model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 584, in train [rank5]: execute_training(cfg, trainer, resume_from_checkpoint) [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 197, in execute_training [rank5]: trainer.train(resume_from_checkpoint=resume_from_checkpoint) [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2224, in train [rank5]: return inner_training_loop( [rank5]: ^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2561, in _inner_training_loop [rank5]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/mixins/activation_checkpointing.py", line 46, in training_step [rank5]: return super().training_step(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3854, in training_step [rank5]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/base.py", line 367, in compute_loss [rank5]: return super().compute_loss( [rank5]: ^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3936, in compute_loss [rank5]: outputs = model(**inputs) [rank5]: ^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank5]: return self._call_impl(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank5]: return forward_call(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1648, in forward [rank5]: else self._run_ddp_forward(*inputs, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1474, in _run_ddp_forward [rank5]: return self.module(*inputs, **kwargs) # type: ignore[index] [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank5]: return self._call_impl(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank5]: return forward_call(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 818, in forward [rank5]: return model_forward(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 806, in __call__ [rank5]: return convert_to_fp32(self.model_forward(*args, **kwargs)) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast [rank5]: return func(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 52, in cce_forward [rank5]: outputs: BaseModelOutputWithPast = self.model( [rank5]: ^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank5]: return self._call_impl(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank5]: return forward_call(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 927, in wrapper [rank5]: outputs = func(self, *args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 652, in forward [rank5]: hidden_states = decoder_layer( [rank5]: ^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ [rank5]: return super().__call__(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank5]: return self._call_impl(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank5]: return forward_call(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank5]: return func(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 556, in forward [rank5]: hidden_states = self.conv( [rank5]: ^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank5]: return self._call_impl(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank5]: return forward_call(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank5]: return func(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 517, in forward [rank5]: return self.slow_forward(hidden_states, past_key_values, cache_position, attention_mask) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank5]: return func(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 500, in slow_forward [rank5]: conv_out = self.conv(Bx)[..., :seqlen] [rank5]: ^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank5]: return self._call_impl(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank5]: return forward_call(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 371, in forward [rank5]: return self._conv_forward(input, self.weight, self.bias) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 366, in _conv_forward [rank5]: return F.conv1d( [rank5]: ^^^^^^^^^ [rank5]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 272.00 MiB. GPU 5 has a total capacity of 79.25 GiB of which 160.94 MiB is free. Including non-PyTorch memory, this process has 79.09 GiB memory in use. Of the allocated memory 77.68 GiB is allocated by PyTorch, and 85.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) [rank4]: Traceback (most recent call last): [rank4]: File "", line 198, in _run_module_as_main [rank4]: File "", line 88, in _run_code [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 121, in [rank4]: fire.Fire(do_cli) [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire [rank4]: component_trace = _Fire(component, args, parsed_flag_args, context, name) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire [rank4]: component, remaining_args = _CallAndUpdateTrace( [rank4]: ^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace [rank4]: component = fn(*varargs, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 88, in do_cli [rank4]: return do_train(parsed_cfg, parsed_cli_args) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 45, in do_train [rank4]: model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 584, in train [rank4]: execute_training(cfg, trainer, resume_from_checkpoint) [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 197, in execute_training [rank4]: trainer.train(resume_from_checkpoint=resume_from_checkpoint) [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2224, in train [rank4]: return inner_training_loop( [rank4]: ^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2561, in _inner_training_loop [rank4]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/mixins/activation_checkpointing.py", line 46, in training_step [rank4]: return super().training_step(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3854, in training_step [rank4]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/base.py", line 367, in compute_loss [rank4]: return super().compute_loss( [rank4]: ^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3936, in compute_loss [rank4]: outputs = model(**inputs) [rank4]: ^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank4]: return self._call_impl(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank4]: return forward_call(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1648, in forward [rank4]: else self._run_ddp_forward(*inputs, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1474, in _run_ddp_forward [rank4]: return self.module(*inputs, **kwargs) # type: ignore[index] [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank4]: return self._call_impl(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank4]: return forward_call(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 818, in forward [rank4]: return model_forward(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 806, in __call__ [rank4]: return convert_to_fp32(self.model_forward(*args, **kwargs)) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast [rank4]: return func(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 52, in cce_forward [rank4]: outputs: BaseModelOutputWithPast = self.model( [rank4]: ^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank4]: return self._call_impl(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank4]: return forward_call(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 927, in wrapper [rank4]: outputs = func(self, *args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 652, in forward [rank4]: hidden_states = decoder_layer( [rank4]: ^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ [rank4]: return super().__call__(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank4]: return self._call_impl(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank4]: return forward_call(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank4]: return func(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 556, in forward [rank4]: hidden_states = self.conv( [rank4]: ^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank4]: return self._call_impl(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank4]: return forward_call(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank4]: return func(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 517, in forward [rank4]: return self.slow_forward(hidden_states, past_key_values, cache_position, attention_mask) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank4]: return func(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 500, in slow_forward [rank4]: conv_out = self.conv(Bx)[..., :seqlen] [rank4]: ^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank4]: return self._call_impl(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank4]: return forward_call(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 371, in forward [rank4]: return self._conv_forward(input, self.weight, self.bias) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 366, in _conv_forward [rank4]: return F.conv1d( [rank4]: ^^^^^^^^^ [rank4]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 272.00 MiB. GPU 4 has a total capacity of 79.25 GiB of which 160.94 MiB is free. Including non-PyTorch memory, this process has 79.09 GiB memory in use. Of the allocated memory 77.68 GiB is allocated by PyTorch, and 85.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) [rank3]: Traceback (most recent call last): [rank3]: File "", line 198, in _run_module_as_main [rank3]: File "", line 88, in _run_code [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 121, in [rank3]: fire.Fire(do_cli) [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire [rank3]: component_trace = _Fire(component, args, parsed_flag_args, context, name) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire [rank3]: component, remaining_args = _CallAndUpdateTrace( [rank3]: ^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace [rank3]: component = fn(*varargs, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 88, in do_cli [rank3]: return do_train(parsed_cfg, parsed_cli_args) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 45, in do_train [rank3]: model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 584, in train [rank3]: execute_training(cfg, trainer, resume_from_checkpoint) [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 197, in execute_training [rank3]: trainer.train(resume_from_checkpoint=resume_from_checkpoint) [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2224, in train [rank3]: return inner_training_loop( [rank3]: ^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2561, in _inner_training_loop [rank3]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/mixins/activation_checkpointing.py", line 46, in training_step [rank3]: return super().training_step(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3854, in training_step [rank3]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/base.py", line 367, in compute_loss [rank3]: return super().compute_loss( [rank3]: ^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3936, in compute_loss [rank3]: outputs = model(**inputs) [rank3]: ^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank3]: return self._call_impl(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank3]: return forward_call(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1648, in forward [rank3]: else self._run_ddp_forward(*inputs, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1474, in _run_ddp_forward [rank3]: return self.module(*inputs, **kwargs) # type: ignore[index] [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank3]: return self._call_impl(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank3]: return forward_call(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 818, in forward [rank3]: return model_forward(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 806, in __call__ [rank3]: return convert_to_fp32(self.model_forward(*args, **kwargs)) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast [rank3]: return func(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 52, in cce_forward [rank3]: outputs: BaseModelOutputWithPast = self.model( [rank3]: ^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank3]: return self._call_impl(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank3]: return forward_call(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 927, in wrapper [rank3]: outputs = func(self, *args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 652, in forward [rank3]: hidden_states = decoder_layer( [rank3]: ^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ [rank3]: return super().__call__(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank3]: return self._call_impl(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank3]: return forward_call(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank3]: return func(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 556, in forward [rank3]: hidden_states = self.conv( [rank3]: ^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank3]: return self._call_impl(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank3]: return forward_call(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank3]: return func(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 517, in forward [rank3]: return self.slow_forward(hidden_states, past_key_values, cache_position, attention_mask) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank3]: return func(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 500, in slow_forward [rank3]: conv_out = self.conv(Bx)[..., :seqlen] [rank3]: ^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank3]: return self._call_impl(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank3]: return forward_call(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 371, in forward [rank3]: return self._conv_forward(input, self.weight, self.bias) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 366, in _conv_forward [rank3]: return F.conv1d( [rank3]: ^^^^^^^^^ [rank3]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 272.00 MiB. GPU 3 has a total capacity of 79.25 GiB of which 160.94 MiB is free. Including non-PyTorch memory, this process has 79.09 GiB memory in use. Of the allocated memory 77.68 GiB is allocated by PyTorch, and 85.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) [rank6]: Traceback (most recent call last): [rank6]: File "", line 198, in _run_module_as_main [rank6]: File "", line 88, in _run_code [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 121, in [rank6]: fire.Fire(do_cli) [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire [rank6]: component_trace = _Fire(component, args, parsed_flag_args, context, name) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire [rank6]: component, remaining_args = _CallAndUpdateTrace( [rank6]: ^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace [rank6]: component = fn(*varargs, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 88, in do_cli [rank6]: return do_train(parsed_cfg, parsed_cli_args) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 45, in do_train [rank6]: model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 584, in train [rank6]: execute_training(cfg, trainer, resume_from_checkpoint) [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 197, in execute_training [rank6]: trainer.train(resume_from_checkpoint=resume_from_checkpoint) [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2224, in train [rank6]: return inner_training_loop( [rank6]: ^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2561, in _inner_training_loop [rank6]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/mixins/activation_checkpointing.py", line 46, in training_step [rank6]: return super().training_step(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3854, in training_step [rank6]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/base.py", line 367, in compute_loss [rank6]: return super().compute_loss( [rank6]: ^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3936, in compute_loss [rank6]: outputs = model(**inputs) [rank6]: ^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank6]: return self._call_impl(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank6]: return forward_call(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1648, in forward [rank6]: else self._run_ddp_forward(*inputs, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1474, in _run_ddp_forward [rank6]: return self.module(*inputs, **kwargs) # type: ignore[index] [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank6]: return self._call_impl(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank6]: return forward_call(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 818, in forward [rank6]: return model_forward(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 806, in __call__ [rank6]: return convert_to_fp32(self.model_forward(*args, **kwargs)) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast [rank6]: return func(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 52, in cce_forward [rank6]: outputs: BaseModelOutputWithPast = self.model( [rank6]: ^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank6]: return self._call_impl(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank6]: return forward_call(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 927, in wrapper [rank6]: outputs = func(self, *args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 652, in forward [rank6]: hidden_states = decoder_layer( [rank6]: ^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ [rank6]: return super().__call__(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank6]: return self._call_impl(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank6]: return forward_call(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank6]: return func(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 556, in forward [rank6]: hidden_states = self.conv( [rank6]: ^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank6]: return self._call_impl(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank6]: return forward_call(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank6]: return func(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 517, in forward [rank6]: return self.slow_forward(hidden_states, past_key_values, cache_position, attention_mask) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank6]: return func(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 500, in slow_forward [rank6]: conv_out = self.conv(Bx)[..., :seqlen] [rank6]: ^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank6]: return self._call_impl(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank6]: return forward_call(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 371, in forward [rank6]: return self._conv_forward(input, self.weight, self.bias) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 366, in _conv_forward [rank6]: return F.conv1d( [rank6]: ^^^^^^^^^ [rank6]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 272.00 MiB. GPU 6 has a total capacity of 79.25 GiB of which 160.94 MiB is free. Including non-PyTorch memory, this process has 79.09 GiB memory in use. Of the allocated memory 77.68 GiB is allocated by PyTorch, and 85.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) [rank7]: Traceback (most recent call last): [rank7]: File "", line 198, in _run_module_as_main [rank7]: File "", line 88, in _run_code [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 121, in [rank7]: fire.Fire(do_cli) [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire [rank7]: component_trace = _Fire(component, args, parsed_flag_args, context, name) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire [rank7]: component, remaining_args = _CallAndUpdateTrace( [rank7]: ^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace [rank7]: component = fn(*varargs, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 88, in do_cli [rank7]: return do_train(parsed_cfg, parsed_cli_args) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 45, in do_train [rank7]: model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 584, in train [rank7]: execute_training(cfg, trainer, resume_from_checkpoint) [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 197, in execute_training [rank7]: trainer.train(resume_from_checkpoint=resume_from_checkpoint) [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2224, in train [rank7]: return inner_training_loop( [rank7]: ^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2561, in _inner_training_loop [rank7]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/mixins/activation_checkpointing.py", line 46, in training_step [rank7]: return super().training_step(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3854, in training_step [rank7]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/base.py", line 367, in compute_loss [rank7]: return super().compute_loss( [rank7]: ^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3936, in compute_loss [rank7]: outputs = model(**inputs) [rank7]: ^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank7]: return self._call_impl(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank7]: return forward_call(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1648, in forward [rank7]: else self._run_ddp_forward(*inputs, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1474, in _run_ddp_forward [rank7]: return self.module(*inputs, **kwargs) # type: ignore[index] [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank7]: return self._call_impl(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank7]: return forward_call(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 818, in forward [rank7]: return model_forward(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 806, in __call__ [rank7]: return convert_to_fp32(self.model_forward(*args, **kwargs)) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast [rank7]: return func(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 52, in cce_forward [rank7]: outputs: BaseModelOutputWithPast = self.model( [rank7]: ^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank7]: return self._call_impl(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank7]: return forward_call(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 927, in wrapper [rank7]: outputs = func(self, *args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 652, in forward [rank7]: hidden_states = decoder_layer( [rank7]: ^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ [rank7]: return super().__call__(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank7]: return self._call_impl(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank7]: return forward_call(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank7]: return func(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 556, in forward [rank7]: hidden_states = self.conv( [rank7]: ^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank7]: return self._call_impl(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank7]: return forward_call(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank7]: return func(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 517, in forward [rank7]: return self.slow_forward(hidden_states, past_key_values, cache_position, attention_mask) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func [rank7]: return func(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 500, in slow_forward [rank7]: conv_out = self.conv(Bx)[..., :seqlen] [rank7]: ^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl [rank7]: return self._call_impl(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl [rank7]: return forward_call(*args, **kwargs) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 371, in forward [rank7]: return self._conv_forward(input, self.weight, self.bias) [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 366, in _conv_forward [rank7]: return F.conv1d( [rank7]: ^^^^^^^^^ [rank7]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 272.00 MiB. GPU 7 has a total capacity of 79.25 GiB of which 160.94 MiB is free. Including non-PyTorch memory, this process has 79.09 GiB memory in use. Of the allocated memory 77.68 GiB is allocated by PyTorch, and 85.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) Exception in thread Thread-8 (_pin_memory_loop): Traceback (most recent call last): File "/usr/lib/python3.12/threading.py", line 1073, in _bootstrap_inner self.run() File "/usr/lib/python3.12/threading.py", line 1010, in run self._target(*self._args, **self._kwargs) File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 61, in _pin_memory_loop do_one_step() File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 37, in do_one_step r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/queues.py", line 122, in get return _ForkingPickler.loads(res) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/multiprocessing/reductions.py", line 541, in rebuild_storage_fd fd = df.detach() ^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 57, in detach with _resource_sharer.get_connection(self._id) as conn: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 86, in get_connection c = Client(address, authkey=process.current_process().authkey) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 526, in Client deliver_challenge(c, authkey) File "/usr/lib/python3.12/multiprocessing/connection.py", line 939, in deliver_challenge response = connection.recv_bytes(256) # reject large message ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 216, in recv_bytes buf = self._recv_bytes(maxlength) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 430, in _recv_bytes buf = self._recv(4) ^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 395, in _recv chunk = read(handle, remaining) ^^^^^^^^^^^^^^^^^^^^^^^ ConnectionResetError: [Errno 104] Connection reset by peer Exception in thread Thread-20 (_pin_memory_loop): Traceback (most recent call last): File "/usr/lib/python3.12/threading.py", line 1073, in _bootstrap_inner self.run() File "/usr/lib/python3.12/threading.py", line 1010, in run self._target(*self._args, **self._kwargs) File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 61, in _pin_memory_loop do_one_step() File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 37, in do_one_step r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/queues.py", line 122, in get return _ForkingPickler.loads(res) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/multiprocessing/reductions.py", line 541, in rebuild_storage_fd fd = df.detach() ^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 57, in detach with _resource_sharer.get_connection(self._id) as conn: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 86, in get_connection c = Client(address, authkey=process.current_process().authkey) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 525, in Client answer_challenge(c, authkey) File "/usr/lib/python3.12/multiprocessing/connection.py", line 962, in answer_challenge response = connection.recv_bytes(256) # reject large message ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 216, in recv_bytes buf = self._recv_bytes(maxlength) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 430, in _recv_bytes buf = self._recv(4) ^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 395, in _recv chunk = read(handle, remaining) ^^^^^^^^^^^^^^^^^^^^^^^ ConnectionResetError: [Errno 104] Connection reset by peer Exception in thread Thread-8 (_pin_memory_loop): Traceback (most recent call last): File "/usr/lib/python3.12/threading.py", line 1073, in _bootstrap_inner self.run() File "/usr/lib/python3.12/threading.py", line 1010, in run self._target(*self._args, **self._kwargs) File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 61, in _pin_memory_loop do_one_step() File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 37, in do_one_step r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/queues.py", line 122, in get return _ForkingPickler.loads(res) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/multiprocessing/reductions.py", line 541, in rebuild_storage_fd fd = df.detach() ^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 57, in detach with _resource_sharer.get_connection(self._id) as conn: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 86, in get_connection c = Client(address, authkey=process.current_process().authkey) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 525, in Client answer_challenge(c, authkey) File "/usr/lib/python3.12/multiprocessing/connection.py", line 962, in answer_challenge response = connection.recv_bytes(256) # reject large message ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 216, in recv_bytes buf = self._recv_bytes(maxlength) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 430, in _recv_bytes buf = self._recv(4) ^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 395, in _recv chunk = read(handle, remaining) ^^^^^^^^^^^^^^^^^^^^^^^ ConnectionResetError: [Errno 104] Connection reset by peer Exception in thread Thread-8 (_pin_memory_loop): Traceback (most recent call last): File "/usr/lib/python3.12/threading.py", line 1073, in _bootstrap_inner self.run() File "/usr/lib/python3.12/threading.py", line 1010, in run self._target(*self._args, **self._kwargs) File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 61, in _pin_memory_loop do_one_step() File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 37, in do_one_step r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/queues.py", line 122, in get return _ForkingPickler.loads(res) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/multiprocessing/reductions.py", line 541, in rebuild_storage_fd fd = df.detach() ^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 57, in detach with _resource_sharer.get_connection(self._id) as conn: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 86, in get_connection c = Client(address, authkey=process.current_process().authkey) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 525, in Client answer_challenge(c, authkey) File "/usr/lib/python3.12/multiprocessing/connection.py", line 953, in answer_challenge message = connection.recv_bytes(256) # reject large message ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 216, in recv_bytes buf = self._recv_bytes(maxlength) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 430, in _recv_bytes buf = self._recv(4) ^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 399, in _recv raise EOFError EOFError Exception in thread Thread-8 (_pin_memory_loop): Traceback (most recent call last): File "/usr/lib/python3.12/threading.py", line 1073, in _bootstrap_inner self.run() File "/usr/lib/python3.12/threading.py", line 1010, in run self._target(*self._args, **self._kwargs) File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 61, in _pin_memory_loop do_one_step() File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 37, in do_one_step r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/queues.py", line 122, in get return _ForkingPickler.loads(res) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/multiprocessing/reductions.py", line 541, in rebuild_storage_fd fd = df.detach() ^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 57, in detach with _resource_sharer.get_connection(self._id) as conn: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 86, in get_connection c = Client(address, authkey=process.current_process().authkey) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 525, in Client answer_challenge(c, authkey) File "/usr/lib/python3.12/multiprocessing/connection.py", line 953, in answer_challenge message = connection.recv_bytes(256) # reject large message ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 216, in recv_bytes buf = self._recv_bytes(maxlength) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 430, in _recv_bytes buf = self._recv(4) ^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 399, in _recv raise EOFError EOFError Exception in thread Thread-8 (_pin_memory_loop): Traceback (most recent call last): File "/usr/lib/python3.12/threading.py", line 1073, in _bootstrap_inner self.run() File "/usr/lib/python3.12/threading.py", line 1010, in run self._target(*self._args, **self._kwargs) File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 61, in _pin_memory_loop do_one_step() File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 37, in do_one_step r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/queues.py", line 122, in get return _ForkingPickler.loads(res) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/multiprocessing/reductions.py", line 541, in rebuild_storage_fd fd = df.detach() ^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 57, in detach with _resource_sharer.get_connection(self._id) as conn: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 86, in get_connection c = Client(address, authkey=process.current_process().authkey) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 525, in Client answer_challenge(c, authkey) File "/usr/lib/python3.12/multiprocessing/connection.py", line 962, in answer_challenge response = connection.recv_bytes(256) # reject large message ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 216, in recv_bytes buf = self._recv_bytes(maxlength) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 430, in _recv_bytes buf = self._recv(4) ^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 399, in _recv raise EOFError EOFError Exception in thread Thread-8 (_pin_memory_loop): Traceback (most recent call last): File "/usr/lib/python3.12/threading.py", line 1073, in _bootstrap_inner self.run() File "/usr/lib/python3.12/threading.py", line 1010, in run self._target(*self._args, **self._kwargs) File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 61, in _pin_memory_loop do_one_step() File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 37, in do_one_step r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/queues.py", line 122, in get return _ForkingPickler.loads(res) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/multiprocessing/reductions.py", line 541, in rebuild_storage_fd fd = df.detach() ^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 57, in detach with _resource_sharer.get_connection(self._id) as conn: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 86, in get_connection c = Client(address, authkey=process.current_process().authkey) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 525, in Client answer_challenge(c, authkey) File "/usr/lib/python3.12/multiprocessing/connection.py", line 962, in answer_challenge response = connection.recv_bytes(256) # reject large message ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 216, in recv_bytes buf = self._recv_bytes(maxlength) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 430, in _recv_bytes buf = self._recv(4) ^^^^^^^^^^^^^ File "/usr/lib/python3.12/multiprocessing/connection.py", line 395, in _recv chunk = read(handle, remaining) ^^^^^^^^^^^^^^^^^^^^^^^ ConnectionResetError: [Errno 104] Connection reset by peer [2025-10-12 02:54:14,858] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:1386789] baseline 0.000GB () [2025-10-12 02:54:14,858] [INFO] [axolotl.cli.config.load_cfg:248] [PID:1386789] config: { "activation_offloading": false, "auto_resume_from_checkpoints": true, "axolotl_config_path": "train_350m_multitask.yaml", "base_model": "/home/ubuntu/axolotl/out-350m-audio-pt", "base_model_config": "/home/ubuntu/axolotl/out-350m-audio-pt", "batch_size": 256, "bf16": true, "bfloat16": true, "capabilities": { "bf16": true, "compute_capability": "sm_80", "fp8": false, "n_gpu": 8, "n_node": 1 }, "context_parallel_size": 1, "cut_cross_entropy": true, "dataloader_num_workers": 8, "dataloader_pin_memory": true, "dataloader_prefetch_factor": 256, "dataset_prepared_path": "/home/ubuntu/axolotl/preprocessed-data-350m-multitask-ft", "dataset_processes": 240, "datasets": [ { "ds_type": "json", "message_property_mappings": { "content": "content", "role": "role" }, "path": "/home/ubuntu/axolotl/hackathon-train_data-s2s-jaen.jsonl", "trust_remote_code": false }, { "ds_type": "json", "message_property_mappings": { "content": "content", "role": "role" }, "path": "/home/ubuntu/axolotl/hackathon-train_data-s2s-enja.jsonl", "trust_remote_code": false }, { "ds_type": "json", "message_property_mappings": { "content": "content", "role": "role" }, "path": "/home/ubuntu/axolotl/hackathon-train_data-asr-ja.jsonl", "trust_remote_code": false }, { "ds_type": "json", "message_property_mappings": { "content": "content", "role": "role" }, "path": "/home/ubuntu/axolotl/hackathon-train_data-tts-ja.jsonl", "trust_remote_code": false }, { "ds_type": "json", "message_property_mappings": { "content": "content", "role": "role" }, "path": "/home/ubuntu/axolotl/hackathon-train_data-asr-en.jsonl", "trust_remote_code": false }, { "ds_type": "json", "message_property_mappings": { "content": "content", "role": "role" }, "path": "/home/ubuntu/axolotl/hackathon-train_data-tts-en.jsonl", "trust_remote_code": false } ], "ddp": true, "device": "cuda:0", "device_map": { "": 0 }, "dion_rank_fraction": 1.0, "dion_rank_multiple_of": 1, "env_capabilities": { "torch_version": "2.8.0" }, "eval_batch_size": 8, "eval_causal_lm_metrics": [ "sacrebleu", "comet", "ter", "chrf" ], "eval_max_new_tokens": 128, "eval_sample_packing": false, "eval_steps": 100, "eval_strategy": "steps", "eval_table_size": 0, "experimental_skip_move_to_device": true, "flash_attention": true, "fp16": false, "gradient_accumulation_steps": 1, "gradient_checkpointing": true, "gradient_checkpointing_kwargs": { "use_reentrant": true }, "group_by_length": false, "include_tkps": true, "is_falcon_derived_model": false, "is_llama_derived_model": false, "is_mistral_derived_model": false, "learning_rate": 5e-05, "lisa_layers_attribute": "model.layers", "load_best_model_at_end": false, "load_in_4bit": false, "load_in_8bit": false, "local_rank": 0, "logging_steps": 1, "loraplus_lr_embedding": 1e-06, "lr_scheduler": "cosine", "max_grad_norm": 1.0, "mean_resizing_embeddings": false, "micro_batch_size": 32, "model_config_type": "lfm2", "num_epochs": 3.0, "optimizer": "adamw_torch_fused", "output_dir": "/home/ubuntu/axolotl/out-350m-multitask-ft", "pad_to_sequence_len": true, "plugins": [ "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin" ], "pretrain_multipack_attn": true, "profiler_steps_start": 0, "qlora_sharded_model_loading": false, "ray_num_workers": 1, "remove_unused_columns": false, "resources_per_worker": { "GPU": 1 }, "sample_packing": true, "sample_packing_bin_size": 200, "sample_packing_group_size": 100000, "save_only_model": false, "save_safetensors": true, "save_steps": 100, "save_strategy": "steps", "sequence_len": 4096, "shuffle_before_merging_datasets": false, "shuffle_merged_datasets": true, "skip_prepare_dataset": false, "streaming_multipack_buffer_size": 10000, "strict": false, "tensor_parallel_size": 1, "tf32": true, "tiled_mlp_use_original_mlp": true, "tokenizer_config": "/home/ubuntu/axolotl/out-350m-audio-pt", "tokenizer_save_jinja_files": true, "tokenizer_type": "AutoTokenizer", "torch_dtype": "torch.bfloat16", "train_on_inputs": false, "trl": { "log_completions": false, "mask_truncated_completions": false, "ref_model_mixup_alpha": 0.9, "ref_model_sync_steps": 64, "scale_rewards": true, "sync_ref_model": false, "use_vllm": false, "vllm_server_host": "0.0.0.0", "vllm_server_port": 8000 }, "type_of_model": "AutoModelForCausalLM", "use_ray": false, "use_wandb": true, "val_set_size": 0.01, "vllm": { "device": "auto", "dtype": "auto", "gpu_memory_utilization": 0.9, "host": "0.0.0.0", "port": 8000 }, "wandb_entity": "aratako-lm", "wandb_name": "350m-multitask-ft-run1", "wandb_project": "liquidai-hackathon", "warmup_ratio": 0.1, "weight_decay": 0.01, "world_size": 8 } Loading dataset from disk: 0%| | 0/240 [00:00 [2025-10-12 02:55:31,480] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:1386789] BOS: 1 / <|startoftext|> [2025-10-12 02:55:31,480] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:1386789] PAD: 0 / <|pad|> [2025-10-12 02:55:31,480] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:1386789] UNK: None / None [2025-10-12 02:55:31,482] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:470] [PID:1386789] Loading prepared dataset from disk at /home/ubuntu/axolotl/preprocessed-data-350m-multitask-ft/28514821cb56568b4099ac280cc69eed... Loading dataset from disk: 0%| | 0/240 [00:00 [2025-10-12 02:57:13,462] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:1386789] BOS: 1 / <|startoftext|> [2025-10-12 02:57:13,462] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:1386789] PAD: 0 / <|pad|> [2025-10-12 02:57:13,463] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:1386789] UNK: None / None [2025-10-12 02:57:13,463] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:1386789] Loading model [2025-10-12 02:57:13,475] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:1386789] Patched Trainer.evaluation_loop with nanmean loss calculation [2025-10-12 02:57:13,476] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:1386789] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation [2025-10-12 02:57:13,476] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:1386789] Applying multipack dataloader patch for sample packing... [2025-10-12 02:57:13,888] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:1386789] Applying Cut Cross Entropy to model type: lfm2 [2025-10-12 02:57:14,225] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:1386789] Converting modules to torch.bfloat16 [2025-10-12 02:57:14,228] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:1386789] Memory usage after model load 1.285GB (+1.285GB allocated, +1.303GB reserved) [2025-10-12 02:57:17,783] [INFO] [axolotl.train.save_initial_configs:402] [PID:1386789] Pre-saving tokenizer to /home/ubuntu/axolotl/out-350m-multitask-ft... [2025-10-12 02:57:18,497] [INFO] [axolotl.train.save_initial_configs:407] [PID:1386789] Pre-saving model config to /home/ubuntu/axolotl/out-350m-multitask-ft... [2025-10-12 02:57:18,500] [INFO] [axolotl.train.execute_training:196] [PID:1386789] Starting trainer... [2025-10-12 03:04:21,879] [WARNING] [py.warnings._showwarnmsg:110] [PID:1386790] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once [2025-10-12 03:04:22,505] [WARNING] [py.warnings._showwarnmsg:110] [PID:1386795] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once [2025-10-12 03:04:23,556] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:1386789] generate_batches time: 3.02742075920105 [2025-10-12 03:04:28,898] [WARNING] [py.warnings._showwarnmsg:110] [PID:1386794] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once [2025-10-12 03:04:50,921] [WARNING] [py.warnings._showwarnmsg:110] [PID:1386796] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once [2025-10-12 03:05:22,805] [WARNING] [py.warnings._showwarnmsg:110] [PID:1386793] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once [2025-10-12 03:08:17,541] [WARNING] [py.warnings._showwarnmsg:110] [PID:1386791] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once [2025-10-12 03:08:50,888] [WARNING] [py.warnings._showwarnmsg:110] [PID:1386792] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once [2025-10-12 03:08:50,889] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:1386789] gather_len_batches: [7647, 7647, 7647, 7647, 7647, 7647, 7647, 7647] [2025-10-12 03:08:50,891] [WARNING] [py.warnings._showwarnmsg:110] [PID:1386789] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once wandb: Currently logged in as: aratako1998 (aratako-lm) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin wandb: ⢿ Waiting for wandb.init()... m wandb: Tracking run with wandb version 0.22.2 wandb: Run data is saved locally in /home/ubuntu/axolotl/wandb/run-20251012_030851-agrcquxp wandb: Run `wandb offline` to turn off syncing. wandb: Syncing run 350m-multitask-ft-run1 wandb: ⭐️ View project at https://wandb.ai/aratako-lm/liquidai-hackathon wandb: 🚀 View run at https://wandb.ai/aratako-lm/liquidai-hackathon/runs/agrcquxp wandb: Detected [huggingface_hub.inference] in use. wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ wandb: WARNING Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt") [2025-10-12 03:08:52,307] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:1386789] The Axolotl config has been saved to the WandB run under files. 0%| | 0/2865 [00:00