diff --git "a/debug.log" "b/debug.log" new file mode 100644--- /dev/null +++ "b/debug.log" @@ -0,0 +1,10587 @@ +[2025-10-12 02:39:16,867] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:1367687] baseline 0.000GB () +[2025-10-12 02:39:16,867] [INFO] [axolotl.cli.config.load_cfg:248] [PID:1367687] config: +{ + "activation_offloading": false, + "auto_resume_from_checkpoints": true, + "axolotl_config_path": "train_350m_multitask.yaml", + "base_model": "/home/ubuntu/axolotl/out-350m-audio-pt", + "base_model_config": "/home/ubuntu/axolotl/out-350m-audio-pt", + "batch_size": 256, + "bf16": true, + "bfloat16": true, + "capabilities": { + "bf16": true, + "compute_capability": "sm_80", + "fp8": false, + "n_gpu": 8, + "n_node": 1 + }, + "context_parallel_size": 1, + "cut_cross_entropy": true, + "dataloader_num_workers": 8, + "dataloader_pin_memory": true, + "dataloader_prefetch_factor": 256, + "dataset_prepared_path": "/home/ubuntu/axolotl/preprocessed-data-350m-multitask-ft", + "dataset_processes": 240, + "datasets": [ + { + "ds_type": "json", + "message_property_mappings": { + "content": "content", + "role": "role" + }, + "path": "/home/ubuntu/axolotl/hackathon-train_data-s2s-jaen.jsonl", + "trust_remote_code": false + }, + { + "ds_type": "json", + "message_property_mappings": { + "content": "content", + "role": "role" + }, + "path": "/home/ubuntu/axolotl/hackathon-train_data-s2s-enja.jsonl", + "trust_remote_code": false + }, + { + "ds_type": "json", + "message_property_mappings": { + "content": "content", + "role": "role" + }, + "path": "/home/ubuntu/axolotl/hackathon-train_data-asr-ja.jsonl", + "trust_remote_code": false + }, + { + "ds_type": "json", + "message_property_mappings": { + "content": "content", + "role": "role" + }, + "path": "/home/ubuntu/axolotl/hackathon-train_data-tts-ja.jsonl", + "trust_remote_code": false + }, + { + "ds_type": "json", + "message_property_mappings": { + "content": "content", + "role": "role" + }, + "path": "/home/ubuntu/axolotl/hackathon-train_data-asr-en.jsonl", + "trust_remote_code": false + }, + { + "ds_type": "json", + "message_property_mappings": { + "content": "content", + "role": "role" + }, + "path": "/home/ubuntu/axolotl/hackathon-train_data-tts-en.jsonl", + "trust_remote_code": false + } + ], + "ddp": true, + "device": "cuda:0", + "device_map": { + "": 0 + }, + "dion_rank_fraction": 1.0, + "dion_rank_multiple_of": 1, + "env_capabilities": { + "torch_version": "2.8.0" + }, + "eval_batch_size": 8, + "eval_causal_lm_metrics": [ + "sacrebleu", + "comet", + "ter", + "chrf" + ], + "eval_max_new_tokens": 128, + "eval_sample_packing": false, + "eval_steps": 100, + "eval_strategy": "steps", + "eval_table_size": 0, + "experimental_skip_move_to_device": true, + "flash_attention": true, + "fp16": false, + "gradient_accumulation_steps": 1, + "gradient_checkpointing": false, + "group_by_length": false, + "include_tkps": true, + "is_falcon_derived_model": false, + "is_llama_derived_model": false, + "is_mistral_derived_model": false, + "learning_rate": 5e-05, + "lisa_layers_attribute": "model.layers", + "load_best_model_at_end": false, + "load_in_4bit": false, + "load_in_8bit": false, + "local_rank": 0, + "logging_steps": 1, + "loraplus_lr_embedding": 1e-06, + "lr_scheduler": "cosine", + "max_grad_norm": 1.0, + "mean_resizing_embeddings": false, + "micro_batch_size": 32, + "model_config_type": "lfm2", + "num_epochs": 3.0, + "optimizer": "adamw_torch_fused", + "output_dir": "/home/ubuntu/axolotl/out-350m-multitask-ft", + "pad_to_sequence_len": true, + "plugins": [ + "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin" + ], + "pretrain_multipack_attn": true, + "profiler_steps_start": 0, + "qlora_sharded_model_loading": false, + "ray_num_workers": 1, + "remove_unused_columns": false, + "resources_per_worker": { + "GPU": 1 + }, + "sample_packing": true, + "sample_packing_bin_size": 200, + "sample_packing_group_size": 100000, + "save_only_model": false, + "save_safetensors": true, + "save_steps": 100, + "save_strategy": "steps", + "sequence_len": 4096, + "shuffle_before_merging_datasets": false, + "shuffle_merged_datasets": true, + "skip_prepare_dataset": false, + "streaming_multipack_buffer_size": 10000, + "strict": false, + "tensor_parallel_size": 1, + "tf32": true, + "tiled_mlp_use_original_mlp": true, + "tokenizer_config": "/home/ubuntu/axolotl/out-350m-audio-pt", + "tokenizer_save_jinja_files": true, + "tokenizer_type": "AutoTokenizer", + "torch_dtype": "torch.bfloat16", + "train_on_inputs": false, + "trl": { + "log_completions": false, + "mask_truncated_completions": false, + "ref_model_mixup_alpha": 0.9, + "ref_model_sync_steps": 64, + "scale_rewards": true, + "sync_ref_model": false, + "use_vllm": false, + "vllm_server_host": "0.0.0.0", + "vllm_server_port": 8000 + }, + "type_of_model": "AutoModelForCausalLM", + "use_ray": false, + "use_wandb": true, + "val_set_size": 0.01, + "vllm": { + "device": "auto", + "dtype": "auto", + "gpu_memory_utilization": 0.9, + "host": "0.0.0.0", + "port": 8000 + }, + "wandb_entity": "aratako-lm", + "wandb_name": "350m-multitask-ft-run1", + "wandb_project": "liquidai-hackathon", + "warmup_ratio": 0.1, + "weight_decay": 0.01, + "world_size": 8 +} +[2025-10-12 02:40:01,705] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:1367689] Loading raw datasets... + Loading dataset shards: 0%| | 0/18 [00:00 +[2025-10-12 02:40:10,483] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:1367687] BOS: 1 / <|startoftext|> +[2025-10-12 02:40:10,484] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:1367687] PAD: 0 / <|pad|> +[2025-10-12 02:40:10,484] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:1367687] UNK: None / None + Dropping Long Sequences (>4096) (num_proc=240): 0%| | 0/1185642 [00:004096) (num_proc=240): 0%| | 1000/1185642 [00:06<2:13:13, 148.20 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 2%|▋ | 20000/1185642 [00:06<04:45, 4078.92 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 3%|█▎ | 37000/1185642 [00:06<02:09, 8838.29 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 5%|█▉ | 55000/1185642 [00:07<01:13, 15368.21 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 6%|██▋ | 73000/1185642 [00:07<00:46, 23967.79 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 8%|███▏ | 89000/1185642 [00:07<00:33, 33183.52 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 9%|███▊ | 106000/1185642 [00:07<00:23, 45484.98 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 11%|████▍ | 127000/1185642 [00:07<00:16, 64139.83 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 13%|█████▋ | 159000/1185642 [00:07<00:10, 99657.38 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 16%|██████▍ | 187000/1185642 [00:07<00:07, 129571.10 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 19%|███████▌ | 220000/1185642 [00:07<00:05, 167897.73 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 21%|████████▋ | 251000/1185642 [00:07<00:04, 197134.90 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 24%|█████████▋ | 279000/1185642 [00:08<00:04, 211483.70 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 26%|██████████▋ | 309000/1185642 [00:08<00:03, 232237.16 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 29%|███████████▊ | 343000/1185642 [00:08<00:03, 260015.55 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 32%|█████████████ | 377000/1185642 [00:08<00:02, 278359.02 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 34%|██████████████ | 408000/1185642 [00:08<00:02, 277664.56 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 37%|███████████████▏ | 438000/1185642 [00:08<00:02, 268511.49 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 39%|████████████████▏ | 467000/1185642 [00:08<00:02, 254779.79 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 42%|█████████████████ | 494000/1185642 [00:08<00:03, 224759.76 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 44%|█████████████████▉ | 518000/1185642 [00:08<00:03, 208819.91 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 46%|██████████████████▋ | 539941/1185642 [00:09<00:03, 207937.00 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 48%|███████████████████▍ | 563823/1185642 [00:09<00:03, 197604.25 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 50%|████████████████████▌ | 592823/1185642 [00:09<00:02, 219616.15 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 52%|█████████████████████▎ | 615763/1185642 [00:09<00:02, 198801.06 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 54%|██████████████████████ | 636704/1185642 [00:09<00:03, 159172.24 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 56%|██████████████████████▊ | 659350/1185642 [00:09<00:03, 173269.28 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 57%|███████████████████████▍ | 678818/1185642 [00:09<00:02, 170075.01 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 59%|████████████████████████▎ | 703344/1185642 [00:10<00:02, 188366.46 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 61%|█████████████████████████▏ | 726868/1185642 [00:10<00:02, 197958.91 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 63%|█████████████████████████▉ | 748331/1185642 [00:10<00:02, 198028.04 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 66%|███████████████████████████ | 783032/1185642 [00:10<00:01, 236517.53 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 68%|███████████████████████████▉ | 808314/1185642 [00:10<00:01, 235742.91 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 71%|████████████████████████████▉ | 836716/1185642 [00:10<00:01, 248221.17 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 74%|██████████████████████████████▏ | 873177/1185642 [00:10<00:01, 280186.00 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 76%|███████████████████████████████▏ | 903400/1185642 [00:10<00:00, 285986.91 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 79%|████████████████████████████████▍ | 936621/1185642 [00:10<00:00, 298678.02 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 82%|█████████████████████████████████▌ | 971201/1185642 [00:10<00:00, 309856.38 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 85%|█████████████████████████████████▉ | 1004722/1185642 [00:11<00:00, 315287.00 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 87%|██████████████████████████████████▉ | 1036642/1185642 [00:11<00:00, 274993.63 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 90%|███████████████████████████████████▉ | 1065682/1185642 [00:11<00:00, 245335.45 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 92%|████████████████████████████████████▊ | 1091902/1185642 [00:11<00:00, 238996.99 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 94%|█████████████████████████████████████▋ | 1117362/1185642 [00:11<00:00, 236046.67 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 96%|██████████████████████████████████████▌ | 1142282/1185642 [00:11<00:00, 197506.25 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 98%|███████████████████████████████████████▎| 1164022/1185642 [00:11<00:00, 177990.40 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 100%|███████████████████████████████████████▉| 1183762/1185642 [00:12<00:00, 116666.59 examples/s] Dropping Long Sequences (>4096) (num_proc=240): 100%|█████████████████████████████████████████| 1185642/1185642 [00:13<00:00, 89249.57 examples/s] + Drop Samples with Zero Trainable Tokens (num_proc=240): 0%| | 0/1185642 [00:00 +[2025-10-12 02:43:35,540] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:1367687] BOS: 1 / <|startoftext|> +[2025-10-12 02:43:35,540] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:1367687] PAD: 0 / <|pad|> +[2025-10-12 02:43:35,540] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:1367687] UNK: None / None +[2025-10-12 02:43:35,540] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:1367687] Loading model +[2025-10-12 02:43:35,564] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:1367687] Patched Trainer.evaluation_loop with nanmean loss calculation +[2025-10-12 02:43:35,566] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:1367687] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation +[2025-10-12 02:43:35,567] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:1367687] Applying multipack dataloader patch for sample packing... +[2025-10-12 02:43:36,156] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:1367687] Applying Cut Cross Entropy to model type: lfm2 +[2025-10-12 02:43:36,508] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:1367687] Converting modules to torch.bfloat16 +[2025-10-12 02:43:36,511] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:1367687] Memory usage after model load 1.285GB (+1.285GB allocated, +1.303GB reserved) +[2025-10-12 02:43:40,065] [INFO] [axolotl.train.save_initial_configs:402] [PID:1367687] Pre-saving tokenizer to /home/ubuntu/axolotl/out-350m-multitask-ft... +[2025-10-12 02:43:40,792] [INFO] [axolotl.train.save_initial_configs:407] [PID:1367687] Pre-saving model config to /home/ubuntu/axolotl/out-350m-multitask-ft... +[2025-10-12 02:43:40,795] [INFO] [axolotl.train.execute_training:196] [PID:1367687] Starting trainer... +[2025-10-12 02:50:32,078] [WARNING] [py.warnings._showwarnmsg:110] [PID:1367693] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once + +[2025-10-12 02:50:40,609] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:1367687] generate_batches time: 2.821959972381592 +[2025-10-12 02:50:42,712] [WARNING] [py.warnings._showwarnmsg:110] [PID:1367688] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once + +[2025-10-12 02:50:44,543] [WARNING] [py.warnings._showwarnmsg:110] [PID:1367689] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once + +[2025-10-12 02:50:45,500] [WARNING] [py.warnings._showwarnmsg:110] [PID:1367694] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once + +[2025-10-12 02:50:47,700] [WARNING] [py.warnings._showwarnmsg:110] [PID:1367690] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once + +[2025-10-12 02:50:52,950] [WARNING] [py.warnings._showwarnmsg:110] [PID:1367691] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once + +[2025-10-12 02:50:59,700] [WARNING] [py.warnings._showwarnmsg:110] [PID:1367692] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once + +[2025-10-12 02:50:59,701] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:1367687] gather_len_batches: [7647, 7647, 7647, 7647, 7647, 7647, 7647, 7647] +[2025-10-12 02:50:59,703] [WARNING] [py.warnings._showwarnmsg:110] [PID:1367687] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once + +wandb: Currently logged in as: aratako1998 (aratako-lm) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: ⢿ Waiting for wandb.init()... + m wandb: ⣻ Waiting for wandb.init()... + m wandb: Tracking run with wandb version 0.22.2 +wandb: Run data is saved locally in /home/ubuntu/axolotl/wandb/run-20251012_025059-guktalmo +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run 350m-multitask-ft-run1 +wandb: ⭐️ View project at https://wandb.ai/aratako-lm/liquidai-hackathon +wandb: 🚀 View run at https://wandb.ai/aratako-lm/liquidai-hackathon/runs/guktalmo +wandb: Detected [huggingface_hub.inference] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +wandb: WARNING Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt") +[2025-10-12 02:51:01,203] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:1367687] The Axolotl config has been saved to the WandB run under files. + 0%| | 0/2865 [00:00", line 198, in _run_module_as_main +[rank1]: File "", line 88, in _run_code +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 121, in +[rank1]: fire.Fire(do_cli) +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire +[rank1]: component_trace = _Fire(component, args, parsed_flag_args, context, name) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire +[rank1]: component, remaining_args = _CallAndUpdateTrace( +[rank1]: ^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace +[rank1]: component = fn(*varargs, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 88, in do_cli +[rank1]: return do_train(parsed_cfg, parsed_cli_args) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 45, in do_train +[rank1]: model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 584, in train +[rank1]: execute_training(cfg, trainer, resume_from_checkpoint) +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 197, in execute_training +[rank1]: trainer.train(resume_from_checkpoint=resume_from_checkpoint) +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2224, in train +[rank1]: return inner_training_loop( +[rank1]: ^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2561, in _inner_training_loop +[rank1]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/mixins/activation_checkpointing.py", line 46, in training_step +[rank1]: return super().training_step(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3854, in training_step +[rank1]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/base.py", line 367, in compute_loss +[rank1]: return super().compute_loss( +[rank1]: ^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3936, in compute_loss +[rank1]: outputs = model(**inputs) +[rank1]: ^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank1]: return self._call_impl(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank1]: return forward_call(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1648, in forward +[rank1]: else self._run_ddp_forward(*inputs, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1474, in _run_ddp_forward +[rank1]: return self.module(*inputs, **kwargs) # type: ignore[index] +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank1]: return self._call_impl(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank1]: return forward_call(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 818, in forward +[rank1]: return model_forward(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 806, in __call__ +[rank1]: return convert_to_fp32(self.model_forward(*args, **kwargs)) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast +[rank1]: return func(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 52, in cce_forward +[rank1]: outputs: BaseModelOutputWithPast = self.model( +[rank1]: ^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank1]: return self._call_impl(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank1]: return forward_call(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 927, in wrapper +[rank1]: outputs = func(self, *args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 652, in forward +[rank1]: hidden_states = decoder_layer( +[rank1]: ^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ +[rank1]: return super().__call__(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank1]: return self._call_impl(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank1]: return forward_call(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank1]: return func(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 556, in forward +[rank1]: hidden_states = self.conv( +[rank1]: ^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank1]: return self._call_impl(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank1]: return forward_call(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank1]: return func(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 517, in forward +[rank1]: return self.slow_forward(hidden_states, past_key_values, cache_position, attention_mask) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank1]: return func(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 500, in slow_forward +[rank1]: conv_out = self.conv(Bx)[..., :seqlen] +[rank1]: ^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank1]: return self._call_impl(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank1]: return forward_call(*args, **kwargs) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 371, in forward +[rank1]: return self._conv_forward(input, self.weight, self.bias) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 366, in _conv_forward +[rank1]: return F.conv1d( +[rank1]: ^^^^^^^^^ +[rank1]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 272.00 MiB. GPU 1 has a total capacity of 79.25 GiB of which 160.94 MiB is free. Including non-PyTorch memory, this process has 79.09 GiB memory in use. Of the allocated memory 77.68 GiB is allocated by PyTorch, and 85.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Traceback (most recent call last): + File "", line 198, in _run_module_as_main + File "", line 88, in _run_code + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 121, in + fire.Fire(do_cli) + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire + component_trace = _Fire(component, args, parsed_flag_args, context, name) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire + component, remaining_args = _CallAndUpdateTrace( + ^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace + component = fn(*varargs, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 88, in do_cli + return do_train(parsed_cfg, parsed_cli_args) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 45, in do_train + model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 584, in train + execute_training(cfg, trainer, resume_from_checkpoint) + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 197, in execute_training + trainer.train(resume_from_checkpoint=resume_from_checkpoint) + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2224, in train + return inner_training_loop( + ^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2561, in _inner_training_loop + tr_loss_step = self.training_step(model, inputs, num_items_in_batch) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/mixins/activation_checkpointing.py", line 46, in training_step + return super().training_step(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3854, in training_step + loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/base.py", line 367, in compute_loss + return super().compute_loss( + ^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3936, in compute_loss + outputs = model(**inputs) + ^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1648, in forward + else self._run_ddp_forward(*inputs, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1474, in _run_ddp_forward + return self.module(*inputs, **kwargs) # type: ignore[index] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 818, in forward + return model_forward(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 806, in __call__ + return convert_to_fp32(self.model_forward(*args, **kwargs)) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 52, in cce_forward + outputs: BaseModelOutputWithPast = self.model( + ^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 927, in wrapper + outputs = func(self, *args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 652, in forward + hidden_states = decoder_layer( + ^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ + return super().__call__(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 557, in forward + hidden_states=self.operator_norm(hidden_states), + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 63, in forward + return self.weight * hidden_states.to(input_dtype) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 0 has a total capacity of 79.25 GiB of which 208.94 MiB is free. Including non-PyTorch memory, this process has 79.04 GiB memory in use. Of the allocated memory 76.43 GiB is allocated by PyTorch, and 165.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank0]: Traceback (most recent call last): +[rank0]: File "", line 198, in _run_module_as_main +[rank0]: File "", line 88, in _run_code +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 121, in +[rank0]: fire.Fire(do_cli) +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire +[rank0]: component_trace = _Fire(component, args, parsed_flag_args, context, name) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire +[rank0]: component, remaining_args = _CallAndUpdateTrace( +[rank0]: ^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace +[rank0]: component = fn(*varargs, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 88, in do_cli +[rank0]: return do_train(parsed_cfg, parsed_cli_args) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 45, in do_train +[rank0]: model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 584, in train +[rank0]: execute_training(cfg, trainer, resume_from_checkpoint) +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 197, in execute_training +[rank0]: trainer.train(resume_from_checkpoint=resume_from_checkpoint) +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2224, in train +[rank0]: return inner_training_loop( +[rank0]: ^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2561, in _inner_training_loop +[rank0]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/mixins/activation_checkpointing.py", line 46, in training_step +[rank0]: return super().training_step(*args, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3854, in training_step +[rank0]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/base.py", line 367, in compute_loss +[rank0]: return super().compute_loss( +[rank0]: ^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3936, in compute_loss +[rank0]: outputs = model(**inputs) +[rank0]: ^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1648, in forward +[rank0]: else self._run_ddp_forward(*inputs, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1474, in _run_ddp_forward +[rank0]: return self.module(*inputs, **kwargs) # type: ignore[index] +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 818, in forward +[rank0]: return model_forward(*args, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 806, in __call__ +[rank0]: return convert_to_fp32(self.model_forward(*args, **kwargs)) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast +[rank0]: return func(*args, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 52, in cce_forward +[rank0]: outputs: BaseModelOutputWithPast = self.model( +[rank0]: ^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 927, in wrapper +[rank0]: outputs = func(self, *args, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 652, in forward +[rank0]: hidden_states = decoder_layer( +[rank0]: ^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ +[rank0]: return super().__call__(*args, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank0]: return func(*args, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 557, in forward +[rank0]: hidden_states=self.operator_norm(hidden_states), +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 63, in forward +[rank0]: return self.weight * hidden_states.to(input_dtype) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 0 has a total capacity of 79.25 GiB of which 208.94 MiB is free. Including non-PyTorch memory, this process has 79.04 GiB memory in use. Of the allocated memory 76.43 GiB is allocated by PyTorch, and 165.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank2]: Traceback (most recent call last): +[rank2]: File "", line 198, in _run_module_as_main +[rank2]: File "", line 88, in _run_code +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 121, in +[rank2]: fire.Fire(do_cli) +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire +[rank2]: component_trace = _Fire(component, args, parsed_flag_args, context, name) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire +[rank2]: component, remaining_args = _CallAndUpdateTrace( +[rank2]: ^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace +[rank2]: component = fn(*varargs, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 88, in do_cli +[rank2]: return do_train(parsed_cfg, parsed_cli_args) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 45, in do_train +[rank2]: model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 584, in train +[rank2]: execute_training(cfg, trainer, resume_from_checkpoint) +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 197, in execute_training +[rank2]: trainer.train(resume_from_checkpoint=resume_from_checkpoint) +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2224, in train +[rank2]: return inner_training_loop( +[rank2]: ^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2561, in _inner_training_loop +[rank2]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/mixins/activation_checkpointing.py", line 46, in training_step +[rank2]: return super().training_step(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3854, in training_step +[rank2]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/base.py", line 367, in compute_loss +[rank2]: return super().compute_loss( +[rank2]: ^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3936, in compute_loss +[rank2]: outputs = model(**inputs) +[rank2]: ^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank2]: return self._call_impl(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank2]: return forward_call(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1648, in forward +[rank2]: else self._run_ddp_forward(*inputs, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1474, in _run_ddp_forward +[rank2]: return self.module(*inputs, **kwargs) # type: ignore[index] +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank2]: return self._call_impl(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank2]: return forward_call(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 818, in forward +[rank2]: return model_forward(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 806, in __call__ +[rank2]: return convert_to_fp32(self.model_forward(*args, **kwargs)) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast +[rank2]: return func(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 52, in cce_forward +[rank2]: outputs: BaseModelOutputWithPast = self.model( +[rank2]: ^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank2]: return self._call_impl(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank2]: return forward_call(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 927, in wrapper +[rank2]: outputs = func(self, *args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 652, in forward +[rank2]: hidden_states = decoder_layer( +[rank2]: ^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ +[rank2]: return super().__call__(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank2]: return self._call_impl(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank2]: return forward_call(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank2]: return func(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 556, in forward +[rank2]: hidden_states = self.conv( +[rank2]: ^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank2]: return self._call_impl(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank2]: return forward_call(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank2]: return func(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 517, in forward +[rank2]: return self.slow_forward(hidden_states, past_key_values, cache_position, attention_mask) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank2]: return func(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 500, in slow_forward +[rank2]: conv_out = self.conv(Bx)[..., :seqlen] +[rank2]: ^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank2]: return self._call_impl(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank2]: return forward_call(*args, **kwargs) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 371, in forward +[rank2]: return self._conv_forward(input, self.weight, self.bias) +[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank2]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 366, in _conv_forward +[rank2]: return F.conv1d( +[rank2]: ^^^^^^^^^ +[rank2]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 272.00 MiB. GPU 2 has a total capacity of 79.25 GiB of which 160.94 MiB is free. Including non-PyTorch memory, this process has 79.09 GiB memory in use. Of the allocated memory 77.68 GiB is allocated by PyTorch, and 85.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank5]: Traceback (most recent call last): +[rank5]: File "", line 198, in _run_module_as_main +[rank5]: File "", line 88, in _run_code +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 121, in +[rank5]: fire.Fire(do_cli) +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire +[rank5]: component_trace = _Fire(component, args, parsed_flag_args, context, name) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire +[rank5]: component, remaining_args = _CallAndUpdateTrace( +[rank5]: ^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace +[rank5]: component = fn(*varargs, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 88, in do_cli +[rank5]: return do_train(parsed_cfg, parsed_cli_args) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 45, in do_train +[rank5]: model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 584, in train +[rank5]: execute_training(cfg, trainer, resume_from_checkpoint) +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 197, in execute_training +[rank5]: trainer.train(resume_from_checkpoint=resume_from_checkpoint) +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2224, in train +[rank5]: return inner_training_loop( +[rank5]: ^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2561, in _inner_training_loop +[rank5]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/mixins/activation_checkpointing.py", line 46, in training_step +[rank5]: return super().training_step(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3854, in training_step +[rank5]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/base.py", line 367, in compute_loss +[rank5]: return super().compute_loss( +[rank5]: ^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3936, in compute_loss +[rank5]: outputs = model(**inputs) +[rank5]: ^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank5]: return self._call_impl(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank5]: return forward_call(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1648, in forward +[rank5]: else self._run_ddp_forward(*inputs, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1474, in _run_ddp_forward +[rank5]: return self.module(*inputs, **kwargs) # type: ignore[index] +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank5]: return self._call_impl(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank5]: return forward_call(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 818, in forward +[rank5]: return model_forward(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 806, in __call__ +[rank5]: return convert_to_fp32(self.model_forward(*args, **kwargs)) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast +[rank5]: return func(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 52, in cce_forward +[rank5]: outputs: BaseModelOutputWithPast = self.model( +[rank5]: ^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank5]: return self._call_impl(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank5]: return forward_call(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 927, in wrapper +[rank5]: outputs = func(self, *args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 652, in forward +[rank5]: hidden_states = decoder_layer( +[rank5]: ^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ +[rank5]: return super().__call__(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank5]: return self._call_impl(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank5]: return forward_call(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank5]: return func(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 556, in forward +[rank5]: hidden_states = self.conv( +[rank5]: ^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank5]: return self._call_impl(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank5]: return forward_call(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank5]: return func(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 517, in forward +[rank5]: return self.slow_forward(hidden_states, past_key_values, cache_position, attention_mask) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank5]: return func(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 500, in slow_forward +[rank5]: conv_out = self.conv(Bx)[..., :seqlen] +[rank5]: ^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank5]: return self._call_impl(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank5]: return forward_call(*args, **kwargs) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 371, in forward +[rank5]: return self._conv_forward(input, self.weight, self.bias) +[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank5]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 366, in _conv_forward +[rank5]: return F.conv1d( +[rank5]: ^^^^^^^^^ +[rank5]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 272.00 MiB. GPU 5 has a total capacity of 79.25 GiB of which 160.94 MiB is free. Including non-PyTorch memory, this process has 79.09 GiB memory in use. Of the allocated memory 77.68 GiB is allocated by PyTorch, and 85.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank4]: Traceback (most recent call last): +[rank4]: File "", line 198, in _run_module_as_main +[rank4]: File "", line 88, in _run_code +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 121, in +[rank4]: fire.Fire(do_cli) +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire +[rank4]: component_trace = _Fire(component, args, parsed_flag_args, context, name) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire +[rank4]: component, remaining_args = _CallAndUpdateTrace( +[rank4]: ^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace +[rank4]: component = fn(*varargs, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 88, in do_cli +[rank4]: return do_train(parsed_cfg, parsed_cli_args) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 45, in do_train +[rank4]: model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 584, in train +[rank4]: execute_training(cfg, trainer, resume_from_checkpoint) +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 197, in execute_training +[rank4]: trainer.train(resume_from_checkpoint=resume_from_checkpoint) +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2224, in train +[rank4]: return inner_training_loop( +[rank4]: ^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2561, in _inner_training_loop +[rank4]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/mixins/activation_checkpointing.py", line 46, in training_step +[rank4]: return super().training_step(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3854, in training_step +[rank4]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/base.py", line 367, in compute_loss +[rank4]: return super().compute_loss( +[rank4]: ^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3936, in compute_loss +[rank4]: outputs = model(**inputs) +[rank4]: ^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank4]: return self._call_impl(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank4]: return forward_call(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1648, in forward +[rank4]: else self._run_ddp_forward(*inputs, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1474, in _run_ddp_forward +[rank4]: return self.module(*inputs, **kwargs) # type: ignore[index] +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank4]: return self._call_impl(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank4]: return forward_call(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 818, in forward +[rank4]: return model_forward(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 806, in __call__ +[rank4]: return convert_to_fp32(self.model_forward(*args, **kwargs)) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast +[rank4]: return func(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 52, in cce_forward +[rank4]: outputs: BaseModelOutputWithPast = self.model( +[rank4]: ^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank4]: return self._call_impl(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank4]: return forward_call(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 927, in wrapper +[rank4]: outputs = func(self, *args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 652, in forward +[rank4]: hidden_states = decoder_layer( +[rank4]: ^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ +[rank4]: return super().__call__(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank4]: return self._call_impl(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank4]: return forward_call(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank4]: return func(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 556, in forward +[rank4]: hidden_states = self.conv( +[rank4]: ^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank4]: return self._call_impl(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank4]: return forward_call(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank4]: return func(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 517, in forward +[rank4]: return self.slow_forward(hidden_states, past_key_values, cache_position, attention_mask) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank4]: return func(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 500, in slow_forward +[rank4]: conv_out = self.conv(Bx)[..., :seqlen] +[rank4]: ^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank4]: return self._call_impl(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank4]: return forward_call(*args, **kwargs) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 371, in forward +[rank4]: return self._conv_forward(input, self.weight, self.bias) +[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank4]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 366, in _conv_forward +[rank4]: return F.conv1d( +[rank4]: ^^^^^^^^^ +[rank4]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 272.00 MiB. GPU 4 has a total capacity of 79.25 GiB of which 160.94 MiB is free. Including non-PyTorch memory, this process has 79.09 GiB memory in use. Of the allocated memory 77.68 GiB is allocated by PyTorch, and 85.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank3]: Traceback (most recent call last): +[rank3]: File "", line 198, in _run_module_as_main +[rank3]: File "", line 88, in _run_code +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 121, in +[rank3]: fire.Fire(do_cli) +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire +[rank3]: component_trace = _Fire(component, args, parsed_flag_args, context, name) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire +[rank3]: component, remaining_args = _CallAndUpdateTrace( +[rank3]: ^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace +[rank3]: component = fn(*varargs, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 88, in do_cli +[rank3]: return do_train(parsed_cfg, parsed_cli_args) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 45, in do_train +[rank3]: model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 584, in train +[rank3]: execute_training(cfg, trainer, resume_from_checkpoint) +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 197, in execute_training +[rank3]: trainer.train(resume_from_checkpoint=resume_from_checkpoint) +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2224, in train +[rank3]: return inner_training_loop( +[rank3]: ^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2561, in _inner_training_loop +[rank3]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/mixins/activation_checkpointing.py", line 46, in training_step +[rank3]: return super().training_step(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3854, in training_step +[rank3]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/base.py", line 367, in compute_loss +[rank3]: return super().compute_loss( +[rank3]: ^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3936, in compute_loss +[rank3]: outputs = model(**inputs) +[rank3]: ^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank3]: return self._call_impl(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank3]: return forward_call(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1648, in forward +[rank3]: else self._run_ddp_forward(*inputs, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1474, in _run_ddp_forward +[rank3]: return self.module(*inputs, **kwargs) # type: ignore[index] +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank3]: return self._call_impl(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank3]: return forward_call(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 818, in forward +[rank3]: return model_forward(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 806, in __call__ +[rank3]: return convert_to_fp32(self.model_forward(*args, **kwargs)) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast +[rank3]: return func(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 52, in cce_forward +[rank3]: outputs: BaseModelOutputWithPast = self.model( +[rank3]: ^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank3]: return self._call_impl(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank3]: return forward_call(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 927, in wrapper +[rank3]: outputs = func(self, *args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 652, in forward +[rank3]: hidden_states = decoder_layer( +[rank3]: ^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ +[rank3]: return super().__call__(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank3]: return self._call_impl(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank3]: return forward_call(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank3]: return func(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 556, in forward +[rank3]: hidden_states = self.conv( +[rank3]: ^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank3]: return self._call_impl(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank3]: return forward_call(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank3]: return func(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 517, in forward +[rank3]: return self.slow_forward(hidden_states, past_key_values, cache_position, attention_mask) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank3]: return func(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 500, in slow_forward +[rank3]: conv_out = self.conv(Bx)[..., :seqlen] +[rank3]: ^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank3]: return self._call_impl(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank3]: return forward_call(*args, **kwargs) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 371, in forward +[rank3]: return self._conv_forward(input, self.weight, self.bias) +[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank3]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 366, in _conv_forward +[rank3]: return F.conv1d( +[rank3]: ^^^^^^^^^ +[rank3]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 272.00 MiB. GPU 3 has a total capacity of 79.25 GiB of which 160.94 MiB is free. Including non-PyTorch memory, this process has 79.09 GiB memory in use. Of the allocated memory 77.68 GiB is allocated by PyTorch, and 85.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank6]: Traceback (most recent call last): +[rank6]: File "", line 198, in _run_module_as_main +[rank6]: File "", line 88, in _run_code +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 121, in +[rank6]: fire.Fire(do_cli) +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire +[rank6]: component_trace = _Fire(component, args, parsed_flag_args, context, name) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire +[rank6]: component, remaining_args = _CallAndUpdateTrace( +[rank6]: ^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace +[rank6]: component = fn(*varargs, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 88, in do_cli +[rank6]: return do_train(parsed_cfg, parsed_cli_args) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 45, in do_train +[rank6]: model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 584, in train +[rank6]: execute_training(cfg, trainer, resume_from_checkpoint) +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 197, in execute_training +[rank6]: trainer.train(resume_from_checkpoint=resume_from_checkpoint) +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2224, in train +[rank6]: return inner_training_loop( +[rank6]: ^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2561, in _inner_training_loop +[rank6]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/mixins/activation_checkpointing.py", line 46, in training_step +[rank6]: return super().training_step(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3854, in training_step +[rank6]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/base.py", line 367, in compute_loss +[rank6]: return super().compute_loss( +[rank6]: ^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3936, in compute_loss +[rank6]: outputs = model(**inputs) +[rank6]: ^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank6]: return self._call_impl(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank6]: return forward_call(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1648, in forward +[rank6]: else self._run_ddp_forward(*inputs, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1474, in _run_ddp_forward +[rank6]: return self.module(*inputs, **kwargs) # type: ignore[index] +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank6]: return self._call_impl(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank6]: return forward_call(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 818, in forward +[rank6]: return model_forward(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 806, in __call__ +[rank6]: return convert_to_fp32(self.model_forward(*args, **kwargs)) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast +[rank6]: return func(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 52, in cce_forward +[rank6]: outputs: BaseModelOutputWithPast = self.model( +[rank6]: ^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank6]: return self._call_impl(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank6]: return forward_call(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 927, in wrapper +[rank6]: outputs = func(self, *args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 652, in forward +[rank6]: hidden_states = decoder_layer( +[rank6]: ^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ +[rank6]: return super().__call__(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank6]: return self._call_impl(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank6]: return forward_call(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank6]: return func(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 556, in forward +[rank6]: hidden_states = self.conv( +[rank6]: ^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank6]: return self._call_impl(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank6]: return forward_call(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank6]: return func(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 517, in forward +[rank6]: return self.slow_forward(hidden_states, past_key_values, cache_position, attention_mask) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank6]: return func(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 500, in slow_forward +[rank6]: conv_out = self.conv(Bx)[..., :seqlen] +[rank6]: ^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank6]: return self._call_impl(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank6]: return forward_call(*args, **kwargs) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 371, in forward +[rank6]: return self._conv_forward(input, self.weight, self.bias) +[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank6]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 366, in _conv_forward +[rank6]: return F.conv1d( +[rank6]: ^^^^^^^^^ +[rank6]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 272.00 MiB. GPU 6 has a total capacity of 79.25 GiB of which 160.94 MiB is free. Including non-PyTorch memory, this process has 79.09 GiB memory in use. Of the allocated memory 77.68 GiB is allocated by PyTorch, and 85.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank7]: Traceback (most recent call last): +[rank7]: File "", line 198, in _run_module_as_main +[rank7]: File "", line 88, in _run_code +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 121, in +[rank7]: fire.Fire(do_cli) +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire +[rank7]: component_trace = _Fire(component, args, parsed_flag_args, context, name) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire +[rank7]: component, remaining_args = _CallAndUpdateTrace( +[rank7]: ^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace +[rank7]: component = fn(*varargs, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 88, in do_cli +[rank7]: return do_train(parsed_cfg, parsed_cli_args) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/cli/train.py", line 45, in do_train +[rank7]: model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 584, in train +[rank7]: execute_training(cfg, trainer, resume_from_checkpoint) +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/train.py", line 197, in execute_training +[rank7]: trainer.train(resume_from_checkpoint=resume_from_checkpoint) +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2224, in train +[rank7]: return inner_training_loop( +[rank7]: ^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2561, in _inner_training_loop +[rank7]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/mixins/activation_checkpointing.py", line 46, in training_step +[rank7]: return super().training_step(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3854, in training_step +[rank7]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/axolotl/core/trainers/base.py", line 367, in compute_loss +[rank7]: return super().compute_loss( +[rank7]: ^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/trainer.py", line 3936, in compute_loss +[rank7]: outputs = model(**inputs) +[rank7]: ^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank7]: return self._call_impl(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank7]: return forward_call(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1648, in forward +[rank7]: else self._run_ddp_forward(*inputs, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1474, in _run_ddp_forward +[rank7]: return self.module(*inputs, **kwargs) # type: ignore[index] +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank7]: return self._call_impl(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank7]: return forward_call(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 818, in forward +[rank7]: return model_forward(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 806, in __call__ +[rank7]: return convert_to_fp32(self.model_forward(*args, **kwargs)) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast +[rank7]: return func(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 52, in cce_forward +[rank7]: outputs: BaseModelOutputWithPast = self.model( +[rank7]: ^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank7]: return self._call_impl(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank7]: return forward_call(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 927, in wrapper +[rank7]: outputs = func(self, *args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 652, in forward +[rank7]: hidden_states = decoder_layer( +[rank7]: ^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ +[rank7]: return super().__call__(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank7]: return self._call_impl(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank7]: return forward_call(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank7]: return func(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 556, in forward +[rank7]: hidden_states = self.conv( +[rank7]: ^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank7]: return self._call_impl(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank7]: return forward_call(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank7]: return func(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 517, in forward +[rank7]: return self.slow_forward(hidden_states, past_key_values, cache_position, attention_mask) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank7]: return func(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.py", line 500, in slow_forward +[rank7]: conv_out = self.conv(Bx)[..., :seqlen] +[rank7]: ^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl +[rank7]: return self._call_impl(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl +[rank7]: return forward_call(*args, **kwargs) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 371, in forward +[rank7]: return self._conv_forward(input, self.weight, self.bias) +[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank7]: File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 366, in _conv_forward +[rank7]: return F.conv1d( +[rank7]: ^^^^^^^^^ +[rank7]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 272.00 MiB. GPU 7 has a total capacity of 79.25 GiB of which 160.94 MiB is free. Including non-PyTorch memory, this process has 79.09 GiB memory in use. Of the allocated memory 77.68 GiB is allocated by PyTorch, and 85.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Exception in thread Thread-8 (_pin_memory_loop): +Traceback (most recent call last): + File "/usr/lib/python3.12/threading.py", line 1073, in _bootstrap_inner + self.run() + File "/usr/lib/python3.12/threading.py", line 1010, in run + self._target(*self._args, **self._kwargs) + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 61, in _pin_memory_loop + do_one_step() + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 37, in do_one_step + r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/queues.py", line 122, in get + return _ForkingPickler.loads(res) + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/multiprocessing/reductions.py", line 541, in rebuild_storage_fd + fd = df.detach() + ^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 57, in detach + with _resource_sharer.get_connection(self._id) as conn: + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 86, in get_connection + c = Client(address, authkey=process.current_process().authkey) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 526, in Client + deliver_challenge(c, authkey) + File "/usr/lib/python3.12/multiprocessing/connection.py", line 939, in deliver_challenge + response = connection.recv_bytes(256) # reject large message + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 216, in recv_bytes + buf = self._recv_bytes(maxlength) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 430, in _recv_bytes + buf = self._recv(4) + ^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 395, in _recv + chunk = read(handle, remaining) + ^^^^^^^^^^^^^^^^^^^^^^^ +ConnectionResetError: [Errno 104] Connection reset by peer +Exception in thread Thread-20 (_pin_memory_loop): +Traceback (most recent call last): + File "/usr/lib/python3.12/threading.py", line 1073, in _bootstrap_inner + self.run() + File "/usr/lib/python3.12/threading.py", line 1010, in run + self._target(*self._args, **self._kwargs) + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 61, in _pin_memory_loop + do_one_step() + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 37, in do_one_step + r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/queues.py", line 122, in get + return _ForkingPickler.loads(res) + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/multiprocessing/reductions.py", line 541, in rebuild_storage_fd + fd = df.detach() + ^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 57, in detach + with _resource_sharer.get_connection(self._id) as conn: + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 86, in get_connection + c = Client(address, authkey=process.current_process().authkey) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 525, in Client + answer_challenge(c, authkey) + File "/usr/lib/python3.12/multiprocessing/connection.py", line 962, in answer_challenge + response = connection.recv_bytes(256) # reject large message + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 216, in recv_bytes + buf = self._recv_bytes(maxlength) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 430, in _recv_bytes + buf = self._recv(4) + ^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 395, in _recv + chunk = read(handle, remaining) + ^^^^^^^^^^^^^^^^^^^^^^^ +ConnectionResetError: [Errno 104] Connection reset by peer +Exception in thread Thread-8 (_pin_memory_loop): +Traceback (most recent call last): + File "/usr/lib/python3.12/threading.py", line 1073, in _bootstrap_inner + self.run() + File "/usr/lib/python3.12/threading.py", line 1010, in run + self._target(*self._args, **self._kwargs) + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 61, in _pin_memory_loop + do_one_step() + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 37, in do_one_step + r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/queues.py", line 122, in get + return _ForkingPickler.loads(res) + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/multiprocessing/reductions.py", line 541, in rebuild_storage_fd + fd = df.detach() + ^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 57, in detach + with _resource_sharer.get_connection(self._id) as conn: + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 86, in get_connection + c = Client(address, authkey=process.current_process().authkey) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 525, in Client + answer_challenge(c, authkey) + File "/usr/lib/python3.12/multiprocessing/connection.py", line 962, in answer_challenge + response = connection.recv_bytes(256) # reject large message + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 216, in recv_bytes + buf = self._recv_bytes(maxlength) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 430, in _recv_bytes + buf = self._recv(4) + ^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 395, in _recv + chunk = read(handle, remaining) + ^^^^^^^^^^^^^^^^^^^^^^^ +ConnectionResetError: [Errno 104] Connection reset by peer +Exception in thread Thread-8 (_pin_memory_loop): +Traceback (most recent call last): + File "/usr/lib/python3.12/threading.py", line 1073, in _bootstrap_inner + self.run() + File "/usr/lib/python3.12/threading.py", line 1010, in run + self._target(*self._args, **self._kwargs) + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 61, in _pin_memory_loop + do_one_step() + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 37, in do_one_step + r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/queues.py", line 122, in get + return _ForkingPickler.loads(res) + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/multiprocessing/reductions.py", line 541, in rebuild_storage_fd + fd = df.detach() + ^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 57, in detach + with _resource_sharer.get_connection(self._id) as conn: + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 86, in get_connection + c = Client(address, authkey=process.current_process().authkey) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 525, in Client + answer_challenge(c, authkey) + File "/usr/lib/python3.12/multiprocessing/connection.py", line 953, in answer_challenge + message = connection.recv_bytes(256) # reject large message + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 216, in recv_bytes + buf = self._recv_bytes(maxlength) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 430, in _recv_bytes + buf = self._recv(4) + ^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 399, in _recv + raise EOFError +EOFError +Exception in thread Thread-8 (_pin_memory_loop): +Traceback (most recent call last): + File "/usr/lib/python3.12/threading.py", line 1073, in _bootstrap_inner + self.run() + File "/usr/lib/python3.12/threading.py", line 1010, in run + self._target(*self._args, **self._kwargs) + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 61, in _pin_memory_loop + do_one_step() + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 37, in do_one_step + r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/queues.py", line 122, in get + return _ForkingPickler.loads(res) + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/multiprocessing/reductions.py", line 541, in rebuild_storage_fd + fd = df.detach() + ^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 57, in detach + with _resource_sharer.get_connection(self._id) as conn: + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 86, in get_connection + c = Client(address, authkey=process.current_process().authkey) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 525, in Client + answer_challenge(c, authkey) + File "/usr/lib/python3.12/multiprocessing/connection.py", line 953, in answer_challenge + message = connection.recv_bytes(256) # reject large message + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 216, in recv_bytes + buf = self._recv_bytes(maxlength) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 430, in _recv_bytes + buf = self._recv(4) + ^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 399, in _recv + raise EOFError +EOFError +Exception in thread Thread-8 (_pin_memory_loop): +Traceback (most recent call last): + File "/usr/lib/python3.12/threading.py", line 1073, in _bootstrap_inner + self.run() + File "/usr/lib/python3.12/threading.py", line 1010, in run + self._target(*self._args, **self._kwargs) + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 61, in _pin_memory_loop + do_one_step() + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 37, in do_one_step + r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/queues.py", line 122, in get + return _ForkingPickler.loads(res) + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/multiprocessing/reductions.py", line 541, in rebuild_storage_fd + fd = df.detach() + ^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 57, in detach + with _resource_sharer.get_connection(self._id) as conn: + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 86, in get_connection + c = Client(address, authkey=process.current_process().authkey) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 525, in Client + answer_challenge(c, authkey) + File "/usr/lib/python3.12/multiprocessing/connection.py", line 962, in answer_challenge + response = connection.recv_bytes(256) # reject large message + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 216, in recv_bytes + buf = self._recv_bytes(maxlength) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 430, in _recv_bytes + buf = self._recv(4) + ^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 399, in _recv + raise EOFError +EOFError +Exception in thread Thread-8 (_pin_memory_loop): +Traceback (most recent call last): + File "/usr/lib/python3.12/threading.py", line 1073, in _bootstrap_inner + self.run() + File "/usr/lib/python3.12/threading.py", line 1010, in run + self._target(*self._args, **self._kwargs) + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 61, in _pin_memory_loop + do_one_step() + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 37, in do_one_step + r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/queues.py", line 122, in get + return _ForkingPickler.loads(res) + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/multiprocessing/reductions.py", line 541, in rebuild_storage_fd + fd = df.detach() + ^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 57, in detach + with _resource_sharer.get_connection(self._id) as conn: + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/resource_sharer.py", line 86, in get_connection + c = Client(address, authkey=process.current_process().authkey) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 525, in Client + answer_challenge(c, authkey) + File "/usr/lib/python3.12/multiprocessing/connection.py", line 962, in answer_challenge + response = connection.recv_bytes(256) # reject large message + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 216, in recv_bytes + buf = self._recv_bytes(maxlength) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 430, in _recv_bytes + buf = self._recv(4) + ^^^^^^^^^^^^^ + File "/usr/lib/python3.12/multiprocessing/connection.py", line 395, in _recv + chunk = read(handle, remaining) + ^^^^^^^^^^^^^^^^^^^^^^^ +ConnectionResetError: [Errno 104] Connection reset by peer +[2025-10-12 02:54:14,858] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:1386789] baseline 0.000GB () +[2025-10-12 02:54:14,858] [INFO] [axolotl.cli.config.load_cfg:248] [PID:1386789] config: +{ + "activation_offloading": false, + "auto_resume_from_checkpoints": true, + "axolotl_config_path": "train_350m_multitask.yaml", + "base_model": "/home/ubuntu/axolotl/out-350m-audio-pt", + "base_model_config": "/home/ubuntu/axolotl/out-350m-audio-pt", + "batch_size": 256, + "bf16": true, + "bfloat16": true, + "capabilities": { + "bf16": true, + "compute_capability": "sm_80", + "fp8": false, + "n_gpu": 8, + "n_node": 1 + }, + "context_parallel_size": 1, + "cut_cross_entropy": true, + "dataloader_num_workers": 8, + "dataloader_pin_memory": true, + "dataloader_prefetch_factor": 256, + "dataset_prepared_path": "/home/ubuntu/axolotl/preprocessed-data-350m-multitask-ft", + "dataset_processes": 240, + "datasets": [ + { + "ds_type": "json", + "message_property_mappings": { + "content": "content", + "role": "role" + }, + "path": "/home/ubuntu/axolotl/hackathon-train_data-s2s-jaen.jsonl", + "trust_remote_code": false + }, + { + "ds_type": "json", + "message_property_mappings": { + "content": "content", + "role": "role" + }, + "path": "/home/ubuntu/axolotl/hackathon-train_data-s2s-enja.jsonl", + "trust_remote_code": false + }, + { + "ds_type": "json", + "message_property_mappings": { + "content": "content", + "role": "role" + }, + "path": "/home/ubuntu/axolotl/hackathon-train_data-asr-ja.jsonl", + "trust_remote_code": false + }, + { + "ds_type": "json", + "message_property_mappings": { + "content": "content", + "role": "role" + }, + "path": "/home/ubuntu/axolotl/hackathon-train_data-tts-ja.jsonl", + "trust_remote_code": false + }, + { + "ds_type": "json", + "message_property_mappings": { + "content": "content", + "role": "role" + }, + "path": "/home/ubuntu/axolotl/hackathon-train_data-asr-en.jsonl", + "trust_remote_code": false + }, + { + "ds_type": "json", + "message_property_mappings": { + "content": "content", + "role": "role" + }, + "path": "/home/ubuntu/axolotl/hackathon-train_data-tts-en.jsonl", + "trust_remote_code": false + } + ], + "ddp": true, + "device": "cuda:0", + "device_map": { + "": 0 + }, + "dion_rank_fraction": 1.0, + "dion_rank_multiple_of": 1, + "env_capabilities": { + "torch_version": "2.8.0" + }, + "eval_batch_size": 8, + "eval_causal_lm_metrics": [ + "sacrebleu", + "comet", + "ter", + "chrf" + ], + "eval_max_new_tokens": 128, + "eval_sample_packing": false, + "eval_steps": 100, + "eval_strategy": "steps", + "eval_table_size": 0, + "experimental_skip_move_to_device": true, + "flash_attention": true, + "fp16": false, + "gradient_accumulation_steps": 1, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": { + "use_reentrant": true + }, + "group_by_length": false, + "include_tkps": true, + "is_falcon_derived_model": false, + "is_llama_derived_model": false, + "is_mistral_derived_model": false, + "learning_rate": 5e-05, + "lisa_layers_attribute": "model.layers", + "load_best_model_at_end": false, + "load_in_4bit": false, + "load_in_8bit": false, + "local_rank": 0, + "logging_steps": 1, + "loraplus_lr_embedding": 1e-06, + "lr_scheduler": "cosine", + "max_grad_norm": 1.0, + "mean_resizing_embeddings": false, + "micro_batch_size": 32, + "model_config_type": "lfm2", + "num_epochs": 3.0, + "optimizer": "adamw_torch_fused", + "output_dir": "/home/ubuntu/axolotl/out-350m-multitask-ft", + "pad_to_sequence_len": true, + "plugins": [ + "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin" + ], + "pretrain_multipack_attn": true, + "profiler_steps_start": 0, + "qlora_sharded_model_loading": false, + "ray_num_workers": 1, + "remove_unused_columns": false, + "resources_per_worker": { + "GPU": 1 + }, + "sample_packing": true, + "sample_packing_bin_size": 200, + "sample_packing_group_size": 100000, + "save_only_model": false, + "save_safetensors": true, + "save_steps": 100, + "save_strategy": "steps", + "sequence_len": 4096, + "shuffle_before_merging_datasets": false, + "shuffle_merged_datasets": true, + "skip_prepare_dataset": false, + "streaming_multipack_buffer_size": 10000, + "strict": false, + "tensor_parallel_size": 1, + "tf32": true, + "tiled_mlp_use_original_mlp": true, + "tokenizer_config": "/home/ubuntu/axolotl/out-350m-audio-pt", + "tokenizer_save_jinja_files": true, + "tokenizer_type": "AutoTokenizer", + "torch_dtype": "torch.bfloat16", + "train_on_inputs": false, + "trl": { + "log_completions": false, + "mask_truncated_completions": false, + "ref_model_mixup_alpha": 0.9, + "ref_model_sync_steps": 64, + "scale_rewards": true, + "sync_ref_model": false, + "use_vllm": false, + "vllm_server_host": "0.0.0.0", + "vllm_server_port": 8000 + }, + "type_of_model": "AutoModelForCausalLM", + "use_ray": false, + "use_wandb": true, + "val_set_size": 0.01, + "vllm": { + "device": "auto", + "dtype": "auto", + "gpu_memory_utilization": 0.9, + "host": "0.0.0.0", + "port": 8000 + }, + "wandb_entity": "aratako-lm", + "wandb_name": "350m-multitask-ft-run1", + "wandb_project": "liquidai-hackathon", + "warmup_ratio": 0.1, + "weight_decay": 0.01, + "world_size": 8 +} + Loading dataset from disk: 0%| | 0/240 [00:00 +[2025-10-12 02:55:31,480] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:1386789] BOS: 1 / <|startoftext|> +[2025-10-12 02:55:31,480] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:1386789] PAD: 0 / <|pad|> +[2025-10-12 02:55:31,480] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:1386789] UNK: None / None +[2025-10-12 02:55:31,482] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:470] [PID:1386789] Loading prepared dataset from disk at /home/ubuntu/axolotl/preprocessed-data-350m-multitask-ft/28514821cb56568b4099ac280cc69eed... + Loading dataset from disk: 0%| | 0/240 [00:00 +[2025-10-12 02:57:13,462] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:1386789] BOS: 1 / <|startoftext|> +[2025-10-12 02:57:13,462] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:1386789] PAD: 0 / <|pad|> +[2025-10-12 02:57:13,463] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:1386789] UNK: None / None +[2025-10-12 02:57:13,463] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:1386789] Loading model +[2025-10-12 02:57:13,475] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:1386789] Patched Trainer.evaluation_loop with nanmean loss calculation +[2025-10-12 02:57:13,476] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:1386789] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation +[2025-10-12 02:57:13,476] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:1386789] Applying multipack dataloader patch for sample packing... +[2025-10-12 02:57:13,888] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:1386789] Applying Cut Cross Entropy to model type: lfm2 +[2025-10-12 02:57:14,225] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:1386789] Converting modules to torch.bfloat16 +[2025-10-12 02:57:14,228] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:1386789] Memory usage after model load 1.285GB (+1.285GB allocated, +1.303GB reserved) +[2025-10-12 02:57:17,783] [INFO] [axolotl.train.save_initial_configs:402] [PID:1386789] Pre-saving tokenizer to /home/ubuntu/axolotl/out-350m-multitask-ft... +[2025-10-12 02:57:18,497] [INFO] [axolotl.train.save_initial_configs:407] [PID:1386789] Pre-saving model config to /home/ubuntu/axolotl/out-350m-multitask-ft... +[2025-10-12 02:57:18,500] [INFO] [axolotl.train.execute_training:196] [PID:1386789] Starting trainer... +[2025-10-12 03:04:21,879] [WARNING] [py.warnings._showwarnmsg:110] [PID:1386790] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once + +[2025-10-12 03:04:22,505] [WARNING] [py.warnings._showwarnmsg:110] [PID:1386795] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once + +[2025-10-12 03:04:23,556] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:1386789] generate_batches time: 3.02742075920105 +[2025-10-12 03:04:28,898] [WARNING] [py.warnings._showwarnmsg:110] [PID:1386794] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once + +[2025-10-12 03:04:50,921] [WARNING] [py.warnings._showwarnmsg:110] [PID:1386796] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once + +[2025-10-12 03:05:22,805] [WARNING] [py.warnings._showwarnmsg:110] [PID:1386793] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once + +[2025-10-12 03:08:17,541] [WARNING] [py.warnings._showwarnmsg:110] [PID:1386791] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once + +[2025-10-12 03:08:50,888] [WARNING] [py.warnings._showwarnmsg:110] [PID:1386792] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once + +[2025-10-12 03:08:50,889] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:1386789] gather_len_batches: [7647, 7647, 7647, 7647, 7647, 7647, 7647, 7647] +[2025-10-12 03:08:50,891] [WARNING] [py.warnings._showwarnmsg:110] [PID:1386789] /home/ubuntu/axolotl/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once + +wandb: Currently logged in as: aratako1998 (aratako-lm) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: ⢿ Waiting for wandb.init()... + m wandb: Tracking run with wandb version 0.22.2 +wandb: Run data is saved locally in /home/ubuntu/axolotl/wandb/run-20251012_030851-agrcquxp +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run 350m-multitask-ft-run1 +wandb: ⭐️ View project at https://wandb.ai/aratako-lm/liquidai-hackathon +wandb: 🚀 View run at https://wandb.ai/aratako-lm/liquidai-hackathon/runs/agrcquxp +wandb: Detected [huggingface_hub.inference] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +wandb: WARNING Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt") +[2025-10-12 03:08:52,307] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:1386789] The Axolotl config has been saved to the WandB run under files. + 0%| | 0/2865 [00:00