diff --git "a/Order_Ablations/E115_smoke2gpu/train.rank0.log" "b/Order_Ablations/E115_smoke2gpu/train.rank0.log" new file mode 100644--- /dev/null +++ "b/Order_Ablations/E115_smoke2gpu/train.rank0.log" @@ -0,0 +1,338 @@ +/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you. + import pynvml # type: ignore[import] +W0127 16:55:30.554000 179089 torch/distributed/run.py:803] +W0127 16:55:30.554000 179089 torch/distributed/run.py:803] ***************************************** +W0127 16:55:30.554000 179089 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0127 16:55:30.554000 179089 torch/distributed/run.py:803] ***************************************** +/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you. + import pynvml # type: ignore[import] +/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you. + import pynvml # type: ignore[import] +Trainer._get_train_sampler replaced with custom implementation.Trainer._get_train_sampler replaced with custom implementation. + +[2026-01-27 16:55:36,218] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2026-01-27 16:55:36,233] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2026-01-27 16:55:37,256] [INFO] [comm.py:658:init_distributed] cdb=None +[2026-01-27 16:55:37,433] [INFO] [comm.py:658:init_distributed] cdb=None +[2026-01-27 16:55:37,433] [INFO] [comm.py:689:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +Warning: FlashAttention 3 is not available, falling back to PyTorch's scaled_dot_product_attention +Warning: FlashAttention 3 is not available, falling back to PyTorch's scaled_dot_product_attention +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. + Loading checkpoint shards: 0%| | 0/2 [00:00 +[rank1]: train(attn_implementation="flash_attention_2") +[rank1]: File "/workspace/src/qwen_vl/train/train_qwen.py", line 217, in train +[rank1]: trainer.train() +[rank1]: File "/usr/local/lib/python3.12/dist-packages/transformers/trainer.py", line 2245, in train +[rank1]: return inner_training_loop( +[rank1]: ^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/usr/local/lib/python3.12/dist-packages/transformers/trainer.py", line 2508, in _inner_training_loop +[rank1]: batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches, args.device) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/usr/local/lib/python3.12/dist-packages/transformers/trainer.py", line 5224, in get_batch_samples +[rank1]: batch_samples += [next(epoch_iterator)] +[rank1]: ^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/usr/local/lib/python3.12/dist-packages/accelerate/data_loader.py", line 575, in __iter__ +[rank1]: next_batch = next(dataloader_iter) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 731, in __next__ +[rank1]: data = self._next_data() +[rank1]: ^^^^^^^^^^^^^^^^^ +[rank1]: File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1502, in _next_data +[rank1]: return self._process_data(data, worker_id) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1537, in _process_data +[rank1]: data.reraise() +[rank1]: File "/usr/local/lib/python3.12/dist-packages/torch/_utils.py", line 769, in reraise +[rank1]: raise exception +[rank1]: FileNotFoundError: Caught FileNotFoundError in DataLoader worker process 1. +[rank1]: Original Traceback (most recent call last): +[rank1]: File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop +[rank1]: data = fetcher.fetch(index) # type: ignore[possibly-undefined] +[rank1]: ^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch +[rank1]: data = [self.dataset[idx] for idx in possibly_batched_index] +[rank1]: ~~~~~~~~~~~~^^^^^ +[rank1]: File "/workspace/src/qwen_vl/data/data_qwen.py", line 336, in __getitem__ +[rank1]: raise e +[rank1]: File "/workspace/src/qwen_vl/data/data_qwen.py", line 333, in __getitem__ +[rank1]: sample = self._get_item(i) +[rank1]: ^^^^^^^^^^^^^^^^^ +[rank1]: File "/workspace/src/qwen_vl/data/data_qwen.py", line 384, in _get_item +[rank1]: sources[0]["images"] = self.read_video_images(sources[0]) +[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank1]: File "/workspace/src/qwen_vl/data/data_qwen.py", line 344, in read_video_images +[rank1]: raise FileNotFoundError +[rank1]: FileNotFoundError + +W0127 16:57:22.373000 179089 torch/distributed/elastic/multiprocessing/api.py:906] Sending process 179158 closing signal SIGTERM +E0127 16:57:22.387000 179089 torch/distributed/elastic/multiprocessing/api.py:880] failed (exitcode: 1) local_rank: 1 (pid: 179159) of binary: /usr/bin/python +Traceback (most recent call last): + File "/usr/local/bin/torchrun", line 7, in + sys.exit(main()) + ^^^^^^ + File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper + return f(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^ + File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 936, in main + run(args) + File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 927, in run + elastic_launch( + File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 151, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 288, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +src/qwen_vl/train/train_qwen.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2026-01-27_16:57:22 + host : 1f486d0aa8fd + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 179159) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================