| WARNING:torch.distributed.run: | |
| ***************************************** | |
| Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. | |
| ***************************************** | |
| WARNING:torch.distributed.run: | |
| ***************************************** | |
| Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. | |
| ***************************************** | |
| [W socket.cpp:426] [c10d] The server socket has failed to bind to [::]:12345 (errno: 98 - Address already in use). | |
| [W socket.cpp:426] [c10d] The server socket has failed to bind to 0.0.0.0:12345 (errno: 98 - Address already in use). | |
| [E socket.cpp:462] [c10d] The server socket has failed to listen on any local network address. | |
| Traceback (most recent call last): | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/bin/torchrun", line 8, in <module> | |
| sys.exit(main()) | |
| ^^^^^^ | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper | |
| return f(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^ | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/lib/python3.11/site-packages/torch/distributed/run.py", line 794, in main | |
| run(args) | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/lib/python3.11/site-packages/torch/distributed/run.py", line 785, in run | |
| elastic_launch( | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 134, in __call__ | |
| return launch_agent(self._config, self._entrypoint, list(args)) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 241, in launch_agent | |
| result = agent.run() | |
| ^^^^^^^^^^^ | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper | |
| result = f(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^ | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 723, in run | |
| result = self._invoke_run(role) | |
| ^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 858, in _invoke_run | |
| self._initialize_workers(self._worker_group) | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper | |
| result = f(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^ | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 692, in _initialize_workers | |
| self._rendezvous(worker_group) | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper | |
| result = f(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^ | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 546, in _rendezvous | |
| store, group_rank, group_world_size = spec.rdzv_handler.next_rendezvous() | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 55, in next_rendezvous | |
| self._store = TCPStore( # type: ignore[call-arg] | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| RuntimeError: The server socket has failed to listen on any local network address. The server socket has failed to bind to [::]:12345 (errno: 98 - Address already in use). The server socket has failed to bind to 0.0.0.0:12345 (errno: 98 - Address already in use). | |
| srun: error: gcpl4-eu-0: task 0: Exited with exit code 1 | |
| 2024-09-04 08:00:56,890 INFO Namespace(n_epoch=250, lr_schedule=[50], lr=0.0002, gpu='0', out_dir='/data/work-gcp-europe-west4-a/yuqian_fu/Ego/checkpoints/egoexo_v2_480x480', train_dir=['/data/work-gcp-europe-west4-a/yuqian_fu/Ego/data_segswap'], prob_dir=[0.5, 0.5], batch_pos=32, batch_neg=15, feat_pth='../evalBrueghel/Moco_resnet50_feat_1Scale_640p.pkl', warp_mask=False, warmUpIter=1000, resume_pth='/data/work-gcp-europe-west4-a/yuqian_fu/Ego/checkpoints/egoexo_v2_480x480/netLast.pth', resume_epoch=0, mode='small', pos_weight=0.1, feat_weight=1, dropout=0.1, activation='relu', prob_style=0.5, layer_type=['I', 'C', 'I', 'C', 'I', 'N'], drop_feat=0.1, tps_grid=[4, 6], eta_corr=8, iter_epoch=1000, iter_epoch_val=100, weight_decay=0, reverse=False) | |
| 2024-09-04 08:00:56,890 INFO Load MocoV2 pre-trained ResNet-50 feature... | |
| 2024-09-04 08:00:56,892 INFO Namespace(n_epoch=250, lr_schedule=[50], lr=0.0002, gpu='0', out_dir='/data/work-gcp-europe-west4-a/yuqian_fu/Ego/checkpoints/egoexo_v2_480x480', train_dir=['/data/work-gcp-europe-west4-a/yuqian_fu/Ego/data_segswap'], prob_dir=[0.5, 0.5], batch_pos=32, batch_neg=15, feat_pth='../evalBrueghel/Moco_resnet50_feat_1Scale_640p.pkl', warp_mask=False, warmUpIter=1000, resume_pth='/data/work-gcp-europe-west4-a/yuqian_fu/Ego/checkpoints/egoexo_v2_480x480/netLast.pth', resume_epoch=0, mode='small', pos_weight=0.1, feat_weight=1, dropout=0.1, activation='relu', prob_style=0.5, layer_type=['I', 'C', 'I', 'C', 'I', 'N'], drop_feat=0.1, tps_grid=[4, 6], eta_corr=8, iter_epoch=1000, iter_epoch_val=100, weight_decay=0, reverse=False) | |
| 2024-09-04 08:00:56,892 INFO Load MocoV2 pre-trained ResNet-50 feature... | |
| LOADING: train_egoexo_pairs.json | |
| LOADING: train_egoexo_pairs.json | |
| LOADING: LOADING: val_egoexo_pairs.jsonval_egoexo_pairs.json | |
| Traceback (most recent call last): | |
| Traceback (most recent call last): | |
| File "/home/yuqian_fu/Projects/ego-exo4d-relation/correspondence/SegSwap/train/Main_dis.py", line 186, in <module> | |
| File "/home/yuqian_fu/Projects/ego-exo4d-relation/correspondence/SegSwap/train/Main_dis.py", line 186, in <module> | |
| trainLoader.sampler = DistributedSampler(trainLoader.dataset, num_replicas=world_size, rank=rank) | |
| ^^^^^^^^^^^^^^^^^^ | |
| NameError: name 'DistributedSampler' is not defined | |
| trainLoader.sampler = DistributedSampler(trainLoader.dataset, num_replicas=world_size, rank=rank) | |
| ^^^^^^^^^^^^^^^^^^ | |
| NameError: name 'DistributedSampler' is not defined | |
| ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 766624) of binary: /scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/bin/python3.11 | |
| Traceback (most recent call last): | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/bin/torchrun", line 8, in <module> | |
| sys.exit(main()) | |
| ^^^^^^ | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper | |
| return f(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^ | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/lib/python3.11/site-packages/torch/distributed/run.py", line 794, in main | |
| run(args) | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/lib/python3.11/site-packages/torch/distributed/run.py", line 785, in run | |
| elastic_launch( | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 134, in __call__ | |
| return launch_agent(self._config, self._entrypoint, list(args)) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/scratch/yuqian_fu/micromamba/envs/auto-gfmiyxa3evbd/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent | |
| raise ChildFailedError( | |
| torch.distributed.elastic.multiprocessing.errors.ChildFailedError: | |
| ============================================================ | |
| /home/yuqian_fu/Projects/ego-exo4d-relation/correspondence/SegSwap/train/Main_dis.py FAILED | |
| ------------------------------------------------------------ | |
| Failures: | |
| [1]: | |
| time : 2024-09-04_08:02:09 | |
| host : gcpl4-eu-0.slurm.insait.ai | |
| rank : 1 (local_rank: 1) | |
| exitcode : 1 (pid: 766625) | |
| error_file: <N/A> | |
| traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html | |
| ------------------------------------------------------------ | |
| Root Cause (first observed failure): | |
| [0]: | |
| time : 2024-09-04_08:02:09 | |
| host : gcpl4-eu-0.slurm.insait.ai | |
| rank : 0 (local_rank: 0) | |
| exitcode : 1 (pid: 766624) | |
| error_file: <N/A> | |
| traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html | |
| ============================================================ | |
| srun: error: gcpl4-eu-0: task 1: Exited with exit code 1 | |