| nohup: ignoring input |
| W1124 10:39:29.690000 58030 site-packages/torch/distributed/run.py:793] |
| W1124 10:39:29.690000 58030 site-packages/torch/distributed/run.py:793] ***************************************** |
| W1124 10:39:29.690000 58030 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. |
| W1124 10:39:29.690000 58030 site-packages/torch/distributed/run.py:793] ***************************************** |
| [NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. |
| [NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. |
| [NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. |
| [NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. |
| Starting rank=0, seed=0, world_size=4. |
| [[34m2025-11-24 10:39:48[0m] Experiment directory created at results/005-SiT-XL-2-Linear-velocity-None |
| [[34m2025-11-24 10:39:48[0m] Sample images will be saved to results/005-SiT-XL-2-Linear-velocity-None/pic |
| Starting rank=2, seed=2, world_size=4. |
| Starting rank=1, seed=1, world_size=4. |
| Starting rank=3, seed=3, world_size=4. |
| [[34m2025-11-24 10:40:02[0m] SiT Parameters: 675,129,632 |
| [[34m2025-11-24 10:40:04[0m] Dataset contains 1,281,167 images (/gemini/platform/public/hzh/datasets/Imagenet/train/) |
| [[34m2025-11-24 10:40:04[0m] Training for 140000 epochs... |
| [[34m2025-11-24 10:40:04[0m] Beginning epoch 0... |
| [[34m2025-11-24 10:40:24[0m] Saved checkpoint to results/005-SiT-XL-2-Linear-velocity-None/checkpoints/0000010.pt |
| [[34m2025-11-24 10:40:24[0m] Generating EMA samples... |
| [[34m2025-11-24 10:40:25[0m] Saved sample images grid to results/005-SiT-XL-2-Linear-velocity-None/pic/step_0000010_samples_grid.png |
| [[34m2025-11-24 10:40:25[0m] Generating EMA samples done. |
| W1124 10:40:39.173000 58030 site-packages/torch/distributed/elastic/agent/server/api.py:704] Received 2 death signal, shutting down workers |
| W1124 10:40:39.173000 58030 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 58079 closing signal SIGINT |
| W1124 10:40:39.174000 58030 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 58080 closing signal SIGINT |
| W1124 10:40:39.174000 58030 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 58081 closing signal SIGINT |
| W1124 10:40:39.174000 58030 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 58082 closing signal SIGINT |
| [rank0]: Traceback (most recent call last): |
| [rank0]: File "/gemini/space/gzy_new/Noise_Matching/SiT_clean/train.py", line 371, in <module> |
| [rank0]: main(args) |
| [rank0]: File "/gemini/space/gzy_new/Noise_Matching/SiT_clean/train.py", line 298, in main |
| [rank0]: torch.save(checkpoint, checkpoint_path) |
| [rank0]: File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/serialization.py", line 850, in save |
| [rank0]: _save( |
| [rank0]: File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/serialization.py", line 1114, in _save |
| [rank0]: zip_file.write_record(name, storage, num_bytes) |
| [rank0]: KeyboardInterrupt |
| W1124 10:40:39.380000 58030 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 58079 closing signal SIGTERM |
| W1124 10:40:39.380000 58030 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 58080 closing signal SIGTERM |
| W1124 10:40:39.381000 58030 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 58081 closing signal SIGTERM |
| W1124 10:40:39.381000 58030 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 58082 closing signal SIGTERM |
| Traceback (most recent call last): |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 696, in run |
| result = self._invoke_run(role) |
| ^^^^^^^^^^^^^^^^^^^^^^ |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 855, in _invoke_run |
| time.sleep(monitor_interval) |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler |
| raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) |
| torch.distributed.elastic.multiprocessing.api.SignalException: Process 58030 got signal: 2 |
|
|
| During handling of the above exception, another exception occurred: |
|
|
| Traceback (most recent call last): |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 705, in run |
| self._shutdown(e.sigval) |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py", line 365, in _shutdown |
| self._pcontext.close(death_sig) |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 572, in close |
| self._close(death_sig=death_sig, timeout=timeout) |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 909, in _close |
| handler.proc.wait(time_to_wait) |
| File "/opt/conda/envs/SiT/lib/python3.12/subprocess.py", line 1266, in wait |
| return self._wait(timeout=timeout) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/opt/conda/envs/SiT/lib/python3.12/subprocess.py", line 2055, in _wait |
| time.sleep(delay) |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler |
| raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) |
| torch.distributed.elastic.multiprocessing.api.SignalException: Process 58030 got signal: 2 |
|
|
| During handling of the above exception, another exception occurred: |
|
|
| Traceback (most recent call last): |
| File "/opt/conda/envs/SiT/bin/torchrun", line 33, in <module> |
| sys.exit(load_entry_point('torch==2.5.1', 'console_scripts', 'torchrun')()) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper |
| return f(*args, **kwargs) |
| ^^^^^^^^^^^^^^^^^^ |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/run.py", line 919, in main |
| run(args) |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/run.py", line 910, in run |
| elastic_launch( |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 138, in __call__ |
| return launch_agent(self._config, self._entrypoint, list(args)) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 260, in launch_agent |
| result = agent.run() |
| ^^^^^^^^^^^ |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper |
| result = f(*args, **kwargs) |
| ^^^^^^^^^^^^^^^^^^ |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 710, in run |
| self._shutdown() |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py", line 365, in _shutdown |
| self._pcontext.close(death_sig) |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 572, in close |
| self._close(death_sig=death_sig, timeout=timeout) |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 909, in _close |
| handler.proc.wait(time_to_wait) |
| File "/opt/conda/envs/SiT/lib/python3.12/subprocess.py", line 1266, in wait |
| return self._wait(timeout=timeout) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/opt/conda/envs/SiT/lib/python3.12/subprocess.py", line 2055, in _wait |
| time.sleep(delay) |
| File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler |
| raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) |
| torch.distributed.elastic.multiprocessing.api.SignalException: Process 58030 got signal: 2 |
|
|