diff --git "a/run_sd3_rectified_sampling.log" "b/run_sd3_rectified_sampling.log" new file mode 100644--- /dev/null +++ "b/run_sd3_rectified_sampling.log" @@ -0,0 +1,185 @@ +nohup: ignoring input +W0325 13:36:09.486000 7088 site-packages/torch/distributed/run.py:793] +W0325 13:36:09.486000 7088 site-packages/torch/distributed/run.py:793] ***************************************** +W0325 13:36:09.486000 7088 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0325 13:36:09.486000 7088 site-packages/torch/distributed/run.py:793] ***************************************** +/root/miniconda3/envs/SiT/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +/root/miniconda3/envs/SiT/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +/root/miniconda3/envs/SiT/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +/root/miniconda3/envs/SiT/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. +[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. +[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. +[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. +[rank0] DDP initialized, device=0, seed=168, world_size=4 +================================================================================ +参数检查: + lora_path: /gemini/space/gzy_new/models/Sida/sd3-lora-finetuned-batch-4/checkpoint-500000 + rectified_weights: /gemini/space/gzy_new/models/Sida/rectified-noise-batch-2/checkpoint-220000/sit_weights + lora_path is None: False + lora_path is empty: False + rectified_weights is None: False + rectified_weights is empty: False +================================================================================ +[rank1] DDP initialized, device=1, seed=169, world_size=4 +[rank2] DDP initialized, device=2, seed=170, world_size=4 +[rank3] DDP initialized, device=3, seed=171, world_size=4 +Loading SD3 pipeline from /gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671 (dtype=torch.float16) + Loading pipeline components...: 0%| | 0/9 [00:00) +No LoRA keys associated to CLIPTextModelWithProjection found with the prefix='text_encoder'. This is safe to ignore if LoRA state dict didn't originally have any CLIPTextModelWithProjection related params. You can also try specifying `prefix=None` to resolve the warning. Otherwise, open an issue if you think it's unexpected: https://github.com/huggingface/diffusers/issues/new +[rank3] Pipeline loaded and moved to device 3 +No LoRA keys associated to CLIPTextModelWithProjection found with the prefix='text_encoder'. This is safe to ignore if LoRA state dict didn't originally have any CLIPTextModelWithProjection related params. You can also try specifying `prefix=None` to resolve the warning. Otherwise, open an issue if you think it's unexpected: https://github.com/huggingface/diffusers/issues/new +No LoRA keys associated to CLIPTextModelWithProjection found with the prefix='text_encoder_2'. This is safe to ignore if LoRA state dict didn't originally have any CLIPTextModelWithProjection related params. You can also try specifying `prefix=None` to resolve the warning. Otherwise, open an issue if you think it's unexpected: https://github.com/huggingface/diffusers/issues/new +No LoRA keys associated to CLIPTextModelWithProjection found with the prefix='text_encoder_2'. This is safe to ignore if LoRA state dict didn't originally have any CLIPTextModelWithProjection related params. You can also try specifying `prefix=None` to resolve the warning. Otherwise, open an issue if you think it's unexpected: https://github.com/huggingface/diffusers/issues/new +[rank1] RectifiedNoiseModule configuration: num_sit_layers=1 +Sample transformer param after LoRA (first 5 values): tensor([ 0.0112, -0.0003, -0.0053, 0.0274, 0.0178], device='cuda:0', + dtype=torch.float16) +Max parameter change after LoRA loading: 0.0 +⚠️ WARNING: LoRA weights may not have been applied (parameter change is very small) +✓ PEFT config found: ['default_0'] +LoRA loaded successfully from standard format. +================================================================================ +LoRA 加载验证: + ✓ PEFT config exists: ['default_0'] + ✓ Found LoRA layer: transformer_blocks.0.attn.to_q.lora_A.default_0, weight_sum=1422.000000 + ✓ Total LoRA layers found: 384 +================================================================================ +use_rectified: True, rectified_weights_path: /gemini/space/gzy_new/models/Sida/rectified-noise-batch-2/checkpoint-220000/sit_weights +Using Rectified Noise module with weights from: /gemini/space/gzy_new/models/Sida/rectified-noise-batch-2/checkpoint-220000/sit_weights +[rank0] RectifiedNoiseModule configuration: num_sit_layers=1 +[rank2] Pipeline loaded and moved to device 2 +No LoRA keys associated to CLIPTextModelWithProjection found with the prefix='text_encoder'. This is safe to ignore if LoRA state dict didn't originally have any CLIPTextModelWithProjection related params. You can also try specifying `prefix=None` to resolve the warning. Otherwise, open an issue if you think it's unexpected: https://github.com/huggingface/diffusers/issues/new +No LoRA keys associated to CLIPTextModelWithProjection found with the prefix='text_encoder'. This is safe to ignore if LoRA state dict didn't originally have any CLIPTextModelWithProjection related params. You can also try specifying `prefix=None` to resolve the warning. Otherwise, open an issue if you think it's unexpected: https://github.com/huggingface/diffusers/issues/new +No LoRA keys associated to CLIPTextModelWithProjection found with the prefix='text_encoder_2'. This is safe to ignore if LoRA state dict didn't originally have any CLIPTextModelWithProjection related params. You can also try specifying `prefix=None` to resolve the warning. Otherwise, open an issue if you think it's unexpected: https://github.com/huggingface/diffusers/issues/new +No LoRA keys associated to CLIPTextModelWithProjection found with the prefix='text_encoder_2'. This is safe to ignore if LoRA state dict didn't originally have any CLIPTextModelWithProjection related params. You can also try specifying `prefix=None` to resolve the warning. Otherwise, open an issue if you think it's unexpected: https://github.com/huggingface/diffusers/issues/new +[rank3] RectifiedNoiseModule configuration: num_sit_layers=1 +[rank2] RectifiedNoiseModule configuration: num_sit_layers=1 +Loading rectified weights from: /gemini/space/gzy_new/models/Sida/rectified-noise-batch-2/checkpoint-220000/sit_weights/pytorch_sit_weights.safetensors + Loaded rectified weights: 22 keys +✓ Successfully loaded rectified noise weights +Creating SD3WithRectifiedNoise with LoRA-enabled transformer... +[rank1]:[W325 13:36:38.278730735 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +LoRA after SD3WithRectifiedNoise wrapping: + LoRA layers: 384, Non-zero: 192 + ✓ LoRA weights preserved after wrapping + Active adapters: ['default_0'] +✓ Verified LoRA weight in transformer_blocks.0.attn.to_q.lora_A.default_0: sum=1422.000000 +Model set to eval mode, LoRA should be active during inference +[rank0]:[W325 13:36:38.321810961 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +[rank3]:[W325 13:36:39.922364388 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +[rank2]:[W325 13:36:39.935140980 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +Warning: Found 29991 files but expected 30000 (missing some indices) +Warning: Found 29991 files but expected 30000 (missing some indices) +Warning: Found 29991 files but expected 30000 (missing some indices) +Found 29991 existing samples, max index: 29999 +Need to generate 9 more samples (total needed: 30000) +Sampling remaining=9, total_samples=128, per_gpu=32, iterations=1 + 0%| | 0/1 [00:00 + 0xe413b9 (0x7f9ab3c9f3b9 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) +frame #2: c10d::ProcessGroupNCCL::shutdown(std::optional) + 0x1f0 (0x7f9ab402ff10 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) +frame #3: c10d::ProcessGroupNCCL::~ProcessGroupNCCL() + 0x259 (0x7f9ab4030299 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) +frame #4: c10d::ProcessGroupNCCL::~ProcessGroupNCCL() + 0x9 (0x7f9ab40306e9 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) +frame #5: + 0x5f9e7f5 (0x7f9aedd0b7f5 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) +frame #6: + 0xdafdf8 (0x7f9afd74fdf8 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_python.so) +frame #7: + 0xdafe5c (0x7f9afd74fe5c in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_python.so) +frame #8: + 0x4c90f3 (0x7f9afce690f3 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_python.so) +frame #9: + 0x4c9c71 (0x7f9afce69c71 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_python.so) +frame #10: + 0x129e5c (0x5628eecbee5c in /root/miniconda3/envs/SiT/bin/python3.10) +frame #11: + 0x129e2d (0x5628eecbee2d in /root/miniconda3/envs/SiT/bin/python3.10) +frame #12: + 0x14f5b0 (0x5628eece45b0 in /root/miniconda3/envs/SiT/bin/python3.10) +frame #13: + 0x11e8e7 (0x5628eecb38e7 in /root/miniconda3/envs/SiT/bin/python3.10) +frame #14: _PyModule_ClearDict + 0xd4 (0x5628eed32584 in /root/miniconda3/envs/SiT/bin/python3.10) +frame #15: + 0x207bba (0x5628eed9cbba in /root/miniconda3/envs/SiT/bin/python3.10) +frame #16: Py_FinalizeEx + 0x150 (0x5628eed9bb70 in /root/miniconda3/envs/SiT/bin/python3.10) +frame #17: + 0x94134 (0x5628eec29134 in /root/miniconda3/envs/SiT/bin/python3.10) +frame #18: Py_BytesMain + 0x37 (0x5628eed5e447 in /root/miniconda3/envs/SiT/bin/python3.10) +frame #19: + 0x3c540 (0x7f9b01e3c540 in /opt/orion/orion_runtime/gpu/cuda/preloadrun.so) +frame #20: + 0x3c5da (0x7f9b01a3c5da in /usr/lib/libra/preloadrun.so) +frame #21: + 0x29d90 (0x7f9b0171ad90 in /usr/lib/x86_64-linux-gnu/libc.so.6) +frame #22: __libc_start_main + 0x80 (0x7f9b0171ae40 in /usr/lib/x86_64-linux-gnu/libc.so.6) +frame #23: __libc_start_main + 0x1258 (0x7f9b01a3eee9 in /usr/lib/libra/preloadrun.so) +frame #24: __libc_start_main + 0x1067 (0x7f9b01e3ec83 in /opt/orion/orion_runtime/gpu/cuda/preloadrun.so) +frame #25: + 0x1c930e (0x5628eed5e30e in /root/miniconda3/envs/SiT/bin/python3.10) + +E0325 13:52:43.995000 7088 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: -6) local_rank: 1 (pid: 7174) of binary: /root/miniconda3/envs/SiT/bin/python3.10 +Traceback (most recent call last): + File "/root/miniconda3/envs/SiT/bin/torchrun", line 6, in + sys.exit(main()) + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper + return f(*args, **kwargs) + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/run.py", line 919, in main + run(args) + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in run + elastic_launch( + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +========================================================== +sample_sd3_rectified_ddp.py FAILED +---------------------------------------------------------- +Failures: + +---------------------------------------------------------- +Root Cause (first observed failure): +[0]: + time : 2026-03-25_13:52:43 + host : 66d2d54653616c6252364513da490658-taskrole1-0 + rank : 1 (local_rank: 1) + exitcode : -6 (pid: 7174) + error_file: + traceback : Signal 6 (SIGABRT) received by PID 7174 +========================================================== +Sampling done. Output at: ./sd3_rectified_samples_batch2_220000