Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- LTA_openwebtext_dualt/logs/data_build_minbpe/build_minbpe_2k4k8k_20260517_010516.outer.log +8 -0
- LTA_openwebtext_dualt/logs/data_build_minbpe/minbpe_2k4k8k_parallel_tok100k_packfull_20260517_014346.outer.log +10 -0
- LTA_openwebtext_dualt/logs/data_build_minbpe/minbpe_2k4k8k_parallel_tok500k_packfull_20260517_011721.outer.log +22 -0
- LTA_openwebtext_dualt/logs/data_build_minbpe/owt_minbpe_v2048_len1024_train_minus_100k_20260517_010516.log +8 -0
- LTA_openwebtext_dualt/logs/data_build_minbpe/owt_minbpe_v2048_tok100k_20260517_014346.log +2 -0
- LTA_openwebtext_dualt/logs/data_build_minbpe/owt_minbpe_v2048_tok500k_20260517_011721.log +6 -0
- LTA_openwebtext_dualt/logs/data_build_minbpe/owt_minbpe_v4096_tok100k_20260517_014346.log +2 -0
- LTA_openwebtext_dualt/logs/data_build_minbpe/owt_minbpe_v4096_tok500k_20260517_011721.log +6 -0
- LTA_openwebtext_dualt/logs/data_build_minbpe/owt_minbpe_v8192_tok100k_20260517_014346.log +2 -0
- LTA_openwebtext_dualt/logs/data_build_minbpe/owt_minbpe_v8192_tok500k_20260517_011721.log +6 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223147-zmkxz.node0.log +150 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223147-zmkxz.node1.log +151 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223237-whzzv.node0.log +150 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223237-whzzv.node1.log +150 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223426-p5lfm.node0.log +150 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223426-p5lfm.node1.log +150 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223615-7lx5q.node0.log +150 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223615-7lx5q.node1.log +151 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223804-mp56c.node0.log +150 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223804-mp56c.node1.log +150 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223954-k9vcs.node0.log +0 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223954-k9vcs.node1.log +701 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw.node0.log +0 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw.node1.log +702 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/debug_owt_t5_randk0_4_4gpu_smoke_20260518_012201.log +0 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_elfofficial_t5_len1024_elfaligned_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_gbs512_8gpu_5epoch_20260516_013934.log +89 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_elfofficial_t5_len1024_elfaligned_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_gbs512_8gpu_5epoch_20260516_014234.log +0 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_elfofficial_t5_len1024_elfaligned_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_gbs512_8gpu_5epoch_20260517_003703.log +820 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_8gpu_1m_20260518_131947.log +364 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_8gpu_1m_20260518_141238.log +45 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_8gpu_1m_20260518_141240.log +45 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_linearsoftkl_m1p5_s0p8_conflinear_gbs512_8gpu_5epoch_20260516_161629.log +106 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_rollin_p50_path2_unif0_0p25_synct_mask1_gbs512_8gpu_20260518_024916.log +0 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_rollin_p50_randk0_3_uniformt_temp1_synct_mask1_gbs512_8gpu_1m_20260518_025115.log +0 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_rollin_p50_randk0_3_uniformt_temp1_synct_mask1_gbs512_8gpu_1m_20260518_125608.log +814 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_rollin_p50_randk0_3_uniformt_temp1_synct_mask1_gbs512_8gpu_1m_20260518_125609.log +814 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_rollin_p50_randk0_4_uniformt_temp1_synct_20260518_013432.log +0 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_rollin_p50_randk0_4_uniformt_temp1_synct_20260518_101300.nohup +113 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_rollin_p50_randk0_4_uniformt_temp1_synct_20260518_101300.pid +1 -0
- LTA_openwebtext_dualt/logs/infer_owt_t5_2node_latest_trainmatched_decode_temp1p5_n8.log +26 -0
- LTA_openwebtext_dualt/logs/lm1b_bos_ban_special_eval_newserver.log +58 -0
- LTA_openwebtext_dualt/logs/lta_lm1b_classic_dirichlet_len128_4gpu_10k_driver.log +285 -0
- LTA_openwebtext_dualt/logs/lta_lm1b_classic_dirichlet_len512_gbs512_4gpu_20k_save1k_20260523.log +0 -0
- LTA_openwebtext_dualt/logs/lta_lm1b_dirichlet_categorical_fullvocab_c1024_fullycoupled_flmpack_onehot_hardce_ddit_small_len128_gbs512_4gpu_1m_nw0.resume_20260508.nohup.log +0 -0
- LTA_openwebtext_dualt/logs/lta_lm1b_dirichlet_categorical_fullvocab_c16p0_dualt_flmpack_onehot_hardce_ddit_small_len128_gbs512_4gpu_1m_nw0.log +0 -0
- LTA_openwebtext_dualt/logs/lta_lm1b_dirichlet_len1024_Cv_to_2v_nosep_gbs512_4gpu_20k_save1k_gumbelwatch_20260525_watcher.log +5 -0
- LTA_openwebtext_dualt/logs/lta_owt_c1024_len1024_t0to1_lowk64plus_noall_buf1000_gbs128_4gpu_20k.log +109 -0
- LTA_openwebtext_dualt/logs/lta_owt_classic_fullvocab_bert_c1024_len128_gbs512_4gpu_1m_save1k_20260521_210848.log +0 -0
- LTA_openwebtext_dualt/logs/lta_owt_dirichlet_categorical_fullvocab_c1024_fullycoupled_flmpack_onehot_hardce_ddit_small_len1024_gbs512_8gpu_1m_nw4.log +0 -0
- LTA_openwebtext_dualt/logs/lta_owt_dirichlet_categorical_fullvocab_c1024_fullycoupled_online_shuffle_len128_gbs512_4gpu_1m_nw2_buf20k.log +0 -0
LTA_openwebtext_dualt/logs/data_build_minbpe/build_minbpe_2k4k8k_20260517_010516.outer.log
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build:minbpe] vocab=2048
|
| 2 |
+
[build:minbpe] tokenizer=/e2e-data/evad-tech-vla/wanghan58/models/lta_tokenizers/owt_minbpe_v2048/tokenizer.json
|
| 3 |
+
[build:minbpe] output=/e2e-data/evad-tech-vla/wanghan58/data/embedded-language-flows/openwebtext-minbpe-v2048-len1024-train-minus-100k
|
| 4 |
+
[tokenizer] streamed_records=100000
|
| 5 |
+
[tokenizer] streamed_records=200000
|
| 6 |
+
[tokenizer] streamed_records=300000
|
| 7 |
+
[tokenizer] streamed_records=400000
|
| 8 |
+
[tokenizer] streamed_records=500000
|
LTA_openwebtext_dualt/logs/data_build_minbpe/minbpe_2k4k8k_parallel_tok100k_packfull_20260517_014346.outer.log
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[stage] parallel tokenizer train max_records=100000
|
| 2 |
+
[tokenizer] streamed_records=100000
|
| 3 |
+
[tokenizer] streamed_records=100000
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
[tokenizer] streamed_records=100000
|
| 7 |
+
|
| 8 |
+
Terminated
|
| 9 |
+
Terminated
|
| 10 |
+
Terminated
|
LTA_openwebtext_dualt/logs/data_build_minbpe/minbpe_2k4k8k_parallel_tok500k_packfull_20260517_011721.outer.log
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[stage] parallel tokenizer train max_records=500000
|
| 2 |
+
[tokenizer] streamed_records=100000
|
| 3 |
+
[tokenizer] streamed_records=100000
|
| 4 |
+
[tokenizer] streamed_records=100000
|
| 5 |
+
[tokenizer] streamed_records=200000
|
| 6 |
+
[tokenizer] streamed_records=200000
|
| 7 |
+
[tokenizer] streamed_records=200000
|
| 8 |
+
[tokenizer] streamed_records=300000
|
| 9 |
+
[tokenizer] streamed_records=300000
|
| 10 |
+
[tokenizer] streamed_records=300000
|
| 11 |
+
[tokenizer] streamed_records=400000
|
| 12 |
+
[tokenizer] streamed_records=400000
|
| 13 |
+
[tokenizer] streamed_records=400000
|
| 14 |
+
[tokenizer] streamed_records=500000
|
| 15 |
+
|
| 16 |
+
[tokenizer] streamed_records=500000
|
| 17 |
+
[tokenizer] streamed_records=500000
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
Terminated
|
| 21 |
+
Terminated
|
| 22 |
+
Terminated
|
LTA_openwebtext_dualt/logs/data_build_minbpe/owt_minbpe_v2048_len1024_train_minus_100k_20260517_010516.log
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build:minbpe] vocab=2048
|
| 2 |
+
[build:minbpe] tokenizer=/e2e-data/evad-tech-vla/wanghan58/models/lta_tokenizers/owt_minbpe_v2048/tokenizer.json
|
| 3 |
+
[build:minbpe] output=/e2e-data/evad-tech-vla/wanghan58/data/embedded-language-flows/openwebtext-minbpe-v2048-len1024-train-minus-100k
|
| 4 |
+
[tokenizer] streamed_records=100000
|
| 5 |
+
[tokenizer] streamed_records=200000
|
| 6 |
+
[tokenizer] streamed_records=300000
|
| 7 |
+
[tokenizer] streamed_records=400000
|
| 8 |
+
[tokenizer] streamed_records=500000
|
LTA_openwebtext_dualt/logs/data_build_minbpe/owt_minbpe_v2048_tok100k_20260517_014346.log
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tokenizer] streamed_records=100000
|
| 2 |
+
|
LTA_openwebtext_dualt/logs/data_build_minbpe/owt_minbpe_v2048_tok500k_20260517_011721.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tokenizer] streamed_records=100000
|
| 2 |
+
[tokenizer] streamed_records=200000
|
| 3 |
+
[tokenizer] streamed_records=300000
|
| 4 |
+
[tokenizer] streamed_records=400000
|
| 5 |
+
[tokenizer] streamed_records=500000
|
| 6 |
+
|
LTA_openwebtext_dualt/logs/data_build_minbpe/owt_minbpe_v4096_tok100k_20260517_014346.log
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tokenizer] streamed_records=100000
|
| 2 |
+
|
LTA_openwebtext_dualt/logs/data_build_minbpe/owt_minbpe_v4096_tok500k_20260517_011721.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tokenizer] streamed_records=100000
|
| 2 |
+
[tokenizer] streamed_records=200000
|
| 3 |
+
[tokenizer] streamed_records=300000
|
| 4 |
+
[tokenizer] streamed_records=400000
|
| 5 |
+
[tokenizer] streamed_records=500000
|
| 6 |
+
|
LTA_openwebtext_dualt/logs/data_build_minbpe/owt_minbpe_v8192_tok100k_20260517_014346.log
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tokenizer] streamed_records=100000
|
| 2 |
+
|
LTA_openwebtext_dualt/logs/data_build_minbpe/owt_minbpe_v8192_tok500k_20260517_011721.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tokenizer] streamed_records=100000
|
| 2 |
+
[tokenizer] streamed_records=200000
|
| 3 |
+
[tokenizer] streamed_records=300000
|
| 4 |
+
[tokenizer] streamed_records=400000
|
| 5 |
+
[tokenizer] streamed_records=500000
|
| 6 |
+
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223147-zmkxz.node0.log
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
W0518 14:32:17.917000 10312 torch/distributed/run.py:852]
|
| 2 |
+
W0518 14:32:17.917000 10312 torch/distributed/run.py:852] *****************************************
|
| 3 |
+
W0518 14:32:17.917000 10312 torch/distributed/run.py:852] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
W0518 14:32:17.917000 10312 torch/distributed/run.py:852] *****************************************
|
| 5 |
+
[W518 14:32:18.112180411 socket.cpp:207] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 6 |
+
Traceback (most recent call last):
|
| 7 |
+
Traceback (most recent call last):
|
| 8 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 9 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 10 |
+
from flowtext_lab.data import (from flowtext_lab.data import (
|
| 11 |
+
|
| 12 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 13 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 14 |
+
Traceback (most recent call last):
|
| 15 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 16 |
+
from flowtext_lab.data import (
|
| 17 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 18 |
+
Traceback (most recent call last):
|
| 19 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 20 |
+
Traceback (most recent call last):
|
| 21 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 22 |
+
from flowtext_lab.data import (
|
| 23 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 24 |
+
from flowtext_lab.data import (
|
| 25 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 26 |
+
from .tokenization import BpeTextTokenizerfrom .tokenization import BpeTextTokenizer
|
| 27 |
+
|
| 28 |
+
from .tokenization import BpeTextTokenizer
|
| 29 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 30 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 31 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 32 |
+
from .tokenization import BpeTextTokenizer
|
| 33 |
+
from .tokenization import BpeTextTokenizer
|
| 34 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 35 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 36 |
+
from tokenizers import Tokenizer
|
| 37 |
+
from tokenizers import Tokenizer
|
| 38 |
+
ModuleNotFoundErrorModuleNotFoundError: No module named 'tokenizers':
|
| 39 |
+
No module named 'tokenizers'
|
| 40 |
+
from tokenizers import Tokenizerfrom tokenizers import Tokenizerfrom tokenizers import Tokenizer
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
ModuleNotFoundErrorModuleNotFoundError: ModuleNotFoundError: No module named 'tokenizers'No module named 'tokenizers'
|
| 44 |
+
:
|
| 45 |
+
No module named 'tokenizers'
|
| 46 |
+
Traceback (most recent call last):
|
| 47 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 48 |
+
from flowtext_lab.data import (
|
| 49 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 50 |
+
from .tokenization import BpeTextTokenizer
|
| 51 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 52 |
+
from tokenizers import Tokenizer
|
| 53 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 54 |
+
Traceback (most recent call last):
|
| 55 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 56 |
+
from flowtext_lab.data import (
|
| 57 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 58 |
+
from .tokenization import BpeTextTokenizer
|
| 59 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 60 |
+
from tokenizers import Tokenizer
|
| 61 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 62 |
+
Traceback (most recent call last):
|
| 63 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 64 |
+
from flowtext_lab.data import (
|
| 65 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 66 |
+
from .tokenization import BpeTextTokenizer
|
| 67 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 68 |
+
from tokenizers import Tokenizer
|
| 69 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 70 |
+
E0518 14:32:21.168000 10312 torch/distributed/elastic/multiprocessing/api.py:984] failed (exitcode: 1) local_rank: 0 (pid: 10402) of binary: /e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/python3
|
| 71 |
+
Traceback (most recent call last):
|
| 72 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/torchrun", line 10, in <module>
|
| 73 |
+
sys.exit(main())
|
| 74 |
+
^^^^^^
|
| 75 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 362, in wrapper
|
| 76 |
+
return f(*args, **kwargs)
|
| 77 |
+
^^^^^^^^^^^^^^^^^^
|
| 78 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 991, in main
|
| 79 |
+
run(args)
|
| 80 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 982, in run
|
| 81 |
+
elastic_launch(
|
| 82 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 170, in __call__
|
| 83 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 84 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 85 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 317, in launch_agent
|
| 86 |
+
raise ChildFailedError(
|
| 87 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 88 |
+
============================================================
|
| 89 |
+
train.py FAILED
|
| 90 |
+
------------------------------------------------------------
|
| 91 |
+
Failures:
|
| 92 |
+
[1]:
|
| 93 |
+
time : 2026-05-18_14:32:21
|
| 94 |
+
host : t-20260518223147-zmkxz-worker-0.t-20260518223147-zmkxz-worker.mlplatform-customtask.svc.cluster.local
|
| 95 |
+
rank : 1 (local_rank: 1)
|
| 96 |
+
exitcode : 1 (pid: 10403)
|
| 97 |
+
error_file: <N/A>
|
| 98 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 99 |
+
[2]:
|
| 100 |
+
time : 2026-05-18_14:32:21
|
| 101 |
+
host : t-20260518223147-zmkxz-worker-0.t-20260518223147-zmkxz-worker.mlplatform-customtask.svc.cluster.local
|
| 102 |
+
rank : 2 (local_rank: 2)
|
| 103 |
+
exitcode : 1 (pid: 10404)
|
| 104 |
+
error_file: <N/A>
|
| 105 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 106 |
+
[3]:
|
| 107 |
+
time : 2026-05-18_14:32:21
|
| 108 |
+
host : t-20260518223147-zmkxz-worker-0.t-20260518223147-zmkxz-worker.mlplatform-customtask.svc.cluster.local
|
| 109 |
+
rank : 3 (local_rank: 3)
|
| 110 |
+
exitcode : 1 (pid: 10405)
|
| 111 |
+
error_file: <N/A>
|
| 112 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 113 |
+
[4]:
|
| 114 |
+
time : 2026-05-18_14:32:21
|
| 115 |
+
host : t-20260518223147-zmkxz-worker-0.t-20260518223147-zmkxz-worker.mlplatform-customtask.svc.cluster.local
|
| 116 |
+
rank : 4 (local_rank: 4)
|
| 117 |
+
exitcode : 1 (pid: 10406)
|
| 118 |
+
error_file: <N/A>
|
| 119 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 120 |
+
[5]:
|
| 121 |
+
time : 2026-05-18_14:32:21
|
| 122 |
+
host : t-20260518223147-zmkxz-worker-0.t-20260518223147-zmkxz-worker.mlplatform-customtask.svc.cluster.local
|
| 123 |
+
rank : 5 (local_rank: 5)
|
| 124 |
+
exitcode : 1 (pid: 10407)
|
| 125 |
+
error_file: <N/A>
|
| 126 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 127 |
+
[6]:
|
| 128 |
+
time : 2026-05-18_14:32:21
|
| 129 |
+
host : t-20260518223147-zmkxz-worker-0.t-20260518223147-zmkxz-worker.mlplatform-customtask.svc.cluster.local
|
| 130 |
+
rank : 6 (local_rank: 6)
|
| 131 |
+
exitcode : 1 (pid: 10408)
|
| 132 |
+
error_file: <N/A>
|
| 133 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 134 |
+
[7]:
|
| 135 |
+
time : 2026-05-18_14:32:21
|
| 136 |
+
host : t-20260518223147-zmkxz-worker-0.t-20260518223147-zmkxz-worker.mlplatform-customtask.svc.cluster.local
|
| 137 |
+
rank : 7 (local_rank: 7)
|
| 138 |
+
exitcode : 1 (pid: 10409)
|
| 139 |
+
error_file: <N/A>
|
| 140 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 141 |
+
------------------------------------------------------------
|
| 142 |
+
Root Cause (first observed failure):
|
| 143 |
+
[0]:
|
| 144 |
+
time : 2026-05-18_14:32:21
|
| 145 |
+
host : t-20260518223147-zmkxz-worker-0.t-20260518223147-zmkxz-worker.mlplatform-customtask.svc.cluster.local
|
| 146 |
+
rank : 0 (local_rank: 0)
|
| 147 |
+
exitcode : 1 (pid: 10402)
|
| 148 |
+
error_file: <N/A>
|
| 149 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 150 |
+
============================================================
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223147-zmkxz.node1.log
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
W0518 14:32:18.002000 10316 torch/distributed/run.py:852]
|
| 2 |
+
W0518 14:32:18.002000 10316 torch/distributed/run.py:852] *****************************************
|
| 3 |
+
W0518 14:32:18.002000 10316 torch/distributed/run.py:852] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
W0518 14:32:18.002000 10316 torch/distributed/run.py:852] *****************************************
|
| 5 |
+
[W518 14:32:19.159050527 socket.cpp:207] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 6 |
+
Traceback (most recent call last):
|
| 7 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 8 |
+
from flowtext_lab.data import (
|
| 9 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 10 |
+
from .tokenization import BpeTextTokenizer
|
| 11 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 12 |
+
Traceback (most recent call last):
|
| 13 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 14 |
+
from flowtext_lab.data import (
|
| 15 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 16 |
+
from .tokenization import BpeTextTokenizer
|
| 17 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 18 |
+
from tokenizers import Tokenizer
|
| 19 |
+
from tokenizers import Tokenizer
|
| 20 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 21 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 22 |
+
Traceback (most recent call last):
|
| 23 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 24 |
+
from flowtext_lab.data import (
|
| 25 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 26 |
+
from .tokenization import BpeTextTokenizer
|
| 27 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 28 |
+
from tokenizers import Tokenizer
|
| 29 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 30 |
+
Traceback (most recent call last):
|
| 31 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 32 |
+
from flowtext_lab.data import (
|
| 33 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 34 |
+
from .tokenization import BpeTextTokenizer
|
| 35 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 36 |
+
from tokenizers import Tokenizer
|
| 37 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 38 |
+
Traceback (most recent call last):
|
| 39 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 40 |
+
from flowtext_lab.data import (
|
| 41 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 42 |
+
from .tokenization import BpeTextTokenizer
|
| 43 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 44 |
+
from tokenizers import Tokenizer
|
| 45 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 46 |
+
Traceback (most recent call last):
|
| 47 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 48 |
+
from flowtext_lab.data import (
|
| 49 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 50 |
+
from .tokenization import BpeTextTokenizer
|
| 51 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 52 |
+
from tokenizers import Tokenizer
|
| 53 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 54 |
+
Traceback (most recent call last):
|
| 55 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 56 |
+
from flowtext_lab.data import (
|
| 57 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 58 |
+
from .tokenization import BpeTextTokenizer
|
| 59 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 60 |
+
from tokenizers import Tokenizer
|
| 61 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 62 |
+
Traceback (most recent call last):
|
| 63 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 64 |
+
from flowtext_lab.data import (
|
| 65 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 66 |
+
from .tokenization import BpeTextTokenizer
|
| 67 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 68 |
+
from tokenizers import Tokenizer
|
| 69 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 70 |
+
W0518 14:32:21.067000 10316 torch/distributed/elastic/multiprocessing/api.py:1010] Sending process 10410 closing signal SIGTERM
|
| 71 |
+
E0518 14:32:21.068000 10316 torch/distributed/elastic/multiprocessing/api.py:984] failed (exitcode: 1) local_rank: 0 (pid: 10405) of binary: /e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/python3
|
| 72 |
+
Traceback (most recent call last):
|
| 73 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/torchrun", line 10, in <module>
|
| 74 |
+
sys.exit(main())
|
| 75 |
+
^^^^^^
|
| 76 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 362, in wrapper
|
| 77 |
+
return f(*args, **kwargs)
|
| 78 |
+
^^^^^^^^^^^^^^^^^^
|
| 79 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 991, in main
|
| 80 |
+
run(args)
|
| 81 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 982, in run
|
| 82 |
+
elastic_launch(
|
| 83 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 170, in __call__
|
| 84 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 85 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 86 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 317, in launch_agent
|
| 87 |
+
raise ChildFailedError(
|
| 88 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 89 |
+
============================================================
|
| 90 |
+
train.py FAILED
|
| 91 |
+
------------------------------------------------------------
|
| 92 |
+
Failures:
|
| 93 |
+
[1]:
|
| 94 |
+
time : 2026-05-18_14:32:21
|
| 95 |
+
host : t-20260518223147-zmkxz-worker-1.t-20260518223147-zmkxz-worker.mlplatform-customtask.svc.cluster.local
|
| 96 |
+
rank : 9 (local_rank: 1)
|
| 97 |
+
exitcode : 1 (pid: 10406)
|
| 98 |
+
error_file: <N/A>
|
| 99 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 100 |
+
[2]:
|
| 101 |
+
time : 2026-05-18_14:32:21
|
| 102 |
+
host : t-20260518223147-zmkxz-worker-1.t-20260518223147-zmkxz-worker.mlplatform-customtask.svc.cluster.local
|
| 103 |
+
rank : 10 (local_rank: 2)
|
| 104 |
+
exitcode : 1 (pid: 10407)
|
| 105 |
+
error_file: <N/A>
|
| 106 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 107 |
+
[3]:
|
| 108 |
+
time : 2026-05-18_14:32:21
|
| 109 |
+
host : t-20260518223147-zmkxz-worker-1.t-20260518223147-zmkxz-worker.mlplatform-customtask.svc.cluster.local
|
| 110 |
+
rank : 11 (local_rank: 3)
|
| 111 |
+
exitcode : 1 (pid: 10408)
|
| 112 |
+
error_file: <N/A>
|
| 113 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 114 |
+
[4]:
|
| 115 |
+
time : 2026-05-18_14:32:21
|
| 116 |
+
host : t-20260518223147-zmkxz-worker-1.t-20260518223147-zmkxz-worker.mlplatform-customtask.svc.cluster.local
|
| 117 |
+
rank : 12 (local_rank: 4)
|
| 118 |
+
exitcode : 1 (pid: 10409)
|
| 119 |
+
error_file: <N/A>
|
| 120 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 121 |
+
[5]:
|
| 122 |
+
time : 2026-05-18_14:32:21
|
| 123 |
+
host : t-20260518223147-zmkxz-worker-1.t-20260518223147-zmkxz-worker.mlplatform-customtask.svc.cluster.local
|
| 124 |
+
rank : 14 (local_rank: 6)
|
| 125 |
+
exitcode : 1 (pid: 10411)
|
| 126 |
+
error_file: <N/A>
|
| 127 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 128 |
+
[6]:
|
| 129 |
+
time : 2026-05-18_14:32:21
|
| 130 |
+
host : t-20260518223147-zmkxz-worker-1.t-20260518223147-zmkxz-worker.mlplatform-customtask.svc.cluster.local
|
| 131 |
+
rank : 15 (local_rank: 7)
|
| 132 |
+
exitcode : 1 (pid: 10412)
|
| 133 |
+
error_file: <N/A>
|
| 134 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 135 |
+
[7]:
|
| 136 |
+
time : 2026-05-18_14:32:21
|
| 137 |
+
host : t-20260518223147-zmkxz-worker-1.t-20260518223147-zmkxz-worker.mlplatform-customtask.svc.cluster.local
|
| 138 |
+
rank : 13 (local_rank: 5)
|
| 139 |
+
exitcode : 1 (pid: 10410)
|
| 140 |
+
error_file: <N/A>
|
| 141 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 142 |
+
------------------------------------------------------------
|
| 143 |
+
Root Cause (first observed failure):
|
| 144 |
+
[0]:
|
| 145 |
+
time : 2026-05-18_14:32:21
|
| 146 |
+
host : t-20260518223147-zmkxz-worker-1.t-20260518223147-zmkxz-worker.mlplatform-customtask.svc.cluster.local
|
| 147 |
+
rank : 8 (local_rank: 0)
|
| 148 |
+
exitcode : 1 (pid: 10405)
|
| 149 |
+
error_file: <N/A>
|
| 150 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 151 |
+
============================================================
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223237-whzzv.node0.log
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
W0518 14:34:07.591000 10360 torch/distributed/run.py:852]
|
| 2 |
+
W0518 14:34:07.591000 10360 torch/distributed/run.py:852] *****************************************
|
| 3 |
+
W0518 14:34:07.591000 10360 torch/distributed/run.py:852] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
W0518 14:34:07.591000 10360 torch/distributed/run.py:852] *****************************************
|
| 5 |
+
[W518 14:34:08.346331012 socket.cpp:207] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 6 |
+
Traceback (most recent call last):
|
| 7 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 8 |
+
from flowtext_lab.data import (
|
| 9 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 10 |
+
from .tokenization import BpeTextTokenizer
|
| 11 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 12 |
+
from tokenizers import Tokenizer
|
| 13 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 14 |
+
Traceback (most recent call last):
|
| 15 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 16 |
+
from flowtext_lab.data import (
|
| 17 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 18 |
+
from .tokenization import BpeTextTokenizer
|
| 19 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 20 |
+
from tokenizers import Tokenizer
|
| 21 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 22 |
+
Traceback (most recent call last):
|
| 23 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 24 |
+
from flowtext_lab.data import (
|
| 25 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 26 |
+
from .tokenization import BpeTextTokenizer
|
| 27 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 28 |
+
from tokenizers import Tokenizer
|
| 29 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 30 |
+
Traceback (most recent call last):
|
| 31 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 32 |
+
from flowtext_lab.data import (
|
| 33 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 34 |
+
from .tokenization import BpeTextTokenizer
|
| 35 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 36 |
+
from tokenizers import Tokenizer
|
| 37 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 38 |
+
Traceback (most recent call last):
|
| 39 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 40 |
+
Traceback (most recent call last):
|
| 41 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 42 |
+
from flowtext_lab.data import (
|
| 43 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 44 |
+
from flowtext_lab.data import (
|
| 45 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 46 |
+
from .tokenization import BpeTextTokenizer
|
| 47 |
+
from .tokenization import BpeTextTokenizer
|
| 48 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 49 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 50 |
+
from tokenizers import Tokenizer
|
| 51 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 52 |
+
from tokenizers import Tokenizer
|
| 53 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 54 |
+
Traceback (most recent call last):
|
| 55 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 56 |
+
from flowtext_lab.data import (
|
| 57 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 58 |
+
from .tokenization import BpeTextTokenizer
|
| 59 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 60 |
+
from tokenizers import Tokenizer
|
| 61 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 62 |
+
Traceback (most recent call last):
|
| 63 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 64 |
+
from flowtext_lab.data import (
|
| 65 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 66 |
+
from .tokenization import BpeTextTokenizer
|
| 67 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 68 |
+
from tokenizers import Tokenizer
|
| 69 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 70 |
+
E0518 14:34:10.188000 10360 torch/distributed/elastic/multiprocessing/api.py:984] failed (exitcode: 1) local_rank: 0 (pid: 10427) of binary: /e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/python3
|
| 71 |
+
Traceback (most recent call last):
|
| 72 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/torchrun", line 10, in <module>
|
| 73 |
+
sys.exit(main())
|
| 74 |
+
^^^^^^
|
| 75 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 362, in wrapper
|
| 76 |
+
return f(*args, **kwargs)
|
| 77 |
+
^^^^^^^^^^^^^^^^^^
|
| 78 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 991, in main
|
| 79 |
+
run(args)
|
| 80 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 982, in run
|
| 81 |
+
elastic_launch(
|
| 82 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 170, in __call__
|
| 83 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 84 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 85 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 317, in launch_agent
|
| 86 |
+
raise ChildFailedError(
|
| 87 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 88 |
+
============================================================
|
| 89 |
+
train.py FAILED
|
| 90 |
+
------------------------------------------------------------
|
| 91 |
+
Failures:
|
| 92 |
+
[1]:
|
| 93 |
+
time : 2026-05-18_14:34:10
|
| 94 |
+
host : t-20260518223237-whzzv-worker-0.t-20260518223237-whzzv-worker.mlplatform-customtask.svc.cluster.local
|
| 95 |
+
rank : 1 (local_rank: 1)
|
| 96 |
+
exitcode : 1 (pid: 10428)
|
| 97 |
+
error_file: <N/A>
|
| 98 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 99 |
+
[2]:
|
| 100 |
+
time : 2026-05-18_14:34:10
|
| 101 |
+
host : t-20260518223237-whzzv-worker-0.t-20260518223237-whzzv-worker.mlplatform-customtask.svc.cluster.local
|
| 102 |
+
rank : 2 (local_rank: 2)
|
| 103 |
+
exitcode : 1 (pid: 10429)
|
| 104 |
+
error_file: <N/A>
|
| 105 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 106 |
+
[3]:
|
| 107 |
+
time : 2026-05-18_14:34:10
|
| 108 |
+
host : t-20260518223237-whzzv-worker-0.t-20260518223237-whzzv-worker.mlplatform-customtask.svc.cluster.local
|
| 109 |
+
rank : 3 (local_rank: 3)
|
| 110 |
+
exitcode : 1 (pid: 10430)
|
| 111 |
+
error_file: <N/A>
|
| 112 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 113 |
+
[4]:
|
| 114 |
+
time : 2026-05-18_14:34:10
|
| 115 |
+
host : t-20260518223237-whzzv-worker-0.t-20260518223237-whzzv-worker.mlplatform-customtask.svc.cluster.local
|
| 116 |
+
rank : 4 (local_rank: 4)
|
| 117 |
+
exitcode : 1 (pid: 10431)
|
| 118 |
+
error_file: <N/A>
|
| 119 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 120 |
+
[5]:
|
| 121 |
+
time : 2026-05-18_14:34:10
|
| 122 |
+
host : t-20260518223237-whzzv-worker-0.t-20260518223237-whzzv-worker.mlplatform-customtask.svc.cluster.local
|
| 123 |
+
rank : 5 (local_rank: 5)
|
| 124 |
+
exitcode : 1 (pid: 10432)
|
| 125 |
+
error_file: <N/A>
|
| 126 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 127 |
+
[6]:
|
| 128 |
+
time : 2026-05-18_14:34:10
|
| 129 |
+
host : t-20260518223237-whzzv-worker-0.t-20260518223237-whzzv-worker.mlplatform-customtask.svc.cluster.local
|
| 130 |
+
rank : 6 (local_rank: 6)
|
| 131 |
+
exitcode : 1 (pid: 10433)
|
| 132 |
+
error_file: <N/A>
|
| 133 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 134 |
+
[7]:
|
| 135 |
+
time : 2026-05-18_14:34:10
|
| 136 |
+
host : t-20260518223237-whzzv-worker-0.t-20260518223237-whzzv-worker.mlplatform-customtask.svc.cluster.local
|
| 137 |
+
rank : 7 (local_rank: 7)
|
| 138 |
+
exitcode : 1 (pid: 10434)
|
| 139 |
+
error_file: <N/A>
|
| 140 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 141 |
+
------------------------------------------------------------
|
| 142 |
+
Root Cause (first observed failure):
|
| 143 |
+
[0]:
|
| 144 |
+
time : 2026-05-18_14:34:10
|
| 145 |
+
host : t-20260518223237-whzzv-worker-0.t-20260518223237-whzzv-worker.mlplatform-customtask.svc.cluster.local
|
| 146 |
+
rank : 0 (local_rank: 0)
|
| 147 |
+
exitcode : 1 (pid: 10427)
|
| 148 |
+
error_file: <N/A>
|
| 149 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 150 |
+
============================================================
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223237-whzzv.node1.log
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
W0518 14:34:07.720000 10364 torch/distributed/run.py:852]
|
| 2 |
+
W0518 14:34:07.720000 10364 torch/distributed/run.py:852] *****************************************
|
| 3 |
+
W0518 14:34:07.720000 10364 torch/distributed/run.py:852] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
W0518 14:34:07.720000 10364 torch/distributed/run.py:852] *****************************************
|
| 5 |
+
[W518 14:34:08.616680373 socket.cpp:207] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 6 |
+
Traceback (most recent call last):
|
| 7 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 8 |
+
from flowtext_lab.data import (
|
| 9 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 10 |
+
from .tokenization import BpeTextTokenizer
|
| 11 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 12 |
+
from tokenizers import Tokenizer
|
| 13 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 14 |
+
Traceback (most recent call last):
|
| 15 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 16 |
+
Traceback (most recent call last):
|
| 17 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 18 |
+
from flowtext_lab.data import (
|
| 19 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 20 |
+
from flowtext_lab.data import (
|
| 21 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 22 |
+
from .tokenization import BpeTextTokenizer
|
| 23 |
+
from .tokenization import BpeTextTokenizer
|
| 24 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 25 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 26 |
+
from tokenizers import Tokenizer
|
| 27 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 28 |
+
from tokenizers import Tokenizer
|
| 29 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 30 |
+
Traceback (most recent call last):
|
| 31 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 32 |
+
from flowtext_lab.data import (
|
| 33 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 34 |
+
from .tokenization import BpeTextTokenizer
|
| 35 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 36 |
+
from tokenizers import Tokenizer
|
| 37 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 38 |
+
Traceback (most recent call last):
|
| 39 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 40 |
+
from flowtext_lab.data import (
|
| 41 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 42 |
+
from .tokenization import BpeTextTokenizer
|
| 43 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 44 |
+
from tokenizers import Tokenizer
|
| 45 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 46 |
+
Traceback (most recent call last):
|
| 47 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 48 |
+
from flowtext_lab.data import (
|
| 49 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 50 |
+
from .tokenization import BpeTextTokenizer
|
| 51 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 52 |
+
from tokenizers import Tokenizer
|
| 53 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 54 |
+
Traceback (most recent call last):
|
| 55 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 56 |
+
from flowtext_lab.data import (
|
| 57 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 58 |
+
from .tokenization import BpeTextTokenizer
|
| 59 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 60 |
+
from tokenizers import Tokenizer
|
| 61 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 62 |
+
Traceback (most recent call last):
|
| 63 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 64 |
+
from flowtext_lab.data import (
|
| 65 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 66 |
+
from .tokenization import BpeTextTokenizer
|
| 67 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 68 |
+
from tokenizers import Tokenizer
|
| 69 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 70 |
+
E0518 14:34:10.088000 10364 torch/distributed/elastic/multiprocessing/api.py:984] failed (exitcode: 1) local_rank: 0 (pid: 10430) of binary: /e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/python3
|
| 71 |
+
Traceback (most recent call last):
|
| 72 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/torchrun", line 10, in <module>
|
| 73 |
+
sys.exit(main())
|
| 74 |
+
^^^^^^
|
| 75 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 362, in wrapper
|
| 76 |
+
return f(*args, **kwargs)
|
| 77 |
+
^^^^^^^^^^^^^^^^^^
|
| 78 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 991, in main
|
| 79 |
+
run(args)
|
| 80 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 982, in run
|
| 81 |
+
elastic_launch(
|
| 82 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 170, in __call__
|
| 83 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 84 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 85 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 317, in launch_agent
|
| 86 |
+
raise ChildFailedError(
|
| 87 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 88 |
+
============================================================
|
| 89 |
+
train.py FAILED
|
| 90 |
+
------------------------------------------------------------
|
| 91 |
+
Failures:
|
| 92 |
+
[1]:
|
| 93 |
+
time : 2026-05-18_14:34:10
|
| 94 |
+
host : t-20260518223237-whzzv-worker-1.t-20260518223237-whzzv-worker.mlplatform-customtask.svc.cluster.local
|
| 95 |
+
rank : 9 (local_rank: 1)
|
| 96 |
+
exitcode : 1 (pid: 10431)
|
| 97 |
+
error_file: <N/A>
|
| 98 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 99 |
+
[2]:
|
| 100 |
+
time : 2026-05-18_14:34:10
|
| 101 |
+
host : t-20260518223237-whzzv-worker-1.t-20260518223237-whzzv-worker.mlplatform-customtask.svc.cluster.local
|
| 102 |
+
rank : 10 (local_rank: 2)
|
| 103 |
+
exitcode : 1 (pid: 10432)
|
| 104 |
+
error_file: <N/A>
|
| 105 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 106 |
+
[3]:
|
| 107 |
+
time : 2026-05-18_14:34:10
|
| 108 |
+
host : t-20260518223237-whzzv-worker-1.t-20260518223237-whzzv-worker.mlplatform-customtask.svc.cluster.local
|
| 109 |
+
rank : 11 (local_rank: 3)
|
| 110 |
+
exitcode : 1 (pid: 10433)
|
| 111 |
+
error_file: <N/A>
|
| 112 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 113 |
+
[4]:
|
| 114 |
+
time : 2026-05-18_14:34:10
|
| 115 |
+
host : t-20260518223237-whzzv-worker-1.t-20260518223237-whzzv-worker.mlplatform-customtask.svc.cluster.local
|
| 116 |
+
rank : 12 (local_rank: 4)
|
| 117 |
+
exitcode : 1 (pid: 10434)
|
| 118 |
+
error_file: <N/A>
|
| 119 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 120 |
+
[5]:
|
| 121 |
+
time : 2026-05-18_14:34:10
|
| 122 |
+
host : t-20260518223237-whzzv-worker-1.t-20260518223237-whzzv-worker.mlplatform-customtask.svc.cluster.local
|
| 123 |
+
rank : 13 (local_rank: 5)
|
| 124 |
+
exitcode : 1 (pid: 10435)
|
| 125 |
+
error_file: <N/A>
|
| 126 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 127 |
+
[6]:
|
| 128 |
+
time : 2026-05-18_14:34:10
|
| 129 |
+
host : t-20260518223237-whzzv-worker-1.t-20260518223237-whzzv-worker.mlplatform-customtask.svc.cluster.local
|
| 130 |
+
rank : 14 (local_rank: 6)
|
| 131 |
+
exitcode : 1 (pid: 10436)
|
| 132 |
+
error_file: <N/A>
|
| 133 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 134 |
+
[7]:
|
| 135 |
+
time : 2026-05-18_14:34:10
|
| 136 |
+
host : t-20260518223237-whzzv-worker-1.t-20260518223237-whzzv-worker.mlplatform-customtask.svc.cluster.local
|
| 137 |
+
rank : 15 (local_rank: 7)
|
| 138 |
+
exitcode : 1 (pid: 10437)
|
| 139 |
+
error_file: <N/A>
|
| 140 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 141 |
+
------------------------------------------------------------
|
| 142 |
+
Root Cause (first observed failure):
|
| 143 |
+
[0]:
|
| 144 |
+
time : 2026-05-18_14:34:10
|
| 145 |
+
host : t-20260518223237-whzzv-worker-1.t-20260518223237-whzzv-worker.mlplatform-customtask.svc.cluster.local
|
| 146 |
+
rank : 8 (local_rank: 0)
|
| 147 |
+
exitcode : 1 (pid: 10430)
|
| 148 |
+
error_file: <N/A>
|
| 149 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 150 |
+
============================================================
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223426-p5lfm.node0.log
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
W0518 14:35:55.740000 10360 torch/distributed/run.py:852]
|
| 2 |
+
W0518 14:35:55.740000 10360 torch/distributed/run.py:852] *****************************************
|
| 3 |
+
W0518 14:35:55.740000 10360 torch/distributed/run.py:852] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
W0518 14:35:55.740000 10360 torch/distributed/run.py:852] *****************************************
|
| 5 |
+
[W518 14:35:56.731399893 socket.cpp:207] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 6 |
+
Traceback (most recent call last):
|
| 7 |
+
Traceback (most recent call last):
|
| 8 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 9 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 10 |
+
from flowtext_lab.data import (from flowtext_lab.data import (
|
| 11 |
+
|
| 12 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 13 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 14 |
+
from .tokenization import BpeTextTokenizer
|
| 15 |
+
from .tokenization import BpeTextTokenizer
|
| 16 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 17 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 18 |
+
from tokenizers import Tokenizerfrom tokenizers import Tokenizer
|
| 19 |
+
|
| 20 |
+
ModuleNotFoundErrorModuleNotFoundError: : No module named 'tokenizers'No module named 'tokenizers'
|
| 21 |
+
|
| 22 |
+
Traceback (most recent call last):
|
| 23 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 24 |
+
from flowtext_lab.data import (
|
| 25 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 26 |
+
from .tokenization import BpeTextTokenizer
|
| 27 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 28 |
+
from tokenizers import Tokenizer
|
| 29 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 30 |
+
Traceback (most recent call last):
|
| 31 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 32 |
+
from flowtext_lab.data import (
|
| 33 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 34 |
+
from .tokenization import BpeTextTokenizer
|
| 35 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 36 |
+
from tokenizers import Tokenizer
|
| 37 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 38 |
+
Traceback (most recent call last):
|
| 39 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 40 |
+
from flowtext_lab.data import (
|
| 41 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 42 |
+
from .tokenization import BpeTextTokenizer
|
| 43 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 44 |
+
from tokenizers import Tokenizer
|
| 45 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 46 |
+
Traceback (most recent call last):
|
| 47 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 48 |
+
from flowtext_lab.data import (
|
| 49 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 50 |
+
from .tokenization import BpeTextTokenizer
|
| 51 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 52 |
+
from tokenizers import Tokenizer
|
| 53 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 54 |
+
Traceback (most recent call last):
|
| 55 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 56 |
+
from flowtext_lab.data import (
|
| 57 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 58 |
+
from .tokenization import BpeTextTokenizer
|
| 59 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 60 |
+
from tokenizers import Tokenizer
|
| 61 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 62 |
+
Traceback (most recent call last):
|
| 63 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 64 |
+
from flowtext_lab.data import (
|
| 65 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 66 |
+
from .tokenization import BpeTextTokenizer
|
| 67 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 68 |
+
from tokenizers import Tokenizer
|
| 69 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 70 |
+
E0518 14:35:59.194000 10360 torch/distributed/elastic/multiprocessing/api.py:984] failed (exitcode: 1) local_rank: 0 (pid: 10427) of binary: /e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/python3
|
| 71 |
+
Traceback (most recent call last):
|
| 72 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/torchrun", line 10, in <module>
|
| 73 |
+
sys.exit(main())
|
| 74 |
+
^^^^^^
|
| 75 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 362, in wrapper
|
| 76 |
+
return f(*args, **kwargs)
|
| 77 |
+
^^^^^^^^^^^^^^^^^^
|
| 78 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 991, in main
|
| 79 |
+
run(args)
|
| 80 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 982, in run
|
| 81 |
+
elastic_launch(
|
| 82 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 170, in __call__
|
| 83 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 84 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 85 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 317, in launch_agent
|
| 86 |
+
raise ChildFailedError(
|
| 87 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 88 |
+
============================================================
|
| 89 |
+
train.py FAILED
|
| 90 |
+
------------------------------------------------------------
|
| 91 |
+
Failures:
|
| 92 |
+
[1]:
|
| 93 |
+
time : 2026-05-18_14:35:59
|
| 94 |
+
host : t-20260518223426-p5lfm-worker-0.t-20260518223426-p5lfm-worker.mlplatform-customtask.svc.cluster.local
|
| 95 |
+
rank : 1 (local_rank: 1)
|
| 96 |
+
exitcode : 1 (pid: 10428)
|
| 97 |
+
error_file: <N/A>
|
| 98 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 99 |
+
[2]:
|
| 100 |
+
time : 2026-05-18_14:35:59
|
| 101 |
+
host : t-20260518223426-p5lfm-worker-0.t-20260518223426-p5lfm-worker.mlplatform-customtask.svc.cluster.local
|
| 102 |
+
rank : 2 (local_rank: 2)
|
| 103 |
+
exitcode : 1 (pid: 10429)
|
| 104 |
+
error_file: <N/A>
|
| 105 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 106 |
+
[3]:
|
| 107 |
+
time : 2026-05-18_14:35:59
|
| 108 |
+
host : t-20260518223426-p5lfm-worker-0.t-20260518223426-p5lfm-worker.mlplatform-customtask.svc.cluster.local
|
| 109 |
+
rank : 3 (local_rank: 3)
|
| 110 |
+
exitcode : 1 (pid: 10430)
|
| 111 |
+
error_file: <N/A>
|
| 112 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 113 |
+
[4]:
|
| 114 |
+
time : 2026-05-18_14:35:59
|
| 115 |
+
host : t-20260518223426-p5lfm-worker-0.t-20260518223426-p5lfm-worker.mlplatform-customtask.svc.cluster.local
|
| 116 |
+
rank : 4 (local_rank: 4)
|
| 117 |
+
exitcode : 1 (pid: 10431)
|
| 118 |
+
error_file: <N/A>
|
| 119 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 120 |
+
[5]:
|
| 121 |
+
time : 2026-05-18_14:35:59
|
| 122 |
+
host : t-20260518223426-p5lfm-worker-0.t-20260518223426-p5lfm-worker.mlplatform-customtask.svc.cluster.local
|
| 123 |
+
rank : 5 (local_rank: 5)
|
| 124 |
+
exitcode : 1 (pid: 10432)
|
| 125 |
+
error_file: <N/A>
|
| 126 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 127 |
+
[6]:
|
| 128 |
+
time : 2026-05-18_14:35:59
|
| 129 |
+
host : t-20260518223426-p5lfm-worker-0.t-20260518223426-p5lfm-worker.mlplatform-customtask.svc.cluster.local
|
| 130 |
+
rank : 6 (local_rank: 6)
|
| 131 |
+
exitcode : 1 (pid: 10433)
|
| 132 |
+
error_file: <N/A>
|
| 133 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 134 |
+
[7]:
|
| 135 |
+
time : 2026-05-18_14:35:59
|
| 136 |
+
host : t-20260518223426-p5lfm-worker-0.t-20260518223426-p5lfm-worker.mlplatform-customtask.svc.cluster.local
|
| 137 |
+
rank : 7 (local_rank: 7)
|
| 138 |
+
exitcode : 1 (pid: 10434)
|
| 139 |
+
error_file: <N/A>
|
| 140 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 141 |
+
------------------------------------------------------------
|
| 142 |
+
Root Cause (first observed failure):
|
| 143 |
+
[0]:
|
| 144 |
+
time : 2026-05-18_14:35:59
|
| 145 |
+
host : t-20260518223426-p5lfm-worker-0.t-20260518223426-p5lfm-worker.mlplatform-customtask.svc.cluster.local
|
| 146 |
+
rank : 0 (local_rank: 0)
|
| 147 |
+
exitcode : 1 (pid: 10427)
|
| 148 |
+
error_file: <N/A>
|
| 149 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 150 |
+
============================================================
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223426-p5lfm.node1.log
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
W0518 14:35:54.956000 10364 torch/distributed/run.py:852]
|
| 2 |
+
W0518 14:35:54.956000 10364 torch/distributed/run.py:852] *****************************************
|
| 3 |
+
W0518 14:35:54.956000 10364 torch/distributed/run.py:852] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
W0518 14:35:54.956000 10364 torch/distributed/run.py:852] *****************************************
|
| 5 |
+
[W518 14:35:56.321557278 socket.cpp:207] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 6 |
+
Traceback (most recent call last):
|
| 7 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 8 |
+
from flowtext_lab.data import (
|
| 9 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 10 |
+
from .tokenization import BpeTextTokenizer
|
| 11 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 12 |
+
from tokenizers import Tokenizer
|
| 13 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 14 |
+
Traceback (most recent call last):
|
| 15 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 16 |
+
from flowtext_lab.data import (
|
| 17 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 18 |
+
from .tokenization import BpeTextTokenizer
|
| 19 |
+
Traceback (most recent call last):
|
| 20 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 21 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 22 |
+
from tokenizers import Tokenizer
|
| 23 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 24 |
+
from flowtext_lab.data import (
|
| 25 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 26 |
+
from .tokenization import BpeTextTokenizer
|
| 27 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 28 |
+
from tokenizers import Tokenizer
|
| 29 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 30 |
+
Traceback (most recent call last):
|
| 31 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 32 |
+
from flowtext_lab.data import (
|
| 33 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 34 |
+
Traceback (most recent call last):
|
| 35 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 36 |
+
from .tokenization import BpeTextTokenizer
|
| 37 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 38 |
+
from tokenizers import Tokenizerfrom flowtext_lab.data import (
|
| 39 |
+
|
| 40 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 41 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 42 |
+
from .tokenization import BpeTextTokenizer
|
| 43 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 44 |
+
from tokenizers import Tokenizer
|
| 45 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 46 |
+
Traceback (most recent call last):
|
| 47 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 48 |
+
from flowtext_lab.data import (
|
| 49 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 50 |
+
from .tokenization import BpeTextTokenizer
|
| 51 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 52 |
+
from tokenizers import Tokenizer
|
| 53 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 54 |
+
Traceback (most recent call last):
|
| 55 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 56 |
+
from flowtext_lab.data import (
|
| 57 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 58 |
+
from .tokenization import BpeTextTokenizer
|
| 59 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 60 |
+
from tokenizers import Tokenizer
|
| 61 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 62 |
+
Traceback (most recent call last):
|
| 63 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 64 |
+
from flowtext_lab.data import (
|
| 65 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 66 |
+
from .tokenization import BpeTextTokenizer
|
| 67 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 68 |
+
from tokenizers import Tokenizer
|
| 69 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 70 |
+
E0518 14:35:59.194000 10364 torch/distributed/elastic/multiprocessing/api.py:984] failed (exitcode: 1) local_rank: 0 (pid: 10430) of binary: /e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/python3
|
| 71 |
+
Traceback (most recent call last):
|
| 72 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/torchrun", line 10, in <module>
|
| 73 |
+
sys.exit(main())
|
| 74 |
+
^^^^^^
|
| 75 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 362, in wrapper
|
| 76 |
+
return f(*args, **kwargs)
|
| 77 |
+
^^^^^^^^^^^^^^^^^^
|
| 78 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 991, in main
|
| 79 |
+
run(args)
|
| 80 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 982, in run
|
| 81 |
+
elastic_launch(
|
| 82 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 170, in __call__
|
| 83 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 84 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 85 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 317, in launch_agent
|
| 86 |
+
raise ChildFailedError(
|
| 87 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 88 |
+
============================================================
|
| 89 |
+
train.py FAILED
|
| 90 |
+
------------------------------------------------------------
|
| 91 |
+
Failures:
|
| 92 |
+
[1]:
|
| 93 |
+
time : 2026-05-18_14:35:59
|
| 94 |
+
host : t-20260518223426-p5lfm-worker-1.t-20260518223426-p5lfm-worker.mlplatform-customtask.svc.cluster.local
|
| 95 |
+
rank : 9 (local_rank: 1)
|
| 96 |
+
exitcode : 1 (pid: 10431)
|
| 97 |
+
error_file: <N/A>
|
| 98 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 99 |
+
[2]:
|
| 100 |
+
time : 2026-05-18_14:35:59
|
| 101 |
+
host : t-20260518223426-p5lfm-worker-1.t-20260518223426-p5lfm-worker.mlplatform-customtask.svc.cluster.local
|
| 102 |
+
rank : 10 (local_rank: 2)
|
| 103 |
+
exitcode : 1 (pid: 10432)
|
| 104 |
+
error_file: <N/A>
|
| 105 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 106 |
+
[3]:
|
| 107 |
+
time : 2026-05-18_14:35:59
|
| 108 |
+
host : t-20260518223426-p5lfm-worker-1.t-20260518223426-p5lfm-worker.mlplatform-customtask.svc.cluster.local
|
| 109 |
+
rank : 11 (local_rank: 3)
|
| 110 |
+
exitcode : 1 (pid: 10433)
|
| 111 |
+
error_file: <N/A>
|
| 112 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 113 |
+
[4]:
|
| 114 |
+
time : 2026-05-18_14:35:59
|
| 115 |
+
host : t-20260518223426-p5lfm-worker-1.t-20260518223426-p5lfm-worker.mlplatform-customtask.svc.cluster.local
|
| 116 |
+
rank : 12 (local_rank: 4)
|
| 117 |
+
exitcode : 1 (pid: 10434)
|
| 118 |
+
error_file: <N/A>
|
| 119 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 120 |
+
[5]:
|
| 121 |
+
time : 2026-05-18_14:35:59
|
| 122 |
+
host : t-20260518223426-p5lfm-worker-1.t-20260518223426-p5lfm-worker.mlplatform-customtask.svc.cluster.local
|
| 123 |
+
rank : 13 (local_rank: 5)
|
| 124 |
+
exitcode : 1 (pid: 10435)
|
| 125 |
+
error_file: <N/A>
|
| 126 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 127 |
+
[6]:
|
| 128 |
+
time : 2026-05-18_14:35:59
|
| 129 |
+
host : t-20260518223426-p5lfm-worker-1.t-20260518223426-p5lfm-worker.mlplatform-customtask.svc.cluster.local
|
| 130 |
+
rank : 14 (local_rank: 6)
|
| 131 |
+
exitcode : 1 (pid: 10436)
|
| 132 |
+
error_file: <N/A>
|
| 133 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 134 |
+
[7]:
|
| 135 |
+
time : 2026-05-18_14:35:59
|
| 136 |
+
host : t-20260518223426-p5lfm-worker-1.t-20260518223426-p5lfm-worker.mlplatform-customtask.svc.cluster.local
|
| 137 |
+
rank : 15 (local_rank: 7)
|
| 138 |
+
exitcode : 1 (pid: 10437)
|
| 139 |
+
error_file: <N/A>
|
| 140 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 141 |
+
------------------------------------------------------------
|
| 142 |
+
Root Cause (first observed failure):
|
| 143 |
+
[0]:
|
| 144 |
+
time : 2026-05-18_14:35:59
|
| 145 |
+
host : t-20260518223426-p5lfm-worker-1.t-20260518223426-p5lfm-worker.mlplatform-customtask.svc.cluster.local
|
| 146 |
+
rank : 8 (local_rank: 0)
|
| 147 |
+
exitcode : 1 (pid: 10430)
|
| 148 |
+
error_file: <N/A>
|
| 149 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 150 |
+
============================================================
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223615-7lx5q.node0.log
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
W0518 14:37:44.239000 10360 torch/distributed/run.py:852]
|
| 2 |
+
W0518 14:37:44.239000 10360 torch/distributed/run.py:852] *****************************************
|
| 3 |
+
W0518 14:37:44.239000 10360 torch/distributed/run.py:852] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
W0518 14:37:44.239000 10360 torch/distributed/run.py:852] *****************************************
|
| 5 |
+
[W518 14:37:45.596957931 socket.cpp:207] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 6 |
+
Traceback (most recent call last):
|
| 7 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 8 |
+
Traceback (most recent call last):
|
| 9 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 10 |
+
from flowtext_lab.data import (
|
| 11 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 12 |
+
from flowtext_lab.data import (
|
| 13 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 14 |
+
from .tokenization import BpeTextTokenizerfrom .tokenization import BpeTextTokenizer
|
| 15 |
+
|
| 16 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 17 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 18 |
+
from tokenizers import Tokenizer
|
| 19 |
+
from tokenizers import Tokenizer
|
| 20 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 21 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 22 |
+
Traceback (most recent call last):
|
| 23 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 24 |
+
from flowtext_lab.data import (
|
| 25 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 26 |
+
from .tokenization import BpeTextTokenizer
|
| 27 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 28 |
+
from tokenizers import Tokenizer
|
| 29 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 30 |
+
Traceback (most recent call last):
|
| 31 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 32 |
+
from flowtext_lab.data import (
|
| 33 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 34 |
+
from .tokenization import BpeTextTokenizer
|
| 35 |
+
Traceback (most recent call last):
|
| 36 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 37 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 38 |
+
from tokenizers import Tokenizer
|
| 39 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 40 |
+
from flowtext_lab.data import (
|
| 41 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 42 |
+
from .tokenization import BpeTextTokenizer
|
| 43 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 44 |
+
from tokenizers import Tokenizer
|
| 45 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 46 |
+
Traceback (most recent call last):
|
| 47 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 48 |
+
from flowtext_lab.data import (
|
| 49 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 50 |
+
from .tokenization import BpeTextTokenizer
|
| 51 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 52 |
+
from tokenizers import Tokenizer
|
| 53 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 54 |
+
Traceback (most recent call last):
|
| 55 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 56 |
+
from flowtext_lab.data import (
|
| 57 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 58 |
+
from .tokenization import BpeTextTokenizer
|
| 59 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 60 |
+
from tokenizers import Tokenizer
|
| 61 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 62 |
+
Traceback (most recent call last):
|
| 63 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 64 |
+
from flowtext_lab.data import (
|
| 65 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 66 |
+
from .tokenization import BpeTextTokenizer
|
| 67 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 68 |
+
from tokenizers import Tokenizer
|
| 69 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 70 |
+
E0518 14:37:47.717000 10360 torch/distributed/elastic/multiprocessing/api.py:984] failed (exitcode: 1) local_rank: 0 (pid: 10427) of binary: /e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/python3
|
| 71 |
+
Traceback (most recent call last):
|
| 72 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/torchrun", line 10, in <module>
|
| 73 |
+
sys.exit(main())
|
| 74 |
+
^^^^^^
|
| 75 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 362, in wrapper
|
| 76 |
+
return f(*args, **kwargs)
|
| 77 |
+
^^^^^^^^^^^^^^^^^^
|
| 78 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 991, in main
|
| 79 |
+
run(args)
|
| 80 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 982, in run
|
| 81 |
+
elastic_launch(
|
| 82 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 170, in __call__
|
| 83 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 84 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 85 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 317, in launch_agent
|
| 86 |
+
raise ChildFailedError(
|
| 87 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 88 |
+
============================================================
|
| 89 |
+
train.py FAILED
|
| 90 |
+
------------------------------------------------------------
|
| 91 |
+
Failures:
|
| 92 |
+
[1]:
|
| 93 |
+
time : 2026-05-18_14:37:47
|
| 94 |
+
host : t-20260518223615-7lx5q-worker-0.t-20260518223615-7lx5q-worker.mlplatform-customtask.svc.cluster.local
|
| 95 |
+
rank : 1 (local_rank: 1)
|
| 96 |
+
exitcode : 1 (pid: 10428)
|
| 97 |
+
error_file: <N/A>
|
| 98 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 99 |
+
[2]:
|
| 100 |
+
time : 2026-05-18_14:37:47
|
| 101 |
+
host : t-20260518223615-7lx5q-worker-0.t-20260518223615-7lx5q-worker.mlplatform-customtask.svc.cluster.local
|
| 102 |
+
rank : 2 (local_rank: 2)
|
| 103 |
+
exitcode : 1 (pid: 10429)
|
| 104 |
+
error_file: <N/A>
|
| 105 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 106 |
+
[3]:
|
| 107 |
+
time : 2026-05-18_14:37:47
|
| 108 |
+
host : t-20260518223615-7lx5q-worker-0.t-20260518223615-7lx5q-worker.mlplatform-customtask.svc.cluster.local
|
| 109 |
+
rank : 3 (local_rank: 3)
|
| 110 |
+
exitcode : 1 (pid: 10430)
|
| 111 |
+
error_file: <N/A>
|
| 112 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 113 |
+
[4]:
|
| 114 |
+
time : 2026-05-18_14:37:47
|
| 115 |
+
host : t-20260518223615-7lx5q-worker-0.t-20260518223615-7lx5q-worker.mlplatform-customtask.svc.cluster.local
|
| 116 |
+
rank : 4 (local_rank: 4)
|
| 117 |
+
exitcode : 1 (pid: 10431)
|
| 118 |
+
error_file: <N/A>
|
| 119 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 120 |
+
[5]:
|
| 121 |
+
time : 2026-05-18_14:37:47
|
| 122 |
+
host : t-20260518223615-7lx5q-worker-0.t-20260518223615-7lx5q-worker.mlplatform-customtask.svc.cluster.local
|
| 123 |
+
rank : 5 (local_rank: 5)
|
| 124 |
+
exitcode : 1 (pid: 10432)
|
| 125 |
+
error_file: <N/A>
|
| 126 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 127 |
+
[6]:
|
| 128 |
+
time : 2026-05-18_14:37:47
|
| 129 |
+
host : t-20260518223615-7lx5q-worker-0.t-20260518223615-7lx5q-worker.mlplatform-customtask.svc.cluster.local
|
| 130 |
+
rank : 6 (local_rank: 6)
|
| 131 |
+
exitcode : 1 (pid: 10433)
|
| 132 |
+
error_file: <N/A>
|
| 133 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 134 |
+
[7]:
|
| 135 |
+
time : 2026-05-18_14:37:47
|
| 136 |
+
host : t-20260518223615-7lx5q-worker-0.t-20260518223615-7lx5q-worker.mlplatform-customtask.svc.cluster.local
|
| 137 |
+
rank : 7 (local_rank: 7)
|
| 138 |
+
exitcode : 1 (pid: 10434)
|
| 139 |
+
error_file: <N/A>
|
| 140 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 141 |
+
------------------------------------------------------------
|
| 142 |
+
Root Cause (first observed failure):
|
| 143 |
+
[0]:
|
| 144 |
+
time : 2026-05-18_14:37:47
|
| 145 |
+
host : t-20260518223615-7lx5q-worker-0.t-20260518223615-7lx5q-worker.mlplatform-customtask.svc.cluster.local
|
| 146 |
+
rank : 0 (local_rank: 0)
|
| 147 |
+
exitcode : 1 (pid: 10427)
|
| 148 |
+
error_file: <N/A>
|
| 149 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 150 |
+
============================================================
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223615-7lx5q.node1.log
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
W0518 14:37:45.329000 10364 torch/distributed/run.py:852]
|
| 2 |
+
W0518 14:37:45.329000 10364 torch/distributed/run.py:852] *****************************************
|
| 3 |
+
W0518 14:37:45.329000 10364 torch/distributed/run.py:852] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
W0518 14:37:45.329000 10364 torch/distributed/run.py:852] *****************************************
|
| 5 |
+
[W518 14:37:46.005851595 socket.cpp:207] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 6 |
+
Traceback (most recent call last):
|
| 7 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 8 |
+
from flowtext_lab.data import (
|
| 9 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 10 |
+
from .tokenization import BpeTextTokenizer
|
| 11 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 12 |
+
from tokenizers import Tokenizer
|
| 13 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 14 |
+
Traceback (most recent call last):
|
| 15 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 16 |
+
from flowtext_lab.data import (
|
| 17 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 18 |
+
from .tokenization import BpeTextTokenizer
|
| 19 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 20 |
+
from tokenizers import Tokenizer
|
| 21 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 22 |
+
Traceback (most recent call last):
|
| 23 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 24 |
+
from flowtext_lab.data import (
|
| 25 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 26 |
+
from .tokenization import BpeTextTokenizer
|
| 27 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 28 |
+
from tokenizers import Tokenizer
|
| 29 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 30 |
+
Traceback (most recent call last):
|
| 31 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 32 |
+
from flowtext_lab.data import (
|
| 33 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 34 |
+
from .tokenization import BpeTextTokenizer
|
| 35 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 36 |
+
from tokenizers import Tokenizer
|
| 37 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 38 |
+
Traceback (most recent call last):
|
| 39 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 40 |
+
from flowtext_lab.data import (
|
| 41 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 42 |
+
from .tokenization import BpeTextTokenizer
|
| 43 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 44 |
+
from tokenizers import Tokenizer
|
| 45 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 46 |
+
Traceback (most recent call last):
|
| 47 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 48 |
+
from flowtext_lab.data import (
|
| 49 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 50 |
+
from .tokenization import BpeTextTokenizer
|
| 51 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 52 |
+
from tokenizers import Tokenizer
|
| 53 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 54 |
+
Traceback (most recent call last):
|
| 55 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 56 |
+
from flowtext_lab.data import (
|
| 57 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 58 |
+
from .tokenization import BpeTextTokenizer
|
| 59 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 60 |
+
Traceback (most recent call last):
|
| 61 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 62 |
+
from tokenizers import Tokenizer
|
| 63 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 64 |
+
from flowtext_lab.data import (
|
| 65 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 66 |
+
from .tokenization import BpeTextTokenizer
|
| 67 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 68 |
+
from tokenizers import Tokenizer
|
| 69 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 70 |
+
W0518 14:37:47.718000 10364 torch/distributed/elastic/multiprocessing/api.py:1010] Sending process 10432 closing signal SIGTERM
|
| 71 |
+
E0518 14:37:47.720000 10364 torch/distributed/elastic/multiprocessing/api.py:984] failed (exitcode: 1) local_rank: 0 (pid: 10430) of binary: /e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/python3
|
| 72 |
+
Traceback (most recent call last):
|
| 73 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/torchrun", line 10, in <module>
|
| 74 |
+
sys.exit(main())
|
| 75 |
+
^^^^^^
|
| 76 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 362, in wrapper
|
| 77 |
+
return f(*args, **kwargs)
|
| 78 |
+
^^^^^^^^^^^^^^^^^^
|
| 79 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 991, in main
|
| 80 |
+
run(args)
|
| 81 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 982, in run
|
| 82 |
+
elastic_launch(
|
| 83 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 170, in __call__
|
| 84 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 85 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 86 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 317, in launch_agent
|
| 87 |
+
raise ChildFailedError(
|
| 88 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 89 |
+
============================================================
|
| 90 |
+
train.py FAILED
|
| 91 |
+
------------------------------------------------------------
|
| 92 |
+
Failures:
|
| 93 |
+
[1]:
|
| 94 |
+
time : 2026-05-18_14:37:47
|
| 95 |
+
host : t-20260518223615-7lx5q-worker-1.t-20260518223615-7lx5q-worker.mlplatform-customtask.svc.cluster.local
|
| 96 |
+
rank : 9 (local_rank: 1)
|
| 97 |
+
exitcode : 1 (pid: 10431)
|
| 98 |
+
error_file: <N/A>
|
| 99 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 100 |
+
[2]:
|
| 101 |
+
time : 2026-05-18_14:37:47
|
| 102 |
+
host : t-20260518223615-7lx5q-worker-1.t-20260518223615-7lx5q-worker.mlplatform-customtask.svc.cluster.local
|
| 103 |
+
rank : 11 (local_rank: 3)
|
| 104 |
+
exitcode : 1 (pid: 10433)
|
| 105 |
+
error_file: <N/A>
|
| 106 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 107 |
+
[3]:
|
| 108 |
+
time : 2026-05-18_14:37:47
|
| 109 |
+
host : t-20260518223615-7lx5q-worker-1.t-20260518223615-7lx5q-worker.mlplatform-customtask.svc.cluster.local
|
| 110 |
+
rank : 12 (local_rank: 4)
|
| 111 |
+
exitcode : 1 (pid: 10434)
|
| 112 |
+
error_file: <N/A>
|
| 113 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 114 |
+
[4]:
|
| 115 |
+
time : 2026-05-18_14:37:47
|
| 116 |
+
host : t-20260518223615-7lx5q-worker-1.t-20260518223615-7lx5q-worker.mlplatform-customtask.svc.cluster.local
|
| 117 |
+
rank : 13 (local_rank: 5)
|
| 118 |
+
exitcode : 1 (pid: 10435)
|
| 119 |
+
error_file: <N/A>
|
| 120 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 121 |
+
[5]:
|
| 122 |
+
time : 2026-05-18_14:37:47
|
| 123 |
+
host : t-20260518223615-7lx5q-worker-1.t-20260518223615-7lx5q-worker.mlplatform-customtask.svc.cluster.local
|
| 124 |
+
rank : 14 (local_rank: 6)
|
| 125 |
+
exitcode : 1 (pid: 10436)
|
| 126 |
+
error_file: <N/A>
|
| 127 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 128 |
+
[6]:
|
| 129 |
+
time : 2026-05-18_14:37:47
|
| 130 |
+
host : t-20260518223615-7lx5q-worker-1.t-20260518223615-7lx5q-worker.mlplatform-customtask.svc.cluster.local
|
| 131 |
+
rank : 15 (local_rank: 7)
|
| 132 |
+
exitcode : 1 (pid: 10437)
|
| 133 |
+
error_file: <N/A>
|
| 134 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 135 |
+
[7]:
|
| 136 |
+
time : 2026-05-18_14:37:47
|
| 137 |
+
host : t-20260518223615-7lx5q-worker-1.t-20260518223615-7lx5q-worker.mlplatform-customtask.svc.cluster.local
|
| 138 |
+
rank : 10 (local_rank: 2)
|
| 139 |
+
exitcode : 1 (pid: 10432)
|
| 140 |
+
error_file: <N/A>
|
| 141 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 142 |
+
------------------------------------------------------------
|
| 143 |
+
Root Cause (first observed failure):
|
| 144 |
+
[0]:
|
| 145 |
+
time : 2026-05-18_14:37:47
|
| 146 |
+
host : t-20260518223615-7lx5q-worker-1.t-20260518223615-7lx5q-worker.mlplatform-customtask.svc.cluster.local
|
| 147 |
+
rank : 8 (local_rank: 0)
|
| 148 |
+
exitcode : 1 (pid: 10430)
|
| 149 |
+
error_file: <N/A>
|
| 150 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 151 |
+
============================================================
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223804-mp56c.node0.log
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
W0518 14:39:35.171000 10360 torch/distributed/run.py:852]
|
| 2 |
+
W0518 14:39:35.171000 10360 torch/distributed/run.py:852] *****************************************
|
| 3 |
+
W0518 14:39:35.171000 10360 torch/distributed/run.py:852] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
W0518 14:39:35.171000 10360 torch/distributed/run.py:852] *****************************************
|
| 5 |
+
[W518 14:39:35.870948832 socket.cpp:207] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 6 |
+
Traceback (most recent call last):
|
| 7 |
+
Traceback (most recent call last):
|
| 8 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 9 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 10 |
+
Traceback (most recent call last):
|
| 11 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 12 |
+
from flowtext_lab.data import (
|
| 13 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 14 |
+
from flowtext_lab.data import (
|
| 15 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 16 |
+
from flowtext_lab.data import (
|
| 17 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 18 |
+
from .tokenization import BpeTextTokenizerfrom .tokenization import BpeTextTokenizer
|
| 19 |
+
|
| 20 |
+
from .tokenization import BpeTextTokenizer
|
| 21 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 22 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 23 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 24 |
+
from tokenizers import Tokenizerfrom tokenizers import Tokenizer
|
| 25 |
+
|
| 26 |
+
from tokenizers import Tokenizer
|
| 27 |
+
ModuleNotFoundErrorModuleNotFoundError: No module named 'tokenizers':
|
| 28 |
+
No module named 'tokenizers'
|
| 29 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 30 |
+
Traceback (most recent call last):
|
| 31 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 32 |
+
from flowtext_lab.data import (
|
| 33 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 34 |
+
from .tokenization import BpeTextTokenizer
|
| 35 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 36 |
+
from tokenizers import Tokenizer
|
| 37 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 38 |
+
Traceback (most recent call last):
|
| 39 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 40 |
+
from flowtext_lab.data import (
|
| 41 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 42 |
+
from .tokenization import BpeTextTokenizer
|
| 43 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 44 |
+
from tokenizers import Tokenizer
|
| 45 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 46 |
+
Traceback (most recent call last):
|
| 47 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 48 |
+
from flowtext_lab.data import (
|
| 49 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 50 |
+
from .tokenization import BpeTextTokenizer
|
| 51 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 52 |
+
from tokenizers import Tokenizer
|
| 53 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 54 |
+
Traceback (most recent call last):
|
| 55 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 56 |
+
from flowtext_lab.data import (
|
| 57 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 58 |
+
from .tokenization import BpeTextTokenizer
|
| 59 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 60 |
+
from tokenizers import Tokenizer
|
| 61 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 62 |
+
Traceback (most recent call last):
|
| 63 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 64 |
+
from flowtext_lab.data import (
|
| 65 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 66 |
+
from .tokenization import BpeTextTokenizer
|
| 67 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 68 |
+
from tokenizers import Tokenizer
|
| 69 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 70 |
+
E0518 14:39:38.354000 10360 torch/distributed/elastic/multiprocessing/api.py:984] failed (exitcode: 1) local_rank: 0 (pid: 10427) of binary: /e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/python3
|
| 71 |
+
Traceback (most recent call last):
|
| 72 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/torchrun", line 10, in <module>
|
| 73 |
+
sys.exit(main())
|
| 74 |
+
^^^^^^
|
| 75 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 362, in wrapper
|
| 76 |
+
return f(*args, **kwargs)
|
| 77 |
+
^^^^^^^^^^^^^^^^^^
|
| 78 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 991, in main
|
| 79 |
+
run(args)
|
| 80 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 982, in run
|
| 81 |
+
elastic_launch(
|
| 82 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 170, in __call__
|
| 83 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 84 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 85 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 317, in launch_agent
|
| 86 |
+
raise ChildFailedError(
|
| 87 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 88 |
+
============================================================
|
| 89 |
+
train.py FAILED
|
| 90 |
+
------------------------------------------------------------
|
| 91 |
+
Failures:
|
| 92 |
+
[1]:
|
| 93 |
+
time : 2026-05-18_14:39:38
|
| 94 |
+
host : t-20260518223804-mp56c-worker-0.t-20260518223804-mp56c-worker.mlplatform-customtask.svc.cluster.local
|
| 95 |
+
rank : 1 (local_rank: 1)
|
| 96 |
+
exitcode : 1 (pid: 10428)
|
| 97 |
+
error_file: <N/A>
|
| 98 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 99 |
+
[2]:
|
| 100 |
+
time : 2026-05-18_14:39:38
|
| 101 |
+
host : t-20260518223804-mp56c-worker-0.t-20260518223804-mp56c-worker.mlplatform-customtask.svc.cluster.local
|
| 102 |
+
rank : 2 (local_rank: 2)
|
| 103 |
+
exitcode : 1 (pid: 10429)
|
| 104 |
+
error_file: <N/A>
|
| 105 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 106 |
+
[3]:
|
| 107 |
+
time : 2026-05-18_14:39:38
|
| 108 |
+
host : t-20260518223804-mp56c-worker-0.t-20260518223804-mp56c-worker.mlplatform-customtask.svc.cluster.local
|
| 109 |
+
rank : 3 (local_rank: 3)
|
| 110 |
+
exitcode : 1 (pid: 10430)
|
| 111 |
+
error_file: <N/A>
|
| 112 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 113 |
+
[4]:
|
| 114 |
+
time : 2026-05-18_14:39:38
|
| 115 |
+
host : t-20260518223804-mp56c-worker-0.t-20260518223804-mp56c-worker.mlplatform-customtask.svc.cluster.local
|
| 116 |
+
rank : 4 (local_rank: 4)
|
| 117 |
+
exitcode : 1 (pid: 10431)
|
| 118 |
+
error_file: <N/A>
|
| 119 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 120 |
+
[5]:
|
| 121 |
+
time : 2026-05-18_14:39:38
|
| 122 |
+
host : t-20260518223804-mp56c-worker-0.t-20260518223804-mp56c-worker.mlplatform-customtask.svc.cluster.local
|
| 123 |
+
rank : 5 (local_rank: 5)
|
| 124 |
+
exitcode : 1 (pid: 10432)
|
| 125 |
+
error_file: <N/A>
|
| 126 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 127 |
+
[6]:
|
| 128 |
+
time : 2026-05-18_14:39:38
|
| 129 |
+
host : t-20260518223804-mp56c-worker-0.t-20260518223804-mp56c-worker.mlplatform-customtask.svc.cluster.local
|
| 130 |
+
rank : 6 (local_rank: 6)
|
| 131 |
+
exitcode : 1 (pid: 10433)
|
| 132 |
+
error_file: <N/A>
|
| 133 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 134 |
+
[7]:
|
| 135 |
+
time : 2026-05-18_14:39:38
|
| 136 |
+
host : t-20260518223804-mp56c-worker-0.t-20260518223804-mp56c-worker.mlplatform-customtask.svc.cluster.local
|
| 137 |
+
rank : 7 (local_rank: 7)
|
| 138 |
+
exitcode : 1 (pid: 10434)
|
| 139 |
+
error_file: <N/A>
|
| 140 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 141 |
+
------------------------------------------------------------
|
| 142 |
+
Root Cause (first observed failure):
|
| 143 |
+
[0]:
|
| 144 |
+
time : 2026-05-18_14:39:38
|
| 145 |
+
host : t-20260518223804-mp56c-worker-0.t-20260518223804-mp56c-worker.mlplatform-customtask.svc.cluster.local
|
| 146 |
+
rank : 0 (local_rank: 0)
|
| 147 |
+
exitcode : 1 (pid: 10427)
|
| 148 |
+
error_file: <N/A>
|
| 149 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 150 |
+
============================================================
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223804-mp56c.node1.log
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
W0518 14:39:34.160000 10364 torch/distributed/run.py:852]
|
| 2 |
+
W0518 14:39:34.160000 10364 torch/distributed/run.py:852] *****************************************
|
| 3 |
+
W0518 14:39:34.160000 10364 torch/distributed/run.py:852] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
W0518 14:39:34.160000 10364 torch/distributed/run.py:852] *****************************************
|
| 5 |
+
[W518 14:39:35.371298625 socket.cpp:207] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 6 |
+
Traceback (most recent call last):
|
| 7 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 8 |
+
Traceback (most recent call last):
|
| 9 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 10 |
+
from flowtext_lab.data import (
|
| 11 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 12 |
+
from .tokenization import BpeTextTokenizerfrom flowtext_lab.data import (
|
| 13 |
+
|
| 14 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 15 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 16 |
+
from .tokenization import BpeTextTokenizer
|
| 17 |
+
from tokenizers import Tokenizer
|
| 18 |
+
ModuleNotFoundError File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 19 |
+
: No module named 'tokenizers'
|
| 20 |
+
from tokenizers import Tokenizer
|
| 21 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 22 |
+
Traceback (most recent call last):
|
| 23 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 24 |
+
from flowtext_lab.data import (
|
| 25 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 26 |
+
from .tokenization import BpeTextTokenizer
|
| 27 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 28 |
+
from tokenizers import Tokenizer
|
| 29 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 30 |
+
Traceback (most recent call last):
|
| 31 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 32 |
+
from flowtext_lab.data import (
|
| 33 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 34 |
+
from .tokenization import BpeTextTokenizer
|
| 35 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 36 |
+
from tokenizers import Tokenizer
|
| 37 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 38 |
+
Traceback (most recent call last):
|
| 39 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 40 |
+
from flowtext_lab.data import (
|
| 41 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 42 |
+
from .tokenization import BpeTextTokenizer
|
| 43 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 44 |
+
from tokenizers import Tokenizer
|
| 45 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 46 |
+
Traceback (most recent call last):
|
| 47 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 48 |
+
Traceback (most recent call last):
|
| 49 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 50 |
+
from flowtext_lab.data import (
|
| 51 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 52 |
+
from .tokenization import BpeTextTokenizer
|
| 53 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 54 |
+
from flowtext_lab.data import (
|
| 55 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 56 |
+
from tokenizers import Tokenizer
|
| 57 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 58 |
+
from .tokenization import BpeTextTokenizer
|
| 59 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 60 |
+
from tokenizers import Tokenizer
|
| 61 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 62 |
+
Traceback (most recent call last):
|
| 63 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 26, in <module>
|
| 64 |
+
from flowtext_lab.data import (
|
| 65 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 14, in <module>
|
| 66 |
+
from .tokenization import BpeTextTokenizer
|
| 67 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/tokenization.py", line 7, in <module>
|
| 68 |
+
from tokenizers import Tokenizer
|
| 69 |
+
ModuleNotFoundError: No module named 'tokenizers'
|
| 70 |
+
E0518 14:39:38.354000 10364 torch/distributed/elastic/multiprocessing/api.py:984] failed (exitcode: 1) local_rank: 0 (pid: 10430) of binary: /e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/python3
|
| 71 |
+
Traceback (most recent call last):
|
| 72 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/bin/torchrun", line 10, in <module>
|
| 73 |
+
sys.exit(main())
|
| 74 |
+
^^^^^^
|
| 75 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 362, in wrapper
|
| 76 |
+
return f(*args, **kwargs)
|
| 77 |
+
^^^^^^^^^^^^^^^^^^
|
| 78 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 991, in main
|
| 79 |
+
run(args)
|
| 80 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/run.py", line 982, in run
|
| 81 |
+
elastic_launch(
|
| 82 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 170, in __call__
|
| 83 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 84 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 85 |
+
File "/e2e-data/evad-tech-vla/wanghan58/env/my_env/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 317, in launch_agent
|
| 86 |
+
raise ChildFailedError(
|
| 87 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 88 |
+
============================================================
|
| 89 |
+
train.py FAILED
|
| 90 |
+
------------------------------------------------------------
|
| 91 |
+
Failures:
|
| 92 |
+
[1]:
|
| 93 |
+
time : 2026-05-18_14:39:38
|
| 94 |
+
host : t-20260518223804-mp56c-worker-1.t-20260518223804-mp56c-worker.mlplatform-customtask.svc.cluster.local
|
| 95 |
+
rank : 9 (local_rank: 1)
|
| 96 |
+
exitcode : 1 (pid: 10431)
|
| 97 |
+
error_file: <N/A>
|
| 98 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 99 |
+
[2]:
|
| 100 |
+
time : 2026-05-18_14:39:38
|
| 101 |
+
host : t-20260518223804-mp56c-worker-1.t-20260518223804-mp56c-worker.mlplatform-customtask.svc.cluster.local
|
| 102 |
+
rank : 10 (local_rank: 2)
|
| 103 |
+
exitcode : 1 (pid: 10432)
|
| 104 |
+
error_file: <N/A>
|
| 105 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 106 |
+
[3]:
|
| 107 |
+
time : 2026-05-18_14:39:38
|
| 108 |
+
host : t-20260518223804-mp56c-worker-1.t-20260518223804-mp56c-worker.mlplatform-customtask.svc.cluster.local
|
| 109 |
+
rank : 11 (local_rank: 3)
|
| 110 |
+
exitcode : 1 (pid: 10433)
|
| 111 |
+
error_file: <N/A>
|
| 112 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 113 |
+
[4]:
|
| 114 |
+
time : 2026-05-18_14:39:38
|
| 115 |
+
host : t-20260518223804-mp56c-worker-1.t-20260518223804-mp56c-worker.mlplatform-customtask.svc.cluster.local
|
| 116 |
+
rank : 12 (local_rank: 4)
|
| 117 |
+
exitcode : 1 (pid: 10434)
|
| 118 |
+
error_file: <N/A>
|
| 119 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 120 |
+
[5]:
|
| 121 |
+
time : 2026-05-18_14:39:38
|
| 122 |
+
host : t-20260518223804-mp56c-worker-1.t-20260518223804-mp56c-worker.mlplatform-customtask.svc.cluster.local
|
| 123 |
+
rank : 13 (local_rank: 5)
|
| 124 |
+
exitcode : 1 (pid: 10435)
|
| 125 |
+
error_file: <N/A>
|
| 126 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 127 |
+
[6]:
|
| 128 |
+
time : 2026-05-18_14:39:38
|
| 129 |
+
host : t-20260518223804-mp56c-worker-1.t-20260518223804-mp56c-worker.mlplatform-customtask.svc.cluster.local
|
| 130 |
+
rank : 14 (local_rank: 6)
|
| 131 |
+
exitcode : 1 (pid: 10436)
|
| 132 |
+
error_file: <N/A>
|
| 133 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 134 |
+
[7]:
|
| 135 |
+
time : 2026-05-18_14:39:38
|
| 136 |
+
host : t-20260518223804-mp56c-worker-1.t-20260518223804-mp56c-worker.mlplatform-customtask.svc.cluster.local
|
| 137 |
+
rank : 15 (local_rank: 7)
|
| 138 |
+
exitcode : 1 (pid: 10437)
|
| 139 |
+
error_file: <N/A>
|
| 140 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 141 |
+
------------------------------------------------------------
|
| 142 |
+
Root Cause (first observed failure):
|
| 143 |
+
[0]:
|
| 144 |
+
time : 2026-05-18_14:39:38
|
| 145 |
+
host : t-20260518223804-mp56c-worker-1.t-20260518223804-mp56c-worker.mlplatform-customtask.svc.cluster.local
|
| 146 |
+
rank : 8 (local_rank: 0)
|
| 147 |
+
exitcode : 1 (pid: 10430)
|
| 148 |
+
error_file: <N/A>
|
| 149 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 150 |
+
============================================================
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223954-k9vcs.node0.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518223954-k9vcs.node1.log
ADDED
|
@@ -0,0 +1,701 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
W0518 14:41:25.942000 10427 torch/distributed/run.py:792]
|
| 2 |
+
W0518 14:41:25.942000 10427 torch/distributed/run.py:792] *****************************************
|
| 3 |
+
W0518 14:41:25.942000 10427 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
W0518 14:41:25.942000 10427 torch/distributed/run.py:792] *****************************************
|
| 5 |
+
[W518 14:41:26.136987369 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 6 |
+
[W518 14:41:30.130931725 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 7 |
+
[W518 14:41:30.435544211 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 8 |
+
[W518 14:41:30.436173532 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 9 |
+
[W518 14:41:30.455235409 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 10 |
+
[W518 14:41:30.463921318 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 11 |
+
[W518 14:41:30.465008417 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 12 |
+
[W518 14:41:30.466279046 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 13 |
+
[W518 14:41:30.472627328 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 14 |
+
t-20260518223954-k9vcs-worker-1:10493:10493 [0] NCCL INFO cudaDriverVersion 12080
|
| 15 |
+
t-20260518223954-k9vcs-worker-1:10493:10493 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 16 |
+
t-20260518223954-k9vcs-worker-1:10497:10497 [4] NCCL INFO cudaDriverVersion 12080
|
| 17 |
+
t-20260518223954-k9vcs-worker-1:10497:10497 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 18 |
+
t-20260518223954-k9vcs-worker-1:10500:10500 [7] NCCL INFO cudaDriverVersion 12080
|
| 19 |
+
t-20260518223954-k9vcs-worker-1:10500:10500 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 20 |
+
t-20260518223954-k9vcs-worker-1:10493:10493 [0] NCCL INFO Bootstrap: Using eth1:10.82.80.12<0>
|
| 21 |
+
t-20260518223954-k9vcs-worker-1:10497:10497 [4] NCCL INFO Bootstrap: Using eth1:10.82.80.12<0>
|
| 22 |
+
t-20260518223954-k9vcs-worker-1:10493:10493 [0] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 23 |
+
t-20260518223954-k9vcs-worker-1:10500:10500 [7] NCCL INFO Bootstrap: Using eth1:10.82.80.12<0>
|
| 24 |
+
t-20260518223954-k9vcs-worker-1:10497:10497 [4] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 25 |
+
t-20260518223954-k9vcs-worker-1:10500:10500 [7] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 26 |
+
t-20260518223954-k9vcs-worker-1:10493:10493 [0] NCCL INFO Comm config Blocking set to 1
|
| 27 |
+
t-20260518223954-k9vcs-worker-1:10500:10500 [7] NCCL INFO Comm config Blocking set to 1
|
| 28 |
+
t-20260518223954-k9vcs-worker-1:10497:10497 [4] NCCL INFO Comm config Blocking set to 1
|
| 29 |
+
t-20260518223954-k9vcs-worker-1:10496:10496 [3] NCCL INFO cudaDriverVersion 12080
|
| 30 |
+
t-20260518223954-k9vcs-worker-1:10496:10496 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 31 |
+
t-20260518223954-k9vcs-worker-1:10496:10496 [3] NCCL INFO Bootstrap: Using eth1:10.82.80.12<0>
|
| 32 |
+
t-20260518223954-k9vcs-worker-1:10496:10496 [3] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 33 |
+
t-20260518223954-k9vcs-worker-1:10496:10496 [3] NCCL INFO Comm config Blocking set to 1
|
| 34 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 35 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 36 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 37 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO P2P plugin v9 IBext_v9
|
| 38 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 39 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 40 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 41 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 42 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO P2P plugin v9 IBext_v9
|
| 43 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 44 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 45 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 46 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 47 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO P2P plugin v9 IBext_v9
|
| 48 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 49 |
+
t-20260518223954-k9vcs-worker-1:10498:10498 [5] NCCL INFO cudaDriverVersion 12080
|
| 50 |
+
t-20260518223954-k9vcs-worker-1:10498:10498 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 51 |
+
t-20260518223954-k9vcs-worker-1:10498:10498 [5] NCCL INFO Bootstrap: Using eth1:10.82.80.12<0>
|
| 52 |
+
t-20260518223954-k9vcs-worker-1:10498:10498 [5] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 53 |
+
t-20260518223954-k9vcs-worker-1:10498:10498 [5] NCCL INFO Comm config Blocking set to 1
|
| 54 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 55 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 56 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 57 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO P2P plugin v9 IBext_v9
|
| 58 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 59 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 60 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.12<0>
|
| 61 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 62 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.12<0>
|
| 63 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 64 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.12<0>
|
| 65 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 66 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO Using network IBext_v9
|
| 67 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 68 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO Using network IBext_v9
|
| 69 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 70 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO Using network IBext_v9
|
| 71 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 72 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.12<0>
|
| 73 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 74 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO Using network IBext_v9
|
| 75 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO ncclCommInitRankConfig comm 0x7f25641ba940 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 65040 commId 0xf170bbf5c404ab97 - Init START
|
| 76 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO ncclCommInitRankConfig comm 0xef3aa70 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 6f020 commId 0xf170bbf5c404ab97 - Init START
|
| 77 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO ncclCommInitRankConfig comm 0xef15590 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId 75020 commId 0xf170bbf5c404ab97 - Init START
|
| 78 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO ncclCommInitRankConfig comm 0xe578fe0 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 6b020 commId 0xf170bbf5c404ab97 - Init START
|
| 79 |
+
t-20260518223954-k9vcs-worker-1:10495:10495 [2] NCCL INFO cudaDriverVersion 12080
|
| 80 |
+
t-20260518223954-k9vcs-worker-1:10495:10495 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 81 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 82 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 83 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 84 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO P2P plugin v9 IBext_v9
|
| 85 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 86 |
+
t-20260518223954-k9vcs-worker-1:10495:10495 [2] NCCL INFO Bootstrap: Using eth1:10.82.80.12<0>
|
| 87 |
+
t-20260518223954-k9vcs-worker-1:10495:10495 [2] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 88 |
+
t-20260518223954-k9vcs-worker-1:10495:10495 [2] NCCL INFO Comm config Blocking set to 1
|
| 89 |
+
t-20260518223954-k9vcs-worker-1:10494:10494 [1] NCCL INFO cudaDriverVersion 12080
|
| 90 |
+
t-20260518223954-k9vcs-worker-1:10494:10494 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 91 |
+
t-20260518223954-k9vcs-worker-1:10494:10494 [1] NCCL INFO Bootstrap: Using eth1:10.82.80.12<0>
|
| 92 |
+
t-20260518223954-k9vcs-worker-1:10494:10494 [1] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 93 |
+
t-20260518223954-k9vcs-worker-1:10494:10494 [1] NCCL INFO Comm config Blocking set to 1
|
| 94 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 95 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.12<0>
|
| 96 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 97 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO Using network IBext_v9
|
| 98 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO ncclCommInitRankConfig comm 0xd2cdad0 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId 71020 commId 0xf170bbf5c404ab97 - Init START
|
| 99 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO RAS client listening socket at ::1<28028>
|
| 100 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 101 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 102 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 103 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO P2P plugin v9 IBext_v9
|
| 104 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 105 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 106 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 107 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 108 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO P2P plugin v9 IBext_v9
|
| 109 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 110 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 111 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.12<0>
|
| 112 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 113 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO Using network IBext_v9
|
| 114 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 115 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.12<0>
|
| 116 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 117 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO Using network IBext_v9
|
| 118 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO ncclCommInitRankConfig comm 0xec09b60 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 69020 commId 0xf170bbf5c404ab97 - Init START
|
| 119 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO RAS client listening socket at ::1<28028>
|
| 120 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO ncclCommInitRankConfig comm 0xd479fb0 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 67020 commId 0xf170bbf5c404ab97 - Init START
|
| 121 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO RAS client listening socket at ::1<28028>
|
| 122 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO RAS client listening socket at ::1<28028>
|
| 123 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO RAS client listening socket at ::1<28028>
|
| 124 |
+
t-20260518223954-k9vcs-worker-1:10499:10499 [6] NCCL INFO cudaDriverVersion 12080
|
| 125 |
+
t-20260518223954-k9vcs-worker-1:10499:10499 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 126 |
+
t-20260518223954-k9vcs-worker-1:10499:10499 [6] NCCL INFO Bootstrap: Using eth1:10.82.80.12<0>
|
| 127 |
+
t-20260518223954-k9vcs-worker-1:10499:10499 [6] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 128 |
+
t-20260518223954-k9vcs-worker-1:10499:10499 [6] NCCL INFO Comm config Blocking set to 1
|
| 129 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 130 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 131 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 132 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO P2P plugin v9 IBext_v9
|
| 133 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 134 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 135 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.12<0>
|
| 136 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 137 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO Using network IBext_v9
|
| 138 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO ncclCommInitRankConfig comm 0xee35ca0 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId 73020 commId 0xf170bbf5c404ab97 - Init START
|
| 139 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO RAS client listening socket at ::1<28028>
|
| 140 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO RAS client listening socket at ::1<28028>
|
| 141 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO RAS client listening socket at ::1<28028>
|
| 142 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO Bootstrap timings total 0.763977 (create 0.000020, send 0.000247, recv 0.762785, ring 0.000551, delay 0.000001)
|
| 143 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO Bootstrap timings total 0.001577 (create 0.000020, send 0.000238, recv 0.000515, ring 0.000535, delay 0.000000)
|
| 144 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO Bootstrap timings total 0.974651 (create 0.000030, send 0.000230, recv 0.163831, ring 0.000446, delay 0.000001)
|
| 145 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO Bootstrap timings total 0.990632 (create 0.000021, send 0.000268, recv 0.226954, ring 0.763002, delay 0.000001)
|
| 146 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO Bootstrap timings total 0.409208 (create 0.000024, send 0.000249, recv 0.000377, ring 0.399295, delay 0.000000)
|
| 147 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO Bootstrap timings total 0.956514 (create 0.000024, send 0.000287, recv 0.000317, ring 0.408179, delay 0.000000)
|
| 148 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO MNNVL busId 0x71020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 149 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO MNNVL busId 0x73020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 150 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO MNNVL busId 0x75020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 151 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO Bootstrap timings total 0.400394 (create 0.000022, send 0.000238, recv 0.000480, ring 0.399367, delay 0.000001)
|
| 152 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO Bootstrap timings total 0.996177 (create 0.000028, send 0.000542, recv 0.595830, ring 0.399409, delay 0.000001)
|
| 153 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO MNNVL busId 0x6f020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 154 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO MNNVL busId 0x69020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 155 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO MNNVL busId 0x6b020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 156 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO MNNVL busId 0x67020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 157 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO MNNVL busId 0x65040 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 158 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 159 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 160 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 161 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 162 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 163 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 164 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 165 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 166 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO Setting affinity for GPU 5 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 167 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO Setting affinity for GPU 1 to 03ffffff,ffffffff,ffffffff
|
| 168 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO NVLS multicast support is available on dev 5
|
| 169 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO Setting affinity for GPU 0 to 03ffffff,ffffffff,ffffffff
|
| 170 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO Setting affinity for GPU 6 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 171 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO NVLS multicast support is available on dev 6
|
| 172 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO Setting affinity for GPU 2 to 03ffffff,ffffffff,ffffffff
|
| 173 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO Setting affinity for GPU 4 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 174 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO Setting affinity for GPU 3 to 03ffffff,ffffffff,ffffffff
|
| 175 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO Setting affinity for GPU 7 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 176 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO NVLS multicast support is available on dev 3
|
| 177 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO NVLS multicast support is available on dev 1
|
| 178 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO NVLS multicast support is available on dev 0
|
| 179 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO NVLS multicast support is available on dev 2
|
| 180 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO NVLS multicast support is available on dev 4
|
| 181 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO NVLS multicast support is available on dev 7
|
| 182 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO comm 0xef15590 rank 15 nRanks 16 nNodes 2 localRanks 8 localRank 7 MNNVL 0
|
| 183 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO comm 0xee35ca0 rank 14 nRanks 16 nNodes 2 localRanks 8 localRank 6 MNNVL 0
|
| 184 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO comm 0xd2cdad0 rank 13 nRanks 16 nNodes 2 localRanks 8 localRank 5 MNNVL 0
|
| 185 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO comm 0xe578fe0 rank 11 nRanks 16 nNodes 2 localRanks 8 localRank 3 MNNVL 0
|
| 186 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO comm 0xef3aa70 rank 12 nRanks 16 nNodes 2 localRanks 8 localRank 4 MNNVL 0
|
| 187 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 [2] 15/-1/-1->14->13 [3] 15/-1/-1->14->13 [4] 15/-1/-1->14->13 [5] 15/-1/-1->14->13 [6] 15/-1/-1->14->6 [7] -1/-1/-1->14->13 [8] 15/-1/-1->14->13 [9] 15/-1/-1->14->13 [10] 15/-1/-1->14->13 [11] 15/-1/-1->14->13 [12] 15/-1/-1->14->13 [13] 15/-1/-1->14->13 [14] 15/6/-1->14->-1 [15] -1/-1/-1->14->13
|
| 188 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] 8/-1/-1->15->14 [2] 8/-1/-1->15->14 [3] 8/-1/-1->15->14 [4] 8/-1/-1->15->14 [5] 8/-1/-1->15->14 [6] 8/-1/-1->15->14 [7] 8/-1/-1->15->7 [8] -1/-1/-1->15->14 [9] 8/-1/-1->15->14 [10] 8/-1/-1->15->14 [11] 8/-1/-1->15->14 [12] 8/-1/-1->15->14 [13] 8/-1/-1->15->14 [14] 8/-1/-1->15->14 [15] 8/7/-1->15->-1
|
| 189 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO P2P Chunksize set to 131072
|
| 190 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO P2P Chunksize set to 131072
|
| 191 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO Trees [0] 12/-1/-1->11->10 [1] 12/-1/-1->11->10 [2] 12/-1/-1->11->10 [3] 12/-1/-1->11->3 [4] -1/-1/-1->11->10 [5] 12/-1/-1->11->10 [6] 12/-1/-1->11->10 [7] 12/-1/-1->11->10 [8] 12/-1/-1->11->10 [9] 12/-1/-1->11->10 [10] 12/-1/-1->11->10 [11] 12/3/-1->11->-1 [12] -1/-1/-1->11->10 [13] 12/-1/-1->11->10 [14] 12/-1/-1->11->10 [15] 12/-1/-1->11->10
|
| 192 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO comm 0xec09b60 rank 10 nRanks 16 nNodes 2 localRanks 8 localRank 2 MNNVL 0
|
| 193 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO P2P Chunksize set to 131072
|
| 194 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO comm 0xd479fb0 rank 9 nRanks 16 nNodes 2 localRanks 8 localRank 1 MNNVL 0
|
| 195 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO comm 0x7f25641ba940 rank 8 nRanks 16 nNodes 2 localRanks 8 localRank 0 MNNVL 0
|
| 196 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO Trees [0] 13/-1/-1->12->11 [1] 13/-1/-1->12->11 [2] 13/-1/-1->12->11 [3] 13/-1/-1->12->11 [4] 13/-1/-1->12->4 [5] -1/-1/-1->12->11 [6] 13/-1/-1->12->11 [7] 13/-1/-1->12->11 [8] 13/-1/-1->12->11 [9] 13/-1/-1->12->11 [10] 13/-1/-1->12->11 [11] 13/-1/-1->12->11 [12] 13/4/-1->12->-1 [13] -1/-1/-1->12->11 [14] 13/-1/-1->12->11 [15] 13/-1/-1->12->11
|
| 197 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/-1/-1->13->12 [2] 14/-1/-1->13->12 [3] 14/-1/-1->13->12 [4] 14/-1/-1->13->12 [5] 14/-1/-1->13->5 [6] -1/-1/-1->13->12 [7] 14/-1/-1->13->12 [8] 14/-1/-1->13->12 [9] 14/-1/-1->13->12 [10] 14/-1/-1->13->12 [11] 14/-1/-1->13->12 [12] 14/-1/-1->13->12 [13] 14/5/-1->13->-1 [14] -1/-1/-1->13->12 [15] 14/-1/-1->13->12
|
| 198 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO P2P Chunksize set to 131072
|
| 199 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO P2P Chunksize set to 131072
|
| 200 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9 [2] 11/-1/-1->10->2 [3] -1/-1/-1->10->9 [4] 11/-1/-1->10->9 [5] 11/-1/-1->10->9 [6] 11/-1/-1->10->9 [7] 11/-1/-1->10->9 [8] 11/-1/-1->10->9 [9] 11/-1/-1->10->9 [10] 11/2/-1->10->-1 [11] -1/-1/-1->10->9 [12] 11/-1/-1->10->9 [13] 11/-1/-1->10->9 [14] 11/-1/-1->10->9 [15] 11/-1/-1->10->9
|
| 201 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO Trees [0] 10/-1/-1->9->8 [1] 10/-1/-1->9->1 [2] -1/-1/-1->9->8 [3] 10/-1/-1->9->8 [4] 10/-1/-1->9->8 [5] 10/-1/-1->9->8 [6] 10/-1/-1->9->8 [7] 10/-1/-1->9->8 [8] 10/-1/-1->9->8 [9] 10/1/-1->9->-1 [10] -1/-1/-1->9->8 [11] 10/-1/-1->9->8 [12] 10/-1/-1->9->8 [13] 10/-1/-1->9->8 [14] 10/-1/-1->9->8 [15] 10/-1/-1->9->8
|
| 202 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO Trees [0] 9/-1/-1->8->0 [1] -1/-1/-1->8->15 [2] 9/-1/-1->8->15 [3] 9/-1/-1->8->15 [4] 9/-1/-1->8->15 [5] 9/-1/-1->8->15 [6] 9/-1/-1->8->15 [7] 9/-1/-1->8->15 [8] 9/0/-1->8->-1 [9] -1/-1/-1->8->15 [10] 9/-1/-1->8->15 [11] 9/-1/-1->8->15 [12] 9/-1/-1->8->15 [13] 9/-1/-1->8->15 [14] 9/-1/-1->8->15 [15] 9/-1/-1->8->15
|
| 203 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO P2P Chunksize set to 131072
|
| 204 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO P2P Chunksize set to 131072
|
| 205 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO P2P Chunksize set to 131072
|
| 206 |
+
t-20260518223954-k9vcs-worker-1:10499:11194 [6] NCCL INFO [Proxy Service] Device 6 CPU core 146
|
| 207 |
+
t-20260518223954-k9vcs-worker-1:10496:11195 [3] NCCL INFO [Proxy Service] Device 3 CPU core 68
|
| 208 |
+
t-20260518223954-k9vcs-worker-1:10500:11196 [7] NCCL INFO [Proxy Service] Device 7 CPU core 148
|
| 209 |
+
t-20260518223954-k9vcs-worker-1:10495:11198 [2] NCCL INFO [Proxy Service] Device 2 CPU core 24
|
| 210 |
+
t-20260518223954-k9vcs-worker-1:10494:11199 [1] NCCL INFO [Proxy Service] Device 1 CPU core 16
|
| 211 |
+
t-20260518223954-k9vcs-worker-1:10497:11202 [4] NCCL INFO [Proxy Service] Device 4 CPU core 93
|
| 212 |
+
t-20260518223954-k9vcs-worker-1:10493:11206 [0] NCCL INFO [Proxy Service] Device 0 CPU core 2
|
| 213 |
+
t-20260518223954-k9vcs-worker-1:10498:11203 [5] NCCL INFO [Proxy Service] Device 5 CPU core 92
|
| 214 |
+
t-20260518223954-k9vcs-worker-1:10496:11201 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 70
|
| 215 |
+
t-20260518223954-k9vcs-worker-1:10497:11209 [4] NCCL INFO [Proxy Service UDS] Device 4 CPU core 95
|
| 216 |
+
t-20260518223954-k9vcs-worker-1:10495:11204 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 28
|
| 217 |
+
t-20260518223954-k9vcs-worker-1:10493:11207 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 4
|
| 218 |
+
t-20260518223954-k9vcs-worker-1:10499:11197 [6] NCCL INFO [Proxy Service UDS] Device 6 CPU core 149
|
| 219 |
+
t-20260518223954-k9vcs-worker-1:10500:11200 [7] NCCL INFO [Proxy Service UDS] Device 7 CPU core 148
|
| 220 |
+
t-20260518223954-k9vcs-worker-1:10498:11208 [5] NCCL INFO [Proxy Service UDS] Device 5 CPU core 94
|
| 221 |
+
t-20260518223954-k9vcs-worker-1:10494:11205 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 27
|
| 222 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
|
| 223 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
|
| 224 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
|
| 225 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
|
| 226 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
|
| 227 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
|
| 228 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
|
| 229 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
|
| 230 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
|
| 231 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
|
| 232 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
|
| 233 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
|
| 234 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
|
| 235 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
|
| 236 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
|
| 237 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
|
| 238 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 239 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 240 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 241 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 242 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 243 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 244 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 245 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 246 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 247 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 248 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 249 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 250 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 251 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 252 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 253 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 254 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 255 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 256 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 257 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 258 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 259 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 260 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO ncclCommInitRankConfig comm 0xe578fe0 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 6b020 commId 0xf170bbf5c404ab97 - Init COMPLETE
|
| 261 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 262 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 263 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO ncclCommInitRankConfig comm 0xd479fb0 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 67020 commId 0xf170bbf5c404ab97 - Init COMPLETE
|
| 264 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO ncclCommInitRankConfig comm 0xd2cdad0 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId 71020 commId 0xf170bbf5c404ab97 - Init COMPLETE
|
| 265 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO ncclCommInitRankConfig comm 0x7f25641ba940 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 65040 commId 0xf170bbf5c404ab97 - Init COMPLETE
|
| 266 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO ncclCommInitRankConfig comm 0xef3aa70 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 6f020 commId 0xf170bbf5c404ab97 - Init COMPLETE
|
| 267 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO ncclCommInitRankConfig comm 0xee35ca0 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId 73020 commId 0xf170bbf5c404ab97 - Init COMPLETE
|
| 268 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO ncclCommInitRankConfig comm 0xef15590 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId 75020 commId 0xf170bbf5c404ab97 - Init COMPLETE
|
| 269 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO ncclCommInitRankConfig comm 0xec09b60 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 69020 commId 0xf170bbf5c404ab97 - Init COMPLETE
|
| 270 |
+
t-20260518223954-k9vcs-worker-1:10496:11116 [3] NCCL INFO Init timings - ncclCommInitRankConfig: rank 11 nranks 16 total 2.54 (kernels 0.20, alloc 0.52, bootstrap 0.96, allgathers 0.05, topo 0.54, graphs 0.01, connections 0.26, rest 0.00)
|
| 271 |
+
t-20260518223954-k9vcs-worker-1:10498:11123 [5] NCCL INFO Init timings - ncclCommInitRankConfig: rank 13 nranks 16 total 2.36 (kernels 0.56, alloc 0.18, bootstrap 0.76, allgathers 0.04, topo 0.54, graphs 0.02, connections 0.26, rest 0.00)
|
| 272 |
+
t-20260518223954-k9vcs-worker-1:10493:11113 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 8 nranks 16 total 2.58 (kernels 0.20, alloc 0.53, bootstrap 1.00, allgathers 0.05, topo 0.54, graphs 0.01, connections 0.26, rest 0.01)
|
| 273 |
+
t-20260518223954-k9vcs-worker-1:10494:11156 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 9 nranks 16 total 1.77 (kernels 0.21, alloc 0.30, bootstrap 0.40, allgathers 0.05, topo 0.54, graphs 0.01, connections 0.26, rest 0.00)
|
| 274 |
+
t-20260518223954-k9vcs-worker-1:10497:11115 [4] NCCL INFO Init timings - ncclCommInitRankConfig: rank 12 nranks 16 total 2.58 (kernels 0.20, alloc 0.53, bootstrap 0.99, allgathers 0.04, topo 0.54, graphs 0.02, connections 0.26, rest 0.01)
|
| 275 |
+
t-20260518223954-k9vcs-worker-1:10499:11182 [6] NCCL INFO Init timings - ncclCommInitRankConfig: rank 14 nranks 16 total 1.24 (kernels 0.20, alloc 0.18, bootstrap 0.00, allgathers 0.05, topo 0.54, graphs 0.01, connections 0.26, rest 0.00)
|
| 276 |
+
t-20260518223954-k9vcs-worker-1:10495:11151 [2] NCCL INFO Init timings - ncclCommInitRankConfig: rank 10 nranks 16 total 1.79 (kernels 0.23, alloc 0.30, bootstrap 0.41, allgathers 0.05, topo 0.54, graphs 0.01, connections 0.26, rest 0.00)
|
| 277 |
+
t-20260518223954-k9vcs-worker-1:10500:11114 [7] NCCL INFO Init timings - ncclCommInitRankConfig: rank 15 nranks 16 total 2.58 (kernels 0.20, alloc 0.55, bootstrap 0.97, allgathers 0.05, topo 0.54, graphs 0.01, connections 0.26, rest 0.00)
|
| 278 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Channel 02/0 : 8[0] -> 9[1] via P2P/CUMEM
|
| 279 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Channel 04/0 : 8[0] -> 9[1] via P2P/CUMEM
|
| 280 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Channel 06/0 : 8[0] -> 9[1] via P2P/CUMEM
|
| 281 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Channel 10/0 : 8[0] -> 9[1] via P2P/CUMEM
|
| 282 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Channel 12/0 : 8[0] -> 9[1] via P2P/CUMEM
|
| 283 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Channel 14/0 : 8[0] -> 9[1] via P2P/CUMEM
|
| 284 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Channel 00/0 : 9[1] -> 10[2] via P2P/CUMEM
|
| 285 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Channel 00/0 : 13[5] -> 14[6] via P2P/CUMEM
|
| 286 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Channel 02/0 : 9[1] -> 10[2] via P2P/CUMEM
|
| 287 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Channel 02/0 : 13[5] -> 14[6] via P2P/CUMEM
|
| 288 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Channel 00/0 : 12[4] -> 13[5] via P2P/CUMEM
|
| 289 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Channel 04/0 : 9[1] -> 10[2] via P2P/CUMEM
|
| 290 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Channel 04/0 : 13[5] -> 14[6] via P2P/CUMEM
|
| 291 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Channel 02/0 : 12[4] -> 13[5] via P2P/CUMEM
|
| 292 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Channel 00/0 : 14[6] -> 15[7] via P2P/CUMEM
|
| 293 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Channel 00/0 : 10[2] -> 11[3] via P2P/CUMEM
|
| 294 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Channel 06/0 : 9[1] -> 10[2] via P2P/CUMEM
|
| 295 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Channel 06/0 : 13[5] -> 14[6] via P2P/CUMEM
|
| 296 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Channel 06/0 : 12[4] -> 13[5] via P2P/CUMEM
|
| 297 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Channel 02/0 : 14[6] -> 15[7] via P2P/CUMEM
|
| 298 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Channel 04/0 : 10[2] -> 11[3] via P2P/CUMEM
|
| 299 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Channel 08/0 : 9[1] -> 10[2] via P2P/CUMEM
|
| 300 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Channel 08/0 : 13[5] -> 14[6] via P2P/CUMEM
|
| 301 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Channel 08/0 : 12[4] -> 13[5] via P2P/CUMEM
|
| 302 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Channel 04/0 : 14[6] -> 15[7] via P2P/CUMEM
|
| 303 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Channel 06/0 : 10[2] -> 11[3] via P2P/CUMEM
|
| 304 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Channel 10/0 : 9[1] -> 10[2] via P2P/CUMEM
|
| 305 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Channel 10/0 : 13[5] -> 14[6] via P2P/CUMEM
|
| 306 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Channel 10/0 : 12[4] -> 13[5] via P2P/CUMEM
|
| 307 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Channel 08/0 : 14[6] -> 15[7] via P2P/CUMEM
|
| 308 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Channel 08/0 : 10[2] -> 11[3] via P2P/CUMEM
|
| 309 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Channel 12/0 : 9[1] -> 10[2] via P2P/CUMEM
|
| 310 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Channel 12/0 : 13[5] -> 14[6] via P2P/CUMEM
|
| 311 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Channel 14/0 : 12[4] -> 13[5] via P2P/CUMEM
|
| 312 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Channel 10/0 : 14[6] -> 15[7] via P2P/CUMEM
|
| 313 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Channel 12/0 : 10[2] -> 11[3] via P2P/CUMEM
|
| 314 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Channel 14/0 : 9[1] -> 10[2] via P2P/CUMEM
|
| 315 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Channel 14/0 : 13[5] -> 14[6] via P2P/CUMEM
|
| 316 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Channel 12/0 : 14[6] -> 15[7] via P2P/CUMEM
|
| 317 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Channel 14/0 : 10[2] -> 11[3] via P2P/CUMEM
|
| 318 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Channel 01/0 : 8[0] -> 15[7] via P2P/CUMEM
|
| 319 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Channel 00/0 : 11[3] -> 12[4] via P2P/CUMEM
|
| 320 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Channel 03/0 : 8[0] -> 15[7] via P2P/CUMEM
|
| 321 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Channel 02/0 : 11[3] -> 12[4] via P2P/CUMEM
|
| 322 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Channel 05/0 : 8[0] -> 15[7] via P2P/CUMEM
|
| 323 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Channel 04/0 : 11[3] -> 12[4] via P2P/CUMEM
|
| 324 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Channel 07/0 : 8[0] -> 15[7] via P2P/CUMEM
|
| 325 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Channel 06/0 : 11[3] -> 12[4] via P2P/CUMEM
|
| 326 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Channel 09/0 : 8[0] -> 15[7] via P2P/CUMEM
|
| 327 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Channel 08/0 : 11[3] -> 12[4] via P2P/CUMEM
|
| 328 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Channel 11/0 : 8[0] -> 15[7] via P2P/CUMEM
|
| 329 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Channel 10/0 : 11[3] -> 12[4] via P2P/CUMEM
|
| 330 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Channel 13/0 : 8[0] -> 15[7] via P2P/CUMEM
|
| 331 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Channel 12/0 : 11[3] -> 12[4] via P2P/CUMEM
|
| 332 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Channel 15/0 : 8[0] -> 15[7] via P2P/CUMEM
|
| 333 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Channel 14/0 : 11[3] -> 12[4] via P2P/CUMEM
|
| 334 |
+
t-20260518223954-k9vcs-worker-1:10498:11218 [5] NCCL INFO [Proxy Progress] Device 5 CPU core 98
|
| 335 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Channel 04/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 336 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Channel 12/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 337 |
+
t-20260518223954-k9vcs-worker-1:10494:11219 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 19
|
| 338 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Channel 05/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 339 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Channel 00/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 340 |
+
t-20260518223954-k9vcs-worker-1:10499:11220 [6] NCCL INFO [Proxy Progress] Device 6 CPU core 150
|
| 341 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Channel 13/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 342 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 343 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Channel 08/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 344 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Channel 15/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 345 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 346 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Channel 06/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 347 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Channel 09/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 348 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Channel 14/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 349 |
+
t-20260518223954-k9vcs-worker-1:10500:11221 [7] NCCL INFO [Proxy Progress] Device 7 CPU core 150
|
| 350 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Channel 06/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 351 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Channel 14/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 352 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Channel 07/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 353 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Channel 15/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 354 |
+
t-20260518223954-k9vcs-worker-1:10495:11222 [2] NCCL INFO [Proxy Progress] Device 2 CPU core 26
|
| 355 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Channel 03/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 356 |
+
t-20260518223954-k9vcs-worker-1:10496:11223 [3] NCCL INFO [Proxy Progress] Device 3 CPU core 76
|
| 357 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Channel 02/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 358 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Channel 10/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 359 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 360 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Channel 11/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 361 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Channel 11/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 362 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 363 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Channel 10/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 364 |
+
t-20260518223954-k9vcs-worker-1:10493:11224 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 4
|
| 365 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Channel 01/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 366 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Channel 09/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 367 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 368 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Channel 08/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 369 |
+
t-20260518223954-k9vcs-worker-1:10497:11225 [4] NCCL INFO [Proxy Progress] Device 4 CPU core 95
|
| 370 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Channel 05/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 371 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Channel 13/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 372 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Channel 04/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 373 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Channel 12/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 374 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Channel 00/0 : 15[7] -> 8[0] via P2P/CUMEM
|
| 375 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Channel 02/0 : 15[7] -> 8[0] via P2P/CUMEM
|
| 376 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Channel 04/0 : 15[7] -> 8[0] via P2P/CUMEM
|
| 377 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Channel 06/0 : 15[7] -> 8[0] via P2P/CUMEM
|
| 378 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Channel 08/0 : 15[7] -> 8[0] via P2P/CUMEM
|
| 379 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Channel 10/0 : 15[7] -> 8[0] via P2P/CUMEM
|
| 380 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Channel 01/0 : 13[5] -> 12[4] via P2P/CUMEM
|
| 381 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Channel 12/0 : 15[7] -> 8[0] via P2P/CUMEM
|
| 382 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Channel 03/0 : 13[5] -> 12[4] via P2P/CUMEM
|
| 383 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Channel 14/0 : 15[7] -> 8[0] via P2P/CUMEM
|
| 384 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Channel 07/0 : 13[5] -> 12[4] via P2P/CUMEM
|
| 385 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Channel 09/0 : 13[5] -> 12[4] via P2P/CUMEM
|
| 386 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Channel 01/0 : 15[7] -> 14[6] via P2P/CUMEM
|
| 387 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Channel 11/0 : 13[5] -> 12[4] via P2P/CUMEM
|
| 388 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Channel 03/0 : 15[7] -> 14[6] via P2P/CUMEM
|
| 389 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Channel 15/0 : 13[5] -> 12[4] via P2P/CUMEM
|
| 390 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Channel 05/0 : 15[7] -> 14[6] via P2P/CUMEM
|
| 391 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Channel 09/0 : 15[7] -> 14[6] via P2P/CUMEM
|
| 392 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Channel 11/0 : 15[7] -> 14[6] via P2P/CUMEM
|
| 393 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Channel 13/0 : 15[7] -> 14[6] via P2P/CUMEM
|
| 394 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Channel 03/0 : 9[1] -> 8[0] via P2P/CUMEM
|
| 395 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Channel 05/0 : 9[1] -> 8[0] via P2P/CUMEM
|
| 396 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Channel 07/0 : 9[1] -> 8[0] via P2P/CUMEM
|
| 397 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Channel 11/0 : 9[1] -> 8[0] via P2P/CUMEM
|
| 398 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Channel 13/0 : 9[1] -> 8[0] via P2P/CUMEM
|
| 399 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Channel 15/0 : 9[1] -> 8[0] via P2P/CUMEM
|
| 400 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Channel 01/0 : 14[6] -> 13[5] via P2P/CUMEM
|
| 401 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Channel 03/0 : 14[6] -> 13[5] via P2P/CUMEM
|
| 402 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Channel 01/0 : 11[3] -> 10[2] via P2P/CUMEM
|
| 403 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Channel 05/0 : 14[6] -> 13[5] via P2P/CUMEM
|
| 404 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Channel 05/0 : 11[3] -> 10[2] via P2P/CUMEM
|
| 405 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Channel 07/0 : 14[6] -> 13[5] via P2P/CUMEM
|
| 406 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Channel 07/0 : 11[3] -> 10[2] via P2P/CUMEM
|
| 407 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Channel 09/0 : 14[6] -> 13[5] via P2P/CUMEM
|
| 408 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Channel 09/0 : 11[3] -> 10[2] via P2P/CUMEM
|
| 409 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Channel 11/0 : 14[6] -> 13[5] via P2P/CUMEM
|
| 410 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Channel 13/0 : 11[3] -> 10[2] via P2P/CUMEM
|
| 411 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Channel 13/0 : 14[6] -> 13[5] via P2P/CUMEM
|
| 412 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Channel 15/0 : 11[3] -> 10[2] via P2P/CUMEM
|
| 413 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Channel 15/0 : 14[6] -> 13[5] via P2P/CUMEM
|
| 414 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Channel 01/0 : 12[4] -> 11[3] via P2P/CUMEM
|
| 415 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Channel 03/0 : 12[4] -> 11[3] via P2P/CUMEM
|
| 416 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Channel 05/0 : 12[4] -> 11[3] via P2P/CUMEM
|
| 417 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Channel 07/0 : 12[4] -> 11[3] via P2P/CUMEM
|
| 418 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Channel 09/0 : 12[4] -> 11[3] via P2P/CUMEM
|
| 419 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Channel 11/0 : 12[4] -> 11[3] via P2P/CUMEM
|
| 420 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Channel 13/0 : 12[4] -> 11[3] via P2P/CUMEM
|
| 421 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Channel 15/0 : 12[4] -> 11[3] via P2P/CUMEM
|
| 422 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Channel 01/0 : 10[2] -> 9[1] via P2P/CUMEM
|
| 423 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Channel 03/0 : 10[2] -> 9[1] via P2P/CUMEM
|
| 424 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Channel 05/0 : 10[2] -> 9[1] via P2P/CUMEM
|
| 425 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Channel 07/0 : 10[2] -> 9[1] via P2P/CUMEM
|
| 426 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Channel 09/0 : 10[2] -> 9[1] via P2P/CUMEM
|
| 427 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Channel 11/0 : 10[2] -> 9[1] via P2P/CUMEM
|
| 428 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Channel 13/0 : 10[2] -> 9[1] via P2P/CUMEM
|
| 429 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Channel 15/0 : 10[2] -> 9[1] via P2P/CUMEM
|
| 430 |
+
t-20260518223954-k9vcs-worker-1:10500:11196 [7] NCCL INFO NCCL_IB_GID_INDEX set by environment to 7.
|
| 431 |
+
t-20260518223954-k9vcs-worker-1:10500:11196 [7] NCCL INFO NCCL_IB_TIMEOUT set by environment to 23.
|
| 432 |
+
t-20260518223954-k9vcs-worker-1:10500:11196 [7] NCCL INFO NCCL_IB_RETRY_CNT set by environment to 7.
|
| 433 |
+
t-20260518223954-k9vcs-worker-1:10493:11206 [0] NCCL INFO NCCL_IB_GID_INDEX set by environment to 7.
|
| 434 |
+
t-20260518223954-k9vcs-worker-1:10493:11206 [0] NCCL INFO NCCL_IB_TIMEOUT set by environment to 23.
|
| 435 |
+
t-20260518223954-k9vcs-worker-1:10493:11206 [0] NCCL INFO NCCL_IB_RETRY_CNT set by environment to 7.
|
| 436 |
+
t-20260518223954-k9vcs-worker-1:10499:11194 [6] NCCL INFO NCCL_IB_GID_INDEX set by environment to 7.
|
| 437 |
+
t-20260518223954-k9vcs-worker-1:10499:11194 [6] NCCL INFO NCCL_IB_TIMEOUT set by environment to 23.
|
| 438 |
+
t-20260518223954-k9vcs-worker-1:10499:11194 [6] NCCL INFO NCCL_IB_RETRY_CNT set by environment to 7.
|
| 439 |
+
t-20260518223954-k9vcs-worker-1:10498:11203 [5] NCCL INFO NCCL_IB_GID_INDEX set by environment to 7.
|
| 440 |
+
t-20260518223954-k9vcs-worker-1:10498:11203 [5] NCCL INFO NCCL_IB_TIMEOUT set by environment to 23.
|
| 441 |
+
t-20260518223954-k9vcs-worker-1:10498:11203 [5] NCCL INFO NCCL_IB_RETRY_CNT set by environment to 7.
|
| 442 |
+
t-20260518223954-k9vcs-worker-1:10497:11202 [4] NCCL INFO NCCL_IB_GID_INDEX set by environment to 7.
|
| 443 |
+
t-20260518223954-k9vcs-worker-1:10497:11202 [4] NCCL INFO NCCL_IB_TIMEOUT set by environment to 23.
|
| 444 |
+
t-20260518223954-k9vcs-worker-1:10497:11202 [4] NCCL INFO NCCL_IB_RETRY_CNT set by environment to 7.
|
| 445 |
+
t-20260518223954-k9vcs-worker-1:10495:11198 [2] NCCL INFO NCCL_IB_GID_INDEX set by environment to 7.
|
| 446 |
+
t-20260518223954-k9vcs-worker-1:10494:11199 [1] NCCL INFO NCCL_IB_GID_INDEX set by environment to 7.
|
| 447 |
+
t-20260518223954-k9vcs-worker-1:10495:11198 [2] NCCL INFO NCCL_IB_TIMEOUT set by environment to 23.
|
| 448 |
+
t-20260518223954-k9vcs-worker-1:10494:11199 [1] NCCL INFO NCCL_IB_TIMEOUT set by environment to 23.
|
| 449 |
+
t-20260518223954-k9vcs-worker-1:10495:11198 [2] NCCL INFO NCCL_IB_RETRY_CNT set by environment to 7.
|
| 450 |
+
t-20260518223954-k9vcs-worker-1:10494:11199 [1] NCCL INFO NCCL_IB_RETRY_CNT set by environment to 7.
|
| 451 |
+
t-20260518223954-k9vcs-worker-1:10496:11195 [3] NCCL INFO NCCL_IB_GID_INDEX set by environment to 7.
|
| 452 |
+
t-20260518223954-k9vcs-worker-1:10496:11195 [3] NCCL INFO NCCL_IB_TIMEOUT set by environment to 23.
|
| 453 |
+
t-20260518223954-k9vcs-worker-1:10496:11195 [3] NCCL INFO NCCL_IB_RETRY_CNT set by environment to 7.
|
| 454 |
+
t-20260518223954-k9vcs-worker-1:10500:11213 [7] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 455 |
+
t-20260518223954-k9vcs-worker-1:10495:11216 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 456 |
+
t-20260518223954-k9vcs-worker-1:10494:11212 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 457 |
+
t-20260518223954-k9vcs-worker-1:10498:11214 [5] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 458 |
+
t-20260518223954-k9vcs-worker-1:10497:11210 [4] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 459 |
+
t-20260518223954-k9vcs-worker-1:10499:11215 [6] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 460 |
+
t-20260518223954-k9vcs-worker-1:10496:11217 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 461 |
+
t-20260518223954-k9vcs-worker-1:10493:11211 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 462 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO NVLS comm 0xef3aa70 headRank 4 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 463 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO NVLS comm 0xe578fe0 headRank 3 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 464 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO NVLS comm 0xd479fb0 headRank 1 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 465 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO NVLS comm 0xec09b60 headRank 2 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 466 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO NVLS comm 0xef15590 headRank 7 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 467 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO NVLS comm 0xd2cdad0 headRank 5 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 468 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO NVLS comm 0x7f25641ba940 headRank 0 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 469 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO NVLS comm 0xee35ca0 headRank 6 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 470 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 471 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 00/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 472 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 00/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 473 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 474 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 00/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 475 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 00/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 476 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 00/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 477 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 00/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 478 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 02/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 479 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 01/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 480 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 02/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 481 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 01/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 482 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 01/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 483 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 01/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 484 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 01/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 485 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 01/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 486 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 03/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 487 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 02/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 488 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 03/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 489 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 02/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 490 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 491 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 04/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 492 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 03/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 493 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 02/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 494 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 495 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 02/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 496 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 497 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 03/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 498 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 04/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 499 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 500 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 04/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 501 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 03/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 502 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 04/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 503 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 05/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 504 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 03/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 505 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 05/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 506 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 04/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 507 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 06/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 508 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 05/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 509 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 05/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 510 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 05/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 511 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 06/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 512 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 07/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 513 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 04/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 514 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 05/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 515 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 07/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 516 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 06/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 517 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 06/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 518 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 07/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 519 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 08/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 520 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 521 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 09/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 522 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 06/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 523 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 06/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 524 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 07/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 525 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 09/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 526 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 527 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 08/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 528 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 10/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 529 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 07/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 530 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 08/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 531 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 07/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 532 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 08/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 533 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 10/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 534 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 08/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 535 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 11/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 536 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 10/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 537 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 08/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 538 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 09/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 539 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 09/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 540 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 08/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 541 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 11/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 542 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 09/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 543 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 12/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 544 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 09/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 545 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 11/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 546 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 10/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 547 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 12/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 548 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 10/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 549 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 11/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 550 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 09/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 551 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 13/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 552 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 12/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 553 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 10/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 554 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 11/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 555 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 13/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 556 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 11/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 557 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 14/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 558 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 12/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 559 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 10/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 560 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 11/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 561 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 13/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 562 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 12/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 563 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 15/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 564 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 13/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 565 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 15/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 566 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 13/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 567 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 12/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 568 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 14/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 569 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 12/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 570 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 00/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 571 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 00/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 572 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 13/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 573 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 14/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 574 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 14/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 575 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 01/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 576 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 13/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 577 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 15/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 578 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 02/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 579 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 14/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 580 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 02/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 581 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 14/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 582 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 15/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 583 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 03/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 584 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 01/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 585 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 14/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 586 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 15/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 587 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 03/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 588 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 00/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 589 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 00/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 590 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 04/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 591 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 02/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 592 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 15/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 593 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 15/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 594 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 04/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 595 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 00/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 596 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 01/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 597 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 01/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 598 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 599 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 00/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 600 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 05/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 601 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 03/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 602 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 00/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 603 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 01/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 604 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 02/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 605 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 06/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 606 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 02/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 607 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 06/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 608 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 609 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 01/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 610 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 02/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 611 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 01/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 612 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 03/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 613 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 07/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 614 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 03/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 615 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 08/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 616 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 05/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 617 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 03/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 618 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 02/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 619 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 08/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 620 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 04/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 621 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 03/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 622 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 04/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 623 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 09/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 624 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 06/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 625 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 05/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 626 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 10/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 627 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 06/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 628 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 04/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 629 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 04/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 630 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 10/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 631 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 05/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 632 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 06/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 633 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 07/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 634 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 11/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 635 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 07/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 636 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 05/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 637 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 11/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 638 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 05/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 639 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 07/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 640 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 12/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 641 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 07/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 642 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 08/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 643 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 12/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 644 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 09/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 645 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 06/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 646 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 647 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 08/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 648 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 13/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 649 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 09/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 650 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 08/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 651 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 13/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 652 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 10/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 653 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 654 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 09/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 655 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 14/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 656 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 07/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 657 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 10/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 658 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Channel 14/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 659 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 09/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 660 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 11/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 661 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 08/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 662 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Channel 15/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 663 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 11/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 664 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 10/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 665 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 10/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 666 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 08/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 667 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 12/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 668 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 09/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 669 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 12/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 670 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 11/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 671 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 11/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 672 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 09/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 673 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 13/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 674 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 10/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 675 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 14/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 676 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 12/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 677 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 13/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 678 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 11/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 679 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 14/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 680 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 12/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 681 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Channel 15/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 682 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 13/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 683 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 14/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 684 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 12/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 685 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Channel 15/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 686 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 13/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 687 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Channel 15/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 688 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Channel 15/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 689 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 13/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 690 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 14/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 691 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 14/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 692 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Channel 15/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 693 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Channel 15/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 694 |
+
t-20260518223954-k9vcs-worker-1:10494:11546 [1] NCCL INFO Connected NVLS tree
|
| 695 |
+
t-20260518223954-k9vcs-worker-1:10493:11632 [0] NCCL INFO Connected NVLS tree
|
| 696 |
+
t-20260518223954-k9vcs-worker-1:10495:11547 [2] NCCL INFO Connected NVLS tree
|
| 697 |
+
t-20260518223954-k9vcs-worker-1:10499:11631 [6] NCCL INFO Connected NVLS tree
|
| 698 |
+
t-20260518223954-k9vcs-worker-1:10497:11544 [4] NCCL INFO Connected NVLS tree
|
| 699 |
+
t-20260518223954-k9vcs-worker-1:10500:11630 [7] NCCL INFO Connected NVLS tree
|
| 700 |
+
t-20260518223954-k9vcs-worker-1:10496:11545 [3] NCCL INFO Connected NVLS tree
|
| 701 |
+
t-20260518223954-k9vcs-worker-1:10498:11629 [5] NCCL INFO Connected NVLS tree
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw.node0.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_2node/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw.node1.log
ADDED
|
@@ -0,0 +1,702 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
W0518 14:48:23.455000 10379 torch/distributed/run.py:792]
|
| 2 |
+
W0518 14:48:23.455000 10379 torch/distributed/run.py:792] *****************************************
|
| 3 |
+
W0518 14:48:23.455000 10379 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
W0518 14:48:23.455000 10379 torch/distributed/run.py:792] *****************************************
|
| 5 |
+
[W518 14:48:24.269417104 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 6 |
+
[W518 14:48:24.631721318 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 7 |
+
[W518 14:48:28.779341843 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 8 |
+
[W518 14:48:28.833270982 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 9 |
+
[W518 14:48:28.833805587 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 10 |
+
[W518 14:48:28.892846221 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 11 |
+
[W518 14:48:28.937191928 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 12 |
+
[W518 14:48:28.964785152 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 13 |
+
[W518 14:48:28.965141345 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 14 |
+
[W518 14:48:28.965341972 socket.cpp:202] [c10d] The hostname of the client socket cannot be retrieved. err=-3
|
| 15 |
+
t-20260518224737-tftgw-worker-1:10446:10446 [1] NCCL INFO cudaDriverVersion 12080
|
| 16 |
+
t-20260518224737-tftgw-worker-1:10446:10446 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 17 |
+
t-20260518224737-tftgw-worker-1:10446:10446 [1] NCCL INFO Bootstrap: Using eth1:10.82.80.5<0>
|
| 18 |
+
t-20260518224737-tftgw-worker-1:10446:10446 [1] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 19 |
+
t-20260518224737-tftgw-worker-1:10452:10452 [7] NCCL INFO cudaDriverVersion 12080
|
| 20 |
+
t-20260518224737-tftgw-worker-1:10452:10452 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 21 |
+
t-20260518224737-tftgw-worker-1:10452:10452 [7] NCCL INFO Bootstrap: Using eth1:10.82.80.5<0>
|
| 22 |
+
t-20260518224737-tftgw-worker-1:10452:10452 [7] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 23 |
+
t-20260518224737-tftgw-worker-1:10447:10447 [2] NCCL INFO cudaDriverVersion 12080
|
| 24 |
+
t-20260518224737-tftgw-worker-1:10447:10447 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 25 |
+
t-20260518224737-tftgw-worker-1:10449:10449 [4] NCCL INFO cudaDriverVersion 12080
|
| 26 |
+
t-20260518224737-tftgw-worker-1:10449:10449 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 27 |
+
t-20260518224737-tftgw-worker-1:10446:10446 [1] NCCL INFO Comm config Blocking set to 1
|
| 28 |
+
t-20260518224737-tftgw-worker-1:10447:10447 [2] NCCL INFO Bootstrap: Using eth1:10.82.80.5<0>
|
| 29 |
+
t-20260518224737-tftgw-worker-1:10449:10449 [4] NCCL INFO Bootstrap: Using eth1:10.82.80.5<0>
|
| 30 |
+
t-20260518224737-tftgw-worker-1:10449:10449 [4] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 31 |
+
t-20260518224737-tftgw-worker-1:10447:10447 [2] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 32 |
+
t-20260518224737-tftgw-worker-1:10448:10448 [3] NCCL INFO cudaDriverVersion 12080
|
| 33 |
+
t-20260518224737-tftgw-worker-1:10448:10448 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 34 |
+
t-20260518224737-tftgw-worker-1:10452:10452 [7] NCCL INFO Comm config Blocking set to 1
|
| 35 |
+
t-20260518224737-tftgw-worker-1:10448:10448 [3] NCCL INFO Bootstrap: Using eth1:10.82.80.5<0>
|
| 36 |
+
t-20260518224737-tftgw-worker-1:10448:10448 [3] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 37 |
+
t-20260518224737-tftgw-worker-1:10449:10449 [4] NCCL INFO Comm config Blocking set to 1
|
| 38 |
+
t-20260518224737-tftgw-worker-1:10447:10447 [2] NCCL INFO Comm config Blocking set to 1
|
| 39 |
+
t-20260518224737-tftgw-worker-1:10445:10445 [0] NCCL INFO cudaDriverVersion 12080
|
| 40 |
+
t-20260518224737-tftgw-worker-1:10445:10445 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 41 |
+
t-20260518224737-tftgw-worker-1:10445:10445 [0] NCCL INFO Bootstrap: Using eth1:10.82.80.5<0>
|
| 42 |
+
t-20260518224737-tftgw-worker-1:10445:10445 [0] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 43 |
+
t-20260518224737-tftgw-worker-1:10450:10450 [5] NCCL INFO cudaDriverVersion 12080
|
| 44 |
+
t-20260518224737-tftgw-worker-1:10450:10450 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 45 |
+
t-20260518224737-tftgw-worker-1:10448:10448 [3] NCCL INFO Comm config Blocking set to 1
|
| 46 |
+
t-20260518224737-tftgw-worker-1:10451:10451 [6] NCCL INFO cudaDriverVersion 12080
|
| 47 |
+
t-20260518224737-tftgw-worker-1:10451:10451 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 48 |
+
t-20260518224737-tftgw-worker-1:10450:10450 [5] NCCL INFO Bootstrap: Using eth1:10.82.80.5<0>
|
| 49 |
+
t-20260518224737-tftgw-worker-1:10450:10450 [5] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 50 |
+
t-20260518224737-tftgw-worker-1:10451:10451 [6] NCCL INFO Bootstrap: Using eth1:10.82.80.5<0>
|
| 51 |
+
t-20260518224737-tftgw-worker-1:10451:10451 [6] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 52 |
+
t-20260518224737-tftgw-worker-1:10445:10445 [0] NCCL INFO Comm config Blocking set to 1
|
| 53 |
+
t-20260518224737-tftgw-worker-1:10450:10450 [5] NCCL INFO Comm config Blocking set to 1
|
| 54 |
+
t-20260518224737-tftgw-worker-1:10451:10451 [6] NCCL INFO Comm config Blocking set to 1
|
| 55 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 56 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 57 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 58 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO P2P plugin v9 IBext_v9
|
| 59 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 60 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 61 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 62 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 63 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO P2P plugin v9 IBext_v9
|
| 64 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 65 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 66 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 67 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 68 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO P2P plugin v9 IBext_v9
|
| 69 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 70 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 71 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 72 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 73 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO P2P plugin v9 IBext_v9
|
| 74 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 75 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 76 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 77 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 78 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO P2P plugin v9 IBext_v9
|
| 79 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 80 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 81 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 82 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 83 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 84 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 85 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 86 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO P2P plugin v9 IBext_v9
|
| 87 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO P2P plugin v9 IBext_v9
|
| 88 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 89 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 90 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 91 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 92 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 93 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO P2P plugin v9 IBext_v9
|
| 94 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 95 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 96 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.5<0>
|
| 97 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 98 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.5<0>
|
| 99 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 100 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO Using network IBext_v9
|
| 101 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 102 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO Using network IBext_v9
|
| 103 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 104 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.5<0>
|
| 105 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 106 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.5<0>
|
| 107 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 108 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.5<0>
|
| 109 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 110 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.5<0>
|
| 111 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 112 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.5<0>
|
| 113 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 114 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO Using network IBext_v9
|
| 115 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 116 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO Using network IBext_v9
|
| 117 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 118 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO Using network IBext_v9
|
| 119 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 120 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO Using network IBext_v9
|
| 121 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 122 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO Using network IBext_v9
|
| 123 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 124 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.5<0>
|
| 125 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 126 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO Using network IBext_v9
|
| 127 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO ncclCommInitRankConfig comm 0xf150bb0 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 67020 commId 0x2ef33c22d6d166f7 - Init START
|
| 128 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO ncclCommInitRankConfig comm 0xedfad70 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 69020 commId 0x2ef33c22d6d166f7 - Init START
|
| 129 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO ncclCommInitRankConfig comm 0xd90a1d0 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId 71020 commId 0x2ef33c22d6d166f7 - Init START
|
| 130 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO ncclCommInitRankConfig comm 0xed06ec0 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId 75020 commId 0x2ef33c22d6d166f7 - Init START
|
| 131 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO ncclCommInitRankConfig comm 0xf242a00 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId 73020 commId 0x2ef33c22d6d166f7 - Init START
|
| 132 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO ncclCommInitRankConfig comm 0xe08a1f0 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 6b020 commId 0x2ef33c22d6d166f7 - Init START
|
| 133 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO ncclCommInitRankConfig comm 0xda93a60 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 6f020 commId 0x2ef33c22d6d166f7 - Init START
|
| 134 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO RAS client listening socket at ::1<28028>
|
| 135 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO RAS client listening socket at ::1<28028>
|
| 136 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO ncclCommInitRankConfig comm 0x7f6eab060080 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 65040 commId 0x2ef33c22d6d166f7 - Init START
|
| 137 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO RAS client listening socket at ::1<28028>
|
| 138 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO RAS client listening socket at ::1<28028>
|
| 139 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO RAS client listening socket at ::1<28028>
|
| 140 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO RAS client listening socket at ::1<28028>
|
| 141 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO RAS client listening socket at ::1<28028>
|
| 142 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO RAS client listening socket at ::1<28028>
|
| 143 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO Bootstrap timings total 0.779562 (create 0.000022, send 0.000700, recv 0.001140, ring 0.777362, delay 0.000001)
|
| 144 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO Bootstrap timings total 0.802562 (create 0.000025, send 0.000260, recv 0.024782, ring 0.777170, delay 0.000001)
|
| 145 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO Bootstrap timings total 0.777968 (create 0.000022, send 0.000226, recv 0.000578, ring 0.776819, delay 0.000001)
|
| 146 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO Bootstrap timings total 0.777407 (create 0.000020, send 0.000221, recv 0.000359, ring 0.474152, delay 0.000001)
|
| 147 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO Bootstrap timings total 0.780613 (create 0.000028, send 0.000234, recv 0.000293, ring 0.777344, delay 0.000001)
|
| 148 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO Bootstrap timings total 0.778550 (create 0.000021, send 0.000230, recv 0.000999, ring 0.776960, delay 0.000001)
|
| 149 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO Bootstrap timings total 0.804136 (create 0.000023, send 0.000528, recv 0.001682, ring 0.776495, delay 0.000001)
|
| 150 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO Bootstrap timings total 0.784705 (create 0.000021, send 0.000275, recv 0.006419, ring 0.776841, delay 0.000000)
|
| 151 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO MNNVL busId 0x73020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 152 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO MNNVL busId 0x6f020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 153 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO MNNVL busId 0x69020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 154 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO MNNVL busId 0x65040 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 155 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO MNNVL busId 0x6b020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 156 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO MNNVL busId 0x75020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 157 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO MNNVL busId 0x67020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 158 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO MNNVL busId 0x71020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 159 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 160 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 161 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 162 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 163 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 164 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 165 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 166 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 167 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO Setting affinity for GPU 7 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 168 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO NVLS multicast support is available on dev 7
|
| 169 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO Setting affinity for GPU 3 to 03ffffff,ffffffff,ffffffff
|
| 170 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO Setting affinity for GPU 5 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 171 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO Setting affinity for GPU 4 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 172 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO NVLS multicast support is available on dev 5
|
| 173 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO Setting affinity for GPU 2 to 03ffffff,ffffffff,ffffffff
|
| 174 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO NVLS multicast support is available on dev 2
|
| 175 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO Setting affinity for GPU 6 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 176 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO NVLS multicast support is available on dev 3
|
| 177 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO Setting affinity for GPU 1 to 03ffffff,ffffffff,ffffffff
|
| 178 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO Setting affinity for GPU 0 to 03ffffff,ffffffff,ffffffff
|
| 179 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO NVLS multicast support is available on dev 1
|
| 180 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO NVLS multicast support is available on dev 4
|
| 181 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO NVLS multicast support is available on dev 6
|
| 182 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO NVLS multicast support is available on dev 0
|
| 183 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO comm 0xed06ec0 rank 15 nRanks 16 nNodes 2 localRanks 8 localRank 7 MNNVL 0
|
| 184 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO comm 0xf242a00 rank 14 nRanks 16 nNodes 2 localRanks 8 localRank 6 MNNVL 0
|
| 185 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO comm 0xd90a1d0 rank 13 nRanks 16 nNodes 2 localRanks 8 localRank 5 MNNVL 0
|
| 186 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/-1/-1->13->12 [2] 14/-1/-1->13->12 [3] 14/-1/-1->13->12 [4] 14/-1/-1->13->12 [5] 14/-1/-1->13->5 [6] -1/-1/-1->13->12 [7] 14/-1/-1->13->12 [8] 14/-1/-1->13->12 [9] 14/-1/-1->13->12 [10] 14/-1/-1->13->12 [11] 14/-1/-1->13->12 [12] 14/-1/-1->13->12 [13] 14/5/-1->13->-1 [14] -1/-1/-1->13->12 [15] 14/-1/-1->13->12
|
| 187 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 [2] 15/-1/-1->14->13 [3] 15/-1/-1->14->13 [4] 15/-1/-1->14->13 [5] 15/-1/-1->14->13 [6] 15/-1/-1->14->6 [7] -1/-1/-1->14->13 [8] 15/-1/-1->14->13 [9] 15/-1/-1->14->13 [10] 15/-1/-1->14->13 [11] 15/-1/-1->14->13 [12] 15/-1/-1->14->13 [13] 15/-1/-1->14->13 [14] 15/6/-1->14->-1 [15] -1/-1/-1->14->13
|
| 188 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] 8/-1/-1->15->14 [2] 8/-1/-1->15->14 [3] 8/-1/-1->15->14 [4] 8/-1/-1->15->14 [5] 8/-1/-1->15->14 [6] 8/-1/-1->15->14 [7] 8/-1/-1->15->7 [8] -1/-1/-1->15->14 [9] 8/-1/-1->15->14 [10] 8/-1/-1->15->14 [11] 8/-1/-1->15->14 [12] 8/-1/-1->15->14 [13] 8/-1/-1->15->14 [14] 8/-1/-1->15->14 [15] 8/7/-1->15->-1
|
| 189 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO P2P Chunksize set to 131072
|
| 190 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO P2P Chunksize set to 131072
|
| 191 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO P2P Chunksize set to 131072
|
| 192 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO comm 0xda93a60 rank 12 nRanks 16 nNodes 2 localRanks 8 localRank 4 MNNVL 0
|
| 193 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO comm 0xedfad70 rank 10 nRanks 16 nNodes 2 localRanks 8 localRank 2 MNNVL 0
|
| 194 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO comm 0x7f6eab060080 rank 8 nRanks 16 nNodes 2 localRanks 8 localRank 0 MNNVL 0
|
| 195 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO comm 0xf150bb0 rank 9 nRanks 16 nNodes 2 localRanks 8 localRank 1 MNNVL 0
|
| 196 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO comm 0xe08a1f0 rank 11 nRanks 16 nNodes 2 localRanks 8 localRank 3 MNNVL 0
|
| 197 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9 [2] 11/-1/-1->10->2 [3] -1/-1/-1->10->9 [4] 11/-1/-1->10->9 [5] 11/-1/-1->10->9 [6] 11/-1/-1->10->9 [7] 11/-1/-1->10->9 [8] 11/-1/-1->10->9 [9] 11/-1/-1->10->9 [10] 11/2/-1->10->-1 [11] -1/-1/-1->10->9 [12] 11/-1/-1->10->9 [13] 11/-1/-1->10->9 [14] 11/-1/-1->10->9 [15] 11/-1/-1->10->9
|
| 198 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO Trees [0] 10/-1/-1->9->8 [1] 10/-1/-1->9->1 [2] -1/-1/-1->9->8 [3] 10/-1/-1->9->8 [4] 10/-1/-1->9->8 [5] 10/-1/-1->9->8 [6] 10/-1/-1->9->8 [7] 10/-1/-1->9->8 [8] 10/-1/-1->9->8 [9] 10/1/-1->9->-1 [10] -1/-1/-1->9->8 [11] 10/-1/-1->9->8 [12] 10/-1/-1->9->8 [13] 10/-1/-1->9->8 [14] 10/-1/-1->9->8 [15] 10/-1/-1->9->8
|
| 199 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO Trees [0] 9/-1/-1->8->0 [1] -1/-1/-1->8->15 [2] 9/-1/-1->8->15 [3] 9/-1/-1->8->15 [4] 9/-1/-1->8->15 [5] 9/-1/-1->8->15 [6] 9/-1/-1->8->15 [7] 9/-1/-1->8->15 [8] 9/0/-1->8->-1 [9] -1/-1/-1->8->15 [10] 9/-1/-1->8->15 [11] 9/-1/-1->8->15 [12] 9/-1/-1->8->15 [13] 9/-1/-1->8->15 [14] 9/-1/-1->8->15 [15] 9/-1/-1->8->15
|
| 200 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO Trees [0] 12/-1/-1->11->10 [1] 12/-1/-1->11->10 [2] 12/-1/-1->11->10 [3] 12/-1/-1->11->3 [4] -1/-1/-1->11->10 [5] 12/-1/-1->11->10 [6] 12/-1/-1->11->10 [7] 12/-1/-1->11->10 [8] 12/-1/-1->11->10 [9] 12/-1/-1->11->10 [10] 12/-1/-1->11->10 [11] 12/3/-1->11->-1 [12] -1/-1/-1->11->10 [13] 12/-1/-1->11->10 [14] 12/-1/-1->11->10 [15] 12/-1/-1->11->10
|
| 201 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO Trees [0] 13/-1/-1->12->11 [1] 13/-1/-1->12->11 [2] 13/-1/-1->12->11 [3] 13/-1/-1->12->11 [4] 13/-1/-1->12->4 [5] -1/-1/-1->12->11 [6] 13/-1/-1->12->11 [7] 13/-1/-1->12->11 [8] 13/-1/-1->12->11 [9] 13/-1/-1->12->11 [10] 13/-1/-1->12->11 [11] 13/-1/-1->12->11 [12] 13/4/-1->12->-1 [13] -1/-1/-1->12->11 [14] 13/-1/-1->12->11 [15] 13/-1/-1->12->11
|
| 202 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO P2P Chunksize set to 131072
|
| 203 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO P2P Chunksize set to 131072
|
| 204 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO P2P Chunksize set to 131072
|
| 205 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO P2P Chunksize set to 131072
|
| 206 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO P2P Chunksize set to 131072
|
| 207 |
+
t-20260518224737-tftgw-worker-1:10450:11168 [5] NCCL INFO [Proxy Service] Device 5 CPU core 136
|
| 208 |
+
t-20260518224737-tftgw-worker-1:10451:11171 [6] NCCL INFO [Proxy Service] Device 6 CPU core 92
|
| 209 |
+
t-20260518224737-tftgw-worker-1:10447:11172 [2] NCCL INFO [Proxy Service] Device 2 CPU core 54
|
| 210 |
+
t-20260518224737-tftgw-worker-1:10446:11173 [1] NCCL INFO [Proxy Service] Device 1 CPU core 68
|
| 211 |
+
t-20260518224737-tftgw-worker-1:10445:11174 [0] NCCL INFO [Proxy Service] Device 0 CPU core 3
|
| 212 |
+
t-20260518224737-tftgw-worker-1:10448:11175 [3] NCCL INFO [Proxy Service] Device 3 CPU core 77
|
| 213 |
+
t-20260518224737-tftgw-worker-1:10449:11177 [4] NCCL INFO [Proxy Service] Device 4 CPU core 162
|
| 214 |
+
t-20260518224737-tftgw-worker-1:10452:11170 [7] NCCL INFO [Proxy Service] Device 7 CPU core 92
|
| 215 |
+
t-20260518224737-tftgw-worker-1:10450:11169 [5] NCCL INFO [Proxy Service UDS] Device 5 CPU core 138
|
| 216 |
+
t-20260518224737-tftgw-worker-1:10449:11183 [4] NCCL INFO [Proxy Service UDS] Device 4 CPU core 165
|
| 217 |
+
t-20260518224737-tftgw-worker-1:10451:11176 [6] NCCL INFO [Proxy Service UDS] Device 6 CPU core 94
|
| 218 |
+
t-20260518224737-tftgw-worker-1:10448:11181 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 78
|
| 219 |
+
t-20260518224737-tftgw-worker-1:10447:11179 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 60
|
| 220 |
+
t-20260518224737-tftgw-worker-1:10452:11178 [7] NCCL INFO [Proxy Service UDS] Device 7 CPU core 96
|
| 221 |
+
t-20260518224737-tftgw-worker-1:10446:11180 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 70
|
| 222 |
+
t-20260518224737-tftgw-worker-1:10445:11182 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 4
|
| 223 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
|
| 224 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
|
| 225 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
|
| 226 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
|
| 227 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
|
| 228 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
|
| 229 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
|
| 230 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
|
| 231 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
|
| 232 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
|
| 233 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
|
| 234 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
|
| 235 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
|
| 236 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
|
| 237 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
|
| 238 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
|
| 239 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 240 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 241 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 242 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 243 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 244 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 245 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 246 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 247 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 248 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 249 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 250 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 251 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 252 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 253 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 254 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 255 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 256 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 257 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO ncclCommInitRankConfig comm 0xedfad70 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 69020 commId 0x2ef33c22d6d166f7 - Init COMPLETE
|
| 258 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 259 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO ncclCommInitRankConfig comm 0xda93a60 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 6f020 commId 0x2ef33c22d6d166f7 - Init COMPLETE
|
| 260 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 261 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 262 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO ncclCommInitRankConfig comm 0xed06ec0 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId 75020 commId 0x2ef33c22d6d166f7 - Init COMPLETE
|
| 263 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 264 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 265 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO ncclCommInitRankConfig comm 0xe08a1f0 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 6b020 commId 0x2ef33c22d6d166f7 - Init COMPLETE
|
| 266 |
+
t-20260518224737-tftgw-worker-1:10447:11069 [2] NCCL INFO Init timings - ncclCommInitRankConfig: rank 10 nranks 16 total 2.93 (kernels 0.20, alloc 1.05, bootstrap 0.80, allgathers 0.01, topo 0.56, graphs 0.01, connections 0.29, rest 0.00)
|
| 267 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO ncclCommInitRankConfig comm 0xd90a1d0 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId 71020 commId 0x2ef33c22d6d166f7 - Init COMPLETE
|
| 268 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 269 |
+
t-20260518224737-tftgw-worker-1:10449:11068 [4] NCCL INFO Init timings - ncclCommInitRankConfig: rank 12 nranks 16 total 2.93 (kernels 0.20, alloc 1.07, bootstrap 0.78, allgathers 0.01, topo 0.56, graphs 0.01, connections 0.29, rest 0.00)
|
| 270 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO ncclCommInitRankConfig comm 0x7f6eab060080 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 65040 commId 0x2ef33c22d6d166f7 - Init COMPLETE
|
| 271 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO ncclCommInitRankConfig comm 0xf242a00 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId 73020 commId 0x2ef33c22d6d166f7 - Init COMPLETE
|
| 272 |
+
t-20260518224737-tftgw-worker-1:10452:11067 [7] NCCL INFO Init timings - ncclCommInitRankConfig: rank 15 nranks 16 total 2.93 (kernels 0.20, alloc 1.07, bootstrap 0.78, allgathers 0.01, topo 0.56, graphs 0.02, connections 0.29, rest 0.00)
|
| 273 |
+
t-20260518224737-tftgw-worker-1:10448:11070 [3] NCCL INFO Init timings - ncclCommInitRankConfig: rank 11 nranks 16 total 2.92 (kernels 0.20, alloc 1.07, bootstrap 0.78, allgathers 0.01, topo 0.56, graphs 0.01, connections 0.29, rest 0.00)
|
| 274 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO ncclCommInitRankConfig comm 0xf150bb0 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 67020 commId 0x2ef33c22d6d166f7 - Init COMPLETE
|
| 275 |
+
t-20260518224737-tftgw-worker-1:10450:11072 [5] NCCL INFO Init timings - ncclCommInitRankConfig: rank 13 nranks 16 total 2.92 (kernels 0.20, alloc 1.07, bootstrap 0.78, allgathers 0.01, topo 0.56, graphs 0.01, connections 0.29, rest 0.00)
|
| 276 |
+
t-20260518224737-tftgw-worker-1:10445:11071 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 8 nranks 16 total 2.92 (kernels 0.20, alloc 1.07, bootstrap 0.78, allgathers 0.00, topo 0.56, graphs 0.01, connections 0.29, rest 0.01)
|
| 277 |
+
t-20260518224737-tftgw-worker-1:10451:11073 [6] NCCL INFO Init timings - ncclCommInitRankConfig: rank 14 nranks 16 total 2.92 (kernels 0.20, alloc 1.07, bootstrap 0.78, allgathers 0.01, topo 0.56, graphs 0.02, connections 0.29, rest 0.00)
|
| 278 |
+
t-20260518224737-tftgw-worker-1:10446:11066 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 9 nranks 16 total 2.93 (kernels 0.20, alloc 1.05, bootstrap 0.80, allgathers 0.01, topo 0.56, graphs 0.01, connections 0.29, rest 0.00)
|
| 279 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Channel 02/0 : 8[0] -> 9[1] via P2P/CUMEM
|
| 280 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Channel 04/0 : 8[0] -> 9[1] via P2P/CUMEM
|
| 281 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Channel 06/0 : 8[0] -> 9[1] via P2P/CUMEM
|
| 282 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Channel 10/0 : 8[0] -> 9[1] via P2P/CUMEM
|
| 283 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Channel 12/0 : 8[0] -> 9[1] via P2P/CUMEM
|
| 284 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Channel 14/0 : 8[0] -> 9[1] via P2P/CUMEM
|
| 285 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Channel 00/0 : 13[5] -> 14[6] via P2P/CUMEM
|
| 286 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Channel 00/0 : 11[3] -> 12[4] via P2P/CUMEM
|
| 287 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Channel 00/0 : 12[4] -> 13[5] via P2P/CUMEM
|
| 288 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Channel 00/0 : 10[2] -> 11[3] via P2P/CUMEM
|
| 289 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Channel 02/0 : 13[5] -> 14[6] via P2P/CUMEM
|
| 290 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Channel 00/0 : 14[6] -> 15[7] via P2P/CUMEM
|
| 291 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Channel 00/0 : 9[1] -> 10[2] via P2P/CUMEM
|
| 292 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Channel 02/0 : 11[3] -> 12[4] via P2P/CUMEM
|
| 293 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Channel 02/0 : 12[4] -> 13[5] via P2P/CUMEM
|
| 294 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Channel 04/0 : 10[2] -> 11[3] via P2P/CUMEM
|
| 295 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Channel 04/0 : 13[5] -> 14[6] via P2P/CUMEM
|
| 296 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Channel 02/0 : 14[6] -> 15[7] via P2P/CUMEM
|
| 297 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Channel 02/0 : 9[1] -> 10[2] via P2P/CUMEM
|
| 298 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Channel 04/0 : 11[3] -> 12[4] via P2P/CUMEM
|
| 299 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Channel 06/0 : 12[4] -> 13[5] via P2P/CUMEM
|
| 300 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Channel 06/0 : 10[2] -> 11[3] via P2P/CUMEM
|
| 301 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Channel 06/0 : 13[5] -> 14[6] via P2P/CUMEM
|
| 302 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Channel 04/0 : 14[6] -> 15[7] via P2P/CUMEM
|
| 303 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Channel 04/0 : 9[1] -> 10[2] via P2P/CUMEM
|
| 304 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Channel 06/0 : 11[3] -> 12[4] via P2P/CUMEM
|
| 305 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Channel 08/0 : 12[4] -> 13[5] via P2P/CUMEM
|
| 306 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Channel 08/0 : 10[2] -> 11[3] via P2P/CUMEM
|
| 307 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Channel 08/0 : 13[5] -> 14[6] via P2P/CUMEM
|
| 308 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Channel 08/0 : 14[6] -> 15[7] via P2P/CUMEM
|
| 309 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Channel 06/0 : 9[1] -> 10[2] via P2P/CUMEM
|
| 310 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Channel 08/0 : 11[3] -> 12[4] via P2P/CUMEM
|
| 311 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Channel 10/0 : 12[4] -> 13[5] via P2P/CUMEM
|
| 312 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Channel 12/0 : 10[2] -> 11[3] via P2P/CUMEM
|
| 313 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Channel 10/0 : 13[5] -> 14[6] via P2P/CUMEM
|
| 314 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Channel 10/0 : 14[6] -> 15[7] via P2P/CUMEM
|
| 315 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Channel 08/0 : 9[1] -> 10[2] via P2P/CUMEM
|
| 316 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Channel 10/0 : 11[3] -> 12[4] via P2P/CUMEM
|
| 317 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Channel 14/0 : 12[4] -> 13[5] via P2P/CUMEM
|
| 318 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Channel 14/0 : 10[2] -> 11[3] via P2P/CUMEM
|
| 319 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Channel 12/0 : 13[5] -> 14[6] via P2P/CUMEM
|
| 320 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Channel 12/0 : 14[6] -> 15[7] via P2P/CUMEM
|
| 321 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Channel 10/0 : 9[1] -> 10[2] via P2P/CUMEM
|
| 322 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Channel 12/0 : 11[3] -> 12[4] via P2P/CUMEM
|
| 323 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Channel 14/0 : 13[5] -> 14[6] via P2P/CUMEM
|
| 324 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Channel 12/0 : 9[1] -> 10[2] via P2P/CUMEM
|
| 325 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Channel 14/0 : 11[3] -> 12[4] via P2P/CUMEM
|
| 326 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Channel 14/0 : 9[1] -> 10[2] via P2P/CUMEM
|
| 327 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Channel 01/0 : 8[0] -> 15[7] via P2P/CUMEM
|
| 328 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Channel 03/0 : 8[0] -> 15[7] via P2P/CUMEM
|
| 329 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Channel 05/0 : 8[0] -> 15[7] via P2P/CUMEM
|
| 330 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Channel 07/0 : 8[0] -> 15[7] via P2P/CUMEM
|
| 331 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Channel 09/0 : 8[0] -> 15[7] via P2P/CUMEM
|
| 332 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Channel 11/0 : 8[0] -> 15[7] via P2P/CUMEM
|
| 333 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Channel 13/0 : 8[0] -> 15[7] via P2P/CUMEM
|
| 334 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Channel 15/0 : 8[0] -> 15[7] via P2P/CUMEM
|
| 335 |
+
t-20260518224737-tftgw-worker-1:10447:11192 [2] NCCL INFO [Proxy Progress] Device 2 CPU core 58
|
| 336 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Channel 03/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 337 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Channel 11/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 338 |
+
t-20260518224737-tftgw-worker-1:10451:11193 [6] NCCL INFO [Proxy Progress] Device 6 CPU core 97
|
| 339 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 340 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 341 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Channel 15/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 342 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Channel 10/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 343 |
+
t-20260518224737-tftgw-worker-1:10450:11194 [5] NCCL INFO [Proxy Progress] Device 5 CPU core 138
|
| 344 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Channel 06/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 345 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Channel 04/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 346 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Channel 14/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 347 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Channel 12/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 348 |
+
t-20260518224737-tftgw-worker-1:10449:11195 [4] NCCL INFO [Proxy Progress] Device 4 CPU core 164
|
| 349 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Channel 05/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 350 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Channel 05/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 351 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Channel 13/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 352 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Channel 13/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 353 |
+
t-20260518224737-tftgw-worker-1:10448:11196 [3] NCCL INFO [Proxy Progress] Device 3 CPU core 78
|
| 354 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Channel 04/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 355 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Channel 02/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 356 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Channel 12/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 357 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Channel 10/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 358 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 359 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Channel 11/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 360 |
+
t-20260518224737-tftgw-worker-1:10445:11197 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 4
|
| 361 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Channel 01/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 362 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Channel 09/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 363 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 364 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Channel 08/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 365 |
+
t-20260518224737-tftgw-worker-1:10452:11198 [7] NCCL INFO [Proxy Progress] Device 7 CPU core 96
|
| 366 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Channel 06/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 367 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Channel 14/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 368 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Channel 07/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 369 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Channel 15/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 370 |
+
t-20260518224737-tftgw-worker-1:10446:11199 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 70
|
| 371 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Channel 00/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 372 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Channel 08/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 373 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 374 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Channel 09/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 375 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Channel 00/0 : 15[7] -> 8[0] via P2P/CUMEM
|
| 376 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Channel 02/0 : 15[7] -> 8[0] via P2P/CUMEM
|
| 377 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Channel 04/0 : 15[7] -> 8[0] via P2P/CUMEM
|
| 378 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Channel 06/0 : 15[7] -> 8[0] via P2P/CUMEM
|
| 379 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Channel 08/0 : 15[7] -> 8[0] via P2P/CUMEM
|
| 380 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Channel 10/0 : 15[7] -> 8[0] via P2P/CUMEM
|
| 381 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Channel 01/0 : 14[6] -> 13[5] via P2P/CUMEM
|
| 382 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Channel 01/0 : 10[2] -> 9[1] via P2P/CUMEM
|
| 383 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Channel 12/0 : 15[7] -> 8[0] via P2P/CUMEM
|
| 384 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Channel 01/0 : 12[4] -> 11[3] via P2P/CUMEM
|
| 385 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Channel 03/0 : 14[6] -> 13[5] via P2P/CUMEM
|
| 386 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Channel 03/0 : 10[2] -> 9[1] via P2P/CUMEM
|
| 387 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Channel 14/0 : 15[7] -> 8[0] via P2P/CUMEM
|
| 388 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Channel 03/0 : 12[4] -> 11[3] via P2P/CUMEM
|
| 389 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Channel 05/0 : 14[6] -> 13[5] via P2P/CUMEM
|
| 390 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Channel 05/0 : 10[2] -> 9[1] via P2P/CUMEM
|
| 391 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Channel 05/0 : 12[4] -> 11[3] via P2P/CUMEM
|
| 392 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Channel 01/0 : 15[7] -> 14[6] via P2P/CUMEM
|
| 393 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Channel 03/0 : 9[1] -> 8[0] via P2P/CUMEM
|
| 394 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Channel 07/0 : 14[6] -> 13[5] via P2P/CUMEM
|
| 395 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Channel 07/0 : 10[2] -> 9[1] via P2P/CUMEM
|
| 396 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Channel 07/0 : 12[4] -> 11[3] via P2P/CUMEM
|
| 397 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Channel 03/0 : 15[7] -> 14[6] via P2P/CUMEM
|
| 398 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Channel 01/0 : 13[5] -> 12[4] via P2P/CUMEM
|
| 399 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Channel 01/0 : 11[3] -> 10[2] via P2P/CUMEM
|
| 400 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Channel 05/0 : 9[1] -> 8[0] via P2P/CUMEM
|
| 401 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Channel 09/0 : 14[6] -> 13[5] via P2P/CUMEM
|
| 402 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Channel 09/0 : 10[2] -> 9[1] via P2P/CUMEM
|
| 403 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Channel 05/0 : 15[7] -> 14[6] via P2P/CUMEM
|
| 404 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Channel 03/0 : 13[5] -> 12[4] via P2P/CUMEM
|
| 405 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Channel 09/0 : 12[4] -> 11[3] via P2P/CUMEM
|
| 406 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Channel 05/0 : 11[3] -> 10[2] via P2P/CUMEM
|
| 407 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Channel 07/0 : 9[1] -> 8[0] via P2P/CUMEM
|
| 408 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Channel 11/0 : 14[6] -> 13[5] via P2P/CUMEM
|
| 409 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Channel 11/0 : 10[2] -> 9[1] via P2P/CUMEM
|
| 410 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Channel 09/0 : 15[7] -> 14[6] via P2P/CUMEM
|
| 411 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Channel 07/0 : 13[5] -> 12[4] via P2P/CUMEM
|
| 412 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Channel 11/0 : 12[4] -> 11[3] via P2P/CUMEM
|
| 413 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Channel 07/0 : 11[3] -> 10[2] via P2P/CUMEM
|
| 414 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Channel 11/0 : 9[1] -> 8[0] via P2P/CUMEM
|
| 415 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Channel 13/0 : 14[6] -> 13[5] via P2P/CUMEM
|
| 416 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Channel 13/0 : 10[2] -> 9[1] via P2P/CUMEM
|
| 417 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Channel 11/0 : 15[7] -> 14[6] via P2P/CUMEM
|
| 418 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Channel 09/0 : 13[5] -> 12[4] via P2P/CUMEM
|
| 419 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Channel 13/0 : 12[4] -> 11[3] via P2P/CUMEM
|
| 420 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Channel 09/0 : 11[3] -> 10[2] via P2P/CUMEM
|
| 421 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Channel 13/0 : 9[1] -> 8[0] via P2P/CUMEM
|
| 422 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Channel 15/0 : 14[6] -> 13[5] via P2P/CUMEM
|
| 423 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Channel 15/0 : 10[2] -> 9[1] via P2P/CUMEM
|
| 424 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Channel 13/0 : 15[7] -> 14[6] via P2P/CUMEM
|
| 425 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Channel 11/0 : 13[5] -> 12[4] via P2P/CUMEM
|
| 426 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Channel 15/0 : 12[4] -> 11[3] via P2P/CUMEM
|
| 427 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Channel 13/0 : 11[3] -> 10[2] via P2P/CUMEM
|
| 428 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Channel 15/0 : 9[1] -> 8[0] via P2P/CUMEM
|
| 429 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Channel 15/0 : 13[5] -> 12[4] via P2P/CUMEM
|
| 430 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Channel 15/0 : 11[3] -> 10[2] via P2P/CUMEM
|
| 431 |
+
t-20260518224737-tftgw-worker-1:10447:11172 [2] NCCL INFO NCCL_IB_GID_INDEX set by environment to 7.
|
| 432 |
+
t-20260518224737-tftgw-worker-1:10447:11172 [2] NCCL INFO NCCL_IB_TIMEOUT set by environment to 23.
|
| 433 |
+
t-20260518224737-tftgw-worker-1:10447:11172 [2] NCCL INFO NCCL_IB_RETRY_CNT set by environment to 7.
|
| 434 |
+
t-20260518224737-tftgw-worker-1:10449:11177 [4] NCCL INFO NCCL_IB_GID_INDEX set by environment to 7.
|
| 435 |
+
t-20260518224737-tftgw-worker-1:10451:11171 [6] NCCL INFO NCCL_IB_GID_INDEX set by environment to 7.
|
| 436 |
+
t-20260518224737-tftgw-worker-1:10451:11171 [6] NCCL INFO NCCL_IB_TIMEOUT set by environment to 23.
|
| 437 |
+
t-20260518224737-tftgw-worker-1:10451:11171 [6] NCCL INFO NCCL_IB_RETRY_CNT set by environment to 7.
|
| 438 |
+
t-20260518224737-tftgw-worker-1:10449:11177 [4] NCCL INFO NCCL_IB_TIMEOUT set by environment to 23.
|
| 439 |
+
t-20260518224737-tftgw-worker-1:10449:11177 [4] NCCL INFO NCCL_IB_RETRY_CNT set by environment to 7.
|
| 440 |
+
t-20260518224737-tftgw-worker-1:10452:11170 [7] NCCL INFO NCCL_IB_GID_INDEX set by environment to 7.
|
| 441 |
+
t-20260518224737-tftgw-worker-1:10446:11173 [1] NCCL INFO NCCL_IB_GID_INDEX set by environment to 7.
|
| 442 |
+
t-20260518224737-tftgw-worker-1:10452:11170 [7] NCCL INFO NCCL_IB_TIMEOUT set by environment to 23.
|
| 443 |
+
t-20260518224737-tftgw-worker-1:10452:11170 [7] NCCL INFO NCCL_IB_RETRY_CNT set by environment to 7.
|
| 444 |
+
t-20260518224737-tftgw-worker-1:10448:11175 [3] NCCL INFO NCCL_IB_GID_INDEX set by environment to 7.
|
| 445 |
+
t-20260518224737-tftgw-worker-1:10446:11173 [1] NCCL INFO NCCL_IB_TIMEOUT set by environment to 23.
|
| 446 |
+
t-20260518224737-tftgw-worker-1:10446:11173 [1] NCCL INFO NCCL_IB_RETRY_CNT set by environment to 7.
|
| 447 |
+
t-20260518224737-tftgw-worker-1:10448:11175 [3] NCCL INFO NCCL_IB_TIMEOUT set by environment to 23.
|
| 448 |
+
t-20260518224737-tftgw-worker-1:10448:11175 [3] NCCL INFO NCCL_IB_RETRY_CNT set by environment to 7.
|
| 449 |
+
t-20260518224737-tftgw-worker-1:10450:11168 [5] NCCL INFO NCCL_IB_GID_INDEX set by environment to 7.
|
| 450 |
+
t-20260518224737-tftgw-worker-1:10450:11168 [5] NCCL INFO NCCL_IB_TIMEOUT set by environment to 23.
|
| 451 |
+
t-20260518224737-tftgw-worker-1:10450:11168 [5] NCCL INFO NCCL_IB_RETRY_CNT set by environment to 7.
|
| 452 |
+
t-20260518224737-tftgw-worker-1:10445:11174 [0] NCCL INFO NCCL_IB_GID_INDEX set by environment to 7.
|
| 453 |
+
t-20260518224737-tftgw-worker-1:10445:11174 [0] NCCL INFO NCCL_IB_TIMEOUT set by environment to 23.
|
| 454 |
+
t-20260518224737-tftgw-worker-1:10445:11174 [0] NCCL INFO NCCL_IB_RETRY_CNT set by environment to 7.
|
| 455 |
+
t-20260518224737-tftgw-worker-1:10451:11186 [6] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 456 |
+
t-20260518224737-tftgw-worker-1:10452:11191 [7] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 457 |
+
t-20260518224737-tftgw-worker-1:10448:11189 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 458 |
+
t-20260518224737-tftgw-worker-1:10450:11190 [5] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 459 |
+
t-20260518224737-tftgw-worker-1:10449:11184 [4] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 460 |
+
t-20260518224737-tftgw-worker-1:10445:11188 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 461 |
+
t-20260518224737-tftgw-worker-1:10447:11185 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 462 |
+
t-20260518224737-tftgw-worker-1:10446:11187 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 463 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO NVLS comm 0xd90a1d0 headRank 5 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 464 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO NVLS comm 0xf242a00 headRank 6 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 465 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO NVLS comm 0xed06ec0 headRank 7 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 466 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO NVLS comm 0x7f6eab060080 headRank 0 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 467 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO NVLS comm 0xda93a60 headRank 4 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 468 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO NVLS comm 0xe08a1f0 headRank 3 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 469 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO NVLS comm 0xedfad70 headRank 2 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 470 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO NVLS comm 0xf150bb0 headRank 1 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 471 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 00/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 472 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 00/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 473 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 00/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 474 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 475 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 476 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 00/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 477 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 01/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 478 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 00/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 479 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 01/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 480 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 01/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 481 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 02/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 482 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 00/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 483 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 02/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 484 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 485 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 02/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 486 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 01/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 487 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 02/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 488 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 01/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 489 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 03/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 490 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 03/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 491 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 04/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 492 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 01/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 493 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 03/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 494 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 495 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 03/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 496 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 497 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 04/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 498 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 02/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 499 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 05/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 500 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 04/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 501 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 05/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 502 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 503 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 04/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 504 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 505 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 02/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 506 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 04/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 507 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 06/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 508 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 05/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 509 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 06/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 510 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 03/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 511 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 05/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 512 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 07/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 513 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 06/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 514 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 03/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 515 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 07/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 516 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 07/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 517 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 06/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 518 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 08/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 519 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 05/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 520 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 06/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 521 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 07/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 522 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 04/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 523 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 08/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 524 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 09/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 525 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 08/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 526 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 527 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 09/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 528 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 06/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 529 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 08/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 530 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 10/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 531 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 10/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 532 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 05/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 533 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 09/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 534 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 08/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 535 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 10/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 536 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 07/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 537 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 09/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 538 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 11/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 539 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 11/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 540 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 07/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 541 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 10/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 542 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 12/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 543 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 09/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 544 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 08/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 545 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 12/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 546 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 10/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 547 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 12/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 548 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 11/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 549 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 08/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 550 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 11/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 551 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 13/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 552 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 11/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 553 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 09/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 554 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 13/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 555 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 13/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 556 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 12/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 557 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 14/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 558 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 12/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 559 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 12/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 560 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 09/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 561 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 14/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 562 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 14/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 563 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 10/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 564 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 13/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 565 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 15/0 : 2[2] -> 10[2] [receive] via NET/IBext_v9/10/GDRDMA
|
| 566 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 13/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 567 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 15/0 : 1[1] -> 9[1] [receive] via NET/IBext_v9/9/GDRDMA
|
| 568 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 14/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 569 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 15/0 : 0[0] -> 8[0] [receive] via NET/IBext_v9/8/GDRDMA
|
| 570 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 10/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 571 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 11/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 572 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 00/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 573 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 14/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 574 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 14/0 : 6[6] -> 14[6] [receive] via NET/IBext_v9/14/GDRDMA
|
| 575 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 01/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 576 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 15/0 : 4[4] -> 12[4] [receive] via NET/IBext_v9/12/GDRDMA
|
| 577 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 00/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 578 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 01/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 579 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 15/0 : 3[3] -> 11[3] [receive] via NET/IBext_v9/11/GDRDMA
|
| 580 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 11/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 581 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 13/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 582 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 02/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 583 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 00/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 584 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 02/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 585 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 00/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 586 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 03/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 587 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 00/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 588 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 03/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 589 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 12/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 590 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 01/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 591 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 03/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 592 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 14/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 593 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 01/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 594 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 04/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 595 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 01/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 596 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 597 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 04/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 598 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 02/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 599 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 05/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 600 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 02/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 601 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 13/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 602 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 15/0 : 5[5] -> 13[5] [receive] via NET/IBext_v9/13/GDRDMA
|
| 603 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 02/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 604 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 05/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 605 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 606 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 03/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 607 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 608 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 03/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 609 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 04/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 610 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 00/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 611 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 15/0 : 7[7] -> 15[7] [receive] via NET/IBext_v9/15/GDRDMA
|
| 612 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 06/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 613 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 06/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 614 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 05/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 615 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 07/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 616 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 04/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 617 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 05/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 618 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 07/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 619 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 07/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 620 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 00/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 621 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 01/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 622 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 06/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 623 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 08/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 624 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 05/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 625 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 06/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 626 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 08/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 627 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 01/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 628 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 09/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 629 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 02/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 630 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 07/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 631 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 09/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 632 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 07/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 633 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 634 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 10/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 635 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 10/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 636 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 02/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 637 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 08/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 638 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 03/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 639 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 11/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 640 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 08/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 641 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 11/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 642 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 11/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 643 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 08/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 644 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 09/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 645 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 03/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 646 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 04/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 647 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 12/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 648 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 12/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 649 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 12/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 650 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 09/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 651 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 09/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 652 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 10/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 653 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 04/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 654 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 13/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 655 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 06/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 656 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 13/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 657 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 13/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 658 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 10/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 659 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 10/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 660 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 11/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 661 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 05/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 662 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 14/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 663 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 07/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 664 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 14/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 665 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 14/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 666 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 11/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 667 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 12/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 668 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 13/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 669 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Channel 15/0 : 10[2] -> 2[2] [send] via NET/IBext_v9/10/GDRDMA
|
| 670 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 06/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 671 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 08/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 672 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Channel 15/0 : 9[1] -> 1[1] [send] via NET/IBext_v9/9/GDRDMA
|
| 673 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Channel 15/0 : 8[0] -> 0[0] [send] via NET/IBext_v9/8/GDRDMA
|
| 674 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 12/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 675 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 13/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 676 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 08/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 677 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 14/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 678 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 09/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 679 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 14/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 680 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 13/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 681 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Channel 15/0 : 12[4] -> 4[4] [send] via NET/IBext_v9/12/GDRDMA
|
| 682 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 09/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 683 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 10/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 684 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Channel 15/0 : 11[3] -> 3[3] [send] via NET/IBext_v9/11/GDRDMA
|
| 685 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Channel 15/0 : 14[6] -> 6[6] [send] via NET/IBext_v9/14/GDRDMA
|
| 686 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 10/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 687 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 11/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 688 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 11/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 689 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 12/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 690 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 12/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 691 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 14/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 692 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 13/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 693 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Channel 15/0 : 13[5] -> 5[5] [send] via NET/IBext_v9/13/GDRDMA
|
| 694 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Channel 14/0 : 15[7] -> 7[7] [send] via NET/IBext_v9/15/GDRDMA
|
| 695 |
+
t-20260518224737-tftgw-worker-1:10448:11582 [3] NCCL INFO Connected NVLS tree
|
| 696 |
+
t-20260518224737-tftgw-worker-1:10446:11583 [1] NCCL INFO Connected NVLS tree
|
| 697 |
+
t-20260518224737-tftgw-worker-1:10447:11584 [2] NCCL INFO Connected NVLS tree
|
| 698 |
+
t-20260518224737-tftgw-worker-1:10445:11580 [0] NCCL INFO Connected NVLS tree
|
| 699 |
+
t-20260518224737-tftgw-worker-1:10452:11579 [7] NCCL INFO Connected NVLS tree
|
| 700 |
+
t-20260518224737-tftgw-worker-1:10450:11577 [5] NCCL INFO Connected NVLS tree
|
| 701 |
+
t-20260518224737-tftgw-worker-1:10449:11581 [4] NCCL INFO Connected NVLS tree
|
| 702 |
+
t-20260518224737-tftgw-worker-1:10451:11578 [6] NCCL INFO Connected NVLS tree
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/debug_owt_t5_randk0_4_4gpu_smoke_20260518_012201.log
ADDED
|
File without changes
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_elfofficial_t5_len1024_elfaligned_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_gbs512_8gpu_5epoch_20260516_013934.log
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Traceback (most recent call last):
|
| 2 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2490, in <module>
|
| 3 |
+
main()
|
| 4 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1643, in main
|
| 5 |
+
torch.cuda.set_device(local_rank)
|
| 6 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py", line 477, in set_device
|
| 7 |
+
torch._C._cuda_setDevice(device)
|
| 8 |
+
RuntimeError: CUDA error: invalid device ordinal
|
| 9 |
+
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
|
| 10 |
+
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
|
| 11 |
+
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
|
| 12 |
+
|
| 13 |
+
Traceback (most recent call last):
|
| 14 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2490, in <module>
|
| 15 |
+
main()
|
| 16 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1643, in main
|
| 17 |
+
torch.cuda.set_device(local_rank)
|
| 18 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py", line 477, in set_device
|
| 19 |
+
torch._C._cuda_setDevice(device)
|
| 20 |
+
RuntimeError: CUDA error: invalid device ordinal
|
| 21 |
+
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
|
| 22 |
+
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
|
| 23 |
+
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
|
| 24 |
+
|
| 25 |
+
Traceback (most recent call last):
|
| 26 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2490, in <module>
|
| 27 |
+
main()
|
| 28 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1643, in main
|
| 29 |
+
torch.cuda.set_device(local_rank)
|
| 30 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py", line 477, in set_device
|
| 31 |
+
torch._C._cuda_setDevice(device)
|
| 32 |
+
RuntimeError: CUDA error: invalid device ordinal
|
| 33 |
+
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
|
| 34 |
+
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
|
| 35 |
+
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
|
| 36 |
+
|
| 37 |
+
Traceback (most recent call last):
|
| 38 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2490, in <module>
|
| 39 |
+
main()
|
| 40 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1643, in main
|
| 41 |
+
torch.cuda.set_device(local_rank)
|
| 42 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py", line 477, in set_device
|
| 43 |
+
torch._C._cuda_setDevice(device)
|
| 44 |
+
RuntimeError: CUDA error: invalid device ordinal
|
| 45 |
+
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
|
| 46 |
+
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
|
| 47 |
+
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
|
| 48 |
+
|
| 49 |
+
W0516 01:39:39.521000 1526576 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 1526580 closing signal SIGTERM
|
| 50 |
+
W0516 01:39:39.522000 1526576 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 1526581 closing signal SIGTERM
|
| 51 |
+
W0516 01:39:39.522000 1526576 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 1526582 closing signal SIGTERM
|
| 52 |
+
W0516 01:39:39.523000 1526576 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 1526583 closing signal SIGTERM
|
| 53 |
+
W0516 01:39:39.523000 1526576 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 1526584 closing signal SIGTERM
|
| 54 |
+
W0516 01:39:39.523000 1526576 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 1526585 closing signal SIGTERM
|
| 55 |
+
W0516 01:39:39.524000 1526576 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 1526586 closing signal SIGTERM
|
| 56 |
+
E0516 01:39:39.623000 1526576 torch/distributed/elastic/multiprocessing/api.py:870] failed (exitcode: 1) local_rank: 7 (pid: 1526587) of binary: /usr/bin/python
|
| 57 |
+
Traceback (most recent call last):
|
| 58 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 59 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 60 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
|
| 61 |
+
main()
|
| 62 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 63 |
+
return f(*args, **kwargs)
|
| 64 |
+
^^^^^^^^^^^^^^^^^^
|
| 65 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
|
| 66 |
+
run(args)
|
| 67 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
|
| 68 |
+
elastic_launch(
|
| 69 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
|
| 70 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 71 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 72 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
|
| 73 |
+
raise ChildFailedError(
|
| 74 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 75 |
+
============================================================
|
| 76 |
+
train.py FAILED
|
| 77 |
+
------------------------------------------------------------
|
| 78 |
+
Failures:
|
| 79 |
+
<NO_OTHER_FAILURES>
|
| 80 |
+
------------------------------------------------------------
|
| 81 |
+
Root Cause (first observed failure):
|
| 82 |
+
[0]:
|
| 83 |
+
time : 2026-05-16_01:39:39
|
| 84 |
+
host : localhost
|
| 85 |
+
rank : 7 (local_rank: 7)
|
| 86 |
+
exitcode : 1 (pid: 1526587)
|
| 87 |
+
error_file: <N/A>
|
| 88 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 89 |
+
============================================================
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_elfofficial_t5_len1024_elfaligned_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_gbs512_8gpu_5epoch_20260516_014234.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_elfofficial_t5_len1024_elfaligned_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_gbs512_8gpu_5epoch_20260517_003703.log
ADDED
|
@@ -0,0 +1,820 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
t-20260517083536-fd9c2-worker-0:10331:10331 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 2 |
+
t-20260517083536-fd9c2-worker-0:10331:10331 [0] NCCL INFO Bootstrap: Using eth1:10.82.40.2<0>
|
| 3 |
+
t-20260517083536-fd9c2-worker-0:10331:10331 [0] NCCL INFO cudaDriverVersion 12080
|
| 4 |
+
t-20260517083536-fd9c2-worker-0:10331:10331 [0] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 5 |
+
t-20260517083536-fd9c2-worker-0:10331:10331 [0] NCCL INFO Comm config Blocking set to 1
|
| 6 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 7 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 8 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 9 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO P2P plugin v9 IBext_v9
|
| 10 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 11 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 12 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.40.2<0>
|
| 13 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 14 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Using network IBext_v9
|
| 15 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO ncclCommInitRankConfig comm 0xc57c980 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 65040 commId 0x4a1a7f483027896c - Init START
|
| 16 |
+
t-20260517083536-fd9c2-worker-0:10335:10335 [4] NCCL INFO cudaDriverVersion 12080
|
| 17 |
+
t-20260517083536-fd9c2-worker-0:10335:10335 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 18 |
+
t-20260517083536-fd9c2-worker-0:10335:10335 [4] NCCL INFO Bootstrap: Using eth1:10.82.40.2<0>
|
| 19 |
+
t-20260517083536-fd9c2-worker-0:10335:10335 [4] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 20 |
+
t-20260517083536-fd9c2-worker-0:10335:10335 [4] NCCL INFO Comm config Blocking set to 1
|
| 21 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 22 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 23 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 24 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO P2P plugin v9 IBext_v9
|
| 25 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 26 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 27 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.40.2<0>
|
| 28 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 29 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO Using network IBext_v9
|
| 30 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO ncclCommInitRankConfig comm 0xe76ac40 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId 6f020 commId 0x4a1a7f483027896c - Init START
|
| 31 |
+
t-20260517083536-fd9c2-worker-0:10338:10338 [7] NCCL INFO cudaDriverVersion 12080
|
| 32 |
+
t-20260517083536-fd9c2-worker-0:10338:10338 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 33 |
+
t-20260517083536-fd9c2-worker-0:10338:10338 [7] NCCL INFO Bootstrap: Using eth1:10.82.40.2<0>
|
| 34 |
+
t-20260517083536-fd9c2-worker-0:10338:10338 [7] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 35 |
+
t-20260517083536-fd9c2-worker-0:10338:10338 [7] NCCL INFO Comm config Blocking set to 1
|
| 36 |
+
t-20260517083536-fd9c2-worker-0:10334:10334 [3] NCCL INFO cudaDriverVersion 12080
|
| 37 |
+
t-20260517083536-fd9c2-worker-0:10334:10334 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 38 |
+
t-20260517083536-fd9c2-worker-0:10334:10334 [3] NCCL INFO Bootstrap: Using eth1:10.82.40.2<0>
|
| 39 |
+
t-20260517083536-fd9c2-worker-0:10334:10334 [3] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 40 |
+
t-20260517083536-fd9c2-worker-0:10334:10334 [3] NCCL INFO Comm config Blocking set to 1
|
| 41 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 42 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 43 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 44 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO P2P plugin v9 IBext_v9
|
| 45 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 46 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 47 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.40.2<0>
|
| 48 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 49 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO Using network IBext_v9
|
| 50 |
+
t-20260517083536-fd9c2-worker-0:10337:10337 [6] NCCL INFO cudaDriverVersion 12080
|
| 51 |
+
t-20260517083536-fd9c2-worker-0:10337:10337 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 52 |
+
t-20260517083536-fd9c2-worker-0:10337:10337 [6] NCCL INFO Bootstrap: Using eth1:10.82.40.2<0>
|
| 53 |
+
t-20260517083536-fd9c2-worker-0:10337:10337 [6] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 54 |
+
t-20260517083536-fd9c2-worker-0:10337:10337 [6] NCCL INFO Comm config Blocking set to 1
|
| 55 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO ncclCommInitRankConfig comm 0xf0141a0 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId 75020 commId 0x4a1a7f483027896c - Init START
|
| 56 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 57 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 58 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 59 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO P2P plugin v9 IBext_v9
|
| 60 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 61 |
+
t-20260517083536-fd9c2-worker-0:10332:10332 [1] NCCL INFO cudaDriverVersion 12080
|
| 62 |
+
t-20260517083536-fd9c2-worker-0:10332:10332 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 63 |
+
t-20260517083536-fd9c2-worker-0:10332:10332 [1] NCCL INFO Bootstrap: Using eth1:10.82.40.2<0>
|
| 64 |
+
t-20260517083536-fd9c2-worker-0:10332:10332 [1] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 65 |
+
t-20260517083536-fd9c2-worker-0:10332:10332 [1] NCCL INFO Comm config Blocking set to 1
|
| 66 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 67 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.40.2<0>
|
| 68 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 69 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO Using network IBext_v9
|
| 70 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 71 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 72 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 73 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO P2P plugin v9 IBext_v9
|
| 74 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 75 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO ncclCommInitRankConfig comm 0xdf2c660 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId 6b020 commId 0x4a1a7f483027896c - Init START
|
| 76 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 77 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.40.2<0>
|
| 78 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 79 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO Using network IBext_v9
|
| 80 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 81 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 82 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 83 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO P2P plugin v9 IBext_v9
|
| 84 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 85 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO ncclCommInitRankConfig comm 0xe813ad0 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId 73020 commId 0x4a1a7f483027896c - Init START
|
| 86 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO RAS client listening socket at ::1<28028>
|
| 87 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 88 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.40.2<0>
|
| 89 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 90 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO Using network IBext_v9
|
| 91 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO ncclCommInitRankConfig comm 0xdfb2ce0 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 67020 commId 0x4a1a7f483027896c - Init START
|
| 92 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO RAS client listening socket at ::1<28028>
|
| 93 |
+
t-20260517083536-fd9c2-worker-0:10336:10336 [5] NCCL INFO cudaDriverVersion 12080
|
| 94 |
+
t-20260517083536-fd9c2-worker-0:10336:10336 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 95 |
+
t-20260517083536-fd9c2-worker-0:10336:10336 [5] NCCL INFO Bootstrap: Using eth1:10.82.40.2<0>
|
| 96 |
+
t-20260517083536-fd9c2-worker-0:10336:10336 [5] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 97 |
+
t-20260517083536-fd9c2-worker-0:10336:10336 [5] NCCL INFO Comm config Blocking set to 1
|
| 98 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 99 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 100 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 101 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO P2P plugin v9 IBext_v9
|
| 102 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 103 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 104 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.40.2<0>
|
| 105 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 106 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO Using network IBext_v9
|
| 107 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO ncclCommInitRankConfig comm 0xe38ef50 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId 71020 commId 0x4a1a7f483027896c - Init START
|
| 108 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO RAS client listening socket at ::1<28028>
|
| 109 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO RAS client listening socket at ::1<28028>
|
| 110 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO RAS client listening socket at ::1<28028>
|
| 111 |
+
t-20260517083536-fd9c2-worker-0:10333:10333 [2] NCCL INFO cudaDriverVersion 12080
|
| 112 |
+
t-20260517083536-fd9c2-worker-0:10333:10333 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 113 |
+
t-20260517083536-fd9c2-worker-0:10333:10333 [2] NCCL INFO Bootstrap: Using eth1:10.82.40.2<0>
|
| 114 |
+
t-20260517083536-fd9c2-worker-0:10333:10333 [2] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 115 |
+
t-20260517083536-fd9c2-worker-0:10333:10333 [2] NCCL INFO Comm config Blocking set to 1
|
| 116 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 117 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 118 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 119 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO P2P plugin v9 IBext_v9
|
| 120 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 121 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 122 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.40.2<0>
|
| 123 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 124 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO Using network IBext_v9
|
| 125 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO ncclCommInitRankConfig comm 0xdbf3fd0 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId 69020 commId 0x4a1a7f483027896c - Init START
|
| 126 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO RAS client listening socket at ::1<28028>
|
| 127 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO RAS client listening socket at ::1<28028>
|
| 128 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO RAS client listening socket at ::1<28028>
|
| 129 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO Bootstrap timings total 1.469858 (create 0.000020, send 0.000066, recv 1.469219, ring 0.000135, delay 0.000001)
|
| 130 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO Bootstrap timings total 0.000786 (create 0.000023, send 0.000071, recv 0.000138, ring 0.000261, delay 0.000000)
|
| 131 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO Bootstrap timings total 0.796941 (create 0.000027, send 0.000112, recv 0.000142, ring 0.796302, delay 0.000001)
|
| 132 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Bootstrap timings total 4.065658 (create 0.000028, send 0.000075, recv 2.595893, ring 1.469256, delay 0.000001)
|
| 133 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO Bootstrap timings total 1.757197 (create 0.000026, send 0.000084, recv 0.000104, ring 0.000136, delay 0.000000)
|
| 134 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO Bootstrap timings total 1.606946 (create 0.000025, send 0.000086, recv 0.000088, ring 0.796308, delay 0.000000)
|
| 135 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO Bootstrap timings total 1.998810 (create 0.000026, send 0.000073, recv 0.000104, ring 1.606370, delay 0.000001)
|
| 136 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO Bootstrap timings total 2.424964 (create 0.000030, send 0.000112, recv 1.628158, ring 0.796199, delay 0.000001)
|
| 137 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO MNNVL busId 0x69020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 138 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO MNNVL busId 0x65040 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 139 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO MNNVL busId 0x71020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 140 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO MNNVL busId 0x67020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 141 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO MNNVL busId 0x6f020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 142 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO MNNVL busId 0x73020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 143 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO MNNVL busId 0x6b020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 144 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO MNNVL busId 0x75020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 145 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 146 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 147 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 148 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 149 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 150 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 151 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 152 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 153 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO Setting affinity for GPU 7 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 154 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO NVLS multicast support is available on dev 7
|
| 155 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Setting affinity for GPU 0 to 03ffffff,ffffffff,ffffffff
|
| 156 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO NVLS multicast support is available on dev 0
|
| 157 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO Setting affinity for GPU 6 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 158 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO NVLS multicast support is available on dev 6
|
| 159 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO Setting affinity for GPU 5 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 160 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO NVLS multicast support is available on dev 5
|
| 161 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO Setting affinity for GPU 4 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 162 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO Setting affinity for GPU 2 to 03ffffff,ffffffff,ffffffff
|
| 163 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO Setting affinity for GPU 3 to 03ffffff,ffffffff,ffffffff
|
| 164 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO Setting affinity for GPU 1 to 03ffffff,ffffffff,ffffffff
|
| 165 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO NVLS multicast support is available on dev 1
|
| 166 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO NVLS multicast support is available on dev 4
|
| 167 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO NVLS multicast support is available on dev 2
|
| 168 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO NVLS multicast support is available on dev 3
|
| 169 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO comm 0xe813ad0 rank 6 nRanks 8 nNodes 1 localRanks 8 localRank 6 MNNVL 0
|
| 170 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->5 [15] 7/-1/-1->6->5 [16] 7/-1/-1->6->5 [17] 7/-1/-1->6->5 [18] 7/-1/-1->6->5 [19] 7/-1/-1->6->5 [20] 7/-1/-1->6->5 [21] 7/-1/-1->6->5 [22] 7/-1/-1->6->5 [23] 7/-1/-1->6->5
|
| 171 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO P2P Chunksize set to 524288
|
| 172 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO comm 0xe38ef50 rank 5 nRanks 8 nNodes 1 localRanks 8 localRank 5 MNNVL 0
|
| 173 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO comm 0xe76ac40 rank 4 nRanks 8 nNodes 1 localRanks 8 localRank 4 MNNVL 0
|
| 174 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO comm 0xdf2c660 rank 3 nRanks 8 nNodes 1 localRanks 8 localRank 3 MNNVL 0
|
| 175 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO comm 0xdfb2ce0 rank 1 nRanks 8 nNodes 1 localRanks 8 localRank 1 MNNVL 0
|
| 176 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO comm 0xdbf3fd0 rank 2 nRanks 8 nNodes 1 localRanks 8 localRank 2 MNNVL 0
|
| 177 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO comm 0xc57c980 rank 0 nRanks 8 nNodes 1 localRanks 8 localRank 0 MNNVL 0
|
| 178 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO comm 0xf0141a0 rank 7 nRanks 8 nNodes 1 localRanks 8 localRank 7 MNNVL 0
|
| 179 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/-1/-1->5->4 [6] 6/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->4 [14] 6/-1/-1->5->4 [15] 6/-1/-1->5->4 [16] 6/-1/-1->5->4 [17] 6/-1/-1->5->4 [18] 6/-1/-1->5->4 [19] 6/-1/-1->5->4 [20] 6/-1/-1->5->4 [21] 6/-1/-1->5->4 [22] 6/-1/-1->5->4 [23] 6/-1/-1->5->4
|
| 180 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 00/24 : 0 1 2 3 4 5 6 7
|
| 181 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO P2P Chunksize set to 524288
|
| 182 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 01/24 : 0 1 2 3 4 5 6 7
|
| 183 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 02/24 : 0 1 2 3 4 5 6 7
|
| 184 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/-1/-1->3->2 [4] 4/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->2 [12] 4/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 [16] 4/-1/-1->3->2 [17] 4/-1/-1->3->2 [18] 4/-1/-1->3->2 [19] 4/-1/-1->3->2 [20] 4/-1/-1->3->2 [21] 4/-1/-1->3->2 [22] 4/-1/-1->3->2 [23] 4/-1/-1->3->2
|
| 185 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0
|
| 186 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 03/24 : 0 1 2 3 4 5 6 7
|
| 187 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO P2P Chunksize set to 524288
|
| 188 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO P2P Chunksize set to 524288
|
| 189 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/-1/-1->4->3 [5] 5/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->3 [13] 5/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 [16] 5/-1/-1->4->3 [17] 5/-1/-1->4->3 [18] 5/-1/-1->4->3 [19] 5/-1/-1->4->3 [20] 5/-1/-1->4->3 [21] 5/-1/-1->4->3 [22] 5/-1/-1->4->3 [23] 5/-1/-1->4->3
|
| 190 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 04/24 : 0 1 2 3 4 5 6 7
|
| 191 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1
|
| 192 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 [2] -1/-1/-1->7->6 [3] -1/-1/-1->7->6 [4] -1/-1/-1->7->6 [5] -1/-1/-1->7->6 [6] -1/-1/-1->7->6 [7] -1/-1/-1->7->6 [8] -1/-1/-1->7->6 [9] -1/-1/-1->7->6 [10] -1/-1/-1->7->6 [11] -1/-1/-1->7->6 [12] -1/-1/-1->7->6 [13] -1/-1/-1->7->6 [14] -1/-1/-1->7->6 [15] -1/-1/-1->7->6 [16] -1/-1/-1->7->6 [17] -1/-1/-1->7->6 [18] -1/-1/-1->7->6 [19] -1/-1/-1->7->6 [20] -1/-1/-1->7->6 [21] -1/-1/-1->7->6 [22] -1/-1/-1->7->6 [23] -1/-1/-1->7->6
|
| 193 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 05/24 : 0 1 2 3 4 5 6 7
|
| 194 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO P2P Chunksize set to 524288
|
| 195 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO P2P Chunksize set to 524288
|
| 196 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO P2P Chunksize set to 524288
|
| 197 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 06/24 : 0 1 2 3 4 5 6 7
|
| 198 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 07/24 : 0 1 2 3 4 5 6 7
|
| 199 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 08/24 : 0 1 2 3 4 5 6 7
|
| 200 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 09/24 : 0 1 2 3 4 5 6 7
|
| 201 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 10/24 : 0 1 2 3 4 5 6 7
|
| 202 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 11/24 : 0 1 2 3 4 5 6 7
|
| 203 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 12/24 : 0 1 2 3 4 5 6 7
|
| 204 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 13/24 : 0 1 2 3 4 5 6 7
|
| 205 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 14/24 : 0 1 2 3 4 5 6 7
|
| 206 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 15/24 : 0 1 2 3 4 5 6 7
|
| 207 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 16/24 : 0 1 2 3 4 5 6 7
|
| 208 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 17/24 : 0 1 2 3 4 5 6 7
|
| 209 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 18/24 : 0 1 2 3 4 5 6 7
|
| 210 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 19/24 : 0 1 2 3 4 5 6 7
|
| 211 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 20/24 : 0 1 2 3 4 5 6 7
|
| 212 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 21/24 : 0 1 2 3 4 5 6 7
|
| 213 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 22/24 : 0 1 2 3 4 5 6 7
|
| 214 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Channel 23/24 : 0 1 2 3 4 5 6 7
|
| 215 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1
|
| 216 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO P2P Chunksize set to 524288
|
| 217 |
+
t-20260517083536-fd9c2-worker-0:10336:11032 [5] NCCL INFO [Proxy Service] Device 5 CPU core 136
|
| 218 |
+
t-20260517083536-fd9c2-worker-0:10336:11033 [5] NCCL INFO [Proxy Service UDS] Device 5 CPU core 138
|
| 219 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Check P2P Type intraNodeP2pSupport 1 directMode 0
|
| 220 |
+
t-20260517083536-fd9c2-worker-0:10331:11035 [0] NCCL INFO [Proxy Service] Device 0 CPU core 2
|
| 221 |
+
t-20260517083536-fd9c2-worker-0:10333:11036 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 4
|
| 222 |
+
t-20260517083536-fd9c2-worker-0:10337:11038 [6] NCCL INFO [Proxy Service] Device 6 CPU core 168
|
| 223 |
+
t-20260517083536-fd9c2-worker-0:10333:11034 [2] NCCL INFO [Proxy Service] Device 2 CPU core 2
|
| 224 |
+
t-20260517083536-fd9c2-worker-0:10331:11037 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 6
|
| 225 |
+
t-20260517083536-fd9c2-worker-0:10337:11039 [6] NCCL INFO [Proxy Service UDS] Device 6 CPU core 170
|
| 226 |
+
t-20260517083536-fd9c2-worker-0:10334:11040 [3] NCCL INFO [Proxy Service] Device 3 CPU core 18
|
| 227 |
+
t-20260517083536-fd9c2-worker-0:10334:11041 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 20
|
| 228 |
+
t-20260517083536-fd9c2-worker-0:10332:11042 [1] NCCL INFO [Proxy Service] Device 1 CPU core 84
|
| 229 |
+
t-20260517083536-fd9c2-worker-0:10332:11043 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 87
|
| 230 |
+
t-20260517083536-fd9c2-worker-0:10338:11044 [7] NCCL INFO [Proxy Service] Device 7 CPU core 92
|
| 231 |
+
t-20260517083536-fd9c2-worker-0:10338:11045 [7] NCCL INFO [Proxy Service UDS] Device 7 CPU core 94
|
| 232 |
+
t-20260517083536-fd9c2-worker-0:10335:11046 [4] NCCL INFO [Proxy Service] Device 4 CPU core 92
|
| 233 |
+
t-20260517083536-fd9c2-worker-0:10335:11047 [4] NCCL INFO [Proxy Service UDS] Device 4 CPU core 94
|
| 234 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 235 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 236 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO CC Off, workFifoBytes 1048576
|
| 237 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 238 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 239 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 240 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 241 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 242 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 243 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 244 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 245 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 246 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 247 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 248 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 249 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 250 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 251 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 252 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 253 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 254 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 255 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 256 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 257 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 258 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 259 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 260 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 261 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 262 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 263 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 264 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 265 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 266 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 267 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 268 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 269 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 270 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 271 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO ncclCommInitRankConfig comm 0xf0141a0 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId 75020 commId 0x4a1a7f483027896c - Init COMPLETE
|
| 272 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 273 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO ncclCommInitRankConfig comm 0xe76ac40 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId 6f020 commId 0x4a1a7f483027896c - Init COMPLETE
|
| 274 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO ncclCommInitRankConfig comm 0xdf2c660 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId 6b020 commId 0x4a1a7f483027896c - Init COMPLETE
|
| 275 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO ncclCommInitRankConfig comm 0xdfb2ce0 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 67020 commId 0x4a1a7f483027896c - Init COMPLETE
|
| 276 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO ncclCommInitRankConfig comm 0xe813ad0 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId 73020 commId 0x4a1a7f483027896c - Init COMPLETE
|
| 277 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO ncclCommInitRankConfig comm 0xdbf3fd0 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId 69020 commId 0x4a1a7f483027896c - Init COMPLETE
|
| 278 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO ncclCommInitRankConfig comm 0xc57c980 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 65040 commId 0x4a1a7f483027896c - Init COMPLETE
|
| 279 |
+
t-20260517083536-fd9c2-worker-0:10338:10948 [7] NCCL INFO Init timings - ncclCommInitRankConfig: rank 7 nranks 8 total 3.34 (kernels 0.20, alloc 0.18, bootstrap 2.00, allgathers 0.02, topo 0.53, graphs 0.01, connections 0.36, rest 0.03)
|
| 280 |
+
t-20260517083536-fd9c2-worker-0:10335:10939 [4] NCCL INFO Init timings - ncclCommInitRankConfig: rank 4 nranks 8 total 3.80 (kernels 0.24, alloc 0.18, bootstrap 2.42, allgathers 0.01, topo 0.54, graphs 0.01, connections 0.36, rest 0.04)
|
| 281 |
+
t-20260517083536-fd9c2-worker-0:10332:10959 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 8 total 2.89 (kernels 0.29, alloc 0.18, bootstrap 1.47, allgathers 0.02, topo 0.54, graphs 0.01, connections 0.36, rest 0.03)
|
| 282 |
+
t-20260517083536-fd9c2-worker-0:10334:10949 [3] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 8 total 3.15 (kernels 0.25, alloc 0.19, bootstrap 1.76, allgathers 0.01, topo 0.54, graphs 0.01, connections 0.37, rest 0.03)
|
| 283 |
+
t-20260517083536-fd9c2-worker-0:10337:10958 [6] NCCL INFO Init timings - ncclCommInitRankConfig: rank 6 nranks 8 total 3.00 (kernels 0.26, alloc 0.18, bootstrap 1.61, allgathers 0.02, topo 0.53, graphs 0.01, connections 0.37, rest 0.02)
|
| 284 |
+
t-20260517083536-fd9c2-worker-0:10331:10928 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 8 total 5.41 (kernels 0.21, alloc 0.18, bootstrap 4.07, allgathers 0.02, topo 0.53, graphs 0.01, connections 0.37, rest 0.03)
|
| 285 |
+
t-20260517083536-fd9c2-worker-0:10333:11020 [2] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 8 total 1.31 (kernels 0.18, alloc 0.17, bootstrap 0.00, allgathers 0.01, topo 0.54, graphs 0.01, connections 0.37, rest 0.03)
|
| 286 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 287 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 288 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 289 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO ncclCommInitRankConfig comm 0xe38ef50 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId 71020 commId 0x4a1a7f483027896c - Init COMPLETE
|
| 290 |
+
t-20260517083536-fd9c2-worker-0:10336:10986 [5] NCCL INFO Init timings - ncclCommInitRankConfig: rank 5 nranks 8 total 2.15 (kernels 0.18, alloc 0.21, bootstrap 0.80, allgathers 0.02, topo 0.54, graphs 0.01, connections 0.38, rest 0.02)
|
| 291 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 292 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 293 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 294 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 295 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 296 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 297 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 298 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 299 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 03/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 300 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 00/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 301 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 302 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 303 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 04/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 304 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 01/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 305 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 306 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 307 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 05/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 308 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 02/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 309 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 310 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 00/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 311 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 01/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 312 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 02/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 313 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 03/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 314 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 00/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 315 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 04/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 316 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 317 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 318 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 05/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 319 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 02/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 320 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 321 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 06/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 322 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 323 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 03/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 324 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 325 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 07/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 326 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 327 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 04/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 328 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 329 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 08/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 330 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 331 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 05/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 332 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 333 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 09/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 334 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 335 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 336 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 06/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 337 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 06/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 338 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 01/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 339 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 340 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 03/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 341 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 10/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 342 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 343 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 344 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 07/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 345 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 346 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 02/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 347 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 348 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 04/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 349 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 11/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 350 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 351 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 352 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 08/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 353 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 08/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 354 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 03/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 355 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 356 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 05/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 357 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 12/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 358 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 359 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 360 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 09/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 361 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 362 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 04/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 363 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 364 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 06/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 365 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 13/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 366 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 367 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 368 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 10/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 369 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 10/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 370 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 05/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 371 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 372 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 07/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 373 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 14/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 374 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 375 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 376 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 11/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 377 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 11/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 378 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 06/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 379 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 380 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 08/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 381 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 15/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 382 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 383 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 384 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 12/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 385 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 12/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 386 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 07/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 387 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 388 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 09/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 389 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 16/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 390 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 391 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 392 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 13/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 393 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 13/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 394 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 395 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 396 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 10/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 397 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 17/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 398 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 399 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 400 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 14/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 401 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 14/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 402 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 09/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 403 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 404 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 11/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 405 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 18/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 406 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 407 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 408 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 15/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 409 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 15/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 410 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 10/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 411 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 412 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 12/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 413 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 19/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 414 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 415 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 416 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 16/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 417 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 16/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 418 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 11/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 419 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 420 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 13/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 421 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 20/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 422 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 423 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 424 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 17/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 425 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 17/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 426 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 12/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 427 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 428 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 14/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 429 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 430 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 21/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 431 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 432 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 18/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 433 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 18/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 434 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 13/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 435 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 436 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 15/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 437 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 438 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 22/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 439 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 440 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 19/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 441 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 19/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 442 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 14/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 443 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 444 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 16/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 445 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 446 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Channel 23/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 447 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 448 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 20/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 449 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 20/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 450 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 15/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 451 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 452 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 17/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 453 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 454 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 455 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 21/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 456 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 21/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 457 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 16/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 458 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 459 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 18/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 460 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 461 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 22/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 462 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 22/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 463 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 17/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 464 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 465 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 19/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 466 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 467 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Channel 23/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 468 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Channel 23/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 469 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 18/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 470 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 471 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 20/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 472 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 473 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 19/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 474 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 475 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 21/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 476 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 477 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 22/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 478 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 20/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 479 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Channel 23/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 480 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 21/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 481 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 22/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 482 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Channel 23/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 483 |
+
t-20260517083536-fd9c2-worker-0:10332:11049 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 484 |
+
t-20260517083536-fd9c2-worker-0:10331:11056 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 485 |
+
t-20260517083536-fd9c2-worker-0:10333:11051 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 486 |
+
t-20260517083536-fd9c2-worker-0:10338:11054 [7] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 487 |
+
t-20260517083536-fd9c2-worker-0:10337:11052 [6] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 488 |
+
t-20260517083536-fd9c2-worker-0:10336:11053 [5] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 489 |
+
t-20260517083536-fd9c2-worker-0:10334:11050 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 490 |
+
t-20260517083536-fd9c2-worker-0:10335:11055 [4] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 491 |
+
{
|
| 492 |
+
"device": "cuda:0",
|
| 493 |
+
"rank": 0,
|
| 494 |
+
"world_size": 8,
|
| 495 |
+
"samples": "tokenized_hf:9737184:pad=0",
|
| 496 |
+
"vocab_size": 32100,
|
| 497 |
+
"tokenizer_vocab_size": 32100,
|
| 498 |
+
"save_dir": "runs/lta_owt_elfofficial_t5_len1024_elfaligned_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_gbs512_8gpu_5epoch_20260517_003703",
|
| 499 |
+
"batch_size": 32,
|
| 500 |
+
"grad_accum": 2,
|
| 501 |
+
"effective_batch_size": 512,
|
| 502 |
+
"global_batch_size": 512,
|
| 503 |
+
"lr_schedule": "constant_warmup",
|
| 504 |
+
"optimizer": "muon",
|
| 505 |
+
"epochs": 5.0,
|
| 506 |
+
"steps_per_epoch": 19018,
|
| 507 |
+
"total_steps": 95090,
|
| 508 |
+
"warmup_steps": 9509,
|
| 509 |
+
"warmup_epochs": 0.5,
|
| 510 |
+
"min_lr": 0.0,
|
| 511 |
+
"weight_decay": 0.1,
|
| 512 |
+
"output_weight_decay": -1.0,
|
| 513 |
+
"adamw_param_groups": "nanogpt",
|
| 514 |
+
"adam_beta1": 0.9,
|
| 515 |
+
"adam_beta2": 0.999,
|
| 516 |
+
"adam_eps": 1e-08,
|
| 517 |
+
"muon_impl": "optax",
|
| 518 |
+
"muon_momentum": 0.95,
|
| 519 |
+
"muon_ns_steps": 5,
|
| 520 |
+
"muon_update_scale": 1.0,
|
| 521 |
+
"muon_nesterov": true,
|
| 522 |
+
"muon_width_scale": true,
|
| 523 |
+
"muon_grouping": "hidden_2d",
|
| 524 |
+
"muon_param_count": 84934656,
|
| 525 |
+
"muon_adam_param_count": 50212608,
|
| 526 |
+
"muon_param_names": [
|
| 527 |
+
"blocks.0.attn_qkv.weight",
|
| 528 |
+
"blocks.0.attn_out.weight",
|
| 529 |
+
"blocks.0.mlp.w12.weight",
|
| 530 |
+
"blocks.0.mlp.w3.weight",
|
| 531 |
+
"blocks.1.attn_qkv.weight",
|
| 532 |
+
"blocks.1.attn_out.weight",
|
| 533 |
+
"blocks.1.mlp.w12.weight",
|
| 534 |
+
"blocks.1.mlp.w3.weight",
|
| 535 |
+
"blocks.2.attn_qkv.weight",
|
| 536 |
+
"blocks.2.attn_out.weight",
|
| 537 |
+
"blocks.2.mlp.w12.weight",
|
| 538 |
+
"blocks.2.mlp.w3.weight",
|
| 539 |
+
"blocks.3.attn_qkv.weight",
|
| 540 |
+
"blocks.3.attn_out.weight",
|
| 541 |
+
"blocks.3.mlp.w12.weight",
|
| 542 |
+
"blocks.3.mlp.w3.weight",
|
| 543 |
+
"blocks.4.attn_qkv.weight",
|
| 544 |
+
"blocks.4.attn_out.weight",
|
| 545 |
+
"blocks.4.mlp.w12.weight",
|
| 546 |
+
"blocks.4.mlp.w3.weight",
|
| 547 |
+
"blocks.5.attn_qkv.weight",
|
| 548 |
+
"blocks.5.attn_out.weight",
|
| 549 |
+
"blocks.5.mlp.w12.weight",
|
| 550 |
+
"blocks.5.mlp.w3.weight",
|
| 551 |
+
"blocks.6.attn_qkv.weight",
|
| 552 |
+
"blocks.6.attn_out.weight",
|
| 553 |
+
"blocks.6.mlp.w12.weight",
|
| 554 |
+
"blocks.6.mlp.w3.weight",
|
| 555 |
+
"blocks.7.attn_qkv.weight",
|
| 556 |
+
"blocks.7.attn_out.weight",
|
| 557 |
+
"blocks.7.mlp.w12.weight",
|
| 558 |
+
"blocks.7.mlp.w3.weight",
|
| 559 |
+
"blocks.8.attn_qkv.weight",
|
| 560 |
+
"blocks.8.attn_out.weight",
|
| 561 |
+
"blocks.8.mlp.w12.weight",
|
| 562 |
+
"blocks.8.mlp.w3.weight",
|
| 563 |
+
"blocks.9.attn_qkv.weight",
|
| 564 |
+
"blocks.9.attn_out.weight",
|
| 565 |
+
"blocks.9.mlp.w12.weight",
|
| 566 |
+
"blocks.9.mlp.w3.weight",
|
| 567 |
+
"blocks.10.attn_qkv.weight",
|
| 568 |
+
"blocks.10.attn_out.weight",
|
| 569 |
+
"blocks.10.mlp.w12.weight",
|
| 570 |
+
"blocks.10.mlp.w3.weight",
|
| 571 |
+
"blocks.11.attn_qkv.weight",
|
| 572 |
+
"blocks.11.attn_out.weight",
|
| 573 |
+
"blocks.11.mlp.w12.weight",
|
| 574 |
+
"blocks.11.mlp.w3.weight"
|
| 575 |
+
],
|
| 576 |
+
"muon_adam_param_names": [
|
| 577 |
+
"time_tokens",
|
| 578 |
+
"vocab_embed.embedding",
|
| 579 |
+
"sigma_map.net.0.weight",
|
| 580 |
+
"sigma_map.net.0.bias",
|
| 581 |
+
"sigma_map.net.2.weight",
|
| 582 |
+
"sigma_map.net.2.bias",
|
| 583 |
+
"blocks.0.norm1.weight",
|
| 584 |
+
"blocks.0.attn_qkv.bias",
|
| 585 |
+
"blocks.0.attn_out.bias",
|
| 586 |
+
"blocks.0.q_norm.weight",
|
| 587 |
+
"blocks.0.k_norm.weight",
|
| 588 |
+
"blocks.0.norm2.weight",
|
| 589 |
+
"blocks.0.mlp.w12.bias",
|
| 590 |
+
"blocks.0.mlp.w3.bias",
|
| 591 |
+
"blocks.1.norm1.weight",
|
| 592 |
+
"blocks.1.attn_qkv.bias",
|
| 593 |
+
"blocks.1.attn_out.bias",
|
| 594 |
+
"blocks.1.q_norm.weight",
|
| 595 |
+
"blocks.1.k_norm.weight",
|
| 596 |
+
"blocks.1.norm2.weight",
|
| 597 |
+
"blocks.1.mlp.w12.bias",
|
| 598 |
+
"blocks.1.mlp.w3.bias",
|
| 599 |
+
"blocks.2.norm1.weight",
|
| 600 |
+
"blocks.2.attn_qkv.bias",
|
| 601 |
+
"blocks.2.attn_out.bias",
|
| 602 |
+
"blocks.2.q_norm.weight",
|
| 603 |
+
"blocks.2.k_norm.weight",
|
| 604 |
+
"blocks.2.norm2.weight",
|
| 605 |
+
"blocks.2.mlp.w12.bias",
|
| 606 |
+
"blocks.2.mlp.w3.bias",
|
| 607 |
+
"blocks.3.norm1.weight",
|
| 608 |
+
"blocks.3.attn_qkv.bias",
|
| 609 |
+
"blocks.3.attn_out.bias",
|
| 610 |
+
"blocks.3.q_norm.weight",
|
| 611 |
+
"blocks.3.k_norm.weight",
|
| 612 |
+
"blocks.3.norm2.weight",
|
| 613 |
+
"blocks.3.mlp.w12.bias",
|
| 614 |
+
"blocks.3.mlp.w3.bias",
|
| 615 |
+
"blocks.4.norm1.weight",
|
| 616 |
+
"blocks.4.attn_qkv.bias",
|
| 617 |
+
"blocks.4.attn_out.bias",
|
| 618 |
+
"blocks.4.q_norm.weight",
|
| 619 |
+
"blocks.4.k_norm.weight",
|
| 620 |
+
"blocks.4.norm2.weight",
|
| 621 |
+
"blocks.4.mlp.w12.bias",
|
| 622 |
+
"blocks.4.mlp.w3.bias",
|
| 623 |
+
"blocks.5.norm1.weight",
|
| 624 |
+
"blocks.5.attn_qkv.bias",
|
| 625 |
+
"blocks.5.attn_out.bias",
|
| 626 |
+
"blocks.5.q_norm.weight",
|
| 627 |
+
"blocks.5.k_norm.weight",
|
| 628 |
+
"blocks.5.norm2.weight",
|
| 629 |
+
"blocks.5.mlp.w12.bias",
|
| 630 |
+
"blocks.5.mlp.w3.bias",
|
| 631 |
+
"blocks.6.norm1.weight",
|
| 632 |
+
"blocks.6.attn_qkv.bias",
|
| 633 |
+
"blocks.6.attn_out.bias",
|
| 634 |
+
"blocks.6.q_norm.weight",
|
| 635 |
+
"blocks.6.k_norm.weight",
|
| 636 |
+
"blocks.6.norm2.weight",
|
| 637 |
+
"blocks.6.mlp.w12.bias",
|
| 638 |
+
"blocks.6.mlp.w3.bias",
|
| 639 |
+
"blocks.7.norm1.weight",
|
| 640 |
+
"blocks.7.attn_qkv.bias",
|
| 641 |
+
"blocks.7.attn_out.bias",
|
| 642 |
+
"blocks.7.q_norm.weight",
|
| 643 |
+
"blocks.7.k_norm.weight",
|
| 644 |
+
"blocks.7.norm2.weight",
|
| 645 |
+
"blocks.7.mlp.w12.bias",
|
| 646 |
+
"blocks.7.mlp.w3.bias",
|
| 647 |
+
"blocks.8.norm1.weight",
|
| 648 |
+
"blocks.8.attn_qkv.bias",
|
| 649 |
+
"blocks.8.attn_out.bias",
|
| 650 |
+
"blocks.8.q_norm.weight",
|
| 651 |
+
"blocks.8.k_norm.weight",
|
| 652 |
+
"blocks.8.norm2.weight",
|
| 653 |
+
"blocks.8.mlp.w12.bias",
|
| 654 |
+
"blocks.8.mlp.w3.bias",
|
| 655 |
+
"blocks.9.norm1.weight",
|
| 656 |
+
"blocks.9.attn_qkv.bias",
|
| 657 |
+
"blocks.9.attn_out.bias",
|
| 658 |
+
"blocks.9.q_norm.weight",
|
| 659 |
+
"blocks.9.k_norm.weight",
|
| 660 |
+
"blocks.9.norm2.weight",
|
| 661 |
+
"blocks.9.mlp.w12.bias",
|
| 662 |
+
"blocks.9.mlp.w3.bias",
|
| 663 |
+
"blocks.10.norm1.weight",
|
| 664 |
+
"blocks.10.attn_qkv.bias",
|
| 665 |
+
"blocks.10.attn_out.bias",
|
| 666 |
+
"blocks.10.q_norm.weight",
|
| 667 |
+
"blocks.10.k_norm.weight",
|
| 668 |
+
"blocks.10.norm2.weight",
|
| 669 |
+
"blocks.10.mlp.w12.bias",
|
| 670 |
+
"blocks.10.mlp.w3.bias",
|
| 671 |
+
"blocks.11.norm1.weight",
|
| 672 |
+
"blocks.11.attn_qkv.bias",
|
| 673 |
+
"blocks.11.attn_out.bias",
|
| 674 |
+
"blocks.11.q_norm.weight",
|
| 675 |
+
"blocks.11.k_norm.weight",
|
| 676 |
+
"blocks.11.norm2.weight",
|
| 677 |
+
"blocks.11.mlp.w12.bias",
|
| 678 |
+
"blocks.11.mlp.w3.bias",
|
| 679 |
+
"output_layer.norm_final.weight",
|
| 680 |
+
"output_layer.linear.weight"
|
| 681 |
+
],
|
| 682 |
+
"muon_effective_nesterov": true,
|
| 683 |
+
"muon_effective_width_scale": true,
|
| 684 |
+
"muon_effective_weight_decay": 0.0,
|
| 685 |
+
"muon_adam_fallback_nesterov": true,
|
| 686 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 687 |
+
"ema_decay": 0.9999,
|
| 688 |
+
"ema_start_step": 0,
|
| 689 |
+
"model_type": "ddit_elf",
|
| 690 |
+
"elf_num_time_tokens": 4,
|
| 691 |
+
"elf_num_model_mode_tokens": 0,
|
| 692 |
+
"qk_norm": true,
|
| 693 |
+
"output_bias": false,
|
| 694 |
+
"output_init_std": 0.0,
|
| 695 |
+
"norm_type": "rmsnorm",
|
| 696 |
+
"target_loss": "hard_ce",
|
| 697 |
+
"linear_soft_target_power": 1.0,
|
| 698 |
+
"linear_soft_target_min_conf": 0.0,
|
| 699 |
+
"linear_soft_target_max_conf": 1.0,
|
| 700 |
+
"t_sampling_mode": "logit_normal",
|
| 701 |
+
"t_sampling_power": 1.0,
|
| 702 |
+
"t_sampling_eps": 0.0001,
|
| 703 |
+
"t_sampling_logit_mean": -1.5,
|
| 704 |
+
"t_sampling_logit_std": 0.8,
|
| 705 |
+
"dual_t": true,
|
| 706 |
+
"corrupt_t_mode": "same",
|
| 707 |
+
"corrupt_min_t": 0.0,
|
| 708 |
+
"corrupt_max_t": 1.0,
|
| 709 |
+
"prefix_block_prob": 0.0,
|
| 710 |
+
"prefix_block_len": 128,
|
| 711 |
+
"mask_ratio_floor_schedule": "none",
|
| 712 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 713 |
+
"dirichlet_semantic_t_mode": "same",
|
| 714 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 715 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 716 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 717 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 718 |
+
"categorical_wrong_from_full_vocab": true,
|
| 719 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 720 |
+
"categorical_wrong_basin_token_ids": "",
|
| 721 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 722 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 723 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 724 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 725 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 726 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 727 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 728 |
+
"mask_mixture_original_prob": 0.0,
|
| 729 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 730 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 731 |
+
"mask_mixture_block_prob": 0.0,
|
| 732 |
+
"mask_mixture_all_prob": 0.0,
|
| 733 |
+
"mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
|
| 734 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 735 |
+
"mask_mixture_block_tokens": "64,128",
|
| 736 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 737 |
+
"logistic_normal_sigma_min": 0.18,
|
| 738 |
+
"logistic_normal_sigma_max": 2.2,
|
| 739 |
+
"logistic_normal_tau_min": 0.65,
|
| 740 |
+
"logistic_normal_tau_max": 1.15,
|
| 741 |
+
"torch_compile": false,
|
| 742 |
+
"compile_mode": "max-autotune",
|
| 743 |
+
"state_format": "prob",
|
| 744 |
+
"meanflow_weight": 0.0,
|
| 745 |
+
"rollout_train_prob": 0.0,
|
| 746 |
+
"rollout_train_steps": 1,
|
| 747 |
+
"rollout_train_infer_steps": 64,
|
| 748 |
+
"rollout_train_temp": 1.45,
|
| 749 |
+
"rollout_train_max_gamma": 1.0,
|
| 750 |
+
"rollout_train_corrupt_only": true,
|
| 751 |
+
"rollout_train_samplewise": false,
|
| 752 |
+
"rollout_train_compute_always": false,
|
| 753 |
+
"bridge_noise_init": "logistic_normal",
|
| 754 |
+
"noise_sigma": -1.0,
|
| 755 |
+
"allow_tf32": true,
|
| 756 |
+
"activation_checkpointing": true,
|
| 757 |
+
"activation_checkpoint_interval": 1,
|
| 758 |
+
"activation_checkpoint_scope": "mlp",
|
| 759 |
+
"ddp_static_graph": false,
|
| 760 |
+
"ddp_gradient_as_bucket_view": true,
|
| 761 |
+
"blocking_data_transfer": false,
|
| 762 |
+
"dataloader_prefetch_factor": 4,
|
| 763 |
+
"full_train_stats": false,
|
| 764 |
+
"tokenized_hf": true,
|
| 765 |
+
"tokenized_pad_token": "pad",
|
| 766 |
+
"elf_conditional_hf": false,
|
| 767 |
+
"record_pad_truncate": false,
|
| 768 |
+
"record_add_eos": false,
|
| 769 |
+
"record_add_special_tokens": false,
|
| 770 |
+
"record_pad_token": "pad",
|
| 771 |
+
"record_shuffle_buffer": 10000,
|
| 772 |
+
"wrap": false,
|
| 773 |
+
"wrap_mode": "stream",
|
| 774 |
+
"wrap_record_buffer_size": 200,
|
| 775 |
+
"owt_cached_chunks": false,
|
| 776 |
+
"owt_chunk_cache_dir": "",
|
| 777 |
+
"owt_chunk_cache_rebuild": false,
|
| 778 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 779 |
+
"owt_exact_repeat_per_chunk": 0,
|
| 780 |
+
"online_chunk_shuffle": false,
|
| 781 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 782 |
+
"openwebtext_split": "all",
|
| 783 |
+
"detokenizer": "auto",
|
| 784 |
+
"resolved_detokenizer": null,
|
| 785 |
+
"num_workers": 8,
|
| 786 |
+
"latest_every": 1000,
|
| 787 |
+
"resume_path": ""
|
| 788 |
+
}
|
| 789 |
+
t-20260517083536-fd9c2-worker-0:10334:11432 [3] NCCL INFO NVLS comm 0xdf2c660 headRank 3 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 790 |
+
t-20260517083536-fd9c2-worker-0:10337:11433 [6] NCCL INFO NVLS comm 0xe813ad0 headRank 6 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 791 |
+
t-20260517083536-fd9c2-worker-0:10333:11434 [2] NCCL INFO NVLS comm 0xdbf3fd0 headRank 2 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 792 |
+
t-20260517083536-fd9c2-worker-0:10338:11435 [7] NCCL INFO NVLS comm 0xf0141a0 headRank 7 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 793 |
+
t-20260517083536-fd9c2-worker-0:10332:11436 [1] NCCL INFO NVLS comm 0xdfb2ce0 headRank 1 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 794 |
+
t-20260517083536-fd9c2-worker-0:10335:11437 [4] NCCL INFO NVLS comm 0xe76ac40 headRank 4 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 795 |
+
t-20260517083536-fd9c2-worker-0:10336:11438 [5] NCCL INFO NVLS comm 0xe38ef50 headRank 5 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 796 |
+
t-20260517083536-fd9c2-worker-0:10331:11439 [0] NCCL INFO NVLS comm 0xc57c980 headRank 0 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 797 |
+
step=100 epoch=1/5 epoch_step=100/19018 micro_steps=200 elapsed=86.3s lr=2.124303e-05 loss=10.1077 loss_recon=10.1077 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0255 corrupt_frac=0.5548 acc_corrupt=0.0228 loss_corrupt=10.1077 wrong_frac=0.7923 init_acc_corrupt=0.1144 acc_corrupt_t_0p0_0p2=0.0217 corrupt_frac_t_0p0_0p2=0.5662 acc_corrupt_t_0p2_0p4=0.0237 corrupt_frac_t_0p2_0p4=0.3453 out_w_norm=1.3776 out_g_norm=1.5106 acc_corrupt_t_0p4_0p6=0.0254 corrupt_frac_t_0p4_0p6=0.0863 acc_corrupt_t_0p6_0p8=0.0310 corrupt_frac_t_0p6_0p8=0.0340 acc_corrupt_t_0p8_1p0=0.0201 corrupt_frac_t_0p8_1p0=0.0110 loss_all=9.5265 init_gold_top10=0.2126 init_gold_top100=0.4678
|
| 798 |
+
step=200 epoch=1/5 epoch_step=200/19018 micro_steps=400 elapsed=86.1s lr=4.227574e-05 loss=8.5844 loss_recon=8.5844 loss_meanflow=0.0000 mean_model_t=0.2078 mean_corrupt_t=0.2078 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0421 corrupt_frac=0.5447 acc_corrupt=0.0405 loss_corrupt=8.5844 wrong_frac=0.7915 init_acc_corrupt=0.1157 acc_corrupt_t_0p0_0p2=0.0401 corrupt_frac_t_0p0_0p2=0.5629 acc_corrupt_t_0p2_0p4=0.0409 corrupt_frac_t_0p2_0p4=0.3516 acc_corrupt_t_0p4_0p6=0.0414 corrupt_frac_t_0p4_0p6=0.0826 out_w_norm=9.1712 out_g_norm=1.7873 acc_corrupt_t_0p6_0p8=0.0414 corrupt_frac_t_0p6_0p8=0.0381 acc_corrupt_t_0p8_1p0=0.0559 corrupt_frac_t_0p8_1p0=0.0262 loss_all=7.6974 init_gold_top10=0.2071 init_gold_top100=0.4612
|
| 799 |
+
step=300 epoch=1/5 epoch_step=300/19018 micro_steps=600 elapsed=86.3s lr=6.330844e-05 loss=7.3203 loss_recon=7.3203 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0441 corrupt_frac=0.5468 acc_corrupt=0.0442 loss_corrupt=7.3203 wrong_frac=0.7917 init_acc_corrupt=0.1151 acc_corrupt_t_0p0_0p2=0.0446 corrupt_frac_t_0p0_0p2=0.5541 acc_corrupt_t_0p2_0p4=0.0438 corrupt_frac_t_0p2_0p4=0.3616 acc_corrupt_t_0p4_0p6=0.0428 corrupt_frac_t_0p4_0p6=0.0820 acc_corrupt_t_0p6_0p8=0.0454 corrupt_frac_t_0p6_0p8=0.0386 out_w_norm=19.5004 out_g_norm=0.6168 acc_corrupt_t_0p8_1p0=0.0462 corrupt_frac_t_0p8_1p0=0.0114 loss_all=7.1118 init_gold_top10=0.1727 init_gold_top100=0.4789
|
| 800 |
+
step=400 epoch=1/5 epoch_step=400/19018 micro_steps=800 elapsed=86.7s lr=8.434115e-05 loss=7.2051 loss_recon=7.2051 loss_meanflow=0.0000 mean_model_t=0.2110 mean_corrupt_t=0.2110 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0436 corrupt_frac=0.5491 acc_corrupt=0.0436 loss_corrupt=7.2051 wrong_frac=0.7892 init_acc_corrupt=0.1185 acc_corrupt_t_0p0_0p2=0.0437 corrupt_frac_t_0p0_0p2=0.5498 acc_corrupt_t_0p2_0p4=0.0433 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.0445 corrupt_frac_t_0p4_0p6=0.0896 out_w_norm=23.9056 out_g_norm=0.1402 acc_corrupt_t_0p6_0p8=0.0409 corrupt_frac_t_0p6_0p8=0.0372 loss_all=7.0927 init_gold_top10=0.2047 init_gold_top100=0.4522
|
| 801 |
+
step=500 epoch=1/5 epoch_step=500/19018 micro_steps=1000 elapsed=86.3s lr=1.053739e-04 loss=7.1409 loss_recon=7.1409 loss_meanflow=0.0000 mean_model_t=0.2087 mean_corrupt_t=0.2087 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0460 corrupt_frac=0.5436 acc_corrupt=0.0449 loss_corrupt=7.1409 wrong_frac=0.7898 init_acc_corrupt=0.1159 acc_corrupt_t_0p0_0p2=0.0443 corrupt_frac_t_0p0_0p2=0.5427 acc_corrupt_t_0p2_0p4=0.0455 corrupt_frac_t_0p2_0p4=0.3736 acc_corrupt_t_0p4_0p6=0.0459 corrupt_frac_t_0p4_0p6=0.0827 out_w_norm=27.0306 out_g_norm=0.1499 acc_corrupt_t_0p6_0p8=0.0458 corrupt_frac_t_0p6_0p8=0.0390 loss_all=7.0390 init_gold_top10=0.1650 init_gold_top100=0.4711
|
| 802 |
+
step=600 epoch=1/5 epoch_step=600/19018 micro_steps=1200 elapsed=86.4s lr=1.264066e-04 loss=7.0747 loss_recon=7.0747 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0459 corrupt_frac=0.5511 acc_corrupt=0.0449 loss_corrupt=7.0747 wrong_frac=0.7918 init_acc_corrupt=0.1153 acc_corrupt_t_0p0_0p2=0.0445 corrupt_frac_t_0p0_0p2=0.5495 acc_corrupt_t_0p2_0p4=0.0453 corrupt_frac_t_0p2_0p4=0.3691 acc_corrupt_t_0p4_0p6=0.0458 corrupt_frac_t_0p4_0p6=0.0786 out_w_norm=30.6778 out_g_norm=0.1513 acc_corrupt_t_0p6_0p8=0.0456 corrupt_frac_t_0p6_0p8=0.0374 loss_all=6.9273 init_gold_top10=0.1367 init_gold_top100=0.4541
|
| 803 |
+
step=700 epoch=1/5 epoch_step=700/19018 micro_steps=1400 elapsed=85.9s lr=1.474393e-04 loss=6.9957 loss_recon=6.9957 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0463 corrupt_frac=0.5509 acc_corrupt=0.0456 loss_corrupt=6.9957 wrong_frac=0.7912 init_acc_corrupt=0.1160 acc_corrupt_t_0p0_0p2=0.0458 corrupt_frac_t_0p0_0p2=0.5576 acc_corrupt_t_0p2_0p4=0.0454 corrupt_frac_t_0p2_0p4=0.3640 acc_corrupt_t_0p4_0p6=0.0457 corrupt_frac_t_0p4_0p6=0.0787 out_w_norm=34.4484 out_g_norm=0.1570 acc_corrupt_t_0p6_0p8=0.0495 corrupt_frac_t_0p6_0p8=0.0367 acc_corrupt_t_0p8_1p0=0.0278 corrupt_frac_t_0p8_1p0=0.0398 loss_all=6.8398 init_gold_top10=0.1595 init_gold_top100=0.4627
|
| 804 |
+
step=800 epoch=1/5 epoch_step=800/19018 micro_steps=1600 elapsed=85.9s lr=1.684720e-04 loss=6.9104 loss_recon=6.9104 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0876 corrupt_frac=0.5501 acc_corrupt=0.0561 loss_corrupt=6.9104 wrong_frac=0.7947 init_acc_corrupt=0.1120 acc_corrupt_t_0p0_0p2=0.0474 corrupt_frac_t_0p0_0p2=0.5739 acc_corrupt_t_0p2_0p4=0.0627 corrupt_frac_t_0p2_0p4=0.3437 acc_corrupt_t_0p4_0p6=0.0868 corrupt_frac_t_0p4_0p6=0.0774 out_w_norm=37.6699 out_g_norm=0.1574 acc_corrupt_t_0p6_0p8=0.1051 corrupt_frac_t_0p6_0p8=0.0360 acc_corrupt_t_0p8_1p0=0.0511 corrupt_frac_t_0p8_1p0=0.0395 loss_all=6.3633 init_gold_top10=0.2347 init_gold_top100=0.4596
|
| 805 |
+
step=900 epoch=1/5 epoch_step=900/19018 micro_steps=1800 elapsed=85.9s lr=1.895047e-04 loss=6.7716 loss_recon=6.7716 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1951 corrupt_frac=0.5524 acc_corrupt=0.0868 loss_corrupt=6.7716 wrong_frac=0.7910 init_acc_corrupt=0.1160 acc_corrupt_t_0p0_0p2=0.0557 corrupt_frac_t_0p0_0p2=0.5488 acc_corrupt_t_0p2_0p4=0.1100 corrupt_frac_t_0p2_0p4=0.3689 out_w_norm=40.5496 out_g_norm=0.1613 acc_corrupt_t_0p4_0p6=0.1827 corrupt_frac_t_0p4_0p6=0.0812 acc_corrupt_t_0p6_0p8=0.2635 corrupt_frac_t_0p6_0p8=0.0376 loss_all=5.8418 init_gold_top10=0.1503 init_gold_top100=0.4575
|
| 806 |
+
step=1000 epoch=1/5 epoch_step=1000/19018 micro_steps=2000 elapsed=86.1s lr=2.105374e-04 loss=6.6095 loss_recon=6.6095 loss_meanflow=0.0000 mean_model_t=0.2079 mean_corrupt_t=0.2079 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2832 corrupt_frac=0.5550 acc_corrupt=0.1135 loss_corrupt=6.6095 wrong_frac=0.7930 init_acc_corrupt=0.1136 acc_corrupt_t_0p0_0p2=0.0646 corrupt_frac_t_0p0_0p2=0.5682 acc_corrupt_t_0p2_0p4=0.1546 corrupt_frac_t_0p2_0p4=0.3522 acc_corrupt_t_0p4_0p6=0.2699 corrupt_frac_t_0p4_0p6=0.0801 out_w_norm=43.5751 out_g_norm=0.1417 acc_corrupt_t_0p6_0p8=0.3703 corrupt_frac_t_0p6_0p8=0.0341 acc_corrupt_t_0p8_1p0=0.3959 corrupt_frac_t_0p8_1p0=0.0305 loss_all=5.1801 init_gold_top10=0.1761 init_gold_top100=0.4380
|
| 807 |
+
step=1100 epoch=1/5 epoch_step=1100/19018 micro_steps=2200 elapsed=87.4s lr=2.315701e-04 loss=6.4173 loss_recon=6.4173 loss_meanflow=0.0000 mean_model_t=0.2072 mean_corrupt_t=0.2072 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3526 corrupt_frac=0.5497 acc_corrupt=0.1407 loss_corrupt=6.4173 wrong_frac=0.7923 init_acc_corrupt=0.1151 acc_corrupt_t_0p0_0p2=0.0777 corrupt_frac_t_0p0_0p2=0.5694 acc_corrupt_t_0p2_0p4=0.1921 corrupt_frac_t_0p2_0p4=0.3438 acc_corrupt_t_0p4_0p6=0.3376 corrupt_frac_t_0p4_0p6=0.0849 out_w_norm=46.9110 out_g_norm=0.1352 acc_corrupt_t_0p6_0p8=0.4598 corrupt_frac_t_0p6_0p8=0.0385 loss_all=4.8849 init_gold_top10=0.1511 init_gold_top100=0.4778
|
| 808 |
+
step=1200 epoch=1/5 epoch_step=1200/19018 micro_steps=2400 elapsed=85.9s lr=2.526028e-04 loss=6.2068 loss_recon=6.2068 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3919 corrupt_frac=0.5524 acc_corrupt=0.1651 loss_corrupt=6.2068 wrong_frac=0.7930 init_acc_corrupt=0.1134 acc_corrupt_t_0p0_0p2=0.0967 corrupt_frac_t_0p0_0p2=0.5750 acc_corrupt_t_0p2_0p4=0.2213 corrupt_frac_t_0p2_0p4=0.3357 acc_corrupt_t_0p4_0p6=0.3767 corrupt_frac_t_0p4_0p6=0.0855 out_w_norm=50.0839 out_g_norm=0.1395 acc_corrupt_t_0p6_0p8=0.5168 corrupt_frac_t_0p6_0p8=0.0380 acc_corrupt_t_0p8_1p0=0.5729 corrupt_frac_t_0p8_1p0=0.0484 loss_all=4.5517 init_gold_top10=0.1819 init_gold_top100=0.4666
|
| 809 |
+
step=1300 epoch=1/5 epoch_step=1300/19018 micro_steps=2600 elapsed=85.9s lr=2.736355e-04 loss=5.9556 loss_recon=5.9556 loss_meanflow=0.0000 mean_model_t=0.2102 mean_corrupt_t=0.2102 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4193 corrupt_frac=0.5542 acc_corrupt=0.1918 loss_corrupt=5.9556 wrong_frac=0.7897 init_acc_corrupt=0.1177 acc_corrupt_t_0p0_0p2=0.1161 corrupt_frac_t_0p0_0p2=0.5479 acc_corrupt_t_0p2_0p4=0.2493 corrupt_frac_t_0p2_0p4=0.3646 acc_corrupt_t_0p4_0p6=0.4164 corrupt_frac_t_0p4_0p6=0.0845 out_w_norm=53.3269 out_g_norm=0.1334 acc_corrupt_t_0p6_0p8=0.5293 corrupt_frac_t_0p6_0p8=0.0343 loss_all=4.7193 init_gold_top10=0.1507 init_gold_top100=0.4609
|
| 810 |
+
step=1400 epoch=1/5 epoch_step=1400/19018 micro_steps=2800 elapsed=85.9s lr=2.946682e-04 loss=5.7526 loss_recon=5.7526 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4395 corrupt_frac=0.5537 acc_corrupt=0.2098 loss_corrupt=5.7526 wrong_frac=0.7928 init_acc_corrupt=0.1137 acc_corrupt_t_0p0_0p2=0.1344 corrupt_frac_t_0p0_0p2=0.5623 acc_corrupt_t_0p2_0p4=0.2720 corrupt_frac_t_0p2_0p4=0.3551 acc_corrupt_t_0p4_0p6=0.4460 corrupt_frac_t_0p4_0p6=0.0832 acc_corrupt_t_0p6_0p8=0.5683 corrupt_frac_t_0p6_0p8=0.0317 out_w_norm=57.0235 out_g_norm=0.1265 acc_corrupt_t_0p8_1p0=0.6662 corrupt_frac_t_0p8_1p0=0.0463 loss_all=4.3451 init_gold_top10=0.1726 init_gold_top100=0.4774
|
| 811 |
+
step=1500 epoch=1/5 epoch_step=1500/19018 micro_steps=3000 elapsed=86.3s lr=3.157009e-04 loss=5.5515 loss_recon=5.5515 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4654 corrupt_frac=0.5434 acc_corrupt=0.2269 loss_corrupt=5.5515 wrong_frac=0.7929 init_acc_corrupt=0.1132 acc_corrupt_t_0p0_0p2=0.1506 corrupt_frac_t_0p0_0p2=0.5609 acc_corrupt_t_0p2_0p4=0.2881 corrupt_frac_t_0p2_0p4=0.3579 acc_corrupt_t_0p4_0p6=0.4685 corrupt_frac_t_0p4_0p6=0.0780 acc_corrupt_t_0p6_0p8=0.5979 corrupt_frac_t_0p6_0p8=0.0346 out_w_norm=61.3916 out_g_norm=0.1185 acc_corrupt_t_0p8_1p0=0.7810 corrupt_frac_t_0p8_1p0=0.0066 loss_all=4.3082 init_gold_top10=0.1572 init_gold_top100=0.4663
|
| 812 |
+
step=1600 epoch=1/5 epoch_step=1600/19018 micro_steps=3200 elapsed=86.7s lr=3.367336e-04 loss=5.3904 loss_recon=5.3904 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4776 corrupt_frac=0.5508 acc_corrupt=0.2403 loss_corrupt=5.3904 wrong_frac=0.7905 init_acc_corrupt=0.1175 acc_corrupt_t_0p0_0p2=0.1576 corrupt_frac_t_0p0_0p2=0.5568 acc_corrupt_t_0p2_0p4=0.3064 corrupt_frac_t_0p2_0p4=0.3575 out_w_norm=66.2681 out_g_norm=0.1160 acc_corrupt_t_0p4_0p6=0.4879 corrupt_frac_t_0p4_0p6=0.0852 acc_corrupt_t_0p6_0p8=0.6338 corrupt_frac_t_0p6_0p8=0.0348 acc_corrupt_t_0p8_1p0=0.7339 corrupt_frac_t_0p8_1p0=0.0238 loss_all=3.8151 init_gold_top10=0.2121 init_gold_top100=0.4529
|
| 813 |
+
step=1700 epoch=1/5 epoch_step=1700/19018 micro_steps=3400 elapsed=86.3s lr=3.577663e-04 loss=5.2517 loss_recon=5.2517 loss_meanflow=0.0000 mean_model_t=0.2118 mean_corrupt_t=0.2118 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4935 corrupt_frac=0.5511 acc_corrupt=0.2518 loss_corrupt=5.2517 wrong_frac=0.7888 init_acc_corrupt=0.1174 acc_corrupt_t_0p0_0p2=0.1685 corrupt_frac_t_0p0_0p2=0.5502 acc_corrupt_t_0p2_0p4=0.3159 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.5030 corrupt_frac_t_0p4_0p6=0.0842 out_w_norm=71.3707 out_g_norm=0.1136 acc_corrupt_t_0p6_0p8=0.6402 corrupt_frac_t_0p6_0p8=0.0337 loss_all=3.5473 init_gold_top10=0.1698 init_gold_top100=0.4573
|
| 814 |
+
step=1800 epoch=1/5 epoch_step=1800/19018 micro_steps=3600 elapsed=86.4s lr=3.787990e-04 loss=5.1378 loss_recon=5.1378 loss_meanflow=0.0000 mean_model_t=0.2103 mean_corrupt_t=0.2103 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5055 corrupt_frac=0.5522 acc_corrupt=0.2607 loss_corrupt=5.1378 wrong_frac=0.7897 init_acc_corrupt=0.1177 acc_corrupt_t_0p0_0p2=0.1739 corrupt_frac_t_0p0_0p2=0.5549 acc_corrupt_t_0p2_0p4=0.3273 corrupt_frac_t_0p2_0p4=0.3554 acc_corrupt_t_0p4_0p6=0.5213 corrupt_frac_t_0p4_0p6=0.0899 acc_corrupt_t_0p6_0p8=0.6580 corrupt_frac_t_0p6_0p8=0.0324 out_w_norm=76.5382 out_g_norm=0.1128 loss_all=3.4692 init_gold_top10=0.2563 init_gold_top100=0.4698
|
| 815 |
+
step=1900 epoch=1/5 epoch_step=1900/19018 micro_steps=3800 elapsed=86.4s lr=3.998317e-04 loss=5.0430 loss_recon=5.0430 loss_meanflow=0.0000 mean_model_t=0.2088 mean_corrupt_t=0.2088 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5182 corrupt_frac=0.5512 acc_corrupt=0.2685 loss_corrupt=5.0430 wrong_frac=0.7900 init_acc_corrupt=0.1165 acc_corrupt_t_0p0_0p2=0.1826 corrupt_frac_t_0p0_0p2=0.5467 acc_corrupt_t_0p2_0p4=0.3356 corrupt_frac_t_0p2_0p4=0.3741 acc_corrupt_t_0p4_0p6=0.5292 corrupt_frac_t_0p4_0p6=0.0824 out_w_norm=81.7069 out_g_norm=0.1132 acc_corrupt_t_0p6_0p8=0.6771 corrupt_frac_t_0p6_0p8=0.0363 acc_corrupt_t_0p8_1p0=0.7918 corrupt_frac_t_0p8_1p0=0.0307 loss_all=3.8845 init_gold_top10=0.1785 init_gold_top100=0.4700
|
| 816 |
+
step=2000 epoch=1/5 epoch_step=2000/19018 micro_steps=4000 elapsed=86.4s lr=4.208644e-04 loss=4.9254 loss_recon=4.9254 loss_meanflow=0.0000 mean_model_t=0.2125 mean_corrupt_t=0.2125 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5342 corrupt_frac=0.5505 acc_corrupt=0.2795 loss_corrupt=4.9254 wrong_frac=0.7867 init_acc_corrupt=0.1206 acc_corrupt_t_0p0_0p2=0.1900 corrupt_frac_t_0p0_0p2=0.5481 acc_corrupt_t_0p2_0p4=0.3454 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.5432 corrupt_frac_t_0p4_0p6=0.0870 out_w_norm=86.7303 out_g_norm=0.1127 acc_corrupt_t_0p6_0p8=0.6953 corrupt_frac_t_0p6_0p8=0.0326 acc_corrupt_t_0p8_1p0=0.7696 corrupt_frac_t_0p8_1p0=0.0254 loss_all=2.6664 init_gold_top10=0.1917 init_gold_top100=0.4474
|
| 817 |
+
step=2100 epoch=1/5 epoch_step=2100/19018 micro_steps=4200 elapsed=87.4s lr=4.418972e-04 loss=4.8926 loss_recon=4.8926 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5393 corrupt_frac=0.5524 acc_corrupt=0.2798 loss_corrupt=4.8926 wrong_frac=0.7932 init_acc_corrupt=0.1133 acc_corrupt_t_0p0_0p2=0.1928 corrupt_frac_t_0p0_0p2=0.5635 acc_corrupt_t_0p2_0p4=0.3508 corrupt_frac_t_0p2_0p4=0.3539 acc_corrupt_t_0p4_0p6=0.5531 corrupt_frac_t_0p4_0p6=0.0766 out_w_norm=91.6862 out_g_norm=0.1134 acc_corrupt_t_0p6_0p8=0.6963 corrupt_frac_t_0p6_0p8=0.0353 acc_corrupt_t_0p8_1p0=0.8406 corrupt_frac_t_0p8_1p0=0.0306 loss_all=2.8325 init_gold_top10=0.2096 init_gold_top100=0.4672
|
| 818 |
+
step=2200 epoch=1/5 epoch_step=2200/19018 micro_steps=4400 elapsed=86.2s lr=4.629299e-04 loss=4.7971 loss_recon=4.7971 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5552 corrupt_frac=0.5460 acc_corrupt=0.2891 loss_corrupt=4.7971 wrong_frac=0.7917 init_acc_corrupt=0.1156 acc_corrupt_t_0p0_0p2=0.2001 corrupt_frac_t_0p0_0p2=0.5635 acc_corrupt_t_0p2_0p4=0.3597 corrupt_frac_t_0p2_0p4=0.3476 acc_corrupt_t_0p4_0p6=0.5594 corrupt_frac_t_0p4_0p6=0.0854 out_w_norm=96.5034 out_g_norm=0.1139 acc_corrupt_t_0p6_0p8=0.7225 corrupt_frac_t_0p6_0p8=0.0415 acc_corrupt_t_0p8_1p0=0.8267 corrupt_frac_t_0p8_1p0=0.0117 loss_all=3.0001 init_gold_top10=0.1882 init_gold_top100=0.4571
|
| 819 |
+
step=2300 epoch=1/5 epoch_step=2300/19018 micro_steps=4600 elapsed=85.9s lr=4.839626e-04 loss=4.7369 loss_recon=4.7369 loss_meanflow=0.0000 mean_model_t=0.2098 mean_corrupt_t=0.2098 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5626 corrupt_frac=0.5493 acc_corrupt=0.2948 loss_corrupt=4.7369 wrong_frac=0.7902 init_acc_corrupt=0.1172 acc_corrupt_t_0p0_0p2=0.2043 corrupt_frac_t_0p0_0p2=0.5548 acc_corrupt_t_0p2_0p4=0.3644 corrupt_frac_t_0p2_0p4=0.3564 acc_corrupt_t_0p4_0p6=0.5671 corrupt_frac_t_0p4_0p6=0.0882 out_w_norm=101.1737 out_g_norm=0.1138 acc_corrupt_t_0p6_0p8=0.7210 corrupt_frac_t_0p6_0p8=0.0353 acc_corrupt_t_0p8_1p0=0.8434 corrupt_frac_t_0p8_1p0=0.0226 loss_all=2.7683 init_gold_top10=0.1874 init_gold_top100=0.4397
|
| 820 |
+
step=2400 epoch=1/5 epoch_step=2400/19018 micro_steps=4800 elapsed=86.0s lr=5.049953e-04 loss=4.6552 loss_recon=4.6552 loss_meanflow=0.0000 mean_model_t=0.2112 mean_corrupt_t=0.2112 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5733 corrupt_frac=0.5491 acc_corrupt=0.3030 loss_corrupt=4.6552 wrong_frac=0.7880 init_acc_corrupt=0.1193 acc_corrupt_t_0p0_0p2=0.2098 corrupt_frac_t_0p0_0p2=0.5425 acc_corrupt_t_0p2_0p4=0.3687 corrupt_frac_t_0p2_0p4=0.3685 acc_corrupt_t_0p4_0p6=0.5783 corrupt_frac_t_0p4_0p6=0.0812 out_w_norm=105.6680 out_g_norm=0.1144 acc_corrupt_t_0p6_0p8=0.7404 corrupt_frac_t_0p6_0p8=0.0383 loss_all=3.0119 init_gold_top10=0.1810 init_gold_top100=0.4601
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_8gpu_1m_20260518_131947.log
ADDED
|
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
*****************************************
|
| 3 |
+
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
*****************************************
|
| 5 |
+
|
| 6 |
+
*****************************************
|
| 7 |
+
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 8 |
+
*****************************************
|
| 9 |
+
NCCL version 2.25.1+cuda12.8
|
| 10 |
+
NCCL version 2.25.1+cuda12.8
|
| 11 |
+
{
|
| 12 |
+
"device": "cuda:0",
|
| 13 |
+
"rank": 0,
|
| 14 |
+
"world_size": 8,
|
| 15 |
+
"samples": "tokenized_hf:9737184:pad=0",
|
| 16 |
+
"vocab_size": 32100,
|
| 17 |
+
"tokenizer_vocab_size": 32100,
|
| 18 |
+
"save_dir": "runs/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_8gpu_1m_20260518_131947",
|
| 19 |
+
"batch_size": 32,
|
| 20 |
+
"grad_accum": 2,
|
| 21 |
+
"effective_batch_size": 512,
|
| 22 |
+
"global_batch_size": 512,
|
| 23 |
+
"lr_schedule": "cosine",
|
| 24 |
+
"optimizer": "adamw",
|
| 25 |
+
"epochs": 0.0,
|
| 26 |
+
"steps_per_epoch": 19018,
|
| 27 |
+
"total_steps": 1000000,
|
| 28 |
+
"warmup_steps": 1000,
|
| 29 |
+
"warmup_epochs": -1.0,
|
| 30 |
+
"min_lr": 6e-05,
|
| 31 |
+
"weight_decay": 0.1,
|
| 32 |
+
"output_weight_decay": -1.0,
|
| 33 |
+
"adamw_param_groups": "nanogpt",
|
| 34 |
+
"adam_beta1": 0.9,
|
| 35 |
+
"adam_beta2": 0.999,
|
| 36 |
+
"adam_eps": 1e-08,
|
| 37 |
+
"muon_impl": "legacy",
|
| 38 |
+
"muon_momentum": 0.95,
|
| 39 |
+
"muon_ns_steps": 5,
|
| 40 |
+
"muon_update_scale": 1.0,
|
| 41 |
+
"muon_nesterov": false,
|
| 42 |
+
"muon_width_scale": false,
|
| 43 |
+
"muon_grouping": "",
|
| 44 |
+
"muon_param_count": 0,
|
| 45 |
+
"muon_adam_param_count": 0,
|
| 46 |
+
"muon_param_names": [],
|
| 47 |
+
"muon_adam_param_names": [],
|
| 48 |
+
"muon_effective_nesterov": false,
|
| 49 |
+
"muon_effective_width_scale": false,
|
| 50 |
+
"muon_effective_weight_decay": 0.1,
|
| 51 |
+
"muon_adam_fallback_nesterov": false,
|
| 52 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 53 |
+
"ema_decay": 0.9999,
|
| 54 |
+
"ema_start_step": 0,
|
| 55 |
+
"model_type": "ddit",
|
| 56 |
+
"ddit_mlp_type": "swiglu",
|
| 57 |
+
"elf_num_time_tokens": 4,
|
| 58 |
+
"elf_num_model_mode_tokens": 0,
|
| 59 |
+
"qk_norm": true,
|
| 60 |
+
"output_bias": false,
|
| 61 |
+
"output_init_std": -1.0,
|
| 62 |
+
"norm_type": "rmsnorm",
|
| 63 |
+
"target_loss": "hard_ce",
|
| 64 |
+
"linear_soft_target_power": 1.0,
|
| 65 |
+
"linear_soft_target_min_conf": 0.0,
|
| 66 |
+
"linear_soft_target_max_conf": 1.0,
|
| 67 |
+
"t_sampling_mode": "uniform",
|
| 68 |
+
"t_sampling_power": 1.0,
|
| 69 |
+
"t_sampling_eps": 0.0001,
|
| 70 |
+
"t_sampling_logit_mean": -1.5,
|
| 71 |
+
"t_sampling_logit_std": 0.8,
|
| 72 |
+
"dual_t": true,
|
| 73 |
+
"corrupt_t_mode": "same",
|
| 74 |
+
"corrupt_min_t": 0.0,
|
| 75 |
+
"corrupt_max_t": 1.0,
|
| 76 |
+
"prefix_block_prob": 0.0,
|
| 77 |
+
"prefix_block_len": 128,
|
| 78 |
+
"mask_ratio_floor_schedule": "none",
|
| 79 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 80 |
+
"dirichlet_semantic_t_mode": "same",
|
| 81 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 82 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 83 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 84 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 85 |
+
"categorical_wrong_from_full_vocab": true,
|
| 86 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 87 |
+
"categorical_wrong_basin_token_ids": "",
|
| 88 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 89 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 90 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 91 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 92 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 93 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 94 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 95 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 96 |
+
"mask_mixture_original_prob": 0.0,
|
| 97 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 98 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 99 |
+
"mask_mixture_block_prob": 0.0,
|
| 100 |
+
"mask_mixture_all_prob": 1.0,
|
| 101 |
+
"mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
|
| 102 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 103 |
+
"mask_mixture_block_tokens": "64,128",
|
| 104 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 105 |
+
"logistic_normal_sigma_min": 0.18,
|
| 106 |
+
"logistic_normal_sigma_max": 2.2,
|
| 107 |
+
"logistic_normal_tau_min": 0.65,
|
| 108 |
+
"logistic_normal_tau_max": 1.15,
|
| 109 |
+
"torch_compile": false,
|
| 110 |
+
"compile_mode": "max-autotune",
|
| 111 |
+
"state_format": "prob",
|
| 112 |
+
"meanflow_weight": 0.0,
|
| 113 |
+
"rollout_train_prob": 0.5,
|
| 114 |
+
"rollout_train_steps": 3,
|
| 115 |
+
"rollout_train_steps_min": 0,
|
| 116 |
+
"rollout_train_infer_steps": 1,
|
| 117 |
+
"rollout_train_time_mode": "sampled_path",
|
| 118 |
+
"rollout_train_s_dist": "uniform",
|
| 119 |
+
"rollout_train_s_min_frac": 0.0,
|
| 120 |
+
"rollout_train_s_max_frac": 0.25,
|
| 121 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 122 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 123 |
+
"rollout_train_temp": 1.0,
|
| 124 |
+
"rollout_train_max_gamma": 1.0,
|
| 125 |
+
"rollout_train_corrupt_only": true,
|
| 126 |
+
"rollout_train_samplewise": true,
|
| 127 |
+
"rollout_train_compute_always": false,
|
| 128 |
+
"rollout_train_sync_t": true,
|
| 129 |
+
"bridge_noise_init": "logistic_normal",
|
| 130 |
+
"noise_sigma": -1.0,
|
| 131 |
+
"allow_tf32": true,
|
| 132 |
+
"activation_checkpointing": true,
|
| 133 |
+
"activation_checkpoint_interval": 1,
|
| 134 |
+
"activation_checkpoint_scope": "mlp",
|
| 135 |
+
"ddp_static_graph": false,
|
| 136 |
+
"ddp_gradient_as_bucket_view": true,
|
| 137 |
+
"blocking_data_transfer": false,
|
| 138 |
+
"dataloader_prefetch_factor": 4,
|
| 139 |
+
"full_train_stats": false,
|
| 140 |
+
"tokenized_hf": true,
|
| 141 |
+
"tokenized_pad_token": "pad",
|
| 142 |
+
"elf_conditional_hf": false,
|
| 143 |
+
"record_pad_truncate": false,
|
| 144 |
+
"record_add_eos": false,
|
| 145 |
+
"record_add_special_tokens": false,
|
| 146 |
+
"record_pad_token": "pad",
|
| 147 |
+
"record_shuffle_buffer": 10000,
|
| 148 |
+
"wrap": false,
|
| 149 |
+
"wrap_mode": "stream",
|
| 150 |
+
"wrap_record_buffer_size": 200,
|
| 151 |
+
"owt_cached_chunks": false,
|
| 152 |
+
"owt_chunk_cache_dir": "",
|
| 153 |
+
"owt_chunk_cache_rebuild": false,
|
| 154 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 155 |
+
"owt_exact_repeat_per_chunk": 0,
|
| 156 |
+
"online_chunk_shuffle": false,
|
| 157 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 158 |
+
"openwebtext_split": "all",
|
| 159 |
+
"detokenizer": "auto",
|
| 160 |
+
"resolved_detokenizer": null,
|
| 161 |
+
"num_workers": 8,
|
| 162 |
+
"latest_every": 1000,
|
| 163 |
+
"resume_path": ""
|
| 164 |
+
}
|
| 165 |
+
{
|
| 166 |
+
"device": "cuda:0",
|
| 167 |
+
"rank": 0,
|
| 168 |
+
"world_size": 8,
|
| 169 |
+
"samples": "tokenized_hf:9737184:pad=0",
|
| 170 |
+
"vocab_size": 32100,
|
| 171 |
+
"tokenizer_vocab_size": 32100,
|
| 172 |
+
"save_dir": "runs/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_8gpu_1m_20260518_131947",
|
| 173 |
+
"batch_size": 32,
|
| 174 |
+
"grad_accum": 2,
|
| 175 |
+
"effective_batch_size": 512,
|
| 176 |
+
"global_batch_size": 512,
|
| 177 |
+
"lr_schedule": "cosine",
|
| 178 |
+
"optimizer": "adamw",
|
| 179 |
+
"epochs": 0.0,
|
| 180 |
+
"steps_per_epoch": 19018,
|
| 181 |
+
"total_steps": 1000000,
|
| 182 |
+
"warmup_steps": 1000,
|
| 183 |
+
"warmup_epochs": -1.0,
|
| 184 |
+
"min_lr": 6e-05,
|
| 185 |
+
"weight_decay": 0.1,
|
| 186 |
+
"output_weight_decay": -1.0,
|
| 187 |
+
"adamw_param_groups": "nanogpt",
|
| 188 |
+
"adam_beta1": 0.9,
|
| 189 |
+
"adam_beta2": 0.999,
|
| 190 |
+
"adam_eps": 1e-08,
|
| 191 |
+
"muon_impl": "legacy",
|
| 192 |
+
"muon_momentum": 0.95,
|
| 193 |
+
"muon_ns_steps": 5,
|
| 194 |
+
"muon_update_scale": 1.0,
|
| 195 |
+
"muon_nesterov": false,
|
| 196 |
+
"muon_width_scale": false,
|
| 197 |
+
"muon_grouping": "",
|
| 198 |
+
"muon_param_count": 0,
|
| 199 |
+
"muon_adam_param_count": 0,
|
| 200 |
+
"muon_param_names": [],
|
| 201 |
+
"muon_adam_param_names": [],
|
| 202 |
+
"muon_effective_nesterov": false,
|
| 203 |
+
"muon_effective_width_scale": false,
|
| 204 |
+
"muon_effective_weight_decay": 0.1,
|
| 205 |
+
"muon_adam_fallback_nesterov": false,
|
| 206 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 207 |
+
"ema_decay": 0.9999,
|
| 208 |
+
"ema_start_step": 0,
|
| 209 |
+
"model_type": "ddit",
|
| 210 |
+
"ddit_mlp_type": "swiglu",
|
| 211 |
+
"elf_num_time_tokens": 4,
|
| 212 |
+
"elf_num_model_mode_tokens": 0,
|
| 213 |
+
"qk_norm": true,
|
| 214 |
+
"output_bias": false,
|
| 215 |
+
"output_init_std": -1.0,
|
| 216 |
+
"norm_type": "rmsnorm",
|
| 217 |
+
"target_loss": "hard_ce",
|
| 218 |
+
"linear_soft_target_power": 1.0,
|
| 219 |
+
"linear_soft_target_min_conf": 0.0,
|
| 220 |
+
"linear_soft_target_max_conf": 1.0,
|
| 221 |
+
"t_sampling_mode": "uniform",
|
| 222 |
+
"t_sampling_power": 1.0,
|
| 223 |
+
"t_sampling_eps": 0.0001,
|
| 224 |
+
"t_sampling_logit_mean": -1.5,
|
| 225 |
+
"t_sampling_logit_std": 0.8,
|
| 226 |
+
"dual_t": true,
|
| 227 |
+
"corrupt_t_mode": "same",
|
| 228 |
+
"corrupt_min_t": 0.0,
|
| 229 |
+
"corrupt_max_t": 1.0,
|
| 230 |
+
"prefix_block_prob": 0.0,
|
| 231 |
+
"prefix_block_len": 128,
|
| 232 |
+
"mask_ratio_floor_schedule": "none",
|
| 233 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 234 |
+
"dirichlet_semantic_t_mode": "same",
|
| 235 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 236 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 237 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 238 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 239 |
+
"categorical_wrong_from_full_vocab": true,
|
| 240 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 241 |
+
"categorical_wrong_basin_token_ids": "",
|
| 242 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 243 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 244 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 245 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 246 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 247 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 248 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 249 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 250 |
+
"mask_mixture_original_prob": 0.0,
|
| 251 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 252 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 253 |
+
"mask_mixture_block_prob": 0.0,
|
| 254 |
+
"mask_mixture_all_prob": 1.0,
|
| 255 |
+
"mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
|
| 256 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 257 |
+
"mask_mixture_block_tokens": "64,128",
|
| 258 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 259 |
+
"logistic_normal_sigma_min": 0.18,
|
| 260 |
+
"logistic_normal_sigma_max": 2.2,
|
| 261 |
+
"logistic_normal_tau_min": 0.65,
|
| 262 |
+
"logistic_normal_tau_max": 1.15,
|
| 263 |
+
"torch_compile": false,
|
| 264 |
+
"compile_mode": "max-autotune",
|
| 265 |
+
"state_format": "prob",
|
| 266 |
+
"meanflow_weight": 0.0,
|
| 267 |
+
"rollout_train_prob": 0.5,
|
| 268 |
+
"rollout_train_steps": 3,
|
| 269 |
+
"rollout_train_steps_min": 0,
|
| 270 |
+
"rollout_train_infer_steps": 1,
|
| 271 |
+
"rollout_train_time_mode": "sampled_path",
|
| 272 |
+
"rollout_train_s_dist": "uniform",
|
| 273 |
+
"rollout_train_s_min_frac": 0.0,
|
| 274 |
+
"rollout_train_s_max_frac": 0.25,
|
| 275 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 276 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 277 |
+
"rollout_train_temp": 1.0,
|
| 278 |
+
"rollout_train_max_gamma": 1.0,
|
| 279 |
+
"rollout_train_corrupt_only": true,
|
| 280 |
+
"rollout_train_samplewise": true,
|
| 281 |
+
"rollout_train_compute_always": false,
|
| 282 |
+
"rollout_train_sync_t": true,
|
| 283 |
+
"bridge_noise_init": "logistic_normal",
|
| 284 |
+
"noise_sigma": -1.0,
|
| 285 |
+
"allow_tf32": true,
|
| 286 |
+
"activation_checkpointing": true,
|
| 287 |
+
"activation_checkpoint_interval": 1,
|
| 288 |
+
"activation_checkpoint_scope": "mlp",
|
| 289 |
+
"ddp_static_graph": false,
|
| 290 |
+
"ddp_gradient_as_bucket_view": true,
|
| 291 |
+
"blocking_data_transfer": false,
|
| 292 |
+
"dataloader_prefetch_factor": 4,
|
| 293 |
+
"full_train_stats": false,
|
| 294 |
+
"tokenized_hf": true,
|
| 295 |
+
"tokenized_pad_token": "pad",
|
| 296 |
+
"elf_conditional_hf": false,
|
| 297 |
+
"record_pad_truncate": false,
|
| 298 |
+
"record_add_eos": false,
|
| 299 |
+
"record_add_special_tokens": false,
|
| 300 |
+
"record_pad_token": "pad",
|
| 301 |
+
"record_shuffle_buffer": 10000,
|
| 302 |
+
"wrap": false,
|
| 303 |
+
"wrap_mode": "stream",
|
| 304 |
+
"wrap_record_buffer_size": 200,
|
| 305 |
+
"owt_cached_chunks": false,
|
| 306 |
+
"owt_chunk_cache_dir": "",
|
| 307 |
+
"owt_chunk_cache_rebuild": false,
|
| 308 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 309 |
+
"owt_exact_repeat_per_chunk": 0,
|
| 310 |
+
"online_chunk_shuffle": false,
|
| 311 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 312 |
+
"openwebtext_split": "all",
|
| 313 |
+
"detokenizer": "auto",
|
| 314 |
+
"resolved_detokenizer": null,
|
| 315 |
+
"num_workers": 8,
|
| 316 |
+
"latest_every": 1000,
|
| 317 |
+
"resume_path": ""
|
| 318 |
+
}
|
| 319 |
+
step=100 epoch=1/53 epoch_step=100/19018 micro_steps=200 elapsed=128.4s lr=6.060000e-05 loss=9.5003 loss_recon=9.5003 loss_meanflow=0.0000 mean_model_t=0.5010 mean_corrupt_t=0.5010 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5009 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2643 corrupt_frac=1.0000 acc_corrupt=0.2643 loss_corrupt=9.5003 wrong_frac=0.4987 init_acc_corrupt=0.4678 acc_corrupt_t_0p0_0p2=0.0404 corrupt_frac_t_0p0_0p2=0.1988 acc_corrupt_t_0p2_0p4=0.1347 corrupt_frac_t_0p2_0p4=0.1957 acc_corrupt_t_0p4_0p6=0.2571 corrupt_frac_t_0p4_0p6=0.2070 acc_corrupt_t_0p6_0p8=0.3793 corrupt_frac_t_0p6_0p8=0.1999 acc_corrupt_t_0p8_1p0=0.5058 corrupt_frac_t_0p8_1p0=0.2007 out_w_norm=4.8209 out_g_norm=1.6074 loss_all=7.7145 init_gold_top10=0.4554 init_gold_top100=0.5816 rollout_applied_pos_frac=0.4999 init_acc_rollout_applied=0.5138 init_acc_rollout_kept=0.3459 logit_acc_rollout_applied=0.1651 logit_acc_rollout_kept=0.1267
|
| 320 |
+
step=100 epoch=1/53 epoch_step=100/19018 micro_steps=200 elapsed=128.3s lr=6.060000e-05 loss=9.5003 loss_recon=9.5003 loss_meanflow=0.0000 mean_model_t=0.5010 mean_corrupt_t=0.5010 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5009 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2643 corrupt_frac=1.0000 acc_corrupt=0.2643 loss_corrupt=9.5003 wrong_frac=0.4987 init_acc_corrupt=0.4678 acc_corrupt_t_0p0_0p2=0.0404 corrupt_frac_t_0p0_0p2=0.1988 acc_corrupt_t_0p2_0p4=0.1347 corrupt_frac_t_0p2_0p4=0.1957 acc_corrupt_t_0p4_0p6=0.2571 corrupt_frac_t_0p4_0p6=0.2070 acc_corrupt_t_0p6_0p8=0.3793 corrupt_frac_t_0p6_0p8=0.1999 acc_corrupt_t_0p8_1p0=0.5058 corrupt_frac_t_0p8_1p0=0.2007 out_w_norm=4.8209 out_g_norm=1.6074 loss_all=7.7145 init_gold_top10=0.4554 init_gold_top100=0.5816 rollout_applied_pos_frac=0.4999 init_acc_rollout_applied=0.5138 init_acc_rollout_kept=0.3459 logit_acc_rollout_applied=0.1651 logit_acc_rollout_kept=0.1267
|
| 321 |
+
step=200 epoch=1/53 epoch_step=200/19018 micro_steps=400 elapsed=127.2s lr=1.206000e-04 loss=5.6868 loss_recon=5.6868 loss_meanflow=0.0000 mean_model_t=0.5028 mean_corrupt_t=0.5028 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5070 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2920 corrupt_frac=1.0000 acc_corrupt=0.2920 loss_corrupt=5.6868 wrong_frac=0.4971 init_acc_corrupt=0.4702 acc_corrupt_t_0p0_0p2=0.0526 corrupt_frac_t_0p0_0p2=0.1981 acc_corrupt_t_0p2_0p4=0.1478 corrupt_frac_t_0p2_0p4=0.1973 acc_corrupt_t_0p4_0p6=0.2880 corrupt_frac_t_0p4_0p6=0.1975 acc_corrupt_t_0p6_0p8=0.4161 corrupt_frac_t_0p6_0p8=0.2050 acc_corrupt_t_0p8_1p0=0.5429 corrupt_frac_t_0p8_1p0=0.2031 out_w_norm=19.3508 out_g_norm=1.0913 loss_all=3.9103 init_gold_top10=0.5571 init_gold_top100=0.6678 rollout_applied_pos_frac=0.5905 init_acc_rollout_applied=0.5359 init_acc_rollout_kept=0.5080 logit_acc_rollout_applied=0.5153 logit_acc_rollout_kept=0.4977
|
| 322 |
+
step=200 epoch=1/53 epoch_step=200/19018 micro_steps=400 elapsed=127.9s lr=1.206000e-04 loss=5.6913 loss_recon=5.6913 loss_meanflow=0.0000 mean_model_t=0.5028 mean_corrupt_t=0.5028 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5070 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2918 corrupt_frac=1.0000 acc_corrupt=0.2918 loss_corrupt=5.6913 wrong_frac=0.4971 init_acc_corrupt=0.4702 acc_corrupt_t_0p0_0p2=0.0522 corrupt_frac_t_0p0_0p2=0.1981 acc_corrupt_t_0p2_0p4=0.1477 corrupt_frac_t_0p2_0p4=0.1973 acc_corrupt_t_0p4_0p6=0.2875 corrupt_frac_t_0p4_0p6=0.1975 acc_corrupt_t_0p6_0p8=0.4163 corrupt_frac_t_0p6_0p8=0.2050 acc_corrupt_t_0p8_1p0=0.5427 corrupt_frac_t_0p8_1p0=0.2031 out_w_norm=19.3841 out_g_norm=1.1588 loss_all=3.8633 init_gold_top10=0.5519 init_gold_top100=0.6667 rollout_applied_pos_frac=0.5905 init_acc_rollout_applied=0.5352 init_acc_rollout_kept=0.5080 logit_acc_rollout_applied=0.5211 logit_acc_rollout_kept=0.5013
|
| 323 |
+
step=300 epoch=1/53 epoch_step=300/19018 micro_steps=600 elapsed=126.9s lr=1.806000e-04 loss=4.1153 loss_recon=4.1153 loss_meanflow=0.0000 mean_model_t=0.5000 mean_corrupt_t=0.5000 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5058 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4752 corrupt_frac=1.0000 acc_corrupt=0.4752 loss_corrupt=4.1153 wrong_frac=0.5004 init_acc_corrupt=0.4675 acc_corrupt_t_0p0_0p2=0.0600 corrupt_frac_t_0p0_0p2=0.1989 acc_corrupt_t_0p2_0p4=0.2380 corrupt_frac_t_0p2_0p4=0.2006 acc_corrupt_t_0p4_0p6=0.4932 corrupt_frac_t_0p4_0p6=0.2021 acc_corrupt_t_0p6_0p8=0.6947 corrupt_frac_t_0p6_0p8=0.2005 acc_corrupt_t_0p8_1p0=0.8922 corrupt_frac_t_0p8_1p0=0.1989 out_w_norm=26.8347 out_g_norm=0.4770 loss_all=4.6517 init_gold_top10=0.4153 init_gold_top100=0.5641 rollout_applied_pos_frac=0.5393 init_acc_rollout_applied=0.3474 init_acc_rollout_kept=0.4090 logit_acc_rollout_applied=0.3711 logit_acc_rollout_kept=0.4269
|
| 324 |
+
step=300 epoch=1/53 epoch_step=300/19018 micro_steps=600 elapsed=127.6s lr=1.806000e-04 loss=4.1237 loss_recon=4.1237 loss_meanflow=0.0000 mean_model_t=0.5000 mean_corrupt_t=0.5000 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5058 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4746 corrupt_frac=1.0000 acc_corrupt=0.4746 loss_corrupt=4.1237 wrong_frac=0.5004 init_acc_corrupt=0.4675 acc_corrupt_t_0p0_0p2=0.0584 corrupt_frac_t_0p0_0p2=0.1989 acc_corrupt_t_0p2_0p4=0.2373 corrupt_frac_t_0p2_0p4=0.2006 acc_corrupt_t_0p4_0p6=0.4926 corrupt_frac_t_0p4_0p6=0.2021 acc_corrupt_t_0p6_0p8=0.6945 corrupt_frac_t_0p6_0p8=0.2005 acc_corrupt_t_0p8_1p0=0.8921 corrupt_frac_t_0p8_1p0=0.1989 out_w_norm=26.9208 out_g_norm=0.4931 loss_all=4.6661 init_gold_top10=0.4163 init_gold_top100=0.5642 rollout_applied_pos_frac=0.5393 init_acc_rollout_applied=0.3467 init_acc_rollout_kept=0.4090 logit_acc_rollout_applied=0.3643 logit_acc_rollout_kept=0.4245
|
| 325 |
+
step=400 epoch=1/53 epoch_step=400/19018 micro_steps=800 elapsed=127.3s lr=2.406000e-04 loss=3.9789 loss_recon=3.9789 loss_meanflow=0.0000 mean_model_t=0.5038 mean_corrupt_t=0.5038 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5031 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4898 corrupt_frac=1.0000 acc_corrupt=0.4898 loss_corrupt=3.9789 wrong_frac=0.4963 init_acc_corrupt=0.4727 acc_corrupt_t_0p0_0p2=0.0644 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.2612 corrupt_frac_t_0p2_0p4=0.1999 acc_corrupt_t_0p4_0p6=0.5032 corrupt_frac_t_0p4_0p6=0.2013 acc_corrupt_t_0p6_0p8=0.7038 corrupt_frac_t_0p6_0p8=0.2029 acc_corrupt_t_0p8_1p0=0.8992 corrupt_frac_t_0p8_1p0=0.2012 out_w_norm=30.7268 out_g_norm=0.3372 loss_all=4.2426 init_gold_top10=0.4804 init_gold_top100=0.6334 rollout_applied_pos_frac=0.4657 init_acc_rollout_applied=0.3259 init_acc_rollout_kept=0.4833 logit_acc_rollout_applied=0.3522 logit_acc_rollout_kept=0.5186
|
| 326 |
+
step=400 epoch=1/53 epoch_step=400/19018 micro_steps=800 elapsed=127.9s lr=2.406000e-04 loss=3.9821 loss_recon=3.9821 loss_meanflow=0.0000 mean_model_t=0.5038 mean_corrupt_t=0.5038 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5031 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4893 corrupt_frac=1.0000 acc_corrupt=0.4893 loss_corrupt=3.9821 wrong_frac=0.4963 init_acc_corrupt=0.4727 acc_corrupt_t_0p0_0p2=0.0644 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.2594 corrupt_frac_t_0p2_0p4=0.1999 acc_corrupt_t_0p4_0p6=0.5025 corrupt_frac_t_0p4_0p6=0.2013 acc_corrupt_t_0p6_0p8=0.7038 corrupt_frac_t_0p6_0p8=0.2029 acc_corrupt_t_0p8_1p0=0.8992 corrupt_frac_t_0p8_1p0=0.2012 out_w_norm=30.9578 out_g_norm=0.3447 loss_all=4.2420 init_gold_top10=0.4803 init_gold_top100=0.6319 rollout_applied_pos_frac=0.4657 init_acc_rollout_applied=0.3257 init_acc_rollout_kept=0.4833 logit_acc_rollout_applied=0.3503 logit_acc_rollout_kept=0.5222
|
| 327 |
+
step=500 epoch=1/53 epoch_step=500/19018 micro_steps=1000 elapsed=127.4s lr=3.006000e-04 loss=3.8254 loss_recon=3.8254 loss_meanflow=0.0000 mean_model_t=0.4995 mean_corrupt_t=0.4995 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5047 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4965 corrupt_frac=1.0000 acc_corrupt=0.4965 loss_corrupt=3.8254 wrong_frac=0.5003 init_acc_corrupt=0.4677 acc_corrupt_t_0p0_0p2=0.0693 corrupt_frac_t_0p0_0p2=0.1982 acc_corrupt_t_0p2_0p4=0.2756 corrupt_frac_t_0p2_0p4=0.2092 acc_corrupt_t_0p4_0p6=0.5185 corrupt_frac_t_0p4_0p6=0.1874 acc_corrupt_t_0p6_0p8=0.7155 corrupt_frac_t_0p6_0p8=0.2040 acc_corrupt_t_0p8_1p0=0.9043 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=34.7422 out_g_norm=0.3654 loss_all=3.8885 init_gold_top10=0.4804 init_gold_top100=0.6383 rollout_applied_pos_frac=0.4496 init_acc_rollout_applied=0.4577 init_acc_rollout_kept=0.4245 logit_acc_rollout_applied=0.4934 logit_acc_rollout_kept=0.4638
|
| 328 |
+
step=500 epoch=1/53 epoch_step=500/19018 micro_steps=1000 elapsed=127.9s lr=3.006000e-04 loss=3.7790 loss_recon=3.7790 loss_meanflow=0.0000 mean_model_t=0.4995 mean_corrupt_t=0.4995 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5047 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4998 corrupt_frac=1.0000 acc_corrupt=0.4998 loss_corrupt=3.7790 wrong_frac=0.5003 init_acc_corrupt=0.4678 acc_corrupt_t_0p0_0p2=0.0704 corrupt_frac_t_0p0_0p2=0.1982 acc_corrupt_t_0p2_0p4=0.2810 corrupt_frac_t_0p2_0p4=0.2092 acc_corrupt_t_0p4_0p6=0.5232 corrupt_frac_t_0p4_0p6=0.1874 acc_corrupt_t_0p6_0p8=0.7193 corrupt_frac_t_0p6_0p8=0.2040 acc_corrupt_t_0p8_1p0=0.9058 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=35.3866 out_g_norm=0.3337 loss_all=3.8471 init_gold_top10=0.4821 init_gold_top100=0.6405 rollout_applied_pos_frac=0.4496 init_acc_rollout_applied=0.4577 init_acc_rollout_kept=0.4245 logit_acc_rollout_applied=0.4993 logit_acc_rollout_kept=0.4680
|
| 329 |
+
step=600 epoch=1/53 epoch_step=600/19018 micro_steps=1200 elapsed=126.8s lr=3.606000e-04 loss=3.5795 loss_recon=3.5795 loss_meanflow=0.0000 mean_model_t=0.5014 mean_corrupt_t=0.5014 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4886 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5160 corrupt_frac=1.0000 acc_corrupt=0.5160 loss_corrupt=3.5795 wrong_frac=0.4982 init_acc_corrupt=0.4713 acc_corrupt_t_0p0_0p2=0.0722 corrupt_frac_t_0p0_0p2=0.2012 acc_corrupt_t_0p2_0p4=0.3011 corrupt_frac_t_0p2_0p4=0.1954 acc_corrupt_t_0p4_0p6=0.5494 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=0.7374 corrupt_frac_t_0p6_0p8=0.2062 acc_corrupt_t_0p8_1p0=0.9154 corrupt_frac_t_0p8_1p0=0.1977 out_w_norm=39.6572 out_g_norm=0.3169 loss_all=3.4472 init_gold_top10=0.5308 init_gold_top100=0.6502 rollout_applied_pos_frac=0.3817 init_acc_rollout_applied=0.4244 init_acc_rollout_kept=0.5100 logit_acc_rollout_applied=0.4826 logit_acc_rollout_kept=0.5568
|
| 330 |
+
step=600 epoch=1/53 epoch_step=600/19018 micro_steps=1200 elapsed=127.4s lr=3.606000e-04 loss=3.5474 loss_recon=3.5474 loss_meanflow=0.0000 mean_model_t=0.5014 mean_corrupt_t=0.5014 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4886 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5181 corrupt_frac=1.0000 acc_corrupt=0.5181 loss_corrupt=3.5474 wrong_frac=0.4982 init_acc_corrupt=0.4714 acc_corrupt_t_0p0_0p2=0.0731 corrupt_frac_t_0p0_0p2=0.2012 acc_corrupt_t_0p2_0p4=0.3045 corrupt_frac_t_0p2_0p4=0.1954 acc_corrupt_t_0p4_0p6=0.5522 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=0.7398 corrupt_frac_t_0p6_0p8=0.2062 acc_corrupt_t_0p8_1p0=0.9164 corrupt_frac_t_0p8_1p0=0.1977 out_w_norm=40.2621 out_g_norm=0.3165 loss_all=3.4160 init_gold_top10=0.5290 init_gold_top100=0.6505 rollout_applied_pos_frac=0.3817 init_acc_rollout_applied=0.4238 init_acc_rollout_kept=0.5100 logit_acc_rollout_applied=0.4840 logit_acc_rollout_kept=0.5613
|
| 331 |
+
step=700 epoch=1/53 epoch_step=700/19018 micro_steps=1400 elapsed=127.6s lr=4.206000e-04 loss=3.4168 loss_recon=3.4168 loss_meanflow=0.0000 mean_model_t=0.5075 mean_corrupt_t=0.5075 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4975 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5308 corrupt_frac=1.0000 acc_corrupt=0.5308 loss_corrupt=3.4168 wrong_frac=0.4922 init_acc_corrupt=0.4784 acc_corrupt_t_0p0_0p2=0.0743 corrupt_frac_t_0p0_0p2=0.1963 acc_corrupt_t_0p2_0p4=0.3119 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.5582 corrupt_frac_t_0p4_0p6=0.1964 acc_corrupt_t_0p6_0p8=0.7481 corrupt_frac_t_0p6_0p8=0.2077 acc_corrupt_t_0p8_1p0=0.9199 corrupt_frac_t_0p8_1p0=0.2082 out_w_norm=45.1937 out_g_norm=0.2718 loss_all=3.3184 init_gold_top10=0.5668 init_gold_top100=0.6837 rollout_applied_pos_frac=0.4076 init_acc_rollout_applied=0.5451 init_acc_rollout_kept=0.4696 logit_acc_rollout_applied=0.5799 logit_acc_rollout_kept=0.5275
|
| 332 |
+
step=700 epoch=1/53 epoch_step=700/19018 micro_steps=1400 elapsed=128.2s lr=4.206000e-04 loss=3.3933 loss_recon=3.3933 loss_meanflow=0.0000 mean_model_t=0.5075 mean_corrupt_t=0.5075 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4975 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5325 corrupt_frac=1.0000 acc_corrupt=0.5325 loss_corrupt=3.3933 wrong_frac=0.4922 init_acc_corrupt=0.4785 acc_corrupt_t_0p0_0p2=0.0751 corrupt_frac_t_0p0_0p2=0.1963 acc_corrupt_t_0p2_0p4=0.3143 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.5607 corrupt_frac_t_0p4_0p6=0.1964 acc_corrupt_t_0p6_0p8=0.7500 corrupt_frac_t_0p6_0p8=0.2077 acc_corrupt_t_0p8_1p0=0.9207 corrupt_frac_t_0p8_1p0=0.2082 out_w_norm=45.5791 out_g_norm=0.2697 loss_all=3.2746 init_gold_top10=0.5672 init_gold_top100=0.6836 rollout_applied_pos_frac=0.4076 init_acc_rollout_applied=0.5446 init_acc_rollout_kept=0.4696 logit_acc_rollout_applied=0.5851 logit_acc_rollout_kept=0.5295
|
| 333 |
+
step=800 epoch=1/53 epoch_step=800/19018 micro_steps=1600 elapsed=127.5s lr=4.806000e-04 loss=3.3896 loss_recon=3.3896 loss_meanflow=0.0000 mean_model_t=0.5023 mean_corrupt_t=0.5023 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5081 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5297 corrupt_frac=1.0000 acc_corrupt=0.5297 loss_corrupt=3.3896 wrong_frac=0.4983 init_acc_corrupt=0.4719 acc_corrupt_t_0p0_0p2=0.0787 corrupt_frac_t_0p0_0p2=0.1966 acc_corrupt_t_0p2_0p4=0.3183 corrupt_frac_t_0p2_0p4=0.2016 acc_corrupt_t_0p4_0p6=0.5683 corrupt_frac_t_0p4_0p6=0.2038 acc_corrupt_t_0p6_0p8=0.7586 corrupt_frac_t_0p6_0p8=0.1993 acc_corrupt_t_0p8_1p0=0.9213 corrupt_frac_t_0p8_1p0=0.1987 out_w_norm=51.3081 out_g_norm=0.2355 loss_all=3.1923 init_gold_top10=0.5511 init_gold_top100=0.6648 rollout_applied_pos_frac=0.4373 init_acc_rollout_applied=0.4910 init_acc_rollout_kept=0.4966 logit_acc_rollout_applied=0.5452 logit_acc_rollout_kept=0.5639
|
| 334 |
+
step=800 epoch=1/53 epoch_step=800/19018 micro_steps=1600 elapsed=128.1s lr=4.806000e-04 loss=3.3687 loss_recon=3.3687 loss_meanflow=0.0000 mean_model_t=0.5023 mean_corrupt_t=0.5023 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5081 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5313 corrupt_frac=1.0000 acc_corrupt=0.5313 loss_corrupt=3.3687 wrong_frac=0.4983 init_acc_corrupt=0.4719 acc_corrupt_t_0p0_0p2=0.0792 corrupt_frac_t_0p0_0p2=0.1966 acc_corrupt_t_0p2_0p4=0.3204 corrupt_frac_t_0p2_0p4=0.2016 acc_corrupt_t_0p4_0p6=0.5705 corrupt_frac_t_0p4_0p6=0.2038 acc_corrupt_t_0p6_0p8=0.7605 corrupt_frac_t_0p6_0p8=0.1993 acc_corrupt_t_0p8_1p0=0.9222 corrupt_frac_t_0p8_1p0=0.1987 out_w_norm=51.2100 out_g_norm=0.2397 loss_all=3.1635 init_gold_top10=0.5527 init_gold_top100=0.6649 rollout_applied_pos_frac=0.4373 init_acc_rollout_applied=0.4907 init_acc_rollout_kept=0.4966 logit_acc_rollout_applied=0.5439 logit_acc_rollout_kept=0.5658
|
| 335 |
+
step=900 epoch=1/53 epoch_step=900/19018 micro_steps=1800 elapsed=127.6s lr=5.406000e-04 loss=3.3800 loss_recon=3.3800 loss_meanflow=0.0000 mean_model_t=0.4970 mean_corrupt_t=0.4970 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4934 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5283 corrupt_frac=1.0000 acc_corrupt=0.5283 loss_corrupt=3.3800 wrong_frac=0.5021 init_acc_corrupt=0.4667 acc_corrupt_t_0p0_0p2=0.0809 corrupt_frac_t_0p0_0p2=0.2070 acc_corrupt_t_0p2_0p4=0.3199 corrupt_frac_t_0p2_0p4=0.1987 acc_corrupt_t_0p4_0p6=0.5709 corrupt_frac_t_0p4_0p6=0.1973 acc_corrupt_t_0p6_0p8=0.7617 corrupt_frac_t_0p6_0p8=0.1991 acc_corrupt_t_0p8_1p0=0.9234 corrupt_frac_t_0p8_1p0=0.1990 out_w_norm=57.2804 out_g_norm=0.2021 loss_all=3.5181 init_gold_top10=0.5200 init_gold_top100=0.6706 rollout_applied_pos_frac=0.5583 init_acc_rollout_applied=0.4290 init_acc_rollout_kept=0.4709 logit_acc_rollout_applied=0.4850 logit_acc_rollout_kept=0.5429
|
| 336 |
+
step=900 epoch=1/53 epoch_step=900/19018 micro_steps=1800 elapsed=128.1s lr=5.406000e-04 loss=3.3642 loss_recon=3.3642 loss_meanflow=0.0000 mean_model_t=0.4970 mean_corrupt_t=0.4970 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4934 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5295 corrupt_frac=1.0000 acc_corrupt=0.5295 loss_corrupt=3.3642 wrong_frac=0.5021 init_acc_corrupt=0.4668 acc_corrupt_t_0p0_0p2=0.0810 corrupt_frac_t_0p0_0p2=0.2070 acc_corrupt_t_0p2_0p4=0.3212 corrupt_frac_t_0p2_0p4=0.1987 acc_corrupt_t_0p4_0p6=0.5727 corrupt_frac_t_0p4_0p6=0.1973 acc_corrupt_t_0p6_0p8=0.7637 corrupt_frac_t_0p6_0p8=0.1991 acc_corrupt_t_0p8_1p0=0.9242 corrupt_frac_t_0p8_1p0=0.1990 out_w_norm=56.7702 out_g_norm=0.2153 loss_all=3.5128 init_gold_top10=0.5177 init_gold_top100=0.6706 rollout_applied_pos_frac=0.5583 init_acc_rollout_applied=0.4285 init_acc_rollout_kept=0.4709 logit_acc_rollout_applied=0.4858 logit_acc_rollout_kept=0.5437
|
| 337 |
+
step=1000 epoch=1/53 epoch_step=1000/19018 micro_steps=2000 elapsed=127.8s lr=6.000000e-04 loss=3.2956 loss_recon=3.2956 loss_meanflow=0.0000 mean_model_t=0.5031 mean_corrupt_t=0.5031 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4886 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5371 corrupt_frac=1.0000 acc_corrupt=0.5371 loss_corrupt=3.2956 wrong_frac=0.4971 init_acc_corrupt=0.4735 acc_corrupt_t_0p0_0p2=0.0794 corrupt_frac_t_0p0_0p2=0.1959 acc_corrupt_t_0p2_0p4=0.3220 corrupt_frac_t_0p2_0p4=0.1971 acc_corrupt_t_0p4_0p6=0.5750 corrupt_frac_t_0p4_0p6=0.2072 acc_corrupt_t_0p6_0p8=0.7645 corrupt_frac_t_0p6_0p8=0.2000 acc_corrupt_t_0p8_1p0=0.9280 corrupt_frac_t_0p8_1p0=0.2009 out_w_norm=63.0428 out_g_norm=0.1939 loss_all=3.4591 init_gold_top10=0.4828 init_gold_top100=0.6040 rollout_applied_pos_frac=0.3761 init_acc_rollout_applied=0.4586 init_acc_rollout_kept=0.4105 logit_acc_rollout_applied=0.5317 logit_acc_rollout_kept=0.4963
|
| 338 |
+
step=1000 epoch=1/53 epoch_step=1000/19018 micro_steps=2000 elapsed=128.4s lr=6.000000e-04 loss=3.2780 loss_recon=3.2780 loss_meanflow=0.0000 mean_model_t=0.5031 mean_corrupt_t=0.5031 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4886 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5387 corrupt_frac=1.0000 acc_corrupt=0.5387 loss_corrupt=3.2780 wrong_frac=0.4971 init_acc_corrupt=0.4735 acc_corrupt_t_0p0_0p2=0.0798 corrupt_frac_t_0p0_0p2=0.1959 acc_corrupt_t_0p2_0p4=0.3236 corrupt_frac_t_0p2_0p4=0.1971 acc_corrupt_t_0p4_0p6=0.5777 corrupt_frac_t_0p4_0p6=0.2072 acc_corrupt_t_0p6_0p8=0.7669 corrupt_frac_t_0p6_0p8=0.2000 acc_corrupt_t_0p8_1p0=0.9289 corrupt_frac_t_0p8_1p0=0.2009 out_w_norm=62.2229 out_g_norm=0.1988 loss_all=3.4433 init_gold_top10=0.4828 init_gold_top100=0.6034 rollout_applied_pos_frac=0.3761 init_acc_rollout_applied=0.4587 init_acc_rollout_kept=0.4105 logit_acc_rollout_applied=0.5336 logit_acc_rollout_kept=0.4975
|
| 339 |
+
step=1100 epoch=1/53 epoch_step=1100/19018 micro_steps=2200 elapsed=129.4s lr=6.000000e-04 loss=3.2449 loss_recon=3.2449 loss_meanflow=0.0000 mean_model_t=0.5050 mean_corrupt_t=0.5050 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5053 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5419 corrupt_frac=1.0000 acc_corrupt=0.5419 loss_corrupt=3.2449 wrong_frac=0.4952 init_acc_corrupt=0.4758 acc_corrupt_t_0p0_0p2=0.0809 corrupt_frac_t_0p0_0p2=0.1969 acc_corrupt_t_0p2_0p4=0.3281 corrupt_frac_t_0p2_0p4=0.1912 acc_corrupt_t_0p4_0p6=0.5813 corrupt_frac_t_0p4_0p6=0.2092 acc_corrupt_t_0p6_0p8=0.7677 corrupt_frac_t_0p6_0p8=0.1987 acc_corrupt_t_0p8_1p0=0.9268 corrupt_frac_t_0p8_1p0=0.2040 out_w_norm=68.5109 out_g_norm=0.1733 loss_all=2.9512 init_gold_top10=0.5514 init_gold_top100=0.6563 rollout_applied_pos_frac=0.5130 init_acc_rollout_applied=0.5195 init_acc_rollout_kept=0.4711 logit_acc_rollout_applied=0.5940 logit_acc_rollout_kept=0.5528
|
| 340 |
+
step=1100 epoch=1/53 epoch_step=1100/19018 micro_steps=2200 elapsed=129.8s lr=6.000000e-04 loss=3.2255 loss_recon=3.2255 loss_meanflow=0.0000 mean_model_t=0.5050 mean_corrupt_t=0.5050 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5053 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5438 corrupt_frac=1.0000 acc_corrupt=0.5438 loss_corrupt=3.2255 wrong_frac=0.4952 init_acc_corrupt=0.4759 acc_corrupt_t_0p0_0p2=0.0811 corrupt_frac_t_0p0_0p2=0.1969 acc_corrupt_t_0p2_0p4=0.3301 corrupt_frac_t_0p2_0p4=0.1912 acc_corrupt_t_0p4_0p6=0.5846 corrupt_frac_t_0p4_0p6=0.2092 acc_corrupt_t_0p6_0p8=0.7704 corrupt_frac_t_0p6_0p8=0.1987 acc_corrupt_t_0p8_1p0=0.9280 corrupt_frac_t_0p8_1p0=0.2040 out_w_norm=67.4424 out_g_norm=0.1738 loss_all=2.9186 init_gold_top10=0.5529 init_gold_top100=0.6574 rollout_applied_pos_frac=0.5130 init_acc_rollout_applied=0.5197 init_acc_rollout_kept=0.4711 logit_acc_rollout_applied=0.5948 logit_acc_rollout_kept=0.5529
|
| 341 |
+
step=1200 epoch=1/53 epoch_step=1200/19018 micro_steps=2400 elapsed=127.2s lr=5.999999e-04 loss=3.2320 loss_recon=3.2320 loss_meanflow=0.0000 mean_model_t=0.5023 mean_corrupt_t=0.5023 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4956 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5417 corrupt_frac=1.0000 acc_corrupt=0.5417 loss_corrupt=3.2320 wrong_frac=0.4979 init_acc_corrupt=0.4728 acc_corrupt_t_0p0_0p2=0.0807 corrupt_frac_t_0p0_0p2=0.1923 acc_corrupt_t_0p2_0p4=0.3326 corrupt_frac_t_0p2_0p4=0.2041 acc_corrupt_t_0p4_0p6=0.5850 corrupt_frac_t_0p4_0p6=0.2073 acc_corrupt_t_0p6_0p8=0.7735 corrupt_frac_t_0p6_0p8=0.2005 acc_corrupt_t_0p8_1p0=0.9276 corrupt_frac_t_0p8_1p0=0.1968 out_w_norm=73.2782 out_g_norm=0.1615 loss_all=2.5402 init_gold_top10=0.6131 init_gold_top100=0.7025 rollout_applied_pos_frac=0.5764 init_acc_rollout_applied=0.5912 init_acc_rollout_kept=0.5207 logit_acc_rollout_applied=0.6531 logit_acc_rollout_kept=0.6049
|
| 342 |
+
step=1200 epoch=1/53 epoch_step=1200/19018 micro_steps=2400 elapsed=127.5s lr=5.999999e-04 loss=3.2110 loss_recon=3.2110 loss_meanflow=0.0000 mean_model_t=0.5023 mean_corrupt_t=0.5023 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4956 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5437 corrupt_frac=1.0000 acc_corrupt=0.5437 loss_corrupt=3.2110 wrong_frac=0.4979 init_acc_corrupt=0.4729 acc_corrupt_t_0p0_0p2=0.0809 corrupt_frac_t_0p0_0p2=0.1923 acc_corrupt_t_0p2_0p4=0.3347 corrupt_frac_t_0p2_0p4=0.2041 acc_corrupt_t_0p4_0p6=0.5885 corrupt_frac_t_0p4_0p6=0.2073 acc_corrupt_t_0p6_0p8=0.7765 corrupt_frac_t_0p6_0p8=0.2005 acc_corrupt_t_0p8_1p0=0.9289 corrupt_frac_t_0p8_1p0=0.1968 out_w_norm=72.0315 out_g_norm=0.1619 loss_all=2.5216 init_gold_top10=0.6132 init_gold_top100=0.7034 rollout_applied_pos_frac=0.5764 init_acc_rollout_applied=0.5913 init_acc_rollout_kept=0.5207 logit_acc_rollout_applied=0.6556 logit_acc_rollout_kept=0.6066
|
| 343 |
+
step=1300 epoch=1/53 epoch_step=1300/19018 micro_steps=2600 elapsed=127.5s lr=5.999999e-04 loss=3.2152 loss_recon=3.2152 loss_meanflow=0.0000 mean_model_t=0.5025 mean_corrupt_t=0.5025 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4969 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5433 corrupt_frac=1.0000 acc_corrupt=0.5433 loss_corrupt=3.2152 wrong_frac=0.4975 init_acc_corrupt=0.4729 acc_corrupt_t_0p0_0p2=0.0827 corrupt_frac_t_0p0_0p2=0.2025 acc_corrupt_t_0p2_0p4=0.3329 corrupt_frac_t_0p2_0p4=0.1954 acc_corrupt_t_0p4_0p6=0.5878 corrupt_frac_t_0p4_0p6=0.1956 acc_corrupt_t_0p6_0p8=0.7703 corrupt_frac_t_0p6_0p8=0.2056 acc_corrupt_t_0p8_1p0=0.9285 corrupt_frac_t_0p8_1p0=0.2039 out_w_norm=77.4548 out_g_norm=0.1491 loss_all=3.4487 init_gold_top10=0.4904 init_gold_top100=0.6381 rollout_applied_pos_frac=0.4005 init_acc_rollout_applied=0.4760 init_acc_rollout_kept=0.4166 logit_acc_rollout_applied=0.5335 logit_acc_rollout_kept=0.4978
|
| 344 |
+
step=1300 epoch=1/53 epoch_step=1300/19018 micro_steps=2600 elapsed=128.0s lr=5.999999e-04 loss=3.1942 loss_recon=3.1942 loss_meanflow=0.0000 mean_model_t=0.5025 mean_corrupt_t=0.5025 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4969 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5452 corrupt_frac=1.0000 acc_corrupt=0.5452 loss_corrupt=3.1942 wrong_frac=0.4975 init_acc_corrupt=0.4729 acc_corrupt_t_0p0_0p2=0.0827 corrupt_frac_t_0p0_0p2=0.2025 acc_corrupt_t_0p2_0p4=0.3349 corrupt_frac_t_0p2_0p4=0.1954 acc_corrupt_t_0p4_0p6=0.5907 corrupt_frac_t_0p4_0p6=0.1956 acc_corrupt_t_0p6_0p8=0.7736 corrupt_frac_t_0p6_0p8=0.2056 acc_corrupt_t_0p8_1p0=0.9299 corrupt_frac_t_0p8_1p0=0.2039 out_w_norm=76.0696 out_g_norm=0.1530 loss_all=3.4418 init_gold_top10=0.4899 init_gold_top100=0.6381 rollout_applied_pos_frac=0.4005 init_acc_rollout_applied=0.4759 init_acc_rollout_kept=0.4166 logit_acc_rollout_applied=0.5321 logit_acc_rollout_kept=0.4991
|
| 345 |
+
step=1400 epoch=1/53 epoch_step=1400/19018 micro_steps=2800 elapsed=127.1s lr=5.999998e-04 loss=3.2361 loss_recon=3.2361 loss_meanflow=0.0000 mean_model_t=0.4974 mean_corrupt_t=0.4974 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5067 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5392 corrupt_frac=1.0000 acc_corrupt=0.5392 loss_corrupt=3.2361 wrong_frac=0.5026 init_acc_corrupt=0.4666 acc_corrupt_t_0p0_0p2=0.0831 corrupt_frac_t_0p0_0p2=0.1994 acc_corrupt_t_0p2_0p4=0.3288 corrupt_frac_t_0p2_0p4=0.2079 acc_corrupt_t_0p4_0p6=0.5895 corrupt_frac_t_0p4_0p6=0.1916 acc_corrupt_t_0p6_0p8=0.7743 corrupt_frac_t_0p6_0p8=0.2056 acc_corrupt_t_0p8_1p0=0.9317 corrupt_frac_t_0p8_1p0=0.1955 out_w_norm=81.2184 out_g_norm=0.1432 loss_all=2.8166 init_gold_top10=0.5512 init_gold_top100=0.6352 rollout_applied_pos_frac=0.4372 init_acc_rollout_applied=0.4944 init_acc_rollout_kept=0.5085 logit_acc_rollout_applied=0.5776 logit_acc_rollout_kept=0.5902
|
| 346 |
+
step=1400 epoch=1/53 epoch_step=1400/19018 micro_steps=2800 elapsed=127.7s lr=5.999998e-04 loss=3.2167 loss_recon=3.2167 loss_meanflow=0.0000 mean_model_t=0.4974 mean_corrupt_t=0.4974 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5067 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5414 corrupt_frac=1.0000 acc_corrupt=0.5414 loss_corrupt=3.2167 wrong_frac=0.5026 init_acc_corrupt=0.4667 acc_corrupt_t_0p0_0p2=0.0837 corrupt_frac_t_0p0_0p2=0.1994 acc_corrupt_t_0p2_0p4=0.3303 corrupt_frac_t_0p2_0p4=0.2079 acc_corrupt_t_0p4_0p6=0.5929 corrupt_frac_t_0p4_0p6=0.1916 acc_corrupt_t_0p6_0p8=0.7779 corrupt_frac_t_0p6_0p8=0.2056 acc_corrupt_t_0p8_1p0=0.9334 corrupt_frac_t_0p8_1p0=0.1955 out_w_norm=79.7282 out_g_norm=0.1481 loss_all=2.8050 init_gold_top10=0.5512 init_gold_top100=0.6349 rollout_applied_pos_frac=0.4372 init_acc_rollout_applied=0.4948 init_acc_rollout_kept=0.5085 logit_acc_rollout_applied=0.5813 logit_acc_rollout_kept=0.5930
|
| 347 |
+
step=1500 epoch=1/53 epoch_step=1500/19018 micro_steps=3000 elapsed=127.2s lr=5.999996e-04 loss=3.1995 loss_recon=3.1995 loss_meanflow=0.0000 mean_model_t=0.4991 mean_corrupt_t=0.4991 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4941 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5430 corrupt_frac=1.0000 acc_corrupt=0.5430 loss_corrupt=3.1995 wrong_frac=0.5014 init_acc_corrupt=0.4691 acc_corrupt_t_0p0_0p2=0.0819 corrupt_frac_t_0p0_0p2=0.1995 acc_corrupt_t_0p2_0p4=0.3365 corrupt_frac_t_0p2_0p4=0.2006 acc_corrupt_t_0p4_0p6=0.5896 corrupt_frac_t_0p4_0p6=0.2030 acc_corrupt_t_0p6_0p8=0.7790 corrupt_frac_t_0p6_0p8=0.1989 acc_corrupt_t_0p8_1p0=0.9319 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=84.6963 out_g_norm=0.1391 loss_all=3.0060 init_gold_top10=0.5632 init_gold_top100=0.6890 rollout_applied_pos_frac=0.5283 init_acc_rollout_applied=0.5095 init_acc_rollout_kept=0.5024 logit_acc_rollout_applied=0.5697 logit_acc_rollout_kept=0.5750
|
| 348 |
+
step=1500 epoch=1/53 epoch_step=1500/19018 micro_steps=3000 elapsed=127.7s lr=5.999996e-04 loss=3.1799 loss_recon=3.1799 loss_meanflow=0.0000 mean_model_t=0.4991 mean_corrupt_t=0.4991 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4941 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5450 corrupt_frac=1.0000 acc_corrupt=0.5450 loss_corrupt=3.1799 wrong_frac=0.5014 init_acc_corrupt=0.4692 acc_corrupt_t_0p0_0p2=0.0821 corrupt_frac_t_0p0_0p2=0.1995 acc_corrupt_t_0p2_0p4=0.3380 corrupt_frac_t_0p2_0p4=0.2006 acc_corrupt_t_0p4_0p6=0.5930 corrupt_frac_t_0p4_0p6=0.2030 acc_corrupt_t_0p6_0p8=0.7823 corrupt_frac_t_0p6_0p8=0.1989 acc_corrupt_t_0p8_1p0=0.9337 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=83.1280 out_g_norm=0.1422 loss_all=2.9766 init_gold_top10=0.5644 init_gold_top100=0.6913 rollout_applied_pos_frac=0.5283 init_acc_rollout_applied=0.5100 init_acc_rollout_kept=0.5024 logit_acc_rollout_applied=0.5709 logit_acc_rollout_kept=0.5777
|
| 349 |
+
step=1600 epoch=1/53 epoch_step=1600/19018 micro_steps=3200 elapsed=127.0s lr=5.999995e-04 loss=3.1767 loss_recon=3.1767 loss_meanflow=0.0000 mean_model_t=0.4989 mean_corrupt_t=0.4989 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4963 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5450 corrupt_frac=1.0000 acc_corrupt=0.5450 loss_corrupt=3.1767 wrong_frac=0.5019 init_acc_corrupt=0.4690 acc_corrupt_t_0p0_0p2=0.0848 corrupt_frac_t_0p0_0p2=0.2021 acc_corrupt_t_0p2_0p4=0.3384 corrupt_frac_t_0p2_0p4=0.1920 acc_corrupt_t_0p4_0p6=0.5927 corrupt_frac_t_0p4_0p6=0.2102 acc_corrupt_t_0p6_0p8=0.7818 corrupt_frac_t_0p6_0p8=0.2050 acc_corrupt_t_0p8_1p0=0.9309 corrupt_frac_t_0p8_1p0=0.1916 out_w_norm=87.9033 out_g_norm=0.1348 loss_all=2.5706 init_gold_top10=0.6111 init_gold_top100=0.7061 rollout_applied_pos_frac=0.4444 init_acc_rollout_applied=0.4904 init_acc_rollout_kept=0.6049 logit_acc_rollout_applied=0.5641 logit_acc_rollout_kept=0.6768
|
| 350 |
+
step=1600 epoch=1/53 epoch_step=1600/19018 micro_steps=3200 elapsed=127.5s lr=5.999995e-04 loss=3.1560 loss_recon=3.1560 loss_meanflow=0.0000 mean_model_t=0.4989 mean_corrupt_t=0.4989 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4963 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5474 corrupt_frac=1.0000 acc_corrupt=0.5474 loss_corrupt=3.1560 wrong_frac=0.5019 init_acc_corrupt=0.4691 acc_corrupt_t_0p0_0p2=0.0851 corrupt_frac_t_0p0_0p2=0.2021 acc_corrupt_t_0p2_0p4=0.3395 corrupt_frac_t_0p2_0p4=0.1920 acc_corrupt_t_0p4_0p6=0.5968 corrupt_frac_t_0p4_0p6=0.2102 acc_corrupt_t_0p6_0p8=0.7858 corrupt_frac_t_0p6_0p8=0.2050 acc_corrupt_t_0p8_1p0=0.9329 corrupt_frac_t_0p8_1p0=0.1916 out_w_norm=86.2745 out_g_norm=0.1375 loss_all=2.5237 init_gold_top10=0.6124 init_gold_top100=0.7088 rollout_applied_pos_frac=0.4444 init_acc_rollout_applied=0.4913 init_acc_rollout_kept=0.6049 logit_acc_rollout_applied=0.5665 logit_acc_rollout_kept=0.6802
|
| 351 |
+
step=1700 epoch=1/53 epoch_step=1700/19018 micro_steps=3400 elapsed=127.8s lr=5.999993e-04 loss=3.1293 loss_recon=3.1293 loss_meanflow=0.0000 mean_model_t=0.5049 mean_corrupt_t=0.5049 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5033 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5517 corrupt_frac=1.0000 acc_corrupt=0.5517 loss_corrupt=3.1293 wrong_frac=0.4953 init_acc_corrupt=0.4755 acc_corrupt_t_0p0_0p2=0.0863 corrupt_frac_t_0p0_0p2=0.1999 acc_corrupt_t_0p2_0p4=0.3354 corrupt_frac_t_0p2_0p4=0.1910 acc_corrupt_t_0p4_0p6=0.5951 corrupt_frac_t_0p4_0p6=0.1981 acc_corrupt_t_0p6_0p8=0.7814 corrupt_frac_t_0p6_0p8=0.2052 acc_corrupt_t_0p8_1p0=0.9338 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=90.9278 out_g_norm=0.1294 loss_all=2.7756 init_gold_top10=0.5805 init_gold_top100=0.6956 rollout_applied_pos_frac=0.5994 init_acc_rollout_applied=0.5599 init_acc_rollout_kept=0.4380 logit_acc_rollout_applied=0.6295 logit_acc_rollout_kept=0.5353
|
| 352 |
+
step=1700 epoch=1/53 epoch_step=1700/19018 micro_steps=3400 elapsed=128.3s lr=5.999993e-04 loss=3.1095 loss_recon=3.1095 loss_meanflow=0.0000 mean_model_t=0.5049 mean_corrupt_t=0.5049 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5033 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5541 corrupt_frac=1.0000 acc_corrupt=0.5541 loss_corrupt=3.1095 wrong_frac=0.4953 init_acc_corrupt=0.4755 acc_corrupt_t_0p0_0p2=0.0866 corrupt_frac_t_0p0_0p2=0.1999 acc_corrupt_t_0p2_0p4=0.3368 corrupt_frac_t_0p2_0p4=0.1910 acc_corrupt_t_0p4_0p6=0.5985 corrupt_frac_t_0p4_0p6=0.1981 acc_corrupt_t_0p6_0p8=0.7858 corrupt_frac_t_0p6_0p8=0.2052 acc_corrupt_t_0p8_1p0=0.9360 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=89.2557 out_g_norm=0.1323 loss_all=2.7520 init_gold_top10=0.5821 init_gold_top100=0.6966 rollout_applied_pos_frac=0.5994 init_acc_rollout_applied=0.5601 init_acc_rollout_kept=0.4380 logit_acc_rollout_applied=0.6317 logit_acc_rollout_kept=0.5375
|
| 353 |
+
step=1800 epoch=1/53 epoch_step=1800/19018 micro_steps=3600 elapsed=127.2s lr=5.999991e-04 loss=3.1210 loss_recon=3.1210 loss_meanflow=0.0000 mean_model_t=0.5023 mean_corrupt_t=0.5023 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4923 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5515 corrupt_frac=1.0000 acc_corrupt=0.5515 loss_corrupt=3.1210 wrong_frac=0.4983 init_acc_corrupt=0.4730 acc_corrupt_t_0p0_0p2=0.0834 corrupt_frac_t_0p0_0p2=0.1957 acc_corrupt_t_0p2_0p4=0.3410 corrupt_frac_t_0p2_0p4=0.1997 acc_corrupt_t_0p4_0p6=0.6024 corrupt_frac_t_0p4_0p6=0.2041 acc_corrupt_t_0p6_0p8=0.7874 corrupt_frac_t_0p6_0p8=0.2044 acc_corrupt_t_0p8_1p0=0.9340 corrupt_frac_t_0p8_1p0=0.1961 out_w_norm=93.8140 out_g_norm=0.1263 loss_all=2.6400 init_gold_top10=0.6065 init_gold_top100=0.7083 rollout_applied_pos_frac=0.5344 init_acc_rollout_applied=0.4650 init_acc_rollout_kept=0.6534 logit_acc_rollout_applied=0.5328 logit_acc_rollout_kept=0.7193
|
| 354 |
+
step=1800 epoch=1/53 epoch_step=1800/19018 micro_steps=3600 elapsed=127.7s lr=5.999991e-04 loss=3.1013 loss_recon=3.1013 loss_meanflow=0.0000 mean_model_t=0.5023 mean_corrupt_t=0.5023 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4923 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5539 corrupt_frac=1.0000 acc_corrupt=0.5539 loss_corrupt=3.1013 wrong_frac=0.4983 init_acc_corrupt=0.4731 acc_corrupt_t_0p0_0p2=0.0837 corrupt_frac_t_0p0_0p2=0.1957 acc_corrupt_t_0p2_0p4=0.3426 corrupt_frac_t_0p2_0p4=0.1997 acc_corrupt_t_0p4_0p6=0.6059 corrupt_frac_t_0p4_0p6=0.2041 acc_corrupt_t_0p6_0p8=0.7916 corrupt_frac_t_0p6_0p8=0.2044 acc_corrupt_t_0p8_1p0=0.9363 corrupt_frac_t_0p8_1p0=0.1961 out_w_norm=92.0965 out_g_norm=0.1290 loss_all=2.6217 init_gold_top10=0.6068 init_gold_top100=0.7091 rollout_applied_pos_frac=0.5344 init_acc_rollout_applied=0.4649 init_acc_rollout_kept=0.6534 logit_acc_rollout_applied=0.5336 logit_acc_rollout_kept=0.7230
|
| 355 |
+
step=1900 epoch=1/53 epoch_step=1900/19018 micro_steps=3800 elapsed=127.7s lr=5.999988e-04 loss=3.0754 loss_recon=3.0754 loss_meanflow=0.0000 mean_model_t=0.5062 mean_corrupt_t=0.5062 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5055 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5579 corrupt_frac=1.0000 acc_corrupt=0.5579 loss_corrupt=3.0754 wrong_frac=0.4937 init_acc_corrupt=0.4789 acc_corrupt_t_0p0_0p2=0.0825 corrupt_frac_t_0p0_0p2=0.1996 acc_corrupt_t_0p2_0p4=0.3456 corrupt_frac_t_0p2_0p4=0.1884 acc_corrupt_t_0p4_0p6=0.6070 corrupt_frac_t_0p4_0p6=0.1962 acc_corrupt_t_0p6_0p8=0.7863 corrupt_frac_t_0p6_0p8=0.2107 acc_corrupt_t_0p8_1p0=0.9340 corrupt_frac_t_0p8_1p0=0.2051 out_w_norm=96.5305 out_g_norm=0.1207 loss_all=2.6528 init_gold_top10=0.6098 init_gold_top100=0.7289 rollout_applied_pos_frac=0.5880 init_acc_rollout_applied=0.5277 init_acc_rollout_kept=0.5785 logit_acc_rollout_applied=0.5996 logit_acc_rollout_kept=0.6461
|
| 356 |
+
step=1900 epoch=1/53 epoch_step=1900/19018 micro_steps=3800 elapsed=128.2s lr=5.999988e-04 loss=3.0616 loss_recon=3.0616 loss_meanflow=0.0000 mean_model_t=0.5062 mean_corrupt_t=0.5062 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5055 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5598 corrupt_frac=1.0000 acc_corrupt=0.5598 loss_corrupt=3.0616 wrong_frac=0.4937 init_acc_corrupt=0.4790 acc_corrupt_t_0p0_0p2=0.0828 corrupt_frac_t_0p0_0p2=0.1996 acc_corrupt_t_0p2_0p4=0.3465 corrupt_frac_t_0p2_0p4=0.1884 acc_corrupt_t_0p4_0p6=0.6095 corrupt_frac_t_0p4_0p6=0.1962 acc_corrupt_t_0p6_0p8=0.7899 corrupt_frac_t_0p6_0p8=0.2107 acc_corrupt_t_0p8_1p0=0.9360 corrupt_frac_t_0p8_1p0=0.2051 out_w_norm=94.7721 out_g_norm=0.1230 loss_all=2.6349 init_gold_top10=0.6110 init_gold_top100=0.7280 rollout_applied_pos_frac=0.5880 init_acc_rollout_applied=0.5277 init_acc_rollout_kept=0.5785 logit_acc_rollout_applied=0.6021 logit_acc_rollout_kept=0.6479
|
| 357 |
+
step=2000 epoch=1/53 epoch_step=2000/19018 micro_steps=4000 elapsed=127.3s lr=5.999985e-04 loss=3.0852 loss_recon=3.0852 loss_meanflow=0.0000 mean_model_t=0.5024 mean_corrupt_t=0.5024 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5095 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5557 corrupt_frac=1.0000 acc_corrupt=0.5557 loss_corrupt=3.0852 wrong_frac=0.4973 init_acc_corrupt=0.4743 acc_corrupt_t_0p0_0p2=0.0850 corrupt_frac_t_0p0_0p2=0.2017 acc_corrupt_t_0p2_0p4=0.3441 corrupt_frac_t_0p2_0p4=0.1943 acc_corrupt_t_0p4_0p6=0.6058 corrupt_frac_t_0p4_0p6=0.1988 acc_corrupt_t_0p6_0p8=0.7913 corrupt_frac_t_0p6_0p8=0.1987 acc_corrupt_t_0p8_1p0=0.9399 corrupt_frac_t_0p8_1p0=0.2064 out_w_norm=99.1620 out_g_norm=0.1176 loss_all=3.3511 init_gold_top10=0.5152 init_gold_top100=0.6804 rollout_applied_pos_frac=0.5950 init_acc_rollout_applied=0.4153 init_acc_rollout_kept=0.4654 logit_acc_rollout_applied=0.4880 logit_acc_rollout_kept=0.5615
|
| 358 |
+
step=2000 epoch=1/53 epoch_step=2000/19018 micro_steps=4000 elapsed=127.7s lr=5.999985e-04 loss=3.0783 loss_recon=3.0783 loss_meanflow=0.0000 mean_model_t=0.5024 mean_corrupt_t=0.5024 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5095 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5566 corrupt_frac=1.0000 acc_corrupt=0.5566 loss_corrupt=3.0783 wrong_frac=0.4973 init_acc_corrupt=0.4743 acc_corrupt_t_0p0_0p2=0.0852 corrupt_frac_t_0p0_0p2=0.2017 acc_corrupt_t_0p2_0p4=0.3449 corrupt_frac_t_0p2_0p4=0.1943 acc_corrupt_t_0p4_0p6=0.6068 corrupt_frac_t_0p4_0p6=0.1988 acc_corrupt_t_0p6_0p8=0.7930 corrupt_frac_t_0p6_0p8=0.1987 acc_corrupt_t_0p8_1p0=0.9407 corrupt_frac_t_0p8_1p0=0.2064 out_w_norm=97.3511 out_g_norm=0.1222 loss_all=3.3459 init_gold_top10=0.5135 init_gold_top100=0.6806 rollout_applied_pos_frac=0.5950 init_acc_rollout_applied=0.4147 init_acc_rollout_kept=0.4654 logit_acc_rollout_applied=0.4868 logit_acc_rollout_kept=0.5633
|
| 359 |
+
step=2100 epoch=1/53 epoch_step=2100/19018 micro_steps=4200 elapsed=129.1s lr=5.999982e-04 loss=3.1091 loss_recon=3.1091 loss_meanflow=0.0000 mean_model_t=0.4979 mean_corrupt_t=0.4979 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5027 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5523 corrupt_frac=1.0000 acc_corrupt=0.5523 loss_corrupt=3.1091 wrong_frac=0.5021 init_acc_corrupt=0.4681 acc_corrupt_t_0p0_0p2=0.0856 corrupt_frac_t_0p0_0p2=0.2061 acc_corrupt_t_0p2_0p4=0.3426 corrupt_frac_t_0p2_0p4=0.1972 acc_corrupt_t_0p4_0p6=0.6059 corrupt_frac_t_0p4_0p6=0.1939 acc_corrupt_t_0p6_0p8=0.7978 corrupt_frac_t_0p6_0p8=0.2057 acc_corrupt_t_0p8_1p0=0.9412 corrupt_frac_t_0p8_1p0=0.1971 out_w_norm=101.6800 out_g_norm=0.1155 loss_all=2.6443 init_gold_top10=0.5803 init_gold_top100=0.6744 rollout_applied_pos_frac=0.3753 init_acc_rollout_applied=0.5105 init_acc_rollout_kept=0.5390 logit_acc_rollout_applied=0.5912 logit_acc_rollout_kept=0.6303
|
| 360 |
+
step=2100 epoch=1/53 epoch_step=2100/19018 micro_steps=4200 elapsed=129.5s lr=5.999982e-04 loss=3.1091 loss_recon=3.1091 loss_meanflow=0.0000 mean_model_t=0.4979 mean_corrupt_t=0.4979 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5027 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5525 corrupt_frac=1.0000 acc_corrupt=0.5525 loss_corrupt=3.1091 wrong_frac=0.5021 init_acc_corrupt=0.4681 acc_corrupt_t_0p0_0p2=0.0859 corrupt_frac_t_0p0_0p2=0.2061 acc_corrupt_t_0p2_0p4=0.3426 corrupt_frac_t_0p2_0p4=0.1972 acc_corrupt_t_0p4_0p6=0.6056 corrupt_frac_t_0p4_0p6=0.1939 acc_corrupt_t_0p6_0p8=0.7981 corrupt_frac_t_0p6_0p8=0.2057 acc_corrupt_t_0p8_1p0=0.9416 corrupt_frac_t_0p8_1p0=0.1971 out_w_norm=99.8153 out_g_norm=0.1182 loss_all=2.6301 init_gold_top10=0.5809 init_gold_top100=0.6752 rollout_applied_pos_frac=0.3753 init_acc_rollout_applied=0.5105 init_acc_rollout_kept=0.5390 logit_acc_rollout_applied=0.5939 logit_acc_rollout_kept=0.6297
|
| 361 |
+
step=2200 epoch=1/53 epoch_step=2200/19018 micro_steps=4400 elapsed=127.7s lr=5.999979e-04 loss=3.1361 loss_recon=3.1361 loss_meanflow=0.0000 mean_model_t=0.4905 mean_corrupt_t=0.4905 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5077 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5479 corrupt_frac=1.0000 acc_corrupt=0.5479 loss_corrupt=3.1361 wrong_frac=0.5086 init_acc_corrupt=0.4620 acc_corrupt_t_0p0_0p2=0.0829 corrupt_frac_t_0p0_0p2=0.2076 acc_corrupt_t_0p2_0p4=0.3503 corrupt_frac_t_0p2_0p4=0.2062 acc_corrupt_t_0p4_0p6=0.6152 corrupt_frac_t_0p4_0p6=0.2026 acc_corrupt_t_0p6_0p8=0.7997 corrupt_frac_t_0p6_0p8=0.1937 acc_corrupt_t_0p8_1p0=0.9401 corrupt_frac_t_0p8_1p0=0.1910 out_w_norm=104.1354 out_g_norm=0.1123 loss_all=3.7217 init_gold_top10=0.4500 init_gold_top100=0.6129 rollout_applied_pos_frac=0.5646 init_acc_rollout_applied=0.4158 init_acc_rollout_kept=0.3285 logit_acc_rollout_applied=0.5008 logit_acc_rollout_kept=0.4152
|
| 362 |
+
step=2200 epoch=1/53 epoch_step=2200/19018 micro_steps=4400 elapsed=128.1s lr=5.999979e-04 loss=3.1425 loss_recon=3.1425 loss_meanflow=0.0000 mean_model_t=0.4905 mean_corrupt_t=0.4905 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5077 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5470 corrupt_frac=1.0000 acc_corrupt=0.5470 loss_corrupt=3.1425 wrong_frac=0.5086 init_acc_corrupt=0.4620 acc_corrupt_t_0p0_0p2=0.0830 corrupt_frac_t_0p0_0p2=0.2076 acc_corrupt_t_0p2_0p4=0.3496 corrupt_frac_t_0p2_0p4=0.2062 acc_corrupt_t_0p4_0p6=0.6133 corrupt_frac_t_0p4_0p6=0.2026 acc_corrupt_t_0p6_0p8=0.7986 corrupt_frac_t_0p6_0p8=0.1937 acc_corrupt_t_0p8_1p0=0.9396 corrupt_frac_t_0p8_1p0=0.1910 out_w_norm=102.2337 out_g_norm=0.1171 loss_all=3.7222 init_gold_top10=0.4493 init_gold_top100=0.6117 rollout_applied_pos_frac=0.5646 init_acc_rollout_applied=0.4159 init_acc_rollout_kept=0.3285 logit_acc_rollout_applied=0.5014 logit_acc_rollout_kept=0.4169
|
| 363 |
+
step=2300 epoch=1/53 epoch_step=2300/19018 micro_steps=4600 elapsed=126.8s lr=5.999975e-04 loss=3.0584 loss_recon=3.0584 loss_meanflow=0.0000 mean_model_t=0.5001 mean_corrupt_t=0.5001 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4963 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5586 corrupt_frac=1.0000 acc_corrupt=0.5586 loss_corrupt=3.0584 wrong_frac=0.5000 init_acc_corrupt=0.4709 acc_corrupt_t_0p0_0p2=0.0863 corrupt_frac_t_0p0_0p2=0.2006 acc_corrupt_t_0p2_0p4=0.3533 corrupt_frac_t_0p2_0p4=0.2021 acc_corrupt_t_0p4_0p6=0.6143 corrupt_frac_t_0p4_0p6=0.1997 acc_corrupt_t_0p6_0p8=0.7992 corrupt_frac_t_0p6_0p8=0.1953 acc_corrupt_t_0p8_1p0=0.9446 corrupt_frac_t_0p8_1p0=0.2023 out_w_norm=106.4809 out_g_norm=0.1095 loss_all=2.2770 init_gold_top10=0.6389 init_gold_top100=0.7380 rollout_applied_pos_frac=0.4648 init_acc_rollout_applied=0.5768 init_acc_rollout_kept=0.5722 logit_acc_rollout_applied=0.6729 logit_acc_rollout_kept=0.6538
|
| 364 |
+
step=2300 epoch=1/53 epoch_step=2300/19018 micro_steps=4600 elapsed=127.4s lr=5.999975e-04 loss=3.0690 loss_recon=3.0690 loss_meanflow=0.0000 mean_model_t=0.5001 mean_corrupt_t=0.5001 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4963 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5572 corrupt_frac=1.0000 acc_corrupt=0.5572 loss_corrupt=3.0690 wrong_frac=0.5000 init_acc_corrupt=0.4708 acc_corrupt_t_0p0_0p2=0.0865 corrupt_frac_t_0p0_0p2=0.2006 acc_corrupt_t_0p2_0p4=0.3520 corrupt_frac_t_0p2_0p4=0.2021 acc_corrupt_t_0p4_0p6=0.6112 corrupt_frac_t_0p4_0p6=0.1997 acc_corrupt_t_0p6_0p8=0.7971 corrupt_frac_t_0p6_0p8=0.1953 acc_corrupt_t_0p8_1p0=0.9439 corrupt_frac_t_0p8_1p0=0.2023 out_w_norm=104.5593 out_g_norm=0.1119 loss_all=2.2810 init_gold_top10=0.6389 init_gold_top100=0.7385 rollout_applied_pos_frac=0.4648 init_acc_rollout_applied=0.5763 init_acc_rollout_kept=0.5722 logit_acc_rollout_applied=0.6731 logit_acc_rollout_kept=0.6514
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_8gpu_1m_20260518_141238.log
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
*****************************************
|
| 3 |
+
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
*****************************************
|
| 5 |
+
Traceback (most recent call last):
|
| 6 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 7 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 8 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
|
| 9 |
+
main()
|
| 10 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 11 |
+
return f(*args, **kwargs)
|
| 12 |
+
^^^^^^^^^^^^^^^^^^
|
| 13 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
|
| 14 |
+
run(args)
|
| 15 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
|
| 16 |
+
elastic_launch(
|
| 17 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
|
| 18 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 19 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 20 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
|
| 21 |
+
result = agent.run()
|
| 22 |
+
^^^^^^^^^^^
|
| 23 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
|
| 24 |
+
result = f(*args, **kwargs)
|
| 25 |
+
^^^^^^^^^^^^^^^^^^
|
| 26 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run
|
| 27 |
+
result = self._invoke_run(role)
|
| 28 |
+
^^^^^^^^^^^^^^^^^^^^^^
|
| 29 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 864, in _invoke_run
|
| 30 |
+
self._initialize_workers(self._worker_group)
|
| 31 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
|
| 32 |
+
result = f(*args, **kwargs)
|
| 33 |
+
^^^^^^^^^^^^^^^^^^
|
| 34 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 683, in _initialize_workers
|
| 35 |
+
self._rendezvous(worker_group)
|
| 36 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
|
| 37 |
+
result = f(*args, **kwargs)
|
| 38 |
+
^^^^^^^^^^^^^^^^^^
|
| 39 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 500, in _rendezvous
|
| 40 |
+
rdzv_info = spec.rdzv_handler.next_rendezvous()
|
| 41 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 42 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 67, in next_rendezvous
|
| 43 |
+
self._store = TCPStore( # type: ignore[call-arg]
|
| 44 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 45 |
+
torch.distributed.DistStoreError: Timed out after 901 seconds waiting for clients. 1/2 clients joined.
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_8gpu_1m_20260518_141240.log
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
*****************************************
|
| 3 |
+
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
*****************************************
|
| 5 |
+
Traceback (most recent call last):
|
| 6 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 7 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 8 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
|
| 9 |
+
main()
|
| 10 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 11 |
+
return f(*args, **kwargs)
|
| 12 |
+
^^^^^^^^^^^^^^^^^^
|
| 13 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
|
| 14 |
+
run(args)
|
| 15 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
|
| 16 |
+
elastic_launch(
|
| 17 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
|
| 18 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 19 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 20 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
|
| 21 |
+
result = agent.run()
|
| 22 |
+
^^^^^^^^^^^
|
| 23 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
|
| 24 |
+
result = f(*args, **kwargs)
|
| 25 |
+
^^^^^^^^^^^^^^^^^^
|
| 26 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run
|
| 27 |
+
result = self._invoke_run(role)
|
| 28 |
+
^^^^^^^^^^^^^^^^^^^^^^
|
| 29 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 864, in _invoke_run
|
| 30 |
+
self._initialize_workers(self._worker_group)
|
| 31 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
|
| 32 |
+
result = f(*args, **kwargs)
|
| 33 |
+
^^^^^^^^^^^^^^^^^^
|
| 34 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 683, in _initialize_workers
|
| 35 |
+
self._rendezvous(worker_group)
|
| 36 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
|
| 37 |
+
result = f(*args, **kwargs)
|
| 38 |
+
^^^^^^^^^^^^^^^^^^
|
| 39 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 500, in _rendezvous
|
| 40 |
+
rdzv_info = spec.rdzv_handler.next_rendezvous()
|
| 41 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 42 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 67, in next_rendezvous
|
| 43 |
+
self._store = TCPStore( # type: ignore[call-arg]
|
| 44 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 45 |
+
torch.distributed.DistStoreError: Timed out after 901 seconds waiting for clients. 1/2 clients joined.
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_linearsoftkl_m1p5_s0p8_conflinear_gbs512_8gpu_5epoch_20260516_161629.log
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Traceback (most recent call last):
|
| 2 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2600, in <module>
|
| 3 |
+
main()
|
| 4 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1722, in main
|
| 5 |
+
torch.cuda.set_device(local_rank)
|
| 6 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py", line 477, in set_device
|
| 7 |
+
torch._C._cuda_setDevice(device)
|
| 8 |
+
RuntimeError: CUDA error: invalid device ordinal
|
| 9 |
+
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
|
| 10 |
+
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
|
| 11 |
+
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
|
| 12 |
+
|
| 13 |
+
Traceback (most recent call last):
|
| 14 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2600, in <module>
|
| 15 |
+
main()
|
| 16 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1722, in main
|
| 17 |
+
torch.cuda.set_device(local_rank)
|
| 18 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py", line 477, in set_device
|
| 19 |
+
torch._C._cuda_setDevice(device)
|
| 20 |
+
RuntimeError: CUDA error: invalid device ordinal
|
| 21 |
+
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
|
| 22 |
+
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
|
| 23 |
+
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
|
| 24 |
+
|
| 25 |
+
Traceback (most recent call last):
|
| 26 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2600, in <module>
|
| 27 |
+
main()
|
| 28 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1722, in main
|
| 29 |
+
torch.cuda.set_device(local_rank)
|
| 30 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py", line 477, in set_device
|
| 31 |
+
torch._C._cuda_setDevice(device)
|
| 32 |
+
RuntimeError: CUDA error: invalid device ordinal
|
| 33 |
+
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
|
| 34 |
+
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
|
| 35 |
+
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
|
| 36 |
+
|
| 37 |
+
Traceback (most recent call last):
|
| 38 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2600, in <module>
|
| 39 |
+
main()
|
| 40 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1722, in main
|
| 41 |
+
torch.cuda.set_device(local_rank)
|
| 42 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py", line 477, in set_device
|
| 43 |
+
torch._C._cuda_setDevice(device)
|
| 44 |
+
RuntimeError: CUDA error: invalid device ordinal
|
| 45 |
+
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
|
| 46 |
+
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
|
| 47 |
+
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
|
| 48 |
+
|
| 49 |
+
W0516 16:16:34.753000 1539 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 1543 closing signal SIGTERM
|
| 50 |
+
W0516 16:16:34.754000 1539 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 1544 closing signal SIGTERM
|
| 51 |
+
W0516 16:16:34.754000 1539 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 1545 closing signal SIGTERM
|
| 52 |
+
W0516 16:16:34.754000 1539 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 1546 closing signal SIGTERM
|
| 53 |
+
E0516 16:16:34.938000 1539 torch/distributed/elastic/multiprocessing/api.py:870] failed (exitcode: 1) local_rank: 4 (pid: 1547) of binary: /usr/bin/python
|
| 54 |
+
Traceback (most recent call last):
|
| 55 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 56 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 57 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
|
| 58 |
+
main()
|
| 59 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 60 |
+
return f(*args, **kwargs)
|
| 61 |
+
^^^^^^^^^^^^^^^^^^
|
| 62 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
|
| 63 |
+
run(args)
|
| 64 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
|
| 65 |
+
elastic_launch(
|
| 66 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
|
| 67 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 68 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 69 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
|
| 70 |
+
raise ChildFailedError(
|
| 71 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 72 |
+
============================================================
|
| 73 |
+
train.py FAILED
|
| 74 |
+
------------------------------------------------------------
|
| 75 |
+
Failures:
|
| 76 |
+
[1]:
|
| 77 |
+
time : 2026-05-16_16:16:34
|
| 78 |
+
host : t-20260517001513-7pn5c-worker-0.t-20260517001513-7pn5c-worker.mlplatform-customtask.svc.cluster.local
|
| 79 |
+
rank : 5 (local_rank: 5)
|
| 80 |
+
exitcode : 1 (pid: 1548)
|
| 81 |
+
error_file: <N/A>
|
| 82 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 83 |
+
[2]:
|
| 84 |
+
time : 2026-05-16_16:16:34
|
| 85 |
+
host : t-20260517001513-7pn5c-worker-0.t-20260517001513-7pn5c-worker.mlplatform-customtask.svc.cluster.local
|
| 86 |
+
rank : 6 (local_rank: 6)
|
| 87 |
+
exitcode : 1 (pid: 1549)
|
| 88 |
+
error_file: <N/A>
|
| 89 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 90 |
+
[3]:
|
| 91 |
+
time : 2026-05-16_16:16:34
|
| 92 |
+
host : t-20260517001513-7pn5c-worker-0.t-20260517001513-7pn5c-worker.mlplatform-customtask.svc.cluster.local
|
| 93 |
+
rank : 7 (local_rank: 7)
|
| 94 |
+
exitcode : 1 (pid: 1550)
|
| 95 |
+
error_file: <N/A>
|
| 96 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 97 |
+
------------------------------------------------------------
|
| 98 |
+
Root Cause (first observed failure):
|
| 99 |
+
[0]:
|
| 100 |
+
time : 2026-05-16_16:16:34
|
| 101 |
+
host : t-20260517001513-7pn5c-worker-0.t-20260517001513-7pn5c-worker.mlplatform-customtask.svc.cluster.local
|
| 102 |
+
rank : 4 (local_rank: 4)
|
| 103 |
+
exitcode : 1 (pid: 1547)
|
| 104 |
+
error_file: <N/A>
|
| 105 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 106 |
+
============================================================
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_rollin_p50_path2_unif0_0p25_synct_mask1_gbs512_8gpu_20260518_024916.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_rollin_p50_randk0_3_uniformt_temp1_synct_mask1_gbs512_8gpu_1m_20260518_025115.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_rollin_p50_randk0_3_uniformt_temp1_synct_mask1_gbs512_8gpu_1m_20260518_125608.log
ADDED
|
@@ -0,0 +1,814 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
t-20260518193619-wnzpp-worker-0:10394:10394 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 2 |
+
t-20260518193619-wnzpp-worker-0:10394:10394 [0] NCCL INFO Bootstrap: Using eth1:10.82.72.17<0>
|
| 3 |
+
t-20260518193619-wnzpp-worker-0:10394:10394 [0] NCCL INFO cudaDriverVersion 12080
|
| 4 |
+
t-20260518193619-wnzpp-worker-0:10394:10394 [0] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 5 |
+
t-20260518193619-wnzpp-worker-0:10394:10394 [0] NCCL INFO Comm config Blocking set to 1
|
| 6 |
+
t-20260518193619-wnzpp-worker-0:10400:10400 [6] NCCL INFO cudaDriverVersion 12080
|
| 7 |
+
t-20260518193619-wnzpp-worker-0:10400:10400 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 8 |
+
t-20260518193619-wnzpp-worker-0:10400:10400 [6] NCCL INFO Bootstrap: Using eth1:10.82.72.17<0>
|
| 9 |
+
t-20260518193619-wnzpp-worker-0:10400:10400 [6] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 10 |
+
t-20260518193619-wnzpp-worker-0:10399:10399 [5] NCCL INFO cudaDriverVersion 12080
|
| 11 |
+
t-20260518193619-wnzpp-worker-0:10399:10399 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 12 |
+
t-20260518193619-wnzpp-worker-0:10400:10400 [6] NCCL INFO Comm config Blocking set to 1
|
| 13 |
+
t-20260518193619-wnzpp-worker-0:10399:10399 [5] NCCL INFO Bootstrap: Using eth1:10.82.72.17<0>
|
| 14 |
+
t-20260518193619-wnzpp-worker-0:10399:10399 [5] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 15 |
+
t-20260518193619-wnzpp-worker-0:10397:10397 [3] NCCL INFO cudaDriverVersion 12080
|
| 16 |
+
t-20260518193619-wnzpp-worker-0:10397:10397 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 17 |
+
t-20260518193619-wnzpp-worker-0:10398:10398 [4] NCCL INFO cudaDriverVersion 12080
|
| 18 |
+
t-20260518193619-wnzpp-worker-0:10398:10398 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 19 |
+
t-20260518193619-wnzpp-worker-0:10401:10401 [7] NCCL INFO cudaDriverVersion 12080
|
| 20 |
+
t-20260518193619-wnzpp-worker-0:10401:10401 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 21 |
+
t-20260518193619-wnzpp-worker-0:10399:10399 [5] NCCL INFO Comm config Blocking set to 1
|
| 22 |
+
t-20260518193619-wnzpp-worker-0:10397:10397 [3] NCCL INFO Bootstrap: Using eth1:10.82.72.17<0>
|
| 23 |
+
t-20260518193619-wnzpp-worker-0:10398:10398 [4] NCCL INFO Bootstrap: Using eth1:10.82.72.17<0>
|
| 24 |
+
t-20260518193619-wnzpp-worker-0:10401:10401 [7] NCCL INFO Bootstrap: Using eth1:10.82.72.17<0>
|
| 25 |
+
t-20260518193619-wnzpp-worker-0:10398:10398 [4] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 26 |
+
t-20260518193619-wnzpp-worker-0:10401:10401 [7] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 27 |
+
t-20260518193619-wnzpp-worker-0:10397:10397 [3] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 28 |
+
t-20260518193619-wnzpp-worker-0:10397:10397 [3] NCCL INFO Comm config Blocking set to 1
|
| 29 |
+
t-20260518193619-wnzpp-worker-0:10401:10401 [7] NCCL INFO Comm config Blocking set to 1
|
| 30 |
+
t-20260518193619-wnzpp-worker-0:10398:10398 [4] NCCL INFO Comm config Blocking set to 1
|
| 31 |
+
t-20260518193619-wnzpp-worker-0:10396:10396 [2] NCCL INFO cudaDriverVersion 12080
|
| 32 |
+
t-20260518193619-wnzpp-worker-0:10396:10396 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 33 |
+
t-20260518193619-wnzpp-worker-0:10396:10396 [2] NCCL INFO Bootstrap: Using eth1:10.82.72.17<0>
|
| 34 |
+
t-20260518193619-wnzpp-worker-0:10396:10396 [2] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 35 |
+
t-20260518193619-wnzpp-worker-0:10396:10396 [2] NCCL INFO Comm config Blocking set to 1
|
| 36 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 37 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 38 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 39 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO P2P plugin v9 IBext_v9
|
| 40 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 41 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 42 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 43 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 44 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO P2P plugin v9 IBext_v9
|
| 45 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 46 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 47 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 48 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 49 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO P2P plugin v9 IBext_v9
|
| 50 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 51 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 52 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 53 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 54 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO P2P plugin v9 IBext_v9
|
| 55 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 56 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 57 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 58 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 59 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO P2P plugin v9 IBext_v9
|
| 60 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 61 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 62 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 63 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 64 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO P2P plugin v9 IBext_v9
|
| 65 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 66 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 67 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.72.17<0>
|
| 68 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 69 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Using network IBext_v9
|
| 70 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 71 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.72.17<0>
|
| 72 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 73 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO Using network IBext_v9
|
| 74 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 75 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.72.17<0>
|
| 76 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 77 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.72.17<0>
|
| 78 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 79 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.72.17<0>
|
| 80 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 81 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.72.17<0>
|
| 82 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 83 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO Using network IBext_v9
|
| 84 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 85 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO Using network IBext_v9
|
| 86 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 87 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO Using network IBext_v9
|
| 88 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 89 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO Using network IBext_v9
|
| 90 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 91 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 92 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 93 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO P2P plugin v9 IBext_v9
|
| 94 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 95 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 96 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.72.17<0>
|
| 97 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 98 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO Using network IBext_v9
|
| 99 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO ncclCommInitRankConfig comm 0xd982790 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId 73020 commId 0x2b45641809baa172 - Init START
|
| 100 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO ncclCommInitRankConfig comm 0xd9dc180 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 65040 commId 0x2b45641809baa172 - Init START
|
| 101 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO ncclCommInitRankConfig comm 0xd4cf3e0 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId 75020 commId 0x2b45641809baa172 - Init START
|
| 102 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO RAS client listening socket at ::1<28028>
|
| 103 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO ncclCommInitRankConfig comm 0xeb6d1a0 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId 6f020 commId 0x2b45641809baa172 - Init START
|
| 104 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO ncclCommInitRankConfig comm 0xec825c0 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId 6b020 commId 0x2b45641809baa172 - Init START
|
| 105 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO ncclCommInitRankConfig comm 0xe49fe20 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId 71020 commId 0x2b45641809baa172 - Init START
|
| 106 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO RAS client listening socket at ::1<28028>
|
| 107 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO RAS client listening socket at ::1<28028>
|
| 108 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO RAS client listening socket at ::1<28028>
|
| 109 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO ncclCommInitRankConfig comm 0x722c5210 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId 69020 commId 0x2b45641809baa172 - Init START
|
| 110 |
+
t-20260518193619-wnzpp-worker-0:10395:10395 [1] NCCL INFO cudaDriverVersion 12080
|
| 111 |
+
t-20260518193619-wnzpp-worker-0:10395:10395 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 112 |
+
t-20260518193619-wnzpp-worker-0:10395:10395 [1] NCCL INFO Bootstrap: Using eth1:10.82.72.17<0>
|
| 113 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO RAS client listening socket at ::1<28028>
|
| 114 |
+
t-20260518193619-wnzpp-worker-0:10395:10395 [1] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 115 |
+
t-20260518193619-wnzpp-worker-0:10395:10395 [1] NCCL INFO Comm config Blocking set to 1
|
| 116 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 117 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 118 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 119 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO P2P plugin v9 IBext_v9
|
| 120 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 121 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 122 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.72.17<0>
|
| 123 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 124 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO Using network IBext_v9
|
| 125 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO ncclCommInitRankConfig comm 0xd3cb0a0 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 67020 commId 0x2b45641809baa172 - Init START
|
| 126 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO RAS client listening socket at ::1<28028>
|
| 127 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO RAS client listening socket at ::1<28028>
|
| 128 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO RAS client listening socket at ::1<28028>
|
| 129 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO Bootstrap timings total 0.370200 (create 0.000020, send 0.000066, recv 0.000048, ring 0.000129, delay 0.000001)
|
| 130 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO Bootstrap timings total 0.008137 (create 0.000021, send 0.000070, recv 0.000075, ring 0.000133, delay 0.000001)
|
| 131 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Bootstrap timings total 0.519792 (create 0.000024, send 0.000069, recv 0.511736, ring 0.000122, delay 0.000001)
|
| 132 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO Bootstrap timings total 0.455770 (create 0.000021, send 0.000074, recv 0.000068, ring 0.455343, delay 0.000001)
|
| 133 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO Bootstrap timings total 0.462973 (create 0.000019, send 0.000065, recv 0.000099, ring 0.462480, delay 0.000001)
|
| 134 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO Bootstrap timings total 0.461395 (create 0.000020, send 0.000081, recv 0.005659, ring 0.455328, delay 0.000001)
|
| 135 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO Bootstrap timings total 0.456248 (create 0.000019, send 0.000072, recv 0.000041, ring 0.369579, delay 0.000001)
|
| 136 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO Bootstrap timings total 0.520885 (create 0.000024, send 0.000070, recv 0.057994, ring 0.455253, delay 0.000001)
|
| 137 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO MNNVL busId 0x69020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 138 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO MNNVL busId 0x71020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 139 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO MNNVL busId 0x75020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 140 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO MNNVL busId 0x67020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 141 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO MNNVL busId 0x6f020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 142 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO MNNVL busId 0x65040 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 143 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO MNNVL busId 0x6b020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 144 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO MNNVL busId 0x73020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 145 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 146 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 147 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 148 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 149 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 150 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 151 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 152 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 153 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO Setting affinity for GPU 7 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 154 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO NVLS multicast support is available on dev 7
|
| 155 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO Setting affinity for GPU 1 to 03ffffff,ffffffff,ffffffff
|
| 156 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Setting affinity for GPU 0 to 03ffffff,ffffffff,ffffffff
|
| 157 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO Setting affinity for GPU 2 to 03ffffff,ffffffff,ffffffff
|
| 158 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO NVLS multicast support is available on dev 1
|
| 159 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO NVLS multicast support is available on dev 0
|
| 160 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO Setting affinity for GPU 5 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 161 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO NVLS multicast support is available on dev 5
|
| 162 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO Setting affinity for GPU 6 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 163 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO NVLS multicast support is available on dev 6
|
| 164 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO Setting affinity for GPU 4 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 165 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO NVLS multicast support is available on dev 4
|
| 166 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO Setting affinity for GPU 3 to 03ffffff,ffffffff,ffffffff
|
| 167 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO NVLS multicast support is available on dev 3
|
| 168 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO NVLS multicast support is available on dev 2
|
| 169 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO comm 0xd982790 rank 6 nRanks 8 nNodes 1 localRanks 8 localRank 6 MNNVL 0
|
| 170 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO comm 0x722c5210 rank 2 nRanks 8 nNodes 1 localRanks 8 localRank 2 MNNVL 0
|
| 171 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO comm 0xd4cf3e0 rank 7 nRanks 8 nNodes 1 localRanks 8 localRank 7 MNNVL 0
|
| 172 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO comm 0xeb6d1a0 rank 4 nRanks 8 nNodes 1 localRanks 8 localRank 4 MNNVL 0
|
| 173 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO comm 0xd3cb0a0 rank 1 nRanks 8 nNodes 1 localRanks 8 localRank 1 MNNVL 0
|
| 174 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO comm 0xec825c0 rank 3 nRanks 8 nNodes 1 localRanks 8 localRank 3 MNNVL 0
|
| 175 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO comm 0xd9dc180 rank 0 nRanks 8 nNodes 1 localRanks 8 localRank 0 MNNVL 0
|
| 176 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO comm 0xe49fe20 rank 5 nRanks 8 nNodes 1 localRanks 8 localRank 5 MNNVL 0
|
| 177 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 00/24 : 0 1 2 3 4 5 6 7
|
| 178 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 01/24 : 0 1 2 3 4 5 6 7
|
| 179 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->5 [15] 7/-1/-1->6->5 [16] 7/-1/-1->6->5 [17] 7/-1/-1->6->5 [18] 7/-1/-1->6->5 [19] 7/-1/-1->6->5 [20] 7/-1/-1->6->5 [21] 7/-1/-1->6->5 [22] 7/-1/-1->6->5 [23] 7/-1/-1->6->5
|
| 180 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 02/24 : 0 1 2 3 4 5 6 7
|
| 181 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 [2] -1/-1/-1->7->6 [3] -1/-1/-1->7->6 [4] -1/-1/-1->7->6 [5] -1/-1/-1->7->6 [6] -1/-1/-1->7->6 [7] -1/-1/-1->7->6 [8] -1/-1/-1->7->6 [9] -1/-1/-1->7->6 [10] -1/-1/-1->7->6 [11] -1/-1/-1->7->6 [12] -1/-1/-1->7->6 [13] -1/-1/-1->7->6 [14] -1/-1/-1->7->6 [15] -1/-1/-1->7->6 [16] -1/-1/-1->7->6 [17] -1/-1/-1->7->6 [18] -1/-1/-1->7->6 [19] -1/-1/-1->7->6 [20] -1/-1/-1->7->6 [21] -1/-1/-1->7->6 [22] -1/-1/-1->7->6 [23] -1/-1/-1->7->6
|
| 182 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO P2P Chunksize set to 524288
|
| 183 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/-1/-1->4->3 [5] 5/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->3 [13] 5/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 [16] 5/-1/-1->4->3 [17] 5/-1/-1->4->3 [18] 5/-1/-1->4->3 [19] 5/-1/-1->4->3 [20] 5/-1/-1->4->3 [21] 5/-1/-1->4->3 [22] 5/-1/-1->4->3 [23] 5/-1/-1->4->3
|
| 184 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 03/24 : 0 1 2 3 4 5 6 7
|
| 185 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/-1/-1->3->2 [4] 4/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->2 [12] 4/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 [16] 4/-1/-1->3->2 [17] 4/-1/-1->3->2 [18] 4/-1/-1->3->2 [19] 4/-1/-1->3->2 [20] 4/-1/-1->3->2 [21] 4/-1/-1->3->2 [22] 4/-1/-1->3->2 [23] 4/-1/-1->3->2
|
| 186 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 04/24 : 0 1 2 3 4 5 6 7
|
| 187 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO P2P Chunksize set to 524288
|
| 188 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO P2P Chunksize set to 524288
|
| 189 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 05/24 : 0 1 2 3 4 5 6 7
|
| 190 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO P2P Chunksize set to 524288
|
| 191 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 06/24 : 0 1 2 3 4 5 6 7
|
| 192 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 07/24 : 0 1 2 3 4 5 6 7
|
| 193 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 08/24 : 0 1 2 3 4 5 6 7
|
| 194 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0
|
| 195 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 09/24 : 0 1 2 3 4 5 6 7
|
| 196 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 10/24 : 0 1 2 3 4 5 6 7
|
| 197 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO P2P Chunksize set to 524288
|
| 198 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1
|
| 199 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 11/24 : 0 1 2 3 4 5 6 7
|
| 200 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 12/24 : 0 1 2 3 4 5 6 7
|
| 201 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/-1/-1->5->4 [6] 6/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->4 [14] 6/-1/-1->5->4 [15] 6/-1/-1->5->4 [16] 6/-1/-1->5->4 [17] 6/-1/-1->5->4 [18] 6/-1/-1->5->4 [19] 6/-1/-1->5->4 [20] 6/-1/-1->5->4 [21] 6/-1/-1->5->4 [22] 6/-1/-1->5->4 [23] 6/-1/-1->5->4
|
| 202 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO P2P Chunksize set to 524288
|
| 203 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 13/24 : 0 1 2 3 4 5 6 7
|
| 204 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 14/24 : 0 1 2 3 4 5 6 7
|
| 205 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO P2P Chunksize set to 524288
|
| 206 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 15/24 : 0 1 2 3 4 5 6 7
|
| 207 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 16/24 : 0 1 2 3 4 5 6 7
|
| 208 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 17/24 : 0 1 2 3 4 5 6 7
|
| 209 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 18/24 : 0 1 2 3 4 5 6 7
|
| 210 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 19/24 : 0 1 2 3 4 5 6 7
|
| 211 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 20/24 : 0 1 2 3 4 5 6 7
|
| 212 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 21/24 : 0 1 2 3 4 5 6 7
|
| 213 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 22/24 : 0 1 2 3 4 5 6 7
|
| 214 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Channel 23/24 : 0 1 2 3 4 5 6 7
|
| 215 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1
|
| 216 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO P2P Chunksize set to 524288
|
| 217 |
+
t-20260518193619-wnzpp-worker-0:10398:11355 [4] NCCL INFO [Proxy Service] Device 4 CPU core 96
|
| 218 |
+
t-20260518193619-wnzpp-worker-0:10400:11357 [6] NCCL INFO [Proxy Service] Device 6 CPU core 112
|
| 219 |
+
t-20260518193619-wnzpp-worker-0:10401:11359 [7] NCCL INFO [Proxy Service] Device 7 CPU core 116
|
| 220 |
+
t-20260518193619-wnzpp-worker-0:10400:11358 [6] NCCL INFO [Proxy Service UDS] Device 6 CPU core 114
|
| 221 |
+
t-20260518193619-wnzpp-worker-0:10401:11360 [7] NCCL INFO [Proxy Service UDS] Device 7 CPU core 120
|
| 222 |
+
t-20260518193619-wnzpp-worker-0:10398:11356 [4] NCCL INFO [Proxy Service UDS] Device 4 CPU core 98
|
| 223 |
+
t-20260518193619-wnzpp-worker-0:10397:11361 [3] NCCL INFO [Proxy Service] Device 3 CPU core 64
|
| 224 |
+
t-20260518193619-wnzpp-worker-0:10397:11362 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 66
|
| 225 |
+
t-20260518193619-wnzpp-worker-0:10396:11363 [2] NCCL INFO [Proxy Service] Device 2 CPU core 2
|
| 226 |
+
t-20260518193619-wnzpp-worker-0:10396:11364 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 4
|
| 227 |
+
t-20260518193619-wnzpp-worker-0:10395:11365 [1] NCCL INFO [Proxy Service] Device 1 CPU core 2
|
| 228 |
+
t-20260518193619-wnzpp-worker-0:10399:11367 [5] NCCL INFO [Proxy Service] Device 5 CPU core 138
|
| 229 |
+
t-20260518193619-wnzpp-worker-0:10395:11366 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 4
|
| 230 |
+
t-20260518193619-wnzpp-worker-0:10399:11368 [5] NCCL INFO [Proxy Service UDS] Device 5 CPU core 140
|
| 231 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Check P2P Type intraNodeP2pSupport 1 directMode 0
|
| 232 |
+
t-20260518193619-wnzpp-worker-0:10394:11369 [0] NCCL INFO [Proxy Service] Device 0 CPU core 89
|
| 233 |
+
t-20260518193619-wnzpp-worker-0:10394:11370 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 2
|
| 234 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 235 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 236 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 237 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 238 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 239 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 240 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 241 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 242 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 243 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 244 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 245 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 246 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 247 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 248 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 249 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 250 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO CC Off, workFifoBytes 1048576
|
| 251 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 252 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 253 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 254 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 255 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 256 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 257 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 258 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 259 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 260 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 261 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 262 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 263 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 264 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 265 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 266 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 267 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 268 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 269 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 270 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO ncclCommInitRankConfig comm 0xd982790 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId 73020 commId 0x2b45641809baa172 - Init COMPLETE
|
| 271 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO ncclCommInitRankConfig comm 0xd4cf3e0 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId 75020 commId 0x2b45641809baa172 - Init COMPLETE
|
| 272 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO ncclCommInitRankConfig comm 0xec825c0 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId 6b020 commId 0x2b45641809baa172 - Init COMPLETE
|
| 273 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 274 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 275 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO ncclCommInitRankConfig comm 0xe49fe20 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId 71020 commId 0x2b45641809baa172 - Init COMPLETE
|
| 276 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 277 |
+
t-20260518193619-wnzpp-worker-0:10397:11255 [3] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 8 total 2.36 (kernels 0.20, alloc 0.87, bootstrap 0.46, allgathers 0.01, topo 0.52, graphs 0.01, connections 0.28, rest 0.02)
|
| 278 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO ncclCommInitRankConfig comm 0xeb6d1a0 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId 6f020 commId 0x2b45641809baa172 - Init COMPLETE
|
| 279 |
+
t-20260518193619-wnzpp-worker-0:10400:11253 [6] NCCL INFO Init timings - ncclCommInitRankConfig: rank 6 nranks 8 total 2.36 (kernels 0.20, alloc 0.81, bootstrap 0.52, allgathers 0.01, topo 0.52, graphs 0.01, connections 0.28, rest 0.02)
|
| 280 |
+
t-20260518193619-wnzpp-worker-0:10401:11256 [7] NCCL INFO Init timings - ncclCommInitRankConfig: rank 7 nranks 8 total 2.36 (kernels 0.20, alloc 0.86, bootstrap 0.46, allgathers 0.01, topo 0.52, graphs 0.01, connections 0.28, rest 0.02)
|
| 281 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 282 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 283 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO ncclCommInitRankConfig comm 0x722c5210 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId 69020 commId 0x2b45641809baa172 - Init COMPLETE
|
| 284 |
+
t-20260518193619-wnzpp-worker-0:10399:11254 [5] NCCL INFO Init timings - ncclCommInitRankConfig: rank 5 nranks 8 total 2.36 (kernels 0.20, alloc 0.87, bootstrap 0.46, allgathers 0.01, topo 0.52, graphs 0.01, connections 0.27, rest 0.03)
|
| 285 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO ncclCommInitRankConfig comm 0xd3cb0a0 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 67020 commId 0x2b45641809baa172 - Init COMPLETE
|
| 286 |
+
t-20260518193619-wnzpp-worker-0:10398:11257 [4] NCCL INFO Init timings - ncclCommInitRankConfig: rank 4 nranks 8 total 2.36 (kernels 0.20, alloc 0.86, bootstrap 0.46, allgathers 0.01, topo 0.52, graphs 0.01, connections 0.28, rest 0.02)
|
| 287 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO ncclCommInitRankConfig comm 0xd9dc180 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 65040 commId 0x2b45641809baa172 - Init COMPLETE
|
| 288 |
+
t-20260518193619-wnzpp-worker-0:10396:11258 [2] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 8 total 2.28 (kernels 0.47, alloc 0.61, bootstrap 0.37, allgathers 0.00, topo 0.52, graphs 0.01, connections 0.27, rest 0.03)
|
| 289 |
+
t-20260518193619-wnzpp-worker-0:10395:11329 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 8 total 1.20 (kernels 0.18, alloc 0.18, bootstrap 0.01, allgathers 0.00, topo 0.52, graphs 0.01, connections 0.27, rest 0.03)
|
| 290 |
+
t-20260518193619-wnzpp-worker-0:10394:11252 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 8 total 2.37 (kernels 0.20, alloc 0.82, bootstrap 0.52, allgathers 0.01, topo 0.52, graphs 0.01, connections 0.27, rest 0.03)
|
| 291 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 00/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 292 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 01/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 293 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 00/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 294 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 00/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 295 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 02/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 296 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 297 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 01/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 298 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 03/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 299 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 02/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 300 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 02/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 301 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 04/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 302 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 03/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 303 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 03/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 304 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 05/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 305 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 04/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 306 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 04/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 307 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 06/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 308 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 05/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 309 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 05/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 310 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 07/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 311 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 06/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 312 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 06/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 313 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 08/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 314 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 07/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 315 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 07/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 316 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 09/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 317 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 08/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 318 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 08/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 319 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 320 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 10/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 321 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 09/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 322 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 09/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 323 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 324 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 11/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 325 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 10/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 326 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 327 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 10/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 328 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 329 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 12/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 330 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 11/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 331 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 332 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 11/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 333 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 334 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 13/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 335 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 12/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 336 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 337 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 12/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 338 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 339 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 14/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 340 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 341 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 13/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 342 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 03/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 343 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 13/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 344 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 345 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 15/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 346 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 347 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 14/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 348 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 04/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 349 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 14/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 350 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 351 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 352 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 16/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 353 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 354 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 15/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 355 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 05/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 356 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 15/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 357 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 358 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 359 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 17/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 360 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 361 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 16/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 362 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 06/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 363 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 16/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 364 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 365 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 366 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 18/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 367 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 368 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 369 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 17/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 370 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 371 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 17/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 372 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 373 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 374 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 19/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 375 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 01/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 376 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 377 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 18/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 378 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 08/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 379 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 18/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 380 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 381 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 382 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 20/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 383 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 02/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 384 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 385 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 19/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 386 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 387 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 19/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 388 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 389 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 390 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 21/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 391 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 03/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 392 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 393 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 20/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 394 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 10/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 395 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 20/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 396 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 397 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 398 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 22/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 399 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 04/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 400 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 401 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 21/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 402 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 11/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 403 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 21/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 404 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 405 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 406 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Channel 23/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 407 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 05/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 408 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 409 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 22/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 410 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 12/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 411 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 22/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 412 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 413 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 414 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 415 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 06/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 416 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Channel 23/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 417 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 13/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 418 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Channel 23/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 419 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 420 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 421 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 422 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 07/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 423 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 14/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 424 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 425 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 426 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 427 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 428 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 15/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 429 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 430 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 431 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 432 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 09/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 433 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 16/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 434 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 435 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 436 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 437 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 10/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 438 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 17/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 439 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 440 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 441 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 442 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 11/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 443 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 18/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 444 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 445 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 446 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 447 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 12/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 448 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 19/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 449 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 450 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 451 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 452 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 13/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 453 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 20/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 454 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 455 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 456 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 457 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 14/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 458 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 21/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 459 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 460 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 461 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 462 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 15/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 463 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 22/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 464 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 465 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 466 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 16/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 467 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Channel 23/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 468 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 469 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 17/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 470 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 471 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 472 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 18/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 473 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 474 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 475 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 476 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 19/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 477 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 20/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 478 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 479 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 21/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 480 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 481 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 22/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 482 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Channel 23/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 483 |
+
t-20260518193619-wnzpp-worker-0:10401:11371 [7] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 484 |
+
t-20260518193619-wnzpp-worker-0:10400:11374 [6] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 485 |
+
t-20260518193619-wnzpp-worker-0:10399:11375 [5] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 486 |
+
t-20260518193619-wnzpp-worker-0:10396:11373 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 487 |
+
t-20260518193619-wnzpp-worker-0:10395:11376 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 488 |
+
t-20260518193619-wnzpp-worker-0:10394:11378 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 489 |
+
t-20260518193619-wnzpp-worker-0:10397:11377 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 490 |
+
t-20260518193619-wnzpp-worker-0:10398:11372 [4] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 491 |
+
{
|
| 492 |
+
"device": "cuda:0",
|
| 493 |
+
"rank": 0,
|
| 494 |
+
"world_size": 8,
|
| 495 |
+
"samples": "tokenized_hf:9737184:pad=0",
|
| 496 |
+
"vocab_size": 32100,
|
| 497 |
+
"tokenizer_vocab_size": 32100,
|
| 498 |
+
"save_dir": "runs/lta_owt_t5_rollin_p50_randk0_3_uniformt_temp1_synct_mask1_gbs512_8gpu_1m_20260518_125608",
|
| 499 |
+
"batch_size": 32,
|
| 500 |
+
"grad_accum": 2,
|
| 501 |
+
"effective_batch_size": 512,
|
| 502 |
+
"global_batch_size": 512,
|
| 503 |
+
"lr_schedule": "constant_warmup",
|
| 504 |
+
"optimizer": "muon",
|
| 505 |
+
"epochs": 5.0,
|
| 506 |
+
"steps_per_epoch": 19018,
|
| 507 |
+
"total_steps": 95090,
|
| 508 |
+
"warmup_steps": 9509,
|
| 509 |
+
"warmup_epochs": 0.5,
|
| 510 |
+
"min_lr": 0.0,
|
| 511 |
+
"weight_decay": 0.1,
|
| 512 |
+
"output_weight_decay": -1.0,
|
| 513 |
+
"adamw_param_groups": "nanogpt",
|
| 514 |
+
"adam_beta1": 0.9,
|
| 515 |
+
"adam_beta2": 0.999,
|
| 516 |
+
"adam_eps": 1e-08,
|
| 517 |
+
"muon_impl": "optax",
|
| 518 |
+
"muon_momentum": 0.95,
|
| 519 |
+
"muon_ns_steps": 5,
|
| 520 |
+
"muon_update_scale": 1.0,
|
| 521 |
+
"muon_nesterov": true,
|
| 522 |
+
"muon_width_scale": true,
|
| 523 |
+
"muon_grouping": "hidden_2d",
|
| 524 |
+
"muon_param_count": 84934656,
|
| 525 |
+
"muon_adam_param_count": 50212608,
|
| 526 |
+
"muon_param_names": [
|
| 527 |
+
"blocks.0.attn_qkv.weight",
|
| 528 |
+
"blocks.0.attn_out.weight",
|
| 529 |
+
"blocks.0.mlp.w12.weight",
|
| 530 |
+
"blocks.0.mlp.w3.weight",
|
| 531 |
+
"blocks.1.attn_qkv.weight",
|
| 532 |
+
"blocks.1.attn_out.weight",
|
| 533 |
+
"blocks.1.mlp.w12.weight",
|
| 534 |
+
"blocks.1.mlp.w3.weight",
|
| 535 |
+
"blocks.2.attn_qkv.weight",
|
| 536 |
+
"blocks.2.attn_out.weight",
|
| 537 |
+
"blocks.2.mlp.w12.weight",
|
| 538 |
+
"blocks.2.mlp.w3.weight",
|
| 539 |
+
"blocks.3.attn_qkv.weight",
|
| 540 |
+
"blocks.3.attn_out.weight",
|
| 541 |
+
"blocks.3.mlp.w12.weight",
|
| 542 |
+
"blocks.3.mlp.w3.weight",
|
| 543 |
+
"blocks.4.attn_qkv.weight",
|
| 544 |
+
"blocks.4.attn_out.weight",
|
| 545 |
+
"blocks.4.mlp.w12.weight",
|
| 546 |
+
"blocks.4.mlp.w3.weight",
|
| 547 |
+
"blocks.5.attn_qkv.weight",
|
| 548 |
+
"blocks.5.attn_out.weight",
|
| 549 |
+
"blocks.5.mlp.w12.weight",
|
| 550 |
+
"blocks.5.mlp.w3.weight",
|
| 551 |
+
"blocks.6.attn_qkv.weight",
|
| 552 |
+
"blocks.6.attn_out.weight",
|
| 553 |
+
"blocks.6.mlp.w12.weight",
|
| 554 |
+
"blocks.6.mlp.w3.weight",
|
| 555 |
+
"blocks.7.attn_qkv.weight",
|
| 556 |
+
"blocks.7.attn_out.weight",
|
| 557 |
+
"blocks.7.mlp.w12.weight",
|
| 558 |
+
"blocks.7.mlp.w3.weight",
|
| 559 |
+
"blocks.8.attn_qkv.weight",
|
| 560 |
+
"blocks.8.attn_out.weight",
|
| 561 |
+
"blocks.8.mlp.w12.weight",
|
| 562 |
+
"blocks.8.mlp.w3.weight",
|
| 563 |
+
"blocks.9.attn_qkv.weight",
|
| 564 |
+
"blocks.9.attn_out.weight",
|
| 565 |
+
"blocks.9.mlp.w12.weight",
|
| 566 |
+
"blocks.9.mlp.w3.weight",
|
| 567 |
+
"blocks.10.attn_qkv.weight",
|
| 568 |
+
"blocks.10.attn_out.weight",
|
| 569 |
+
"blocks.10.mlp.w12.weight",
|
| 570 |
+
"blocks.10.mlp.w3.weight",
|
| 571 |
+
"blocks.11.attn_qkv.weight",
|
| 572 |
+
"blocks.11.attn_out.weight",
|
| 573 |
+
"blocks.11.mlp.w12.weight",
|
| 574 |
+
"blocks.11.mlp.w3.weight"
|
| 575 |
+
],
|
| 576 |
+
"muon_adam_param_names": [
|
| 577 |
+
"time_tokens",
|
| 578 |
+
"vocab_embed.embedding",
|
| 579 |
+
"sigma_map.net.0.weight",
|
| 580 |
+
"sigma_map.net.0.bias",
|
| 581 |
+
"sigma_map.net.2.weight",
|
| 582 |
+
"sigma_map.net.2.bias",
|
| 583 |
+
"blocks.0.norm1.weight",
|
| 584 |
+
"blocks.0.attn_qkv.bias",
|
| 585 |
+
"blocks.0.attn_out.bias",
|
| 586 |
+
"blocks.0.q_norm.weight",
|
| 587 |
+
"blocks.0.k_norm.weight",
|
| 588 |
+
"blocks.0.norm2.weight",
|
| 589 |
+
"blocks.0.mlp.w12.bias",
|
| 590 |
+
"blocks.0.mlp.w3.bias",
|
| 591 |
+
"blocks.1.norm1.weight",
|
| 592 |
+
"blocks.1.attn_qkv.bias",
|
| 593 |
+
"blocks.1.attn_out.bias",
|
| 594 |
+
"blocks.1.q_norm.weight",
|
| 595 |
+
"blocks.1.k_norm.weight",
|
| 596 |
+
"blocks.1.norm2.weight",
|
| 597 |
+
"blocks.1.mlp.w12.bias",
|
| 598 |
+
"blocks.1.mlp.w3.bias",
|
| 599 |
+
"blocks.2.norm1.weight",
|
| 600 |
+
"blocks.2.attn_qkv.bias",
|
| 601 |
+
"blocks.2.attn_out.bias",
|
| 602 |
+
"blocks.2.q_norm.weight",
|
| 603 |
+
"blocks.2.k_norm.weight",
|
| 604 |
+
"blocks.2.norm2.weight",
|
| 605 |
+
"blocks.2.mlp.w12.bias",
|
| 606 |
+
"blocks.2.mlp.w3.bias",
|
| 607 |
+
"blocks.3.norm1.weight",
|
| 608 |
+
"blocks.3.attn_qkv.bias",
|
| 609 |
+
"blocks.3.attn_out.bias",
|
| 610 |
+
"blocks.3.q_norm.weight",
|
| 611 |
+
"blocks.3.k_norm.weight",
|
| 612 |
+
"blocks.3.norm2.weight",
|
| 613 |
+
"blocks.3.mlp.w12.bias",
|
| 614 |
+
"blocks.3.mlp.w3.bias",
|
| 615 |
+
"blocks.4.norm1.weight",
|
| 616 |
+
"blocks.4.attn_qkv.bias",
|
| 617 |
+
"blocks.4.attn_out.bias",
|
| 618 |
+
"blocks.4.q_norm.weight",
|
| 619 |
+
"blocks.4.k_norm.weight",
|
| 620 |
+
"blocks.4.norm2.weight",
|
| 621 |
+
"blocks.4.mlp.w12.bias",
|
| 622 |
+
"blocks.4.mlp.w3.bias",
|
| 623 |
+
"blocks.5.norm1.weight",
|
| 624 |
+
"blocks.5.attn_qkv.bias",
|
| 625 |
+
"blocks.5.attn_out.bias",
|
| 626 |
+
"blocks.5.q_norm.weight",
|
| 627 |
+
"blocks.5.k_norm.weight",
|
| 628 |
+
"blocks.5.norm2.weight",
|
| 629 |
+
"blocks.5.mlp.w12.bias",
|
| 630 |
+
"blocks.5.mlp.w3.bias",
|
| 631 |
+
"blocks.6.norm1.weight",
|
| 632 |
+
"blocks.6.attn_qkv.bias",
|
| 633 |
+
"blocks.6.attn_out.bias",
|
| 634 |
+
"blocks.6.q_norm.weight",
|
| 635 |
+
"blocks.6.k_norm.weight",
|
| 636 |
+
"blocks.6.norm2.weight",
|
| 637 |
+
"blocks.6.mlp.w12.bias",
|
| 638 |
+
"blocks.6.mlp.w3.bias",
|
| 639 |
+
"blocks.7.norm1.weight",
|
| 640 |
+
"blocks.7.attn_qkv.bias",
|
| 641 |
+
"blocks.7.attn_out.bias",
|
| 642 |
+
"blocks.7.q_norm.weight",
|
| 643 |
+
"blocks.7.k_norm.weight",
|
| 644 |
+
"blocks.7.norm2.weight",
|
| 645 |
+
"blocks.7.mlp.w12.bias",
|
| 646 |
+
"blocks.7.mlp.w3.bias",
|
| 647 |
+
"blocks.8.norm1.weight",
|
| 648 |
+
"blocks.8.attn_qkv.bias",
|
| 649 |
+
"blocks.8.attn_out.bias",
|
| 650 |
+
"blocks.8.q_norm.weight",
|
| 651 |
+
"blocks.8.k_norm.weight",
|
| 652 |
+
"blocks.8.norm2.weight",
|
| 653 |
+
"blocks.8.mlp.w12.bias",
|
| 654 |
+
"blocks.8.mlp.w3.bias",
|
| 655 |
+
"blocks.9.norm1.weight",
|
| 656 |
+
"blocks.9.attn_qkv.bias",
|
| 657 |
+
"blocks.9.attn_out.bias",
|
| 658 |
+
"blocks.9.q_norm.weight",
|
| 659 |
+
"blocks.9.k_norm.weight",
|
| 660 |
+
"blocks.9.norm2.weight",
|
| 661 |
+
"blocks.9.mlp.w12.bias",
|
| 662 |
+
"blocks.9.mlp.w3.bias",
|
| 663 |
+
"blocks.10.norm1.weight",
|
| 664 |
+
"blocks.10.attn_qkv.bias",
|
| 665 |
+
"blocks.10.attn_out.bias",
|
| 666 |
+
"blocks.10.q_norm.weight",
|
| 667 |
+
"blocks.10.k_norm.weight",
|
| 668 |
+
"blocks.10.norm2.weight",
|
| 669 |
+
"blocks.10.mlp.w12.bias",
|
| 670 |
+
"blocks.10.mlp.w3.bias",
|
| 671 |
+
"blocks.11.norm1.weight",
|
| 672 |
+
"blocks.11.attn_qkv.bias",
|
| 673 |
+
"blocks.11.attn_out.bias",
|
| 674 |
+
"blocks.11.q_norm.weight",
|
| 675 |
+
"blocks.11.k_norm.weight",
|
| 676 |
+
"blocks.11.norm2.weight",
|
| 677 |
+
"blocks.11.mlp.w12.bias",
|
| 678 |
+
"blocks.11.mlp.w3.bias",
|
| 679 |
+
"output_layer.norm_final.weight",
|
| 680 |
+
"output_layer.linear.weight"
|
| 681 |
+
],
|
| 682 |
+
"muon_effective_nesterov": true,
|
| 683 |
+
"muon_effective_width_scale": true,
|
| 684 |
+
"muon_effective_weight_decay": 0.0,
|
| 685 |
+
"muon_adam_fallback_nesterov": true,
|
| 686 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 687 |
+
"ema_decay": 0.9999,
|
| 688 |
+
"ema_start_step": 0,
|
| 689 |
+
"model_type": "ddit_elf",
|
| 690 |
+
"ddit_mlp_type": "gelu",
|
| 691 |
+
"elf_num_time_tokens": 4,
|
| 692 |
+
"elf_num_model_mode_tokens": 0,
|
| 693 |
+
"qk_norm": true,
|
| 694 |
+
"output_bias": false,
|
| 695 |
+
"output_init_std": 0.0,
|
| 696 |
+
"norm_type": "rmsnorm",
|
| 697 |
+
"target_loss": "hard_ce",
|
| 698 |
+
"linear_soft_target_power": 1.0,
|
| 699 |
+
"linear_soft_target_min_conf": 0.0,
|
| 700 |
+
"linear_soft_target_max_conf": 1.0,
|
| 701 |
+
"t_sampling_mode": "uniform",
|
| 702 |
+
"t_sampling_power": 1.0,
|
| 703 |
+
"t_sampling_eps": 0.0001,
|
| 704 |
+
"t_sampling_logit_mean": -1.5,
|
| 705 |
+
"t_sampling_logit_std": 0.8,
|
| 706 |
+
"dual_t": true,
|
| 707 |
+
"corrupt_t_mode": "same",
|
| 708 |
+
"corrupt_min_t": 0.0,
|
| 709 |
+
"corrupt_max_t": 1.0,
|
| 710 |
+
"prefix_block_prob": 0.0,
|
| 711 |
+
"prefix_block_len": 128,
|
| 712 |
+
"mask_ratio_floor_schedule": "none",
|
| 713 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 714 |
+
"dirichlet_semantic_t_mode": "same",
|
| 715 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 716 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 717 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 718 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 719 |
+
"categorical_wrong_from_full_vocab": true,
|
| 720 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 721 |
+
"categorical_wrong_basin_token_ids": "",
|
| 722 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 723 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 724 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 725 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 726 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 727 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 728 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 729 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 730 |
+
"mask_mixture_original_prob": 0.0,
|
| 731 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 732 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 733 |
+
"mask_mixture_block_prob": 0.0,
|
| 734 |
+
"mask_mixture_all_prob": 1.0,
|
| 735 |
+
"mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
|
| 736 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 737 |
+
"mask_mixture_block_tokens": "64,128",
|
| 738 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 739 |
+
"logistic_normal_sigma_min": 0.18,
|
| 740 |
+
"logistic_normal_sigma_max": 2.2,
|
| 741 |
+
"logistic_normal_tau_min": 0.65,
|
| 742 |
+
"logistic_normal_tau_max": 1.15,
|
| 743 |
+
"torch_compile": false,
|
| 744 |
+
"compile_mode": "max-autotune",
|
| 745 |
+
"state_format": "prob",
|
| 746 |
+
"meanflow_weight": 0.0,
|
| 747 |
+
"rollout_train_prob": 0.5,
|
| 748 |
+
"rollout_train_steps": 3,
|
| 749 |
+
"rollout_train_steps_min": 0,
|
| 750 |
+
"rollout_train_infer_steps": 1,
|
| 751 |
+
"rollout_train_time_mode": "sampled_path",
|
| 752 |
+
"rollout_train_s_dist": "uniform",
|
| 753 |
+
"rollout_train_s_min_frac": 0.0,
|
| 754 |
+
"rollout_train_s_max_frac": 0.25,
|
| 755 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 756 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 757 |
+
"rollout_train_temp": 1.0,
|
| 758 |
+
"rollout_train_max_gamma": 1.0,
|
| 759 |
+
"rollout_train_corrupt_only": true,
|
| 760 |
+
"rollout_train_samplewise": true,
|
| 761 |
+
"rollout_train_compute_always": false,
|
| 762 |
+
"rollout_train_sync_t": true,
|
| 763 |
+
"bridge_noise_init": "logistic_normal",
|
| 764 |
+
"noise_sigma": -1.0,
|
| 765 |
+
"allow_tf32": true,
|
| 766 |
+
"activation_checkpointing": true,
|
| 767 |
+
"activation_checkpoint_interval": 1,
|
| 768 |
+
"activation_checkpoint_scope": "mlp",
|
| 769 |
+
"ddp_static_graph": false,
|
| 770 |
+
"ddp_gradient_as_bucket_view": true,
|
| 771 |
+
"blocking_data_transfer": false,
|
| 772 |
+
"dataloader_prefetch_factor": 4,
|
| 773 |
+
"full_train_stats": false,
|
| 774 |
+
"tokenized_hf": true,
|
| 775 |
+
"tokenized_pad_token": "pad",
|
| 776 |
+
"elf_conditional_hf": false,
|
| 777 |
+
"record_pad_truncate": false,
|
| 778 |
+
"record_add_eos": false,
|
| 779 |
+
"record_add_special_tokens": false,
|
| 780 |
+
"record_pad_token": "pad",
|
| 781 |
+
"record_shuffle_buffer": 10000,
|
| 782 |
+
"wrap": false,
|
| 783 |
+
"wrap_mode": "stream",
|
| 784 |
+
"wrap_record_buffer_size": 200,
|
| 785 |
+
"owt_cached_chunks": false,
|
| 786 |
+
"owt_chunk_cache_dir": "",
|
| 787 |
+
"owt_chunk_cache_rebuild": false,
|
| 788 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 789 |
+
"owt_exact_repeat_per_chunk": 0,
|
| 790 |
+
"online_chunk_shuffle": false,
|
| 791 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 792 |
+
"openwebtext_split": "all",
|
| 793 |
+
"detokenizer": "auto",
|
| 794 |
+
"resolved_detokenizer": null,
|
| 795 |
+
"num_workers": 8,
|
| 796 |
+
"latest_every": 1000,
|
| 797 |
+
"resume_path": ""
|
| 798 |
+
}
|
| 799 |
+
t-20260518193619-wnzpp-worker-0:10394:11755 [0] NCCL INFO NVLS comm 0xd9dc180 headRank 0 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 800 |
+
t-20260518193619-wnzpp-worker-0:10396:11757 [2] NCCL INFO NVLS comm 0x722c5210 headRank 2 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 801 |
+
t-20260518193619-wnzpp-worker-0:10397:11758 [3] NCCL INFO NVLS comm 0xec825c0 headRank 3 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 802 |
+
t-20260518193619-wnzpp-worker-0:10395:11759 [1] NCCL INFO NVLS comm 0xd3cb0a0 headRank 1 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 803 |
+
t-20260518193619-wnzpp-worker-0:10398:11760 [4] NCCL INFO NVLS comm 0xeb6d1a0 headRank 4 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 804 |
+
t-20260518193619-wnzpp-worker-0:10401:11783 [7] NCCL INFO NVLS comm 0xd4cf3e0 headRank 7 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 805 |
+
t-20260518193619-wnzpp-worker-0:10400:11784 [6] NCCL INFO NVLS comm 0xd982790 headRank 6 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 806 |
+
t-20260518193619-wnzpp-worker-0:10399:11785 [5] NCCL INFO NVLS comm 0xe49fe20 headRank 5 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 807 |
+
step=100 epoch=1/5 epoch_step=100/19018 micro_steps=200 elapsed=140.6s lr=2.124303e-05 loss=10.1003 loss_recon=10.1003 loss_meanflow=0.0000 mean_model_t=0.5010 mean_corrupt_t=0.5010 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5009 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0203 corrupt_frac=1.0000 acc_corrupt=0.0203 loss_corrupt=10.1003 wrong_frac=0.4987 init_acc_corrupt=0.4678 acc_corrupt_t_0p0_0p2=0.0145 corrupt_frac_t_0p0_0p2=0.1988 acc_corrupt_t_0p2_0p4=0.0179 corrupt_frac_t_0p2_0p4=0.1957 acc_corrupt_t_0p4_0p6=0.0214 corrupt_frac_t_0p4_0p6=0.2070 acc_corrupt_t_0p6_0p8=0.0240 corrupt_frac_t_0p6_0p8=0.1999 acc_corrupt_t_0p8_1p0=0.0237 corrupt_frac_t_0p8_1p0=0.2007 out_w_norm=1.4216 out_g_norm=1.5149 loss_all=9.5077 init_gold_top10=0.4520 init_gold_top100=0.5699 rollout_applied_pos_frac=0.4999 init_acc_rollout_applied=0.5139 init_acc_rollout_kept=0.3459 logit_acc_rollout_applied=0.0332 logit_acc_rollout_kept=0.0351
|
| 808 |
+
step=200 epoch=1/5 epoch_step=200/19018 micro_steps=400 elapsed=139.5s lr=4.227574e-05 loss=8.5592 loss_recon=8.5592 loss_meanflow=0.0000 mean_model_t=0.5028 mean_corrupt_t=0.5028 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5070 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0355 corrupt_frac=1.0000 acc_corrupt=0.0355 loss_corrupt=8.5592 wrong_frac=0.4971 init_acc_corrupt=0.4692 acc_corrupt_t_0p0_0p2=0.0338 corrupt_frac_t_0p0_0p2=0.1981 acc_corrupt_t_0p2_0p4=0.0345 corrupt_frac_t_0p2_0p4=0.1973 acc_corrupt_t_0p4_0p6=0.0351 corrupt_frac_t_0p4_0p6=0.1975 acc_corrupt_t_0p6_0p8=0.0359 corrupt_frac_t_0p6_0p8=0.2050 acc_corrupt_t_0p8_1p0=0.0380 corrupt_frac_t_0p8_1p0=0.2031 out_w_norm=9.4462 out_g_norm=1.7789 loss_all=7.6793 init_gold_top10=0.5476 init_gold_top100=0.6694 rollout_applied_pos_frac=0.5905 init_acc_rollout_applied=0.5336 init_acc_rollout_kept=0.5080 logit_acc_rollout_applied=0.0425 logit_acc_rollout_kept=0.0428
|
| 809 |
+
step=300 epoch=1/5 epoch_step=300/19018 micro_steps=600 elapsed=139.2s lr=6.330844e-05 loss=7.3133 loss_recon=7.3133 loss_meanflow=0.0000 mean_model_t=0.5000 mean_corrupt_t=0.5000 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5058 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0441 corrupt_frac=1.0000 acc_corrupt=0.0441 loss_corrupt=7.3133 wrong_frac=0.5004 init_acc_corrupt=0.4653 acc_corrupt_t_0p0_0p2=0.0446 corrupt_frac_t_0p0_0p2=0.1989 acc_corrupt_t_0p2_0p4=0.0440 corrupt_frac_t_0p2_0p4=0.2006 acc_corrupt_t_0p4_0p6=0.0438 corrupt_frac_t_0p4_0p6=0.2021 acc_corrupt_t_0p6_0p8=0.0440 corrupt_frac_t_0p6_0p8=0.2005 acc_corrupt_t_0p8_1p0=0.0439 corrupt_frac_t_0p8_1p0=0.1989 out_w_norm=19.8990 out_g_norm=0.5973 loss_all=7.1080 init_gold_top10=0.4187 init_gold_top100=0.5664 rollout_applied_pos_frac=0.5393 init_acc_rollout_applied=0.3405 init_acc_rollout_kept=0.4090 logit_acc_rollout_applied=0.0493 logit_acc_rollout_kept=0.0447
|
| 810 |
+
step=400 epoch=1/5 epoch_step=400/19018 micro_steps=800 elapsed=139.6s lr=8.434115e-05 loss=7.0900 loss_recon=7.0900 loss_meanflow=0.0000 mean_model_t=0.5038 mean_corrupt_t=0.5038 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5031 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0445 corrupt_frac=1.0000 acc_corrupt=0.0445 loss_corrupt=7.0900 wrong_frac=0.4963 init_acc_corrupt=0.4702 acc_corrupt_t_0p0_0p2=0.0429 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.0430 corrupt_frac_t_0p2_0p4=0.1999 acc_corrupt_t_0p4_0p6=0.0430 corrupt_frac_t_0p4_0p6=0.2013 acc_corrupt_t_0p6_0p8=0.0449 corrupt_frac_t_0p6_0p8=0.2029 acc_corrupt_t_0p8_1p0=0.0485 corrupt_frac_t_0p8_1p0=0.2012 out_w_norm=25.3746 out_g_norm=0.2534 loss_all=6.8650 init_gold_top10=0.4741 init_gold_top100=0.6303 rollout_applied_pos_frac=0.4657 init_acc_rollout_applied=0.3189 init_acc_rollout_kept=0.4833 logit_acc_rollout_applied=0.0494 logit_acc_rollout_kept=0.0565
|
| 811 |
+
step=500 epoch=1/5 epoch_step=500/19018 micro_steps=1000 elapsed=139.8s lr=1.053739e-04 loss=6.6772 loss_recon=6.6772 loss_meanflow=0.0000 mean_model_t=0.4995 mean_corrupt_t=0.4995 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5047 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0834 corrupt_frac=1.0000 acc_corrupt=0.0834 loss_corrupt=6.6772 wrong_frac=0.5003 init_acc_corrupt=0.4647 acc_corrupt_t_0p0_0p2=0.0449 corrupt_frac_t_0p0_0p2=0.1982 acc_corrupt_t_0p2_0p4=0.0582 corrupt_frac_t_0p2_0p4=0.2092 acc_corrupt_t_0p4_0p6=0.0774 corrupt_frac_t_0p4_0p6=0.1874 acc_corrupt_t_0p6_0p8=0.1042 corrupt_frac_t_0p6_0p8=0.2040 acc_corrupt_t_0p8_1p0=0.1321 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=30.4220 out_g_norm=0.2944 loss_all=6.3481 init_gold_top10=0.4677 init_gold_top100=0.6267 rollout_applied_pos_frac=0.4496 init_acc_rollout_applied=0.4524 init_acc_rollout_kept=0.4245 logit_acc_rollout_applied=0.1296 logit_acc_rollout_kept=0.1238
|
| 812 |
+
step=600 epoch=1/5 epoch_step=600/19018 micro_steps=1200 elapsed=139.2s lr=1.264066e-04 loss=5.8930 loss_recon=5.8930 loss_meanflow=0.0000 mean_model_t=0.5014 mean_corrupt_t=0.5014 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4886 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2510 corrupt_frac=1.0000 acc_corrupt=0.2510 loss_corrupt=5.8930 wrong_frac=0.4982 init_acc_corrupt=0.4684 acc_corrupt_t_0p0_0p2=0.0451 corrupt_frac_t_0p0_0p2=0.2012 acc_corrupt_t_0p2_0p4=0.1027 corrupt_frac_t_0p2_0p4=0.1954 acc_corrupt_t_0p4_0p6=0.2461 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=0.3619 corrupt_frac_t_0p6_0p8=0.2062 acc_corrupt_t_0p8_1p0=0.4963 corrupt_frac_t_0p8_1p0=0.1977 out_w_norm=35.3894 out_g_norm=0.3174 loss_all=5.4253 init_gold_top10=0.5145 init_gold_top100=0.6336 rollout_applied_pos_frac=0.3817 init_acc_rollout_applied=0.4180 init_acc_rollout_kept=0.5100 logit_acc_rollout_applied=0.2918 logit_acc_rollout_kept=0.3632
|
| 813 |
+
step=700 epoch=1/5 epoch_step=700/19018 micro_steps=1400 elapsed=140.1s lr=1.474393e-04 loss=5.0278 loss_recon=5.0278 loss_meanflow=0.0000 mean_model_t=0.5075 mean_corrupt_t=0.5075 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4975 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3878 corrupt_frac=1.0000 acc_corrupt=0.3878 loss_corrupt=5.0278 wrong_frac=0.4922 init_acc_corrupt=0.4755 acc_corrupt_t_0p0_0p2=0.0499 corrupt_frac_t_0p0_0p2=0.1963 acc_corrupt_t_0p2_0p4=0.1899 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.3917 corrupt_frac_t_0p4_0p6=0.1964 acc_corrupt_t_0p6_0p8=0.5546 corrupt_frac_t_0p6_0p8=0.2077 acc_corrupt_t_0p8_1p0=0.7182 corrupt_frac_t_0p8_1p0=0.2082 out_w_norm=42.7662 out_g_norm=0.2303 loss_all=4.6872 init_gold_top10=0.5493 init_gold_top100=0.6652 rollout_applied_pos_frac=0.4076 init_acc_rollout_applied=0.5359 init_acc_rollout_kept=0.4696 logit_acc_rollout_applied=0.4517 logit_acc_rollout_kept=0.4131
|
| 814 |
+
step=800 epoch=1/5 epoch_step=800/19018 micro_steps=1600 elapsed=140.2s lr=1.684720e-04 loss=4.5691 loss_recon=4.5691 loss_meanflow=0.0000 mean_model_t=0.5023 mean_corrupt_t=0.5023 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5081 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4334 corrupt_frac=1.0000 acc_corrupt=0.4334 loss_corrupt=4.5691 wrong_frac=0.4983 init_acc_corrupt=0.4691 acc_corrupt_t_0p0_0p2=0.0561 corrupt_frac_t_0p0_0p2=0.1966 acc_corrupt_t_0p2_0p4=0.2257 corrupt_frac_t_0p2_0p4=0.2016 acc_corrupt_t_0p4_0p6=0.4458 corrupt_frac_t_0p4_0p6=0.2038 acc_corrupt_t_0p6_0p8=0.6295 corrupt_frac_t_0p6_0p8=0.1993 acc_corrupt_t_0p8_1p0=0.8079 corrupt_frac_t_0p8_1p0=0.1987 out_w_norm=51.7482 out_g_norm=0.1240 loss_all=4.2315 init_gold_top10=0.5330 init_gold_top100=0.6430 rollout_applied_pos_frac=0.4373 init_acc_rollout_applied=0.4846 init_acc_rollout_kept=0.4966 logit_acc_rollout_applied=0.4485 logit_acc_rollout_kept=0.4882
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_rollin_p50_randk0_3_uniformt_temp1_synct_mask1_gbs512_8gpu_1m_20260518_125609.log
ADDED
|
@@ -0,0 +1,814 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
t-20260518193619-wnzpp-worker-1:10398:10398 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 2 |
+
t-20260518193619-wnzpp-worker-1:10398:10398 [0] NCCL INFO Bootstrap: Using eth1:10.82.80.12<0>
|
| 3 |
+
t-20260518193619-wnzpp-worker-1:10398:10398 [0] NCCL INFO cudaDriverVersion 12080
|
| 4 |
+
t-20260518193619-wnzpp-worker-1:10398:10398 [0] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 5 |
+
t-20260518193619-wnzpp-worker-1:10398:10398 [0] NCCL INFO Comm config Blocking set to 1
|
| 6 |
+
t-20260518193619-wnzpp-worker-1:10400:10400 [2] NCCL INFO cudaDriverVersion 12080
|
| 7 |
+
t-20260518193619-wnzpp-worker-1:10400:10400 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 8 |
+
t-20260518193619-wnzpp-worker-1:10401:10401 [3] NCCL INFO cudaDriverVersion 12080
|
| 9 |
+
t-20260518193619-wnzpp-worker-1:10401:10401 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 10 |
+
t-20260518193619-wnzpp-worker-1:10400:10400 [2] NCCL INFO Bootstrap: Using eth1:10.82.80.12<0>
|
| 11 |
+
t-20260518193619-wnzpp-worker-1:10400:10400 [2] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 12 |
+
t-20260518193619-wnzpp-worker-1:10401:10401 [3] NCCL INFO Bootstrap: Using eth1:10.82.80.12<0>
|
| 13 |
+
t-20260518193619-wnzpp-worker-1:10401:10401 [3] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 14 |
+
t-20260518193619-wnzpp-worker-1:10405:10405 [7] NCCL INFO cudaDriverVersion 12080
|
| 15 |
+
t-20260518193619-wnzpp-worker-1:10405:10405 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 16 |
+
t-20260518193619-wnzpp-worker-1:10405:10405 [7] NCCL INFO Bootstrap: Using eth1:10.82.80.12<0>
|
| 17 |
+
t-20260518193619-wnzpp-worker-1:10405:10405 [7] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 18 |
+
t-20260518193619-wnzpp-worker-1:10400:10400 [2] NCCL INFO Comm config Blocking set to 1
|
| 19 |
+
t-20260518193619-wnzpp-worker-1:10401:10401 [3] NCCL INFO Comm config Blocking set to 1
|
| 20 |
+
t-20260518193619-wnzpp-worker-1:10405:10405 [7] NCCL INFO Comm config Blocking set to 1
|
| 21 |
+
t-20260518193619-wnzpp-worker-1:10402:10402 [4] NCCL INFO cudaDriverVersion 12080
|
| 22 |
+
t-20260518193619-wnzpp-worker-1:10402:10402 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 23 |
+
t-20260518193619-wnzpp-worker-1:10402:10402 [4] NCCL INFO Bootstrap: Using eth1:10.82.80.12<0>
|
| 24 |
+
t-20260518193619-wnzpp-worker-1:10402:10402 [4] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 25 |
+
t-20260518193619-wnzpp-worker-1:10402:10402 [4] NCCL INFO Comm config Blocking set to 1
|
| 26 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 27 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 28 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 29 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 30 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 31 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 32 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO P2P plugin v9 IBext_v9
|
| 33 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO P2P plugin v9 IBext_v9
|
| 34 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 35 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 36 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 37 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 38 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 39 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO P2P plugin v9 IBext_v9
|
| 40 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 41 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 42 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 43 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 44 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO P2P plugin v9 IBext_v9
|
| 45 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 46 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 47 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.12<0>
|
| 48 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 49 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.12<0>
|
| 50 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 51 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.12<0>
|
| 52 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 53 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.12<0>
|
| 54 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 55 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO Using network IBext_v9
|
| 56 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 57 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO Using network IBext_v9
|
| 58 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 59 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Using network IBext_v9
|
| 60 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 61 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO Using network IBext_v9
|
| 62 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 63 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 64 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 65 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO P2P plugin v9 IBext_v9
|
| 66 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 67 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO ncclCommInitRankConfig comm 0xce77c80 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 65040 commId 0x824180f3dd2d3478 - Init START
|
| 68 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO ncclCommInitRankConfig comm 0xeb83a20 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId 6b020 commId 0x824180f3dd2d3478 - Init START
|
| 69 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO ncclCommInitRankConfig comm 0x729d7ec0 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId 69020 commId 0x824180f3dd2d3478 - Init START
|
| 70 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO ncclCommInitRankConfig comm 0xeeabba0 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId 75020 commId 0x824180f3dd2d3478 - Init START
|
| 71 |
+
t-20260518193619-wnzpp-worker-1:10403:10403 [5] NCCL INFO cudaDriverVersion 12080
|
| 72 |
+
t-20260518193619-wnzpp-worker-1:10403:10403 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 73 |
+
t-20260518193619-wnzpp-worker-1:10403:10403 [5] NCCL INFO Bootstrap: Using eth1:10.82.80.12<0>
|
| 74 |
+
t-20260518193619-wnzpp-worker-1:10403:10403 [5] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 75 |
+
t-20260518193619-wnzpp-worker-1:10403:10403 [5] NCCL INFO Comm config Blocking set to 1
|
| 76 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 77 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.12<0>
|
| 78 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 79 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO Using network IBext_v9
|
| 80 |
+
t-20260518193619-wnzpp-worker-1:10404:10404 [6] NCCL INFO cudaDriverVersion 12080
|
| 81 |
+
t-20260518193619-wnzpp-worker-1:10404:10404 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 82 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO ncclCommInitRankConfig comm 0xe035080 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId 6f020 commId 0x824180f3dd2d3478 - Init START
|
| 83 |
+
t-20260518193619-wnzpp-worker-1:10404:10404 [6] NCCL INFO Bootstrap: Using eth1:10.82.80.12<0>
|
| 84 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO RAS client listening socket at ::1<28028>
|
| 85 |
+
t-20260518193619-wnzpp-worker-1:10404:10404 [6] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 86 |
+
t-20260518193619-wnzpp-worker-1:10404:10404 [6] NCCL INFO Comm config Blocking set to 1
|
| 87 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 88 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 89 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 90 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO P2P plugin v9 IBext_v9
|
| 91 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 92 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 93 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.12<0>
|
| 94 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 95 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO Using network IBext_v9
|
| 96 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO ncclCommInitRankConfig comm 0xe2d8120 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId 71020 commId 0x824180f3dd2d3478 - Init START
|
| 97 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO RAS client listening socket at ::1<28028>
|
| 98 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 99 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 100 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 101 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO P2P plugin v9 IBext_v9
|
| 102 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 103 |
+
t-20260518193619-wnzpp-worker-1:10399:10399 [1] NCCL INFO cudaDriverVersion 12080
|
| 104 |
+
t-20260518193619-wnzpp-worker-1:10399:10399 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 105 |
+
t-20260518193619-wnzpp-worker-1:10399:10399 [1] NCCL INFO Bootstrap: Using eth1:10.82.80.12<0>
|
| 106 |
+
t-20260518193619-wnzpp-worker-1:10399:10399 [1] NCCL INFO NCCL version 2.25.1+cuda12.8
|
| 107 |
+
t-20260518193619-wnzpp-worker-1:10399:10399 [1] NCCL INFO Comm config Blocking set to 1
|
| 108 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 109 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.12<0>
|
| 110 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 111 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO Using network IBext_v9
|
| 112 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO ncclCommInitRankConfig comm 0xdf37090 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId 73020 commId 0x824180f3dd2d3478 - Init START
|
| 113 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO RAS client listening socket at ::1<28028>
|
| 114 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO RAS client listening socket at ::1<28028>
|
| 115 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO RAS client listening socket at ::1<28028>
|
| 116 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v9 (v9)
|
| 117 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO NET/Plugin: Loaded collnet plugin SHARP (v9)
|
| 118 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
|
| 119 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO P2P plugin v9 IBext_v9
|
| 120 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1
|
| 121 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
|
| 122 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_4:1/RoCE [2]mlx5_5:1/RoCE [3]mlx5_6:1/RoCE [4]mlx5_7:1/RoCE [5]mlx5_8:1/RoCE [6]mlx5_9:1/RoCE [7]mlx5_10:1/RoCE [RO]; OOB eth1:10.82.80.12<0>
|
| 123 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 124 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO Using network IBext_v9
|
| 125 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO ncclCommInitRankConfig comm 0xd6b7b10 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 67020 commId 0x824180f3dd2d3478 - Init START
|
| 126 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO RAS client listening socket at ::1<28028>
|
| 127 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO RAS client listening socket at ::1<28028>
|
| 128 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO RAS client listening socket at ::1<28028>
|
| 129 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO Bootstrap timings total 0.026702 (create 0.000020, send 0.000077, recv 0.000089, ring 0.000340, delay 0.000001)
|
| 130 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Bootstrap timings total 0.863898 (create 0.000032, send 0.000100, recv 0.837254, ring 0.000255, delay 0.000001)
|
| 131 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO Bootstrap timings total 0.857636 (create 0.000021, send 0.000078, recv 0.000046, ring 0.000260, delay 0.000001)
|
| 132 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO Bootstrap timings total 0.262683 (create 0.000031, send 0.000087, recv 0.000146, ring 0.262040, delay 0.000001)
|
| 133 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO Bootstrap timings total 0.482491 (create 0.000020, send 0.000069, recv 0.219969, ring 0.262044, delay 0.000001)
|
| 134 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO Bootstrap timings total 0.856549 (create 0.000020, send 0.000084, recv 0.000027, ring 0.262031, delay 0.000001)
|
| 135 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO Bootstrap timings total 0.859727 (create 0.000029, send 0.000070, recv 0.134877, ring 0.700013, delay 0.000001)
|
| 136 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO Bootstrap timings total 0.724939 (create 0.000022, send 0.000068, recv 0.242543, ring 0.481955, delay 0.000001)
|
| 137 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO MNNVL busId 0x67020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 138 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO MNNVL busId 0x69020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 139 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO MNNVL busId 0x65040 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 140 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO MNNVL busId 0x71020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 141 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO MNNVL busId 0x73020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 142 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO MNNVL busId 0x75020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 143 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO MNNVL busId 0x6f020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 144 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO MNNVL busId 0x6b020 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
|
| 145 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 146 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 147 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 148 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 149 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 150 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 151 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 152 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO NCCL_TOPO_FILE set by environment to /var/run/nvidia-topologyd/virtualTopology.xml
|
| 153 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO Setting affinity for GPU 5 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 154 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO Setting affinity for GPU 3 to 03ffffff,ffffffff,ffffffff
|
| 155 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO Setting affinity for GPU 7 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 156 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO Setting affinity for GPU 4 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 157 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO Setting affinity for GPU 6 to 0fffff,ffffffff,ffffffff,fc000000,00000000,00000000
|
| 158 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO Setting affinity for GPU 1 to 03ffffff,ffffffff,ffffffff
|
| 159 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO Setting affinity for GPU 2 to 03ffffff,ffffffff,ffffffff
|
| 160 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Setting affinity for GPU 0 to 03ffffff,ffffffff,ffffffff
|
| 161 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO NVLS multicast support is available on dev 5
|
| 162 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO NVLS multicast support is available on dev 7
|
| 163 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO NVLS multicast support is available on dev 2
|
| 164 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO NVLS multicast support is available on dev 3
|
| 165 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO NVLS multicast support is available on dev 4
|
| 166 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO NVLS multicast support is available on dev 1
|
| 167 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO NVLS multicast support is available on dev 0
|
| 168 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO NVLS multicast support is available on dev 6
|
| 169 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO comm 0xdf37090 rank 6 nRanks 8 nNodes 1 localRanks 8 localRank 6 MNNVL 0
|
| 170 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO comm 0xe2d8120 rank 5 nRanks 8 nNodes 1 localRanks 8 localRank 5 MNNVL 0
|
| 171 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO comm 0x729d7ec0 rank 2 nRanks 8 nNodes 1 localRanks 8 localRank 2 MNNVL 0
|
| 172 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO comm 0xeb83a20 rank 3 nRanks 8 nNodes 1 localRanks 8 localRank 3 MNNVL 0
|
| 173 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO comm 0xe035080 rank 4 nRanks 8 nNodes 1 localRanks 8 localRank 4 MNNVL 0
|
| 174 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO comm 0xd6b7b10 rank 1 nRanks 8 nNodes 1 localRanks 8 localRank 1 MNNVL 0
|
| 175 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO comm 0xce77c80 rank 0 nRanks 8 nNodes 1 localRanks 8 localRank 0 MNNVL 0
|
| 176 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO comm 0xeeabba0 rank 7 nRanks 8 nNodes 1 localRanks 8 localRank 7 MNNVL 0
|
| 177 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 00/24 : 0 1 2 3 4 5 6 7
|
| 178 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 01/24 : 0 1 2 3 4 5 6 7
|
| 179 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/-1/-1->5->4 [6] 6/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->4 [14] 6/-1/-1->5->4 [15] 6/-1/-1->5->4 [16] 6/-1/-1->5->4 [17] 6/-1/-1->5->4 [18] 6/-1/-1->5->4 [19] 6/-1/-1->5->4 [20] 6/-1/-1->5->4 [21] 6/-1/-1->5->4 [22] 6/-1/-1->5->4 [23] 6/-1/-1->5->4
|
| 180 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 02/24 : 0 1 2 3 4 5 6 7
|
| 181 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 03/24 : 0 1 2 3 4 5 6 7
|
| 182 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO P2P Chunksize set to 524288
|
| 183 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1
|
| 184 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 04/24 : 0 1 2 3 4 5 6 7
|
| 185 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/-1/-1->4->3 [5] 5/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->3 [13] 5/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 [16] 5/-1/-1->4->3 [17] 5/-1/-1->4->3 [18] 5/-1/-1->4->3 [19] 5/-1/-1->4->3 [20] 5/-1/-1->4->3 [21] 5/-1/-1->4->3 [22] 5/-1/-1->4->3 [23] 5/-1/-1->4->3
|
| 186 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/-1/-1->3->2 [4] 4/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->2 [12] 4/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 [16] 4/-1/-1->3->2 [17] 4/-1/-1->3->2 [18] 4/-1/-1->3->2 [19] 4/-1/-1->3->2 [20] 4/-1/-1->3->2 [21] 4/-1/-1->3->2 [22] 4/-1/-1->3->2 [23] 4/-1/-1->3->2
|
| 187 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0
|
| 188 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO P2P Chunksize set to 524288
|
| 189 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 05/24 : 0 1 2 3 4 5 6 7
|
| 190 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO P2P Chunksize set to 524288
|
| 191 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO P2P Chunksize set to 524288
|
| 192 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO P2P Chunksize set to 524288
|
| 193 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 [2] -1/-1/-1->7->6 [3] -1/-1/-1->7->6 [4] -1/-1/-1->7->6 [5] -1/-1/-1->7->6 [6] -1/-1/-1->7->6 [7] -1/-1/-1->7->6 [8] -1/-1/-1->7->6 [9] -1/-1/-1->7->6 [10] -1/-1/-1->7->6 [11] -1/-1/-1->7->6 [12] -1/-1/-1->7->6 [13] -1/-1/-1->7->6 [14] -1/-1/-1->7->6 [15] -1/-1/-1->7->6 [16] -1/-1/-1->7->6 [17] -1/-1/-1->7->6 [18] -1/-1/-1->7->6 [19] -1/-1/-1->7->6 [20] -1/-1/-1->7->6 [21] -1/-1/-1->7->6 [22] -1/-1/-1->7->6 [23] -1/-1/-1->7->6
|
| 194 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 06/24 : 0 1 2 3 4 5 6 7
|
| 195 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->5 [15] 7/-1/-1->6->5 [16] 7/-1/-1->6->5 [17] 7/-1/-1->6->5 [18] 7/-1/-1->6->5 [19] 7/-1/-1->6->5 [20] 7/-1/-1->6->5 [21] 7/-1/-1->6->5 [22] 7/-1/-1->6->5 [23] 7/-1/-1->6->5
|
| 196 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO P2P Chunksize set to 524288
|
| 197 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 07/24 : 0 1 2 3 4 5 6 7
|
| 198 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO P2P Chunksize set to 524288
|
| 199 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 08/24 : 0 1 2 3 4 5 6 7
|
| 200 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 09/24 : 0 1 2 3 4 5 6 7
|
| 201 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 10/24 : 0 1 2 3 4 5 6 7
|
| 202 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 11/24 : 0 1 2 3 4 5 6 7
|
| 203 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 12/24 : 0 1 2 3 4 5 6 7
|
| 204 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 13/24 : 0 1 2 3 4 5 6 7
|
| 205 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 14/24 : 0 1 2 3 4 5 6 7
|
| 206 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 15/24 : 0 1 2 3 4 5 6 7
|
| 207 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 16/24 : 0 1 2 3 4 5 6 7
|
| 208 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 17/24 : 0 1 2 3 4 5 6 7
|
| 209 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 18/24 : 0 1 2 3 4 5 6 7
|
| 210 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 19/24 : 0 1 2 3 4 5 6 7
|
| 211 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 20/24 : 0 1 2 3 4 5 6 7
|
| 212 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 21/24 : 0 1 2 3 4 5 6 7
|
| 213 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 22/24 : 0 1 2 3 4 5 6 7
|
| 214 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Channel 23/24 : 0 1 2 3 4 5 6 7
|
| 215 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1
|
| 216 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO P2P Chunksize set to 524288
|
| 217 |
+
t-20260518193619-wnzpp-worker-1:10400:11359 [2] NCCL INFO [Proxy Service] Device 2 CPU core 12
|
| 218 |
+
t-20260518193619-wnzpp-worker-1:10402:11362 [4] NCCL INFO [Proxy Service] Device 4 CPU core 126
|
| 219 |
+
t-20260518193619-wnzpp-worker-1:10399:11361 [1] NCCL INFO [Proxy Service] Device 1 CPU core 9
|
| 220 |
+
t-20260518193619-wnzpp-worker-1:10401:11364 [3] NCCL INFO [Proxy Service] Device 3 CPU core 23
|
| 221 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Check P2P Type intraNodeP2pSupport 1 directMode 0
|
| 222 |
+
t-20260518193619-wnzpp-worker-1:10405:11367 [7] NCCL INFO [Proxy Service] Device 7 CPU core 94
|
| 223 |
+
t-20260518193619-wnzpp-worker-1:10404:11369 [6] NCCL INFO [Proxy Service] Device 6 CPU core 96
|
| 224 |
+
t-20260518193619-wnzpp-worker-1:10398:11368 [0] NCCL INFO [Proxy Service] Device 0 CPU core 34
|
| 225 |
+
t-20260518193619-wnzpp-worker-1:10401:11366 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 86
|
| 226 |
+
t-20260518193619-wnzpp-worker-1:10399:11363 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 21
|
| 227 |
+
t-20260518193619-wnzpp-worker-1:10400:11360 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 20
|
| 228 |
+
t-20260518193619-wnzpp-worker-1:10402:11365 [4] NCCL INFO [Proxy Service UDS] Device 4 CPU core 128
|
| 229 |
+
t-20260518193619-wnzpp-worker-1:10404:11372 [6] NCCL INFO [Proxy Service UDS] Device 6 CPU core 98
|
| 230 |
+
t-20260518193619-wnzpp-worker-1:10405:11370 [7] NCCL INFO [Proxy Service UDS] Device 7 CPU core 99
|
| 231 |
+
t-20260518193619-wnzpp-worker-1:10398:11371 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 32
|
| 232 |
+
t-20260518193619-wnzpp-worker-1:10403:11373 [5] NCCL INFO [Proxy Service] Device 5 CPU core 174
|
| 233 |
+
t-20260518193619-wnzpp-worker-1:10403:11374 [5] NCCL INFO [Proxy Service UDS] Device 5 CPU core 176
|
| 234 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 235 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 236 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 237 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 238 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 239 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 240 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 241 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 242 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO CC Off, workFifoBytes 1048576
|
| 243 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 244 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 245 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 246 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 247 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 248 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 249 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
|
| 250 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer
|
| 251 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 252 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 253 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 254 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 255 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 256 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 257 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 258 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 259 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 260 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 261 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 262 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 263 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 264 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 265 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 266 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 267 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 268 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 269 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO ncclCommInitRankConfig comm 0xeb83a20 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId 6b020 commId 0x824180f3dd2d3478 - Init COMPLETE
|
| 270 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO ncclCommInitRankConfig comm 0xce77c80 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 65040 commId 0x824180f3dd2d3478 - Init COMPLETE
|
| 271 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO ncclCommInitRankConfig comm 0x729d7ec0 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId 69020 commId 0x824180f3dd2d3478 - Init COMPLETE
|
| 272 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.
|
| 273 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 274 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 275 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 276 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO ncclCommInitRankConfig comm 0xd6b7b10 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 67020 commId 0x824180f3dd2d3478 - Init COMPLETE
|
| 277 |
+
t-20260518193619-wnzpp-worker-1:10401:11281 [3] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 8 total 2.50 (kernels 0.22, alloc 0.57, bootstrap 0.86, allgathers 0.01, topo 0.53, graphs 0.01, connections 0.28, rest 0.02)
|
| 278 |
+
t-20260518193619-wnzpp-worker-1:10398:11279 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 8 total 2.50 (kernels 0.23, alloc 0.56, bootstrap 0.86, allgathers 0.01, topo 0.53, graphs 0.01, connections 0.28, rest 0.03)
|
| 279 |
+
t-20260518193619-wnzpp-worker-1:10400:11280 [2] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 8 total 2.50 (kernels 0.22, alloc 0.57, bootstrap 0.86, allgathers 0.01, topo 0.53, graphs 0.01, connections 0.29, rest 0.02)
|
| 280 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO ncclCommInitRankConfig comm 0xe035080 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId 6f020 commId 0x824180f3dd2d3478 - Init COMPLETE
|
| 281 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.
|
| 282 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO ncclCommInitRankConfig comm 0xdf37090 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId 73020 commId 0x824180f3dd2d3478 - Init COMPLETE
|
| 283 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO ncclCommInitRankConfig comm 0xe2d8120 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId 71020 commId 0x824180f3dd2d3478 - Init COMPLETE
|
| 284 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.
|
| 285 |
+
t-20260518193619-wnzpp-worker-1:10399:11342 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 8 total 1.25 (kernels 0.20, alloc 0.17, bootstrap 0.03, allgathers 0.01, topo 0.53, graphs 0.01, connections 0.28, rest 0.02)
|
| 286 |
+
t-20260518193619-wnzpp-worker-1:10402:11283 [4] NCCL INFO Init timings - ncclCommInitRankConfig: rank 4 nranks 8 total 2.36 (kernels 0.51, alloc 0.27, bootstrap 0.72, allgathers 0.01, topo 0.53, graphs 0.01, connections 0.29, rest 0.02)
|
| 287 |
+
t-20260518193619-wnzpp-worker-1:10403:11323 [5] NCCL INFO Init timings - ncclCommInitRankConfig: rank 5 nranks 8 total 1.71 (kernels 0.19, alloc 0.18, bootstrap 0.48, allgathers 0.01, topo 0.53, graphs 0.01, connections 0.28, rest 0.03)
|
| 288 |
+
t-20260518193619-wnzpp-worker-1:10404:11326 [6] NCCL INFO Init timings - ncclCommInitRankConfig: rank 6 nranks 8 total 1.55 (kernels 0.26, alloc 0.18, bootstrap 0.26, allgathers 0.00, topo 0.53, graphs 0.01, connections 0.29, rest 0.02)
|
| 289 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO ncclCommInitRankConfig comm 0xeeabba0 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId 75020 commId 0x824180f3dd2d3478 - Init COMPLETE
|
| 290 |
+
t-20260518193619-wnzpp-worker-1:10405:11282 [7] NCCL INFO Init timings - ncclCommInitRankConfig: rank 7 nranks 8 total 2.50 (kernels 0.22, alloc 0.57, bootstrap 0.86, allgathers 0.00, topo 0.53, graphs 0.01, connections 0.29, rest 0.02)
|
| 291 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 292 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 293 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 294 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 295 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 296 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 297 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 298 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 00/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 299 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 300 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 01/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 301 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 302 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 303 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 02/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 304 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 305 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 306 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 307 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 03/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 308 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 309 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 310 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 311 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 312 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 04/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 313 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 314 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 315 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 316 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 317 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 00/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 318 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 319 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 320 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 321 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 00/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 322 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 02/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 323 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 324 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 01/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 325 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 03/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 326 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 327 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 02/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 328 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 04/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 329 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 330 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 03/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 331 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 05/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 332 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 333 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 04/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 334 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 06/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 335 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 336 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 05/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 337 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 07/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 338 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 339 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 06/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 340 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 08/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 341 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 342 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 07/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 343 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 09/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 344 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 345 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 01/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 346 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 05/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 347 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM
|
| 348 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 03/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 349 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 08/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 350 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 10/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 351 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 352 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 02/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 353 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 06/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 354 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 04/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 355 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 09/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 356 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 11/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 357 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 358 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 03/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 359 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 07/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 360 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 05/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 361 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 10/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 362 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 12/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 363 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 364 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 04/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 365 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 08/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 366 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 06/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 367 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 11/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 368 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 13/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 369 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 370 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 05/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 371 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 09/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 372 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 373 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 12/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 374 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 375 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 14/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 376 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 377 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 06/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 378 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 10/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 379 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 08/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 380 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 13/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 381 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 382 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 15/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 383 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 384 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 07/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 385 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 11/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 386 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 387 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 14/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 388 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 389 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 16/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 390 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 391 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 392 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 12/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 393 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 10/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 394 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 15/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 395 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 396 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 17/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 397 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 398 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 09/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 399 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 13/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 400 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 11/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 401 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 16/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 402 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 403 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 18/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 404 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 405 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 10/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 406 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 14/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 407 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 12/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 408 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 17/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 409 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 19/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 410 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 411 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 412 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 11/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 413 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 15/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 414 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 13/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 415 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 18/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 416 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 20/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 417 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 418 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 419 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 12/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 420 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 16/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 421 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 14/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 422 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 19/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 423 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 21/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 424 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 425 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 426 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 13/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 427 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 17/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 428 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 15/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 429 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 20/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 430 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 22/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 431 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 432 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 433 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 14/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 434 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 18/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 435 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 16/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 436 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 21/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 437 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Channel 23/0 : 6[6] -> 7[7] via P2P/CUMEM
|
| 438 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 439 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 440 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 15/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 441 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 19/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 442 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 17/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 443 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 22/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 444 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 445 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 446 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 16/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 447 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 20/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 448 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 18/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 449 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Channel 23/0 : 7[7] -> 0[0] via P2P/CUMEM
|
| 450 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 451 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 452 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 17/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 453 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 21/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 454 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 19/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 455 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 456 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 457 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 18/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 458 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 22/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 459 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 460 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Channel 23/0 : 4[4] -> 5[5] via P2P/CUMEM
|
| 461 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 462 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 19/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 463 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 464 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 465 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 20/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 466 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 467 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM
|
| 468 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 21/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 469 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 470 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 22/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 471 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 472 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 20/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 473 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 474 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 21/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 475 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Channel 23/0 : 3[3] -> 4[4] via P2P/CUMEM
|
| 476 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 477 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 22/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 478 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 479 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Channel 23/0 : 5[5] -> 6[6] via P2P/CUMEM
|
| 480 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 481 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 482 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM
|
| 483 |
+
t-20260518193619-wnzpp-worker-1:10401:11380 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 484 |
+
t-20260518193619-wnzpp-worker-1:10400:11375 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 485 |
+
t-20260518193619-wnzpp-worker-1:10403:11379 [5] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 486 |
+
t-20260518193619-wnzpp-worker-1:10404:11381 [6] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 487 |
+
t-20260518193619-wnzpp-worker-1:10402:11376 [4] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 488 |
+
t-20260518193619-wnzpp-worker-1:10399:11382 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 489 |
+
t-20260518193619-wnzpp-worker-1:10405:11378 [7] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 490 |
+
t-20260518193619-wnzpp-worker-1:10398:11377 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
|
| 491 |
+
{
|
| 492 |
+
"device": "cuda:0",
|
| 493 |
+
"rank": 0,
|
| 494 |
+
"world_size": 8,
|
| 495 |
+
"samples": "tokenized_hf:9737184:pad=0",
|
| 496 |
+
"vocab_size": 32100,
|
| 497 |
+
"tokenizer_vocab_size": 32100,
|
| 498 |
+
"save_dir": "runs/lta_owt_t5_rollin_p50_randk0_3_uniformt_temp1_synct_mask1_gbs512_8gpu_1m_20260518_125609",
|
| 499 |
+
"batch_size": 32,
|
| 500 |
+
"grad_accum": 2,
|
| 501 |
+
"effective_batch_size": 512,
|
| 502 |
+
"global_batch_size": 512,
|
| 503 |
+
"lr_schedule": "constant_warmup",
|
| 504 |
+
"optimizer": "muon",
|
| 505 |
+
"epochs": 5.0,
|
| 506 |
+
"steps_per_epoch": 19018,
|
| 507 |
+
"total_steps": 95090,
|
| 508 |
+
"warmup_steps": 9509,
|
| 509 |
+
"warmup_epochs": 0.5,
|
| 510 |
+
"min_lr": 0.0,
|
| 511 |
+
"weight_decay": 0.1,
|
| 512 |
+
"output_weight_decay": -1.0,
|
| 513 |
+
"adamw_param_groups": "nanogpt",
|
| 514 |
+
"adam_beta1": 0.9,
|
| 515 |
+
"adam_beta2": 0.999,
|
| 516 |
+
"adam_eps": 1e-08,
|
| 517 |
+
"muon_impl": "optax",
|
| 518 |
+
"muon_momentum": 0.95,
|
| 519 |
+
"muon_ns_steps": 5,
|
| 520 |
+
"muon_update_scale": 1.0,
|
| 521 |
+
"muon_nesterov": true,
|
| 522 |
+
"muon_width_scale": true,
|
| 523 |
+
"muon_grouping": "hidden_2d",
|
| 524 |
+
"muon_param_count": 84934656,
|
| 525 |
+
"muon_adam_param_count": 50212608,
|
| 526 |
+
"muon_param_names": [
|
| 527 |
+
"blocks.0.attn_qkv.weight",
|
| 528 |
+
"blocks.0.attn_out.weight",
|
| 529 |
+
"blocks.0.mlp.w12.weight",
|
| 530 |
+
"blocks.0.mlp.w3.weight",
|
| 531 |
+
"blocks.1.attn_qkv.weight",
|
| 532 |
+
"blocks.1.attn_out.weight",
|
| 533 |
+
"blocks.1.mlp.w12.weight",
|
| 534 |
+
"blocks.1.mlp.w3.weight",
|
| 535 |
+
"blocks.2.attn_qkv.weight",
|
| 536 |
+
"blocks.2.attn_out.weight",
|
| 537 |
+
"blocks.2.mlp.w12.weight",
|
| 538 |
+
"blocks.2.mlp.w3.weight",
|
| 539 |
+
"blocks.3.attn_qkv.weight",
|
| 540 |
+
"blocks.3.attn_out.weight",
|
| 541 |
+
"blocks.3.mlp.w12.weight",
|
| 542 |
+
"blocks.3.mlp.w3.weight",
|
| 543 |
+
"blocks.4.attn_qkv.weight",
|
| 544 |
+
"blocks.4.attn_out.weight",
|
| 545 |
+
"blocks.4.mlp.w12.weight",
|
| 546 |
+
"blocks.4.mlp.w3.weight",
|
| 547 |
+
"blocks.5.attn_qkv.weight",
|
| 548 |
+
"blocks.5.attn_out.weight",
|
| 549 |
+
"blocks.5.mlp.w12.weight",
|
| 550 |
+
"blocks.5.mlp.w3.weight",
|
| 551 |
+
"blocks.6.attn_qkv.weight",
|
| 552 |
+
"blocks.6.attn_out.weight",
|
| 553 |
+
"blocks.6.mlp.w12.weight",
|
| 554 |
+
"blocks.6.mlp.w3.weight",
|
| 555 |
+
"blocks.7.attn_qkv.weight",
|
| 556 |
+
"blocks.7.attn_out.weight",
|
| 557 |
+
"blocks.7.mlp.w12.weight",
|
| 558 |
+
"blocks.7.mlp.w3.weight",
|
| 559 |
+
"blocks.8.attn_qkv.weight",
|
| 560 |
+
"blocks.8.attn_out.weight",
|
| 561 |
+
"blocks.8.mlp.w12.weight",
|
| 562 |
+
"blocks.8.mlp.w3.weight",
|
| 563 |
+
"blocks.9.attn_qkv.weight",
|
| 564 |
+
"blocks.9.attn_out.weight",
|
| 565 |
+
"blocks.9.mlp.w12.weight",
|
| 566 |
+
"blocks.9.mlp.w3.weight",
|
| 567 |
+
"blocks.10.attn_qkv.weight",
|
| 568 |
+
"blocks.10.attn_out.weight",
|
| 569 |
+
"blocks.10.mlp.w12.weight",
|
| 570 |
+
"blocks.10.mlp.w3.weight",
|
| 571 |
+
"blocks.11.attn_qkv.weight",
|
| 572 |
+
"blocks.11.attn_out.weight",
|
| 573 |
+
"blocks.11.mlp.w12.weight",
|
| 574 |
+
"blocks.11.mlp.w3.weight"
|
| 575 |
+
],
|
| 576 |
+
"muon_adam_param_names": [
|
| 577 |
+
"time_tokens",
|
| 578 |
+
"vocab_embed.embedding",
|
| 579 |
+
"sigma_map.net.0.weight",
|
| 580 |
+
"sigma_map.net.0.bias",
|
| 581 |
+
"sigma_map.net.2.weight",
|
| 582 |
+
"sigma_map.net.2.bias",
|
| 583 |
+
"blocks.0.norm1.weight",
|
| 584 |
+
"blocks.0.attn_qkv.bias",
|
| 585 |
+
"blocks.0.attn_out.bias",
|
| 586 |
+
"blocks.0.q_norm.weight",
|
| 587 |
+
"blocks.0.k_norm.weight",
|
| 588 |
+
"blocks.0.norm2.weight",
|
| 589 |
+
"blocks.0.mlp.w12.bias",
|
| 590 |
+
"blocks.0.mlp.w3.bias",
|
| 591 |
+
"blocks.1.norm1.weight",
|
| 592 |
+
"blocks.1.attn_qkv.bias",
|
| 593 |
+
"blocks.1.attn_out.bias",
|
| 594 |
+
"blocks.1.q_norm.weight",
|
| 595 |
+
"blocks.1.k_norm.weight",
|
| 596 |
+
"blocks.1.norm2.weight",
|
| 597 |
+
"blocks.1.mlp.w12.bias",
|
| 598 |
+
"blocks.1.mlp.w3.bias",
|
| 599 |
+
"blocks.2.norm1.weight",
|
| 600 |
+
"blocks.2.attn_qkv.bias",
|
| 601 |
+
"blocks.2.attn_out.bias",
|
| 602 |
+
"blocks.2.q_norm.weight",
|
| 603 |
+
"blocks.2.k_norm.weight",
|
| 604 |
+
"blocks.2.norm2.weight",
|
| 605 |
+
"blocks.2.mlp.w12.bias",
|
| 606 |
+
"blocks.2.mlp.w3.bias",
|
| 607 |
+
"blocks.3.norm1.weight",
|
| 608 |
+
"blocks.3.attn_qkv.bias",
|
| 609 |
+
"blocks.3.attn_out.bias",
|
| 610 |
+
"blocks.3.q_norm.weight",
|
| 611 |
+
"blocks.3.k_norm.weight",
|
| 612 |
+
"blocks.3.norm2.weight",
|
| 613 |
+
"blocks.3.mlp.w12.bias",
|
| 614 |
+
"blocks.3.mlp.w3.bias",
|
| 615 |
+
"blocks.4.norm1.weight",
|
| 616 |
+
"blocks.4.attn_qkv.bias",
|
| 617 |
+
"blocks.4.attn_out.bias",
|
| 618 |
+
"blocks.4.q_norm.weight",
|
| 619 |
+
"blocks.4.k_norm.weight",
|
| 620 |
+
"blocks.4.norm2.weight",
|
| 621 |
+
"blocks.4.mlp.w12.bias",
|
| 622 |
+
"blocks.4.mlp.w3.bias",
|
| 623 |
+
"blocks.5.norm1.weight",
|
| 624 |
+
"blocks.5.attn_qkv.bias",
|
| 625 |
+
"blocks.5.attn_out.bias",
|
| 626 |
+
"blocks.5.q_norm.weight",
|
| 627 |
+
"blocks.5.k_norm.weight",
|
| 628 |
+
"blocks.5.norm2.weight",
|
| 629 |
+
"blocks.5.mlp.w12.bias",
|
| 630 |
+
"blocks.5.mlp.w3.bias",
|
| 631 |
+
"blocks.6.norm1.weight",
|
| 632 |
+
"blocks.6.attn_qkv.bias",
|
| 633 |
+
"blocks.6.attn_out.bias",
|
| 634 |
+
"blocks.6.q_norm.weight",
|
| 635 |
+
"blocks.6.k_norm.weight",
|
| 636 |
+
"blocks.6.norm2.weight",
|
| 637 |
+
"blocks.6.mlp.w12.bias",
|
| 638 |
+
"blocks.6.mlp.w3.bias",
|
| 639 |
+
"blocks.7.norm1.weight",
|
| 640 |
+
"blocks.7.attn_qkv.bias",
|
| 641 |
+
"blocks.7.attn_out.bias",
|
| 642 |
+
"blocks.7.q_norm.weight",
|
| 643 |
+
"blocks.7.k_norm.weight",
|
| 644 |
+
"blocks.7.norm2.weight",
|
| 645 |
+
"blocks.7.mlp.w12.bias",
|
| 646 |
+
"blocks.7.mlp.w3.bias",
|
| 647 |
+
"blocks.8.norm1.weight",
|
| 648 |
+
"blocks.8.attn_qkv.bias",
|
| 649 |
+
"blocks.8.attn_out.bias",
|
| 650 |
+
"blocks.8.q_norm.weight",
|
| 651 |
+
"blocks.8.k_norm.weight",
|
| 652 |
+
"blocks.8.norm2.weight",
|
| 653 |
+
"blocks.8.mlp.w12.bias",
|
| 654 |
+
"blocks.8.mlp.w3.bias",
|
| 655 |
+
"blocks.9.norm1.weight",
|
| 656 |
+
"blocks.9.attn_qkv.bias",
|
| 657 |
+
"blocks.9.attn_out.bias",
|
| 658 |
+
"blocks.9.q_norm.weight",
|
| 659 |
+
"blocks.9.k_norm.weight",
|
| 660 |
+
"blocks.9.norm2.weight",
|
| 661 |
+
"blocks.9.mlp.w12.bias",
|
| 662 |
+
"blocks.9.mlp.w3.bias",
|
| 663 |
+
"blocks.10.norm1.weight",
|
| 664 |
+
"blocks.10.attn_qkv.bias",
|
| 665 |
+
"blocks.10.attn_out.bias",
|
| 666 |
+
"blocks.10.q_norm.weight",
|
| 667 |
+
"blocks.10.k_norm.weight",
|
| 668 |
+
"blocks.10.norm2.weight",
|
| 669 |
+
"blocks.10.mlp.w12.bias",
|
| 670 |
+
"blocks.10.mlp.w3.bias",
|
| 671 |
+
"blocks.11.norm1.weight",
|
| 672 |
+
"blocks.11.attn_qkv.bias",
|
| 673 |
+
"blocks.11.attn_out.bias",
|
| 674 |
+
"blocks.11.q_norm.weight",
|
| 675 |
+
"blocks.11.k_norm.weight",
|
| 676 |
+
"blocks.11.norm2.weight",
|
| 677 |
+
"blocks.11.mlp.w12.bias",
|
| 678 |
+
"blocks.11.mlp.w3.bias",
|
| 679 |
+
"output_layer.norm_final.weight",
|
| 680 |
+
"output_layer.linear.weight"
|
| 681 |
+
],
|
| 682 |
+
"muon_effective_nesterov": true,
|
| 683 |
+
"muon_effective_width_scale": true,
|
| 684 |
+
"muon_effective_weight_decay": 0.0,
|
| 685 |
+
"muon_adam_fallback_nesterov": true,
|
| 686 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 687 |
+
"ema_decay": 0.9999,
|
| 688 |
+
"ema_start_step": 0,
|
| 689 |
+
"model_type": "ddit_elf",
|
| 690 |
+
"ddit_mlp_type": "gelu",
|
| 691 |
+
"elf_num_time_tokens": 4,
|
| 692 |
+
"elf_num_model_mode_tokens": 0,
|
| 693 |
+
"qk_norm": true,
|
| 694 |
+
"output_bias": false,
|
| 695 |
+
"output_init_std": 0.0,
|
| 696 |
+
"norm_type": "rmsnorm",
|
| 697 |
+
"target_loss": "hard_ce",
|
| 698 |
+
"linear_soft_target_power": 1.0,
|
| 699 |
+
"linear_soft_target_min_conf": 0.0,
|
| 700 |
+
"linear_soft_target_max_conf": 1.0,
|
| 701 |
+
"t_sampling_mode": "uniform",
|
| 702 |
+
"t_sampling_power": 1.0,
|
| 703 |
+
"t_sampling_eps": 0.0001,
|
| 704 |
+
"t_sampling_logit_mean": -1.5,
|
| 705 |
+
"t_sampling_logit_std": 0.8,
|
| 706 |
+
"dual_t": true,
|
| 707 |
+
"corrupt_t_mode": "same",
|
| 708 |
+
"corrupt_min_t": 0.0,
|
| 709 |
+
"corrupt_max_t": 1.0,
|
| 710 |
+
"prefix_block_prob": 0.0,
|
| 711 |
+
"prefix_block_len": 128,
|
| 712 |
+
"mask_ratio_floor_schedule": "none",
|
| 713 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 714 |
+
"dirichlet_semantic_t_mode": "same",
|
| 715 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 716 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 717 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 718 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 719 |
+
"categorical_wrong_from_full_vocab": true,
|
| 720 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 721 |
+
"categorical_wrong_basin_token_ids": "",
|
| 722 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 723 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 724 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 725 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 726 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 727 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 728 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 729 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 730 |
+
"mask_mixture_original_prob": 0.0,
|
| 731 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 732 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 733 |
+
"mask_mixture_block_prob": 0.0,
|
| 734 |
+
"mask_mixture_all_prob": 1.0,
|
| 735 |
+
"mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
|
| 736 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 737 |
+
"mask_mixture_block_tokens": "64,128",
|
| 738 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 739 |
+
"logistic_normal_sigma_min": 0.18,
|
| 740 |
+
"logistic_normal_sigma_max": 2.2,
|
| 741 |
+
"logistic_normal_tau_min": 0.65,
|
| 742 |
+
"logistic_normal_tau_max": 1.15,
|
| 743 |
+
"torch_compile": false,
|
| 744 |
+
"compile_mode": "max-autotune",
|
| 745 |
+
"state_format": "prob",
|
| 746 |
+
"meanflow_weight": 0.0,
|
| 747 |
+
"rollout_train_prob": 0.5,
|
| 748 |
+
"rollout_train_steps": 3,
|
| 749 |
+
"rollout_train_steps_min": 0,
|
| 750 |
+
"rollout_train_infer_steps": 1,
|
| 751 |
+
"rollout_train_time_mode": "sampled_path",
|
| 752 |
+
"rollout_train_s_dist": "uniform",
|
| 753 |
+
"rollout_train_s_min_frac": 0.0,
|
| 754 |
+
"rollout_train_s_max_frac": 0.25,
|
| 755 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 756 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 757 |
+
"rollout_train_temp": 1.0,
|
| 758 |
+
"rollout_train_max_gamma": 1.0,
|
| 759 |
+
"rollout_train_corrupt_only": true,
|
| 760 |
+
"rollout_train_samplewise": true,
|
| 761 |
+
"rollout_train_compute_always": false,
|
| 762 |
+
"rollout_train_sync_t": true,
|
| 763 |
+
"bridge_noise_init": "logistic_normal",
|
| 764 |
+
"noise_sigma": -1.0,
|
| 765 |
+
"allow_tf32": true,
|
| 766 |
+
"activation_checkpointing": true,
|
| 767 |
+
"activation_checkpoint_interval": 1,
|
| 768 |
+
"activation_checkpoint_scope": "mlp",
|
| 769 |
+
"ddp_static_graph": false,
|
| 770 |
+
"ddp_gradient_as_bucket_view": true,
|
| 771 |
+
"blocking_data_transfer": false,
|
| 772 |
+
"dataloader_prefetch_factor": 4,
|
| 773 |
+
"full_train_stats": false,
|
| 774 |
+
"tokenized_hf": true,
|
| 775 |
+
"tokenized_pad_token": "pad",
|
| 776 |
+
"elf_conditional_hf": false,
|
| 777 |
+
"record_pad_truncate": false,
|
| 778 |
+
"record_add_eos": false,
|
| 779 |
+
"record_add_special_tokens": false,
|
| 780 |
+
"record_pad_token": "pad",
|
| 781 |
+
"record_shuffle_buffer": 10000,
|
| 782 |
+
"wrap": false,
|
| 783 |
+
"wrap_mode": "stream",
|
| 784 |
+
"wrap_record_buffer_size": 200,
|
| 785 |
+
"owt_cached_chunks": false,
|
| 786 |
+
"owt_chunk_cache_dir": "",
|
| 787 |
+
"owt_chunk_cache_rebuild": false,
|
| 788 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 789 |
+
"owt_exact_repeat_per_chunk": 0,
|
| 790 |
+
"online_chunk_shuffle": false,
|
| 791 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 792 |
+
"openwebtext_split": "all",
|
| 793 |
+
"detokenizer": "auto",
|
| 794 |
+
"resolved_detokenizer": null,
|
| 795 |
+
"num_workers": 8,
|
| 796 |
+
"latest_every": 1000,
|
| 797 |
+
"resume_path": ""
|
| 798 |
+
}
|
| 799 |
+
t-20260518193619-wnzpp-worker-1:10400:11782 [2] NCCL INFO NVLS comm 0x729d7ec0 headRank 2 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 800 |
+
t-20260518193619-wnzpp-worker-1:10401:11783 [3] NCCL INFO NVLS comm 0xeb83a20 headRank 3 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 801 |
+
t-20260518193619-wnzpp-worker-1:10399:11784 [1] NCCL INFO NVLS comm 0xd6b7b10 headRank 1 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 802 |
+
t-20260518193619-wnzpp-worker-1:10402:11785 [4] NCCL INFO NVLS comm 0xe035080 headRank 4 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 803 |
+
t-20260518193619-wnzpp-worker-1:10405:11786 [7] NCCL INFO NVLS comm 0xeeabba0 headRank 7 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 804 |
+
t-20260518193619-wnzpp-worker-1:10404:11787 [6] NCCL INFO NVLS comm 0xdf37090 headRank 6 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 805 |
+
t-20260518193619-wnzpp-worker-1:10398:11788 [0] NCCL INFO NVLS comm 0xce77c80 headRank 0 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 806 |
+
t-20260518193619-wnzpp-worker-1:10403:11789 [5] NCCL INFO NVLS comm 0xe2d8120 headRank 5 nHeads 8 buffSize 1048576 nvlsPerRankSize 33554432 nvlsTotalSize 268435456
|
| 807 |
+
step=100 epoch=1/5 epoch_step=100/19018 micro_steps=200 elapsed=139.6s lr=2.124303e-05 loss=10.1003 loss_recon=10.1003 loss_meanflow=0.0000 mean_model_t=0.5010 mean_corrupt_t=0.5010 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5009 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0203 corrupt_frac=1.0000 acc_corrupt=0.0203 loss_corrupt=10.1003 wrong_frac=0.4987 init_acc_corrupt=0.4678 acc_corrupt_t_0p0_0p2=0.0145 corrupt_frac_t_0p0_0p2=0.1988 acc_corrupt_t_0p2_0p4=0.0179 corrupt_frac_t_0p2_0p4=0.1957 acc_corrupt_t_0p4_0p6=0.0214 corrupt_frac_t_0p4_0p6=0.2070 acc_corrupt_t_0p6_0p8=0.0240 corrupt_frac_t_0p6_0p8=0.1999 acc_corrupt_t_0p8_1p0=0.0237 corrupt_frac_t_0p8_1p0=0.2007 out_w_norm=1.4216 out_g_norm=1.5149 loss_all=9.5077 init_gold_top10=0.4520 init_gold_top100=0.5699 rollout_applied_pos_frac=0.4999 init_acc_rollout_applied=0.5139 init_acc_rollout_kept=0.3459 logit_acc_rollout_applied=0.0332 logit_acc_rollout_kept=0.0351
|
| 808 |
+
step=200 epoch=1/5 epoch_step=200/19018 micro_steps=400 elapsed=139.3s lr=4.227574e-05 loss=8.5592 loss_recon=8.5592 loss_meanflow=0.0000 mean_model_t=0.5028 mean_corrupt_t=0.5028 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5070 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0355 corrupt_frac=1.0000 acc_corrupt=0.0355 loss_corrupt=8.5592 wrong_frac=0.4971 init_acc_corrupt=0.4692 acc_corrupt_t_0p0_0p2=0.0338 corrupt_frac_t_0p0_0p2=0.1981 acc_corrupt_t_0p2_0p4=0.0345 corrupt_frac_t_0p2_0p4=0.1973 acc_corrupt_t_0p4_0p6=0.0351 corrupt_frac_t_0p4_0p6=0.1975 acc_corrupt_t_0p6_0p8=0.0360 corrupt_frac_t_0p6_0p8=0.2050 acc_corrupt_t_0p8_1p0=0.0380 corrupt_frac_t_0p8_1p0=0.2031 out_w_norm=9.4462 out_g_norm=1.7789 loss_all=7.6793 init_gold_top10=0.5476 init_gold_top100=0.6694 rollout_applied_pos_frac=0.5905 init_acc_rollout_applied=0.5336 init_acc_rollout_kept=0.5080 logit_acc_rollout_applied=0.0425 logit_acc_rollout_kept=0.0428
|
| 809 |
+
step=300 epoch=1/5 epoch_step=300/19018 micro_steps=600 elapsed=138.8s lr=6.330844e-05 loss=7.3133 loss_recon=7.3133 loss_meanflow=0.0000 mean_model_t=0.5000 mean_corrupt_t=0.5000 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5058 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0441 corrupt_frac=1.0000 acc_corrupt=0.0441 loss_corrupt=7.3133 wrong_frac=0.5004 init_acc_corrupt=0.4653 acc_corrupt_t_0p0_0p2=0.0446 corrupt_frac_t_0p0_0p2=0.1989 acc_corrupt_t_0p2_0p4=0.0440 corrupt_frac_t_0p2_0p4=0.2006 acc_corrupt_t_0p4_0p6=0.0438 corrupt_frac_t_0p4_0p6=0.2021 acc_corrupt_t_0p6_0p8=0.0440 corrupt_frac_t_0p6_0p8=0.2005 acc_corrupt_t_0p8_1p0=0.0439 corrupt_frac_t_0p8_1p0=0.1989 out_w_norm=19.8990 out_g_norm=0.5973 loss_all=7.1080 init_gold_top10=0.4187 init_gold_top100=0.5664 rollout_applied_pos_frac=0.5393 init_acc_rollout_applied=0.3405 init_acc_rollout_kept=0.4090 logit_acc_rollout_applied=0.0493 logit_acc_rollout_kept=0.0447
|
| 810 |
+
step=400 epoch=1/5 epoch_step=400/19018 micro_steps=800 elapsed=139.4s lr=8.434115e-05 loss=7.0895 loss_recon=7.0895 loss_meanflow=0.0000 mean_model_t=0.5038 mean_corrupt_t=0.5038 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5031 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0445 corrupt_frac=1.0000 acc_corrupt=0.0445 loss_corrupt=7.0895 wrong_frac=0.4963 init_acc_corrupt=0.4702 acc_corrupt_t_0p0_0p2=0.0429 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.0430 corrupt_frac_t_0p2_0p4=0.1999 acc_corrupt_t_0p4_0p6=0.0430 corrupt_frac_t_0p4_0p6=0.2013 acc_corrupt_t_0p6_0p8=0.0450 corrupt_frac_t_0p6_0p8=0.2029 acc_corrupt_t_0p8_1p0=0.0487 corrupt_frac_t_0p8_1p0=0.2012 out_w_norm=25.3857 out_g_norm=0.2535 loss_all=6.8638 init_gold_top10=0.4743 init_gold_top100=0.6303 rollout_applied_pos_frac=0.4657 init_acc_rollout_applied=0.3189 init_acc_rollout_kept=0.4833 logit_acc_rollout_applied=0.0497 logit_acc_rollout_kept=0.0575
|
| 811 |
+
step=500 epoch=1/5 epoch_step=500/19018 micro_steps=1000 elapsed=139.7s lr=1.053739e-04 loss=6.6757 loss_recon=6.6757 loss_meanflow=0.0000 mean_model_t=0.4995 mean_corrupt_t=0.4995 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5047 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0836 corrupt_frac=1.0000 acc_corrupt=0.0836 loss_corrupt=6.6757 wrong_frac=0.5003 init_acc_corrupt=0.4647 acc_corrupt_t_0p0_0p2=0.0449 corrupt_frac_t_0p0_0p2=0.1982 acc_corrupt_t_0p2_0p4=0.0583 corrupt_frac_t_0p2_0p4=0.2092 acc_corrupt_t_0p4_0p6=0.0776 corrupt_frac_t_0p4_0p6=0.1874 acc_corrupt_t_0p6_0p8=0.1045 corrupt_frac_t_0p6_0p8=0.2040 acc_corrupt_t_0p8_1p0=0.1326 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=30.4488 out_g_norm=0.2943 loss_all=6.3466 init_gold_top10=0.4677 init_gold_top100=0.6267 rollout_applied_pos_frac=0.4496 init_acc_rollout_applied=0.4524 init_acc_rollout_kept=0.4245 logit_acc_rollout_applied=0.1300 logit_acc_rollout_kept=0.1240
|
| 812 |
+
step=600 epoch=1/5 epoch_step=600/19018 micro_steps=1200 elapsed=139.0s lr=1.264066e-04 loss=5.8922 loss_recon=5.8922 loss_meanflow=0.0000 mean_model_t=0.5014 mean_corrupt_t=0.5014 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4886 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2511 corrupt_frac=1.0000 acc_corrupt=0.2511 loss_corrupt=5.8922 wrong_frac=0.4982 init_acc_corrupt=0.4684 acc_corrupt_t_0p0_0p2=0.0451 corrupt_frac_t_0p0_0p2=0.2012 acc_corrupt_t_0p2_0p4=0.1028 corrupt_frac_t_0p2_0p4=0.1954 acc_corrupt_t_0p4_0p6=0.2463 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=0.3622 corrupt_frac_t_0p6_0p8=0.2062 acc_corrupt_t_0p8_1p0=0.4966 corrupt_frac_t_0p8_1p0=0.1977 out_w_norm=35.4047 out_g_norm=0.3173 loss_all=5.4247 init_gold_top10=0.5145 init_gold_top100=0.6337 rollout_applied_pos_frac=0.3817 init_acc_rollout_applied=0.4180 init_acc_rollout_kept=0.5100 logit_acc_rollout_applied=0.2917 logit_acc_rollout_kept=0.3635
|
| 813 |
+
step=700 epoch=1/5 epoch_step=700/19018 micro_steps=1400 elapsed=139.9s lr=1.474393e-04 loss=5.0283 loss_recon=5.0283 loss_meanflow=0.0000 mean_model_t=0.5075 mean_corrupt_t=0.5075 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4975 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3878 corrupt_frac=1.0000 acc_corrupt=0.3878 loss_corrupt=5.0283 wrong_frac=0.4922 init_acc_corrupt=0.4755 acc_corrupt_t_0p0_0p2=0.0500 corrupt_frac_t_0p0_0p2=0.1963 acc_corrupt_t_0p2_0p4=0.1901 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.3917 corrupt_frac_t_0p4_0p6=0.1964 acc_corrupt_t_0p6_0p8=0.5546 corrupt_frac_t_0p6_0p8=0.2077 acc_corrupt_t_0p8_1p0=0.7181 corrupt_frac_t_0p8_1p0=0.2082 out_w_norm=42.7650 out_g_norm=0.2315 loss_all=4.6847 init_gold_top10=0.5481 init_gold_top100=0.6645 rollout_applied_pos_frac=0.4076 init_acc_rollout_applied=0.5359 init_acc_rollout_kept=0.4696 logit_acc_rollout_applied=0.4566 logit_acc_rollout_kept=0.4168
|
| 814 |
+
step=800 epoch=1/5 epoch_step=800/19018 micro_steps=1600 elapsed=139.9s lr=1.684720e-04 loss=4.5670 loss_recon=4.5670 loss_meanflow=0.0000 mean_model_t=0.5023 mean_corrupt_t=0.5023 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5081 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4336 corrupt_frac=1.0000 acc_corrupt=0.4336 loss_corrupt=4.5670 wrong_frac=0.4983 init_acc_corrupt=0.4691 acc_corrupt_t_0p0_0p2=0.0561 corrupt_frac_t_0p0_0p2=0.1966 acc_corrupt_t_0p2_0p4=0.2258 corrupt_frac_t_0p2_0p4=0.2016 acc_corrupt_t_0p4_0p6=0.4460 corrupt_frac_t_0p4_0p6=0.2038 acc_corrupt_t_0p6_0p8=0.6298 corrupt_frac_t_0p6_0p8=0.1993 acc_corrupt_t_0p8_1p0=0.8084 corrupt_frac_t_0p8_1p0=0.1987 out_w_norm=51.8328 out_g_norm=0.1234 loss_all=4.2287 init_gold_top10=0.5328 init_gold_top100=0.6429 rollout_applied_pos_frac=0.4373 init_acc_rollout_applied=0.4846 init_acc_rollout_kept=0.4966 logit_acc_rollout_applied=0.4490 logit_acc_rollout_kept=0.4890
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_rollin_p50_randk0_4_uniformt_temp1_synct_20260518_013432.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_rollin_p50_randk0_4_uniformt_temp1_synct_20260518_101300.nohup
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[launch] method=owt_elfaligned_t5tokenized_dditelf host=di-20260411014000-djqhq time=2026-05-18T10:13:07+00:00
|
| 2 |
+
[launch] run_name=lta_owt_t5_rollin_p50_randk0_4_uniformt_temp1_synct_20260518_101300
|
| 3 |
+
[launch] save_dir=runs/lta_owt_t5_rollin_p50_randk0_4_uniformt_temp1_synct_20260518_101300
|
| 4 |
+
[launch] log_file=logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_rollin_p50_randk0_4_uniformt_temp1_synct_20260518_101300.log
|
| 5 |
+
[launch] data_path=/e2e-data/evad-tech-vla/wanghan58/data/embedded-language-flows/openwebtext-t5
|
| 6 |
+
[launch] tokenizer=/e2e-data/evad-tech-vla/wanghan58/models/hf/t5-small/tokenizer.json
|
| 7 |
+
[launch] examples=9737184 epochs=5 world_size=8 grad_accum=2 loader_batches_per_rank=38035 steps_per_epoch=19018 save_every=19018
|
| 8 |
+
[launch] optimizer=muon_impl=optax grouping=hidden_2d lr=0.002 hidden_wd=0 adam_fallback_wd=0.1 momentum=0.95 ns=5 nesterov=true width_scale=true adam_fallback_b2=0.999 ema=0.9999
|
| 9 |
+
[launch] model=ddit_elf rmsnorm qk_norm=true swiglu no_adaln output_bias=false output_init_std=0.0 time_tokens=4 mode_tokens=0
|
| 10 |
+
[launch] data=tokenized_hf pad=pad add_special_tokens=false t5-small fp32=true bf16=false tf32=true
|
| 11 |
+
[launch] t_sampling=uniform mean=-1.5 std=0.8 target_loss=hard_ce linear_soft_power=1.0 linear_soft_conf=0.0->1.0 loss_t_weight=none loss_t_min_weight=0.0 warmup_epochs=0.5
|
| 12 |
+
[launch] mask=min1.0->max1.0 mixture original=0.0 lowk=0.0 lowcorrupt=0.0 block=0.0 all=1.0
|
| 13 |
+
[launch] rollout_train prob=0.50 mode=sampled_path steps=4 steps_min=0 infer_steps=1 s=uniform:0.0->0.25 temp=1.0 samplewise=1 selected_only=1 sync_t=1
|
| 14 |
+
Traceback (most recent call last):
|
| 15 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2823, in <module>
|
| 16 |
+
main()
|
| 17 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1930, in main
|
| 18 |
+
torch.cuda.set_device(local_rank)
|
| 19 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py", line 477, in set_device
|
| 20 |
+
torch._C._cuda_setDevice(device)
|
| 21 |
+
RuntimeError: CUDA error: invalid device ordinal
|
| 22 |
+
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
|
| 23 |
+
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
|
| 24 |
+
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
|
| 25 |
+
|
| 26 |
+
Traceback (most recent call last):
|
| 27 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2823, in <module>
|
| 28 |
+
main()
|
| 29 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1930, in main
|
| 30 |
+
torch.cuda.set_device(local_rank)
|
| 31 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py", line 477, in set_device
|
| 32 |
+
torch._C._cuda_setDevice(device)
|
| 33 |
+
RuntimeError: CUDA error: invalid device ordinal
|
| 34 |
+
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
|
| 35 |
+
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
|
| 36 |
+
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
|
| 37 |
+
|
| 38 |
+
Traceback (most recent call last):
|
| 39 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2823, in <module>
|
| 40 |
+
main()
|
| 41 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1930, in main
|
| 42 |
+
torch.cuda.set_device(local_rank)
|
| 43 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py", line 477, in set_device
|
| 44 |
+
torch._C._cuda_setDevice(device)
|
| 45 |
+
RuntimeError: CUDA error: invalid device ordinal
|
| 46 |
+
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
|
| 47 |
+
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
|
| 48 |
+
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
|
| 49 |
+
|
| 50 |
+
Traceback (most recent call last):
|
| 51 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2823, in <module>
|
| 52 |
+
main()
|
| 53 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1930, in main
|
| 54 |
+
torch.cuda.set_device(local_rank)
|
| 55 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py", line 477, in set_device
|
| 56 |
+
torch._C._cuda_setDevice(device)
|
| 57 |
+
RuntimeError: CUDA error: invalid device ordinal
|
| 58 |
+
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
|
| 59 |
+
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
|
| 60 |
+
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
|
| 61 |
+
|
| 62 |
+
W0518 10:13:11.784000 20150 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 20154 closing signal SIGTERM
|
| 63 |
+
W0518 10:13:11.785000 20150 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 20155 closing signal SIGTERM
|
| 64 |
+
W0518 10:13:11.785000 20150 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 20156 closing signal SIGTERM
|
| 65 |
+
W0518 10:13:11.786000 20150 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 20157 closing signal SIGTERM
|
| 66 |
+
W0518 10:13:11.786000 20150 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 20159 closing signal SIGTERM
|
| 67 |
+
E0518 10:13:11.964000 20150 torch/distributed/elastic/multiprocessing/api.py:870] failed (exitcode: 1) local_rank: 4 (pid: 20158) of binary: /usr/bin/python
|
| 68 |
+
Traceback (most recent call last):
|
| 69 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 70 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 71 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
|
| 72 |
+
main()
|
| 73 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 74 |
+
return f(*args, **kwargs)
|
| 75 |
+
^^^^^^^^^^^^^^^^^^
|
| 76 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
|
| 77 |
+
run(args)
|
| 78 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
|
| 79 |
+
elastic_launch(
|
| 80 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
|
| 81 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 82 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 83 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
|
| 84 |
+
raise ChildFailedError(
|
| 85 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 86 |
+
============================================================
|
| 87 |
+
train.py FAILED
|
| 88 |
+
------------------------------------------------------------
|
| 89 |
+
Failures:
|
| 90 |
+
[1]:
|
| 91 |
+
time : 2026-05-18_10:13:11
|
| 92 |
+
host : localhost
|
| 93 |
+
rank : 6 (local_rank: 6)
|
| 94 |
+
exitcode : 1 (pid: 20160)
|
| 95 |
+
error_file: <N/A>
|
| 96 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 97 |
+
[2]:
|
| 98 |
+
time : 2026-05-18_10:13:11
|
| 99 |
+
host : localhost
|
| 100 |
+
rank : 7 (local_rank: 7)
|
| 101 |
+
exitcode : 1 (pid: 20161)
|
| 102 |
+
error_file: <N/A>
|
| 103 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 104 |
+
------------------------------------------------------------
|
| 105 |
+
Root Cause (first observed failure):
|
| 106 |
+
[0]:
|
| 107 |
+
time : 2026-05-18_10:13:11
|
| 108 |
+
host : localhost
|
| 109 |
+
rank : 4 (local_rank: 4)
|
| 110 |
+
exitcode : 1 (pid: 20158)
|
| 111 |
+
error_file: <N/A>
|
| 112 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 113 |
+
============================================================
|
LTA_openwebtext_dualt/logs/elfaligned_t5tokenized_8gpu/lta_owt_t5_rollin_p50_randk0_4_uniformt_temp1_synct_20260518_101300.pid
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
19995
|
LTA_openwebtext_dualt/logs/infer_owt_t5_2node_latest_trainmatched_decode_temp1p5_n8.log
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[ckpt] eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step101200_20260519_092441.pt step=101000
|
| 2 |
+
[decode-base] n=8 max_len=1024 steps=1024 model_t=post
|
| 3 |
+
[decode-time] schedule=linear s=[0.0,0.25] force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.000977 dt_max=0.000977
|
| 4 |
+
[decode] temp=1.50 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 5 |
+
[decode] temp=1.50 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 6 |
+
[decode] temp=1.50 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 7 |
+
[decode] temp=1.50 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 8 |
+
[decode] temp=1.50 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 9 |
+
[decode] temp=1.50 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 10 |
+
[decode] temp=1.50 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 11 |
+
[decode] temp=1.50 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 12 |
+
[summary] {"type": "summary", "checkpoint": "eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step101200_20260519_092441.pt", "step": 101000, "decode": {"steps": 1024, "model_t_mode": "post", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0009765625, 0.001953125, 0.0029296875, 0.00390625, 0.0048828125, 0.005859375, 0.0068359375, 0.0078125, 0.0087890625, 0.009765625, 0.0107421875, 0.01171875, 0.0126953125, 0.013671875, 0.0146484375, 0.015625, 0.0166015625, 0.017578125, 0.0185546875, 0.01953125, 0.0205078125, 0.021484375, 0.0224609375, 0.0234375, 0.0244140625, 0.025390625, 0.0263671875, 0.02734375, 0.0283203125, 0.029296875, 0.0302734375, 0.03125, 0.0322265625, 0.033203125, 0.0341796875, 0.03515625, 0.0361328125, 0.037109375, 0.0380859375, 0.0390625, 0.0400390625, 0.041015625, 0.0419921875, 0.04296875, 0.0439453125, 0.044921875, 0.0458984375, 0.046875, 0.0478515625, 0.048828125, 0.0498046875, 0.05078125, 0.0517578125, 0.052734375, 0.0537109375, 0.0546875, 0.0556640625, 0.056640625, 0.0576171875, 0.05859375, 0.0595703125, 0.060546875, 0.0615234375, 0.0625, 0.0634765625, 0.064453125, 0.0654296875, 0.06640625, 0.0673828125, 0.068359375, 0.0693359375, 0.0703125, 0.0712890625, 0.072265625, 0.0732421875, 0.07421875, 0.0751953125, 0.076171875, 0.0771484375, 0.078125, 0.0791015625, 0.080078125, 0.0810546875, 0.08203125, 0.0830078125, 0.083984375, 0.0849609375, 0.0859375, 0.0869140625, 0.087890625, 0.0888671875, 0.08984375, 0.0908203125, 0.091796875, 0.0927734375, 0.09375, 0.0947265625, 0.095703125, 0.0966796875, 0.09765625, 0.0986328125, 0.099609375, 0.1005859375, 0.1015625, 0.1025390625, 0.103515625, 0.1044921875, 0.10546875, 0.1064453125, 0.107421875, 0.1083984375, 0.109375, 0.1103515625, 0.111328125, 0.1123046875, 0.11328125, 0.1142578125, 0.115234375, 0.1162109375, 0.1171875, 0.1181640625, 0.119140625, 0.1201171875, 0.12109375, 0.1220703125, 0.123046875, 0.1240234375, 0.125, 0.1259765625, 0.126953125, 0.1279296875, 0.12890625, 0.1298828125, 0.130859375, 0.1318359375, 0.1328125, 0.1337890625, 0.134765625, 0.1357421875, 0.13671875, 0.1376953125, 0.138671875, 0.1396484375, 0.140625, 0.1416015625, 0.142578125, 0.1435546875, 0.14453125, 0.1455078125, 0.146484375, 0.1474609375, 0.1484375, 0.1494140625, 0.150390625, 0.1513671875, 0.15234375, 0.1533203125, 0.154296875, 0.1552734375, 0.15625, 0.1572265625, 0.158203125, 0.1591796875, 0.16015625, 0.1611328125, 0.162109375, 0.1630859375, 0.1640625, 0.1650390625, 0.166015625, 0.1669921875, 0.16796875, 0.1689453125, 0.169921875, 0.1708984375, 0.171875, 0.1728515625, 0.173828125, 0.1748046875, 0.17578125, 0.1767578125, 0.177734375, 0.1787109375, 0.1796875, 0.1806640625, 0.181640625, 0.1826171875, 0.18359375, 0.1845703125, 0.185546875, 0.1865234375, 0.1875, 0.1884765625, 0.189453125, 0.1904296875, 0.19140625, 0.1923828125, 0.193359375, 0.1943359375, 0.1953125, 0.1962890625, 0.197265625, 0.1982421875, 0.19921875, 0.2001953125, 0.201171875, 0.2021484375, 0.203125, 0.2041015625, 0.205078125, 0.2060546875, 0.20703125, 0.2080078125, 0.208984375, 0.2099609375, 0.2109375, 0.2119140625, 0.212890625, 0.2138671875, 0.21484375, 0.2158203125, 0.216796875, 0.2177734375, 0.21875, 0.2197265625, 0.220703125, 0.2216796875, 0.22265625, 0.2236328125, 0.224609375, 0.2255859375, 0.2265625, 0.2275390625, 0.228515625, 0.2294921875, 0.23046875, 0.2314453125, 0.232421875, 0.2333984375, 0.234375, 0.2353515625, 0.236328125, 0.2373046875, 0.23828125, 0.2392578125, 0.240234375, 0.2412109375, 0.2421875, 0.2431640625, 0.244140625, 0.2451171875, 0.24609375, 0.2470703125, 0.248046875, 0.2490234375, 0.25, 0.2509765625, 0.251953125, 0.2529296875, 0.25390625, 0.2548828125, 0.255859375, 0.2568359375, 0.2578125, 0.2587890625, 0.259765625, 0.2607421875, 0.26171875, 0.2626953125, 0.263671875, 0.2646484375, 0.265625, 0.2666015625, 0.267578125, 0.2685546875, 0.26953125, 0.2705078125, 0.271484375, 0.2724609375, 0.2734375, 0.2744140625, 0.275390625, 0.2763671875, 0.27734375, 0.2783203125, 0.279296875, 0.2802734375, 0.28125, 0.2822265625, 0.283203125, 0.2841796875, 0.28515625, 0.2861328125, 0.287109375, 0.2880859375, 0.2890625, 0.2900390625, 0.291015625, 0.2919921875, 0.29296875, 0.2939453125, 0.294921875, 0.2958984375, 0.296875, 0.2978515625, 0.298828125, 0.2998046875, 0.30078125, 0.3017578125, 0.302734375, 0.3037109375, 0.3046875, 0.3056640625, 0.306640625, 0.3076171875, 0.30859375, 0.3095703125, 0.310546875, 0.3115234375, 0.3125, 0.3134765625, 0.314453125, 0.3154296875, 0.31640625, 0.3173828125, 0.318359375, 0.3193359375, 0.3203125, 0.3212890625, 0.322265625, 0.3232421875, 0.32421875, 0.3251953125, 0.326171875, 0.3271484375, 0.328125, 0.3291015625, 0.330078125, 0.3310546875, 0.33203125, 0.3330078125, 0.333984375, 0.3349609375, 0.3359375, 0.3369140625, 0.337890625, 0.3388671875, 0.33984375, 0.3408203125, 0.341796875, 0.3427734375, 0.34375, 0.3447265625, 0.345703125, 0.3466796875, 0.34765625, 0.3486328125, 0.349609375, 0.3505859375, 0.3515625, 0.3525390625, 0.353515625, 0.3544921875, 0.35546875, 0.3564453125, 0.357421875, 0.3583984375, 0.359375, 0.3603515625, 0.361328125, 0.3623046875, 0.36328125, 0.3642578125, 0.365234375, 0.3662109375, 0.3671875, 0.3681640625, 0.369140625, 0.3701171875, 0.37109375, 0.3720703125, 0.373046875, 0.3740234375, 0.375, 0.3759765625, 0.376953125, 0.3779296875, 0.37890625, 0.3798828125, 0.380859375, 0.3818359375, 0.3828125, 0.3837890625, 0.384765625, 0.3857421875, 0.38671875, 0.3876953125, 0.388671875, 0.3896484375, 0.390625, 0.3916015625, 0.392578125, 0.3935546875, 0.39453125, 0.3955078125, 0.396484375, 0.3974609375, 0.3984375, 0.3994140625, 0.400390625, 0.4013671875, 0.40234375, 0.4033203125, 0.404296875, 0.4052734375, 0.40625, 0.4072265625, 0.408203125, 0.4091796875, 0.41015625, 0.4111328125, 0.412109375, 0.4130859375, 0.4140625, 0.4150390625, 0.416015625, 0.4169921875, 0.41796875, 0.4189453125, 0.419921875, 0.4208984375, 0.421875, 0.4228515625, 0.423828125, 0.4248046875, 0.42578125, 0.4267578125, 0.427734375, 0.4287109375, 0.4296875, 0.4306640625, 0.431640625, 0.4326171875, 0.43359375, 0.4345703125, 0.435546875, 0.4365234375, 0.4375, 0.4384765625, 0.439453125, 0.4404296875, 0.44140625, 0.4423828125, 0.443359375, 0.4443359375, 0.4453125, 0.4462890625, 0.447265625, 0.4482421875, 0.44921875, 0.4501953125, 0.451171875, 0.4521484375, 0.453125, 0.4541015625, 0.455078125, 0.4560546875, 0.45703125, 0.4580078125, 0.458984375, 0.4599609375, 0.4609375, 0.4619140625, 0.462890625, 0.4638671875, 0.46484375, 0.4658203125, 0.466796875, 0.4677734375, 0.46875, 0.4697265625, 0.470703125, 0.4716796875, 0.47265625, 0.4736328125, 0.474609375, 0.4755859375, 0.4765625, 0.4775390625, 0.478515625, 0.4794921875, 0.48046875, 0.4814453125, 0.482421875, 0.4833984375, 0.484375, 0.4853515625, 0.486328125, 0.4873046875, 0.48828125, 0.4892578125, 0.490234375, 0.4912109375, 0.4921875, 0.4931640625, 0.494140625, 0.4951171875, 0.49609375, 0.4970703125, 0.498046875, 0.4990234375, 0.5, 0.5009765625, 0.501953125, 0.5029296875, 0.50390625, 0.5048828125, 0.505859375, 0.5068359375, 0.5078125, 0.5087890625, 0.509765625, 0.5107421875, 0.51171875, 0.5126953125, 0.513671875, 0.5146484375, 0.515625, 0.5166015625, 0.517578125, 0.5185546875, 0.51953125, 0.5205078125, 0.521484375, 0.5224609375, 0.5234375, 0.5244140625, 0.525390625, 0.5263671875, 0.52734375, 0.5283203125, 0.529296875, 0.5302734375, 0.53125, 0.5322265625, 0.533203125, 0.5341796875, 0.53515625, 0.5361328125, 0.537109375, 0.5380859375, 0.5390625, 0.5400390625, 0.541015625, 0.5419921875, 0.54296875, 0.5439453125, 0.544921875, 0.5458984375, 0.546875, 0.5478515625, 0.548828125, 0.5498046875, 0.55078125, 0.5517578125, 0.552734375, 0.5537109375, 0.5546875, 0.5556640625, 0.556640625, 0.5576171875, 0.55859375, 0.5595703125, 0.560546875, 0.5615234375, 0.5625, 0.5634765625, 0.564453125, 0.5654296875, 0.56640625, 0.5673828125, 0.568359375, 0.5693359375, 0.5703125, 0.5712890625, 0.572265625, 0.5732421875, 0.57421875, 0.5751953125, 0.576171875, 0.5771484375, 0.578125, 0.5791015625, 0.580078125, 0.5810546875, 0.58203125, 0.5830078125, 0.583984375, 0.5849609375, 0.5859375, 0.5869140625, 0.587890625, 0.5888671875, 0.58984375, 0.5908203125, 0.591796875, 0.5927734375, 0.59375, 0.5947265625, 0.595703125, 0.5966796875, 0.59765625, 0.5986328125, 0.599609375, 0.6005859375, 0.6015625, 0.6025390625, 0.603515625, 0.6044921875, 0.60546875, 0.6064453125, 0.607421875, 0.6083984375, 0.609375, 0.6103515625, 0.611328125, 0.6123046875, 0.61328125, 0.6142578125, 0.615234375, 0.6162109375, 0.6171875, 0.6181640625, 0.619140625, 0.6201171875, 0.62109375, 0.6220703125, 0.623046875, 0.6240234375, 0.625, 0.6259765625, 0.626953125, 0.6279296875, 0.62890625, 0.6298828125, 0.630859375, 0.6318359375, 0.6328125, 0.6337890625, 0.634765625, 0.6357421875, 0.63671875, 0.6376953125, 0.638671875, 0.6396484375, 0.640625, 0.6416015625, 0.642578125, 0.6435546875, 0.64453125, 0.6455078125, 0.646484375, 0.6474609375, 0.6484375, 0.6494140625, 0.650390625, 0.6513671875, 0.65234375, 0.6533203125, 0.654296875, 0.6552734375, 0.65625, 0.6572265625, 0.658203125, 0.6591796875, 0.66015625, 0.6611328125, 0.662109375, 0.6630859375, 0.6640625, 0.6650390625, 0.666015625, 0.6669921875, 0.66796875, 0.6689453125, 0.669921875, 0.6708984375, 0.671875, 0.6728515625, 0.673828125, 0.6748046875, 0.67578125, 0.6767578125, 0.677734375, 0.6787109375, 0.6796875, 0.6806640625, 0.681640625, 0.6826171875, 0.68359375, 0.6845703125, 0.685546875, 0.6865234375, 0.6875, 0.6884765625, 0.689453125, 0.6904296875, 0.69140625, 0.6923828125, 0.693359375, 0.6943359375, 0.6953125, 0.6962890625, 0.697265625, 0.6982421875, 0.69921875, 0.7001953125, 0.701171875, 0.7021484375, 0.703125, 0.7041015625, 0.705078125, 0.7060546875, 0.70703125, 0.7080078125, 0.708984375, 0.7099609375, 0.7109375, 0.7119140625, 0.712890625, 0.7138671875, 0.71484375, 0.7158203125, 0.716796875, 0.7177734375, 0.71875, 0.7197265625, 0.720703125, 0.7216796875, 0.72265625, 0.7236328125, 0.724609375, 0.7255859375, 0.7265625, 0.7275390625, 0.728515625, 0.7294921875, 0.73046875, 0.7314453125, 0.732421875, 0.7333984375, 0.734375, 0.7353515625, 0.736328125, 0.7373046875, 0.73828125, 0.7392578125, 0.740234375, 0.7412109375, 0.7421875, 0.7431640625, 0.744140625, 0.7451171875, 0.74609375, 0.7470703125, 0.748046875, 0.7490234375, 0.75, 0.7509765625, 0.751953125, 0.7529296875, 0.75390625, 0.7548828125, 0.755859375, 0.7568359375, 0.7578125, 0.7587890625, 0.759765625, 0.7607421875, 0.76171875, 0.7626953125, 0.763671875, 0.7646484375, 0.765625, 0.7666015625, 0.767578125, 0.7685546875, 0.76953125, 0.7705078125, 0.771484375, 0.7724609375, 0.7734375, 0.7744140625, 0.775390625, 0.7763671875, 0.77734375, 0.7783203125, 0.779296875, 0.7802734375, 0.78125, 0.7822265625, 0.783203125, 0.7841796875, 0.78515625, 0.7861328125, 0.787109375, 0.7880859375, 0.7890625, 0.7900390625, 0.791015625, 0.7919921875, 0.79296875, 0.7939453125, 0.794921875, 0.7958984375, 0.796875, 0.7978515625, 0.798828125, 0.7998046875, 0.80078125, 0.8017578125, 0.802734375, 0.8037109375, 0.8046875, 0.8056640625, 0.806640625, 0.8076171875, 0.80859375, 0.8095703125, 0.810546875, 0.8115234375, 0.8125, 0.8134765625, 0.814453125, 0.8154296875, 0.81640625, 0.8173828125, 0.818359375, 0.8193359375, 0.8203125, 0.8212890625, 0.822265625, 0.8232421875, 0.82421875, 0.8251953125, 0.826171875, 0.8271484375, 0.828125, 0.8291015625, 0.830078125, 0.8310546875, 0.83203125, 0.8330078125, 0.833984375, 0.8349609375, 0.8359375, 0.8369140625, 0.837890625, 0.8388671875, 0.83984375, 0.8408203125, 0.841796875, 0.8427734375, 0.84375, 0.8447265625, 0.845703125, 0.8466796875, 0.84765625, 0.8486328125, 0.849609375, 0.8505859375, 0.8515625, 0.8525390625, 0.853515625, 0.8544921875, 0.85546875, 0.8564453125, 0.857421875, 0.8583984375, 0.859375, 0.8603515625, 0.861328125, 0.8623046875, 0.86328125, 0.8642578125, 0.865234375, 0.8662109375, 0.8671875, 0.8681640625, 0.869140625, 0.8701171875, 0.87109375, 0.8720703125, 0.873046875, 0.8740234375, 0.875, 0.8759765625, 0.876953125, 0.8779296875, 0.87890625, 0.8798828125, 0.880859375, 0.8818359375, 0.8828125, 0.8837890625, 0.884765625, 0.8857421875, 0.88671875, 0.8876953125, 0.888671875, 0.8896484375, 0.890625, 0.8916015625, 0.892578125, 0.8935546875, 0.89453125, 0.8955078125, 0.896484375, 0.8974609375, 0.8984375, 0.8994140625, 0.900390625, 0.9013671875, 0.90234375, 0.9033203125, 0.904296875, 0.9052734375, 0.90625, 0.9072265625, 0.908203125, 0.9091796875, 0.91015625, 0.9111328125, 0.912109375, 0.9130859375, 0.9140625, 0.9150390625, 0.916015625, 0.9169921875, 0.91796875, 0.9189453125, 0.919921875, 0.9208984375, 0.921875, 0.9228515625, 0.923828125, 0.9248046875, 0.92578125, 0.9267578125, 0.927734375, 0.9287109375, 0.9296875, 0.9306640625, 0.931640625, 0.9326171875, 0.93359375, 0.9345703125, 0.935546875, 0.9365234375, 0.9375, 0.9384765625, 0.939453125, 0.9404296875, 0.94140625, 0.9423828125, 0.943359375, 0.9443359375, 0.9453125, 0.9462890625, 0.947265625, 0.9482421875, 0.94921875, 0.9501953125, 0.951171875, 0.9521484375, 0.953125, 0.9541015625, 0.955078125, 0.9560546875, 0.95703125, 0.9580078125, 0.958984375, 0.9599609375, 0.9609375, 0.9619140625, 0.962890625, 0.9638671875, 0.96484375, 0.9658203125, 0.966796875, 0.9677734375, 0.96875, 0.9697265625, 0.970703125, 0.9716796875, 0.97265625, 0.9736328125, 0.974609375, 0.9755859375, 0.9765625, 0.9775390625, 0.978515625, 0.9794921875, 0.98046875, 0.9814453125, 0.982421875, 0.9833984375, 0.984375, 0.9853515625, 0.986328125, 0.9873046875, 0.98828125, 0.9892578125, 0.990234375, 0.9912109375, 0.9921875, 0.9931640625, 0.994140625, 0.9951171875, 0.99609375, 0.9970703125, 0.998046875, 0.9990234375, 1.0], "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 512.0, "target_prob": 1.0, "endpoint_temp": 1.5, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260519}, "raw_genppl": {"ppl": 2.137996627450479, "nll_per_token": 0.7598692351696538, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 2.137996627450479, "nll_per_token": 0.7598692351696538, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 1.8910837194113463, "unique_tokens": 39, "token_count": 8192, "distinct_1": 0.0047607421875, "distinct_2": 0.011485826001955034, "top_token_mass": 0.148681640625}}
|
| 13 |
+
[done] docs/lta_samples/metrics_20260519/owt_t5_2node_latest_trainmatched_decode_temp1p5_n8/dirichlet_resample_cmax512.jsonl
|
| 14 |
+
[ckpt] eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step101200_20260519_092441.pt step=101000
|
| 15 |
+
[decode-base] n=8 max_len=1024 steps=1024 model_t=post
|
| 16 |
+
[decode-time] schedule=linear s=[0.0,0.25] force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.000977 dt_max=0.000977
|
| 17 |
+
[decode] temp=1.50 final=state rule=dirichlet_mean support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 18 |
+
[decode] temp=1.50 final=state rule=dirichlet_mean support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 19 |
+
[decode] temp=1.50 final=state rule=dirichlet_mean support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 20 |
+
[decode] temp=1.50 final=state rule=dirichlet_mean support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 21 |
+
[decode] temp=1.50 final=state rule=dirichlet_mean support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 22 |
+
[decode] temp=1.50 final=state rule=dirichlet_mean support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 23 |
+
[decode] temp=1.50 final=state rule=dirichlet_mean support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 24 |
+
[decode] temp=1.50 final=state rule=dirichlet_mean support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 25 |
+
[summary] {"type": "summary", "checkpoint": "eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step101200_20260519_092441.pt", "step": 101000, "decode": {"steps": 1024, "model_t_mode": "post", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0009765625, 0.001953125, 0.0029296875, 0.00390625, 0.0048828125, 0.005859375, 0.0068359375, 0.0078125, 0.0087890625, 0.009765625, 0.0107421875, 0.01171875, 0.0126953125, 0.013671875, 0.0146484375, 0.015625, 0.0166015625, 0.017578125, 0.0185546875, 0.01953125, 0.0205078125, 0.021484375, 0.0224609375, 0.0234375, 0.0244140625, 0.025390625, 0.0263671875, 0.02734375, 0.0283203125, 0.029296875, 0.0302734375, 0.03125, 0.0322265625, 0.033203125, 0.0341796875, 0.03515625, 0.0361328125, 0.037109375, 0.0380859375, 0.0390625, 0.0400390625, 0.041015625, 0.0419921875, 0.04296875, 0.0439453125, 0.044921875, 0.0458984375, 0.046875, 0.0478515625, 0.048828125, 0.0498046875, 0.05078125, 0.0517578125, 0.052734375, 0.0537109375, 0.0546875, 0.0556640625, 0.056640625, 0.0576171875, 0.05859375, 0.0595703125, 0.060546875, 0.0615234375, 0.0625, 0.0634765625, 0.064453125, 0.0654296875, 0.06640625, 0.0673828125, 0.068359375, 0.0693359375, 0.0703125, 0.0712890625, 0.072265625, 0.0732421875, 0.07421875, 0.0751953125, 0.076171875, 0.0771484375, 0.078125, 0.0791015625, 0.080078125, 0.0810546875, 0.08203125, 0.0830078125, 0.083984375, 0.0849609375, 0.0859375, 0.0869140625, 0.087890625, 0.0888671875, 0.08984375, 0.0908203125, 0.091796875, 0.0927734375, 0.09375, 0.0947265625, 0.095703125, 0.0966796875, 0.09765625, 0.0986328125, 0.099609375, 0.1005859375, 0.1015625, 0.1025390625, 0.103515625, 0.1044921875, 0.10546875, 0.1064453125, 0.107421875, 0.1083984375, 0.109375, 0.1103515625, 0.111328125, 0.1123046875, 0.11328125, 0.1142578125, 0.115234375, 0.1162109375, 0.1171875, 0.1181640625, 0.119140625, 0.1201171875, 0.12109375, 0.1220703125, 0.123046875, 0.1240234375, 0.125, 0.1259765625, 0.126953125, 0.1279296875, 0.12890625, 0.1298828125, 0.130859375, 0.1318359375, 0.1328125, 0.1337890625, 0.134765625, 0.1357421875, 0.13671875, 0.1376953125, 0.138671875, 0.1396484375, 0.140625, 0.1416015625, 0.142578125, 0.1435546875, 0.14453125, 0.1455078125, 0.146484375, 0.1474609375, 0.1484375, 0.1494140625, 0.150390625, 0.1513671875, 0.15234375, 0.1533203125, 0.154296875, 0.1552734375, 0.15625, 0.1572265625, 0.158203125, 0.1591796875, 0.16015625, 0.1611328125, 0.162109375, 0.1630859375, 0.1640625, 0.1650390625, 0.166015625, 0.1669921875, 0.16796875, 0.1689453125, 0.169921875, 0.1708984375, 0.171875, 0.1728515625, 0.173828125, 0.1748046875, 0.17578125, 0.1767578125, 0.177734375, 0.1787109375, 0.1796875, 0.1806640625, 0.181640625, 0.1826171875, 0.18359375, 0.1845703125, 0.185546875, 0.1865234375, 0.1875, 0.1884765625, 0.189453125, 0.1904296875, 0.19140625, 0.1923828125, 0.193359375, 0.1943359375, 0.1953125, 0.1962890625, 0.197265625, 0.1982421875, 0.19921875, 0.2001953125, 0.201171875, 0.2021484375, 0.203125, 0.2041015625, 0.205078125, 0.2060546875, 0.20703125, 0.2080078125, 0.208984375, 0.2099609375, 0.2109375, 0.2119140625, 0.212890625, 0.2138671875, 0.21484375, 0.2158203125, 0.216796875, 0.2177734375, 0.21875, 0.2197265625, 0.220703125, 0.2216796875, 0.22265625, 0.2236328125, 0.224609375, 0.2255859375, 0.2265625, 0.2275390625, 0.228515625, 0.2294921875, 0.23046875, 0.2314453125, 0.232421875, 0.2333984375, 0.234375, 0.2353515625, 0.236328125, 0.2373046875, 0.23828125, 0.2392578125, 0.240234375, 0.2412109375, 0.2421875, 0.2431640625, 0.244140625, 0.2451171875, 0.24609375, 0.2470703125, 0.248046875, 0.2490234375, 0.25, 0.2509765625, 0.251953125, 0.2529296875, 0.25390625, 0.2548828125, 0.255859375, 0.2568359375, 0.2578125, 0.2587890625, 0.259765625, 0.2607421875, 0.26171875, 0.2626953125, 0.263671875, 0.2646484375, 0.265625, 0.2666015625, 0.267578125, 0.2685546875, 0.26953125, 0.2705078125, 0.271484375, 0.2724609375, 0.2734375, 0.2744140625, 0.275390625, 0.2763671875, 0.27734375, 0.2783203125, 0.279296875, 0.2802734375, 0.28125, 0.2822265625, 0.283203125, 0.2841796875, 0.28515625, 0.2861328125, 0.287109375, 0.2880859375, 0.2890625, 0.2900390625, 0.291015625, 0.2919921875, 0.29296875, 0.2939453125, 0.294921875, 0.2958984375, 0.296875, 0.2978515625, 0.298828125, 0.2998046875, 0.30078125, 0.3017578125, 0.302734375, 0.3037109375, 0.3046875, 0.3056640625, 0.306640625, 0.3076171875, 0.30859375, 0.3095703125, 0.310546875, 0.3115234375, 0.3125, 0.3134765625, 0.314453125, 0.3154296875, 0.31640625, 0.3173828125, 0.318359375, 0.3193359375, 0.3203125, 0.3212890625, 0.322265625, 0.3232421875, 0.32421875, 0.3251953125, 0.326171875, 0.3271484375, 0.328125, 0.3291015625, 0.330078125, 0.3310546875, 0.33203125, 0.3330078125, 0.333984375, 0.3349609375, 0.3359375, 0.3369140625, 0.337890625, 0.3388671875, 0.33984375, 0.3408203125, 0.341796875, 0.3427734375, 0.34375, 0.3447265625, 0.345703125, 0.3466796875, 0.34765625, 0.3486328125, 0.349609375, 0.3505859375, 0.3515625, 0.3525390625, 0.353515625, 0.3544921875, 0.35546875, 0.3564453125, 0.357421875, 0.3583984375, 0.359375, 0.3603515625, 0.361328125, 0.3623046875, 0.36328125, 0.3642578125, 0.365234375, 0.3662109375, 0.3671875, 0.3681640625, 0.369140625, 0.3701171875, 0.37109375, 0.3720703125, 0.373046875, 0.3740234375, 0.375, 0.3759765625, 0.376953125, 0.3779296875, 0.37890625, 0.3798828125, 0.380859375, 0.3818359375, 0.3828125, 0.3837890625, 0.384765625, 0.3857421875, 0.38671875, 0.3876953125, 0.388671875, 0.3896484375, 0.390625, 0.3916015625, 0.392578125, 0.3935546875, 0.39453125, 0.3955078125, 0.396484375, 0.3974609375, 0.3984375, 0.3994140625, 0.400390625, 0.4013671875, 0.40234375, 0.4033203125, 0.404296875, 0.4052734375, 0.40625, 0.4072265625, 0.408203125, 0.4091796875, 0.41015625, 0.4111328125, 0.412109375, 0.4130859375, 0.4140625, 0.4150390625, 0.416015625, 0.4169921875, 0.41796875, 0.4189453125, 0.419921875, 0.4208984375, 0.421875, 0.4228515625, 0.423828125, 0.4248046875, 0.42578125, 0.4267578125, 0.427734375, 0.4287109375, 0.4296875, 0.4306640625, 0.431640625, 0.4326171875, 0.43359375, 0.4345703125, 0.435546875, 0.4365234375, 0.4375, 0.4384765625, 0.439453125, 0.4404296875, 0.44140625, 0.4423828125, 0.443359375, 0.4443359375, 0.4453125, 0.4462890625, 0.447265625, 0.4482421875, 0.44921875, 0.4501953125, 0.451171875, 0.4521484375, 0.453125, 0.4541015625, 0.455078125, 0.4560546875, 0.45703125, 0.4580078125, 0.458984375, 0.4599609375, 0.4609375, 0.4619140625, 0.462890625, 0.4638671875, 0.46484375, 0.4658203125, 0.466796875, 0.4677734375, 0.46875, 0.4697265625, 0.470703125, 0.4716796875, 0.47265625, 0.4736328125, 0.474609375, 0.4755859375, 0.4765625, 0.4775390625, 0.478515625, 0.4794921875, 0.48046875, 0.4814453125, 0.482421875, 0.4833984375, 0.484375, 0.4853515625, 0.486328125, 0.4873046875, 0.48828125, 0.4892578125, 0.490234375, 0.4912109375, 0.4921875, 0.4931640625, 0.494140625, 0.4951171875, 0.49609375, 0.4970703125, 0.498046875, 0.4990234375, 0.5, 0.5009765625, 0.501953125, 0.5029296875, 0.50390625, 0.5048828125, 0.505859375, 0.5068359375, 0.5078125, 0.5087890625, 0.509765625, 0.5107421875, 0.51171875, 0.5126953125, 0.513671875, 0.5146484375, 0.515625, 0.5166015625, 0.517578125, 0.5185546875, 0.51953125, 0.5205078125, 0.521484375, 0.5224609375, 0.5234375, 0.5244140625, 0.525390625, 0.5263671875, 0.52734375, 0.5283203125, 0.529296875, 0.5302734375, 0.53125, 0.5322265625, 0.533203125, 0.5341796875, 0.53515625, 0.5361328125, 0.537109375, 0.5380859375, 0.5390625, 0.5400390625, 0.541015625, 0.5419921875, 0.54296875, 0.5439453125, 0.544921875, 0.5458984375, 0.546875, 0.5478515625, 0.548828125, 0.5498046875, 0.55078125, 0.5517578125, 0.552734375, 0.5537109375, 0.5546875, 0.5556640625, 0.556640625, 0.5576171875, 0.55859375, 0.5595703125, 0.560546875, 0.5615234375, 0.5625, 0.5634765625, 0.564453125, 0.5654296875, 0.56640625, 0.5673828125, 0.568359375, 0.5693359375, 0.5703125, 0.5712890625, 0.572265625, 0.5732421875, 0.57421875, 0.5751953125, 0.576171875, 0.5771484375, 0.578125, 0.5791015625, 0.580078125, 0.5810546875, 0.58203125, 0.5830078125, 0.583984375, 0.5849609375, 0.5859375, 0.5869140625, 0.587890625, 0.5888671875, 0.58984375, 0.5908203125, 0.591796875, 0.5927734375, 0.59375, 0.5947265625, 0.595703125, 0.5966796875, 0.59765625, 0.5986328125, 0.599609375, 0.6005859375, 0.6015625, 0.6025390625, 0.603515625, 0.6044921875, 0.60546875, 0.6064453125, 0.607421875, 0.6083984375, 0.609375, 0.6103515625, 0.611328125, 0.6123046875, 0.61328125, 0.6142578125, 0.615234375, 0.6162109375, 0.6171875, 0.6181640625, 0.619140625, 0.6201171875, 0.62109375, 0.6220703125, 0.623046875, 0.6240234375, 0.625, 0.6259765625, 0.626953125, 0.6279296875, 0.62890625, 0.6298828125, 0.630859375, 0.6318359375, 0.6328125, 0.6337890625, 0.634765625, 0.6357421875, 0.63671875, 0.6376953125, 0.638671875, 0.6396484375, 0.640625, 0.6416015625, 0.642578125, 0.6435546875, 0.64453125, 0.6455078125, 0.646484375, 0.6474609375, 0.6484375, 0.6494140625, 0.650390625, 0.6513671875, 0.65234375, 0.6533203125, 0.654296875, 0.6552734375, 0.65625, 0.6572265625, 0.658203125, 0.6591796875, 0.66015625, 0.6611328125, 0.662109375, 0.6630859375, 0.6640625, 0.6650390625, 0.666015625, 0.6669921875, 0.66796875, 0.6689453125, 0.669921875, 0.6708984375, 0.671875, 0.6728515625, 0.673828125, 0.6748046875, 0.67578125, 0.6767578125, 0.677734375, 0.6787109375, 0.6796875, 0.6806640625, 0.681640625, 0.6826171875, 0.68359375, 0.6845703125, 0.685546875, 0.6865234375, 0.6875, 0.6884765625, 0.689453125, 0.6904296875, 0.69140625, 0.6923828125, 0.693359375, 0.6943359375, 0.6953125, 0.6962890625, 0.697265625, 0.6982421875, 0.69921875, 0.7001953125, 0.701171875, 0.7021484375, 0.703125, 0.7041015625, 0.705078125, 0.7060546875, 0.70703125, 0.7080078125, 0.708984375, 0.7099609375, 0.7109375, 0.7119140625, 0.712890625, 0.7138671875, 0.71484375, 0.7158203125, 0.716796875, 0.7177734375, 0.71875, 0.7197265625, 0.720703125, 0.7216796875, 0.72265625, 0.7236328125, 0.724609375, 0.7255859375, 0.7265625, 0.7275390625, 0.728515625, 0.7294921875, 0.73046875, 0.7314453125, 0.732421875, 0.7333984375, 0.734375, 0.7353515625, 0.736328125, 0.7373046875, 0.73828125, 0.7392578125, 0.740234375, 0.7412109375, 0.7421875, 0.7431640625, 0.744140625, 0.7451171875, 0.74609375, 0.7470703125, 0.748046875, 0.7490234375, 0.75, 0.7509765625, 0.751953125, 0.7529296875, 0.75390625, 0.7548828125, 0.755859375, 0.7568359375, 0.7578125, 0.7587890625, 0.759765625, 0.7607421875, 0.76171875, 0.7626953125, 0.763671875, 0.7646484375, 0.765625, 0.7666015625, 0.767578125, 0.7685546875, 0.76953125, 0.7705078125, 0.771484375, 0.7724609375, 0.7734375, 0.7744140625, 0.775390625, 0.7763671875, 0.77734375, 0.7783203125, 0.779296875, 0.7802734375, 0.78125, 0.7822265625, 0.783203125, 0.7841796875, 0.78515625, 0.7861328125, 0.787109375, 0.7880859375, 0.7890625, 0.7900390625, 0.791015625, 0.7919921875, 0.79296875, 0.7939453125, 0.794921875, 0.7958984375, 0.796875, 0.7978515625, 0.798828125, 0.7998046875, 0.80078125, 0.8017578125, 0.802734375, 0.8037109375, 0.8046875, 0.8056640625, 0.806640625, 0.8076171875, 0.80859375, 0.8095703125, 0.810546875, 0.8115234375, 0.8125, 0.8134765625, 0.814453125, 0.8154296875, 0.81640625, 0.8173828125, 0.818359375, 0.8193359375, 0.8203125, 0.8212890625, 0.822265625, 0.8232421875, 0.82421875, 0.8251953125, 0.826171875, 0.8271484375, 0.828125, 0.8291015625, 0.830078125, 0.8310546875, 0.83203125, 0.8330078125, 0.833984375, 0.8349609375, 0.8359375, 0.8369140625, 0.837890625, 0.8388671875, 0.83984375, 0.8408203125, 0.841796875, 0.8427734375, 0.84375, 0.8447265625, 0.845703125, 0.8466796875, 0.84765625, 0.8486328125, 0.849609375, 0.8505859375, 0.8515625, 0.8525390625, 0.853515625, 0.8544921875, 0.85546875, 0.8564453125, 0.857421875, 0.8583984375, 0.859375, 0.8603515625, 0.861328125, 0.8623046875, 0.86328125, 0.8642578125, 0.865234375, 0.8662109375, 0.8671875, 0.8681640625, 0.869140625, 0.8701171875, 0.87109375, 0.8720703125, 0.873046875, 0.8740234375, 0.875, 0.8759765625, 0.876953125, 0.8779296875, 0.87890625, 0.8798828125, 0.880859375, 0.8818359375, 0.8828125, 0.8837890625, 0.884765625, 0.8857421875, 0.88671875, 0.8876953125, 0.888671875, 0.8896484375, 0.890625, 0.8916015625, 0.892578125, 0.8935546875, 0.89453125, 0.8955078125, 0.896484375, 0.8974609375, 0.8984375, 0.8994140625, 0.900390625, 0.9013671875, 0.90234375, 0.9033203125, 0.904296875, 0.9052734375, 0.90625, 0.9072265625, 0.908203125, 0.9091796875, 0.91015625, 0.9111328125, 0.912109375, 0.9130859375, 0.9140625, 0.9150390625, 0.916015625, 0.9169921875, 0.91796875, 0.9189453125, 0.919921875, 0.9208984375, 0.921875, 0.9228515625, 0.923828125, 0.9248046875, 0.92578125, 0.9267578125, 0.927734375, 0.9287109375, 0.9296875, 0.9306640625, 0.931640625, 0.9326171875, 0.93359375, 0.9345703125, 0.935546875, 0.9365234375, 0.9375, 0.9384765625, 0.939453125, 0.9404296875, 0.94140625, 0.9423828125, 0.943359375, 0.9443359375, 0.9453125, 0.9462890625, 0.947265625, 0.9482421875, 0.94921875, 0.9501953125, 0.951171875, 0.9521484375, 0.953125, 0.9541015625, 0.955078125, 0.9560546875, 0.95703125, 0.9580078125, 0.958984375, 0.9599609375, 0.9609375, 0.9619140625, 0.962890625, 0.9638671875, 0.96484375, 0.9658203125, 0.966796875, 0.9677734375, 0.96875, 0.9697265625, 0.970703125, 0.9716796875, 0.97265625, 0.9736328125, 0.974609375, 0.9755859375, 0.9765625, 0.9775390625, 0.978515625, 0.9794921875, 0.98046875, 0.9814453125, 0.982421875, 0.9833984375, 0.984375, 0.9853515625, 0.986328125, 0.9873046875, 0.98828125, 0.9892578125, 0.990234375, 0.9912109375, 0.9921875, 0.9931640625, 0.994140625, 0.9951171875, 0.99609375, 0.9970703125, 0.998046875, 0.9990234375, 1.0], "decode_rule": "dirichlet_mean", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 512.0, "target_prob": 1.0, "endpoint_temp": 1.5, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260519}, "raw_genppl": {"ppl": 1.0, "nll_per_token": 0.0, "tokens": 0, "kept_samples": 0, "total_samples": 0, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 1.0, "nll_per_token": 0.0, "tokens": 0, "kept_samples": 0, "total_samples": 0, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 0.0, "unique_tokens": 1, "token_count": 8192, "distinct_1": 0.0001220703125, "distinct_2": 0.00012218963831867058, "top_token_mass": 1.0}}
|
| 26 |
+
[done] docs/lta_samples/metrics_20260519/owt_t5_2node_latest_trainmatched_decode_temp1p5_n8/dirichlet_mean_cmax512.jsonl
|
LTA_openwebtext_dualt/logs/lm1b_bos_ban_special_eval_newserver.log
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ids {'pad': 0, 'unk': 100, 'bos': 101, 'eos': 102}
|
| 2 |
+
|
| 3 |
+
{
|
| 4 |
+
"tag": "boslock_const05_final_state_k256_s16",
|
| 5 |
+
"out": "runs/lta_lm1b_duo_aligned_dirichlet_dualt_onehot_hardce_b128_4xh20_1m_nw0_compile_ro/genppl_lm1b_latest_boslock_const05_final_state_k256_s16.jsonl",
|
| 6 |
+
"gen_ppl": 9.87111397613715,
|
| 7 |
+
"nll": 2.2896127119334855,
|
| 8 |
+
"tokens": 3067,
|
| 9 |
+
"entropy": 3.0039772758541794,
|
| 10 |
+
"distinct1": 0.1875,
|
| 11 |
+
"distinct2": 0.42322834645669294,
|
| 12 |
+
"top_token_mass": 0.1611328125,
|
| 13 |
+
"samples": [
|
| 14 |
+
"[CLS] ibm. [SEP] [SEP] [CLS] well, the economy is the worst in the world... [SEP] [SEP] [CLS] the lost money was passed to the in person, including the driver of the two buses, the report said. [SEP] [SEP] [CLS] this is the presidential race. [SEP] [SEP] [CLS] the fee is increased in addition to the current limit. [SEP] [SEP] [CLS] the day of the battle of the wall, in the morning. [SEP] [SEP] [CLS] the ashe. [SEP] [SEP] [CLS] but, the problem is, the harm that is, to the extent of the damage, is to the economy, he said. [SEP] [SEP] [CLS] the blazers, winners of the [SEP]",
|
| 15 |
+
"[CLS] earlier in the day, \" the statement said. [SEP] [SEP] [CLS] in the end, according to the status, the celebrations, the distractions, the costumes, the depth, and the subtlety. [SEP] [SEP] [CLS] in the............... [SEP] [SEP] [CLS].................. [SEP] [SEP] [CLS] oh, the knowledge and resources, the money, the resources, the....... [SEP] [SEP] [CLS] and, yes, elsewhere, and in the rest of the uk. [SEP] [SEP] [CLS] \" [SEP]",
|
| 16 |
+
"[CLS] in the cup. [SEP] [SEP] [CLS] politicians and the public, the government, students, and the public. [SEP] [SEP] [CLS] and, look, file... the closest in getting the two... [SEP] [SEP] [CLS] the leaders of the u. s., canada, canada, germany, russia, and, netherlands, are in the meeting. [SEP] [SEP] [CLS] the baby was found in a drawer. [SEP] [SEP] [CLS] the majority, according to nielsen, is in the united states. [SEP] [SEP] [CLS] in the end, most of the clubs. [SEP] [SEP] [CLS] it ' s psychology. [SEP] [SEP] [CLS] 16, 2008,. [SEP] [SEP]",
|
| 17 |
+
"[CLS]....... [SEP] [SEP] [CLS] this is the biggest country, in the world, right. [SEP] [SEP] [CLS] the pain, the pain of anger, the betrayals, the pain of the anger, the pain of death and the slow slows of the inevitable. [SEP] [SEP] [CLS] in addition, he said, it has adapted to the environment, to the age of the brain. [SEP] [SEP] [CLS] and to the players, the final and the final. [SEP] [SEP] [CLS] and, yes... [SEP] [SEP] [CLS] and the the circuits. [SEP] [SEP] [CLS] the iraqis, the canadians, and the canadians, and the [SEP]",
|
| 18 |
+
"[CLS]. [SEP] [SEP] [CLS] in the last year, the government, for example, intervened. [SEP] [SEP] [CLS] the man was wearing a motorcycle harness in the back of the horse, the report said. [SEP] [SEP] [CLS] the people in the world..., in the world, the country, the people in the world and the world.... [SEP] [SEP] [CLS] it is the law, and the spirit of the constitution. [SEP] [SEP] [CLS] the goal is to reduce the number of students and the seniors in the complete cycle. [SEP] [SEP] [CLS] the ceremony was the kickoff today in the history of the royal melbourne society. [SEP] [SEP] [CLS] the [SEP]"
|
| 19 |
+
]
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
{
|
| 23 |
+
"tag": "boslock_bancls_const05_final_state_k128_s16",
|
| 24 |
+
"out": "runs/lta_lm1b_duo_aligned_dirichlet_dualt_onehot_hardce_b128_4xh20_1m_nw0_compile_ro/genppl_lm1b_latest_boslock_bancls_const05_final_state_k128_s16.jsonl",
|
| 25 |
+
"gen_ppl": 36.469665232385864,
|
| 26 |
+
"nll": 3.596480825518406,
|
| 27 |
+
"tokens": 1073,
|
| 28 |
+
"entropy": 1.3724775055784635,
|
| 29 |
+
"distinct1": 0.08251953125,
|
| 30 |
+
"distinct2": 0.18405511811023623,
|
| 31 |
+
"top_token_mass": 0.63818359375,
|
| 32 |
+
"samples": [
|
| 33 |
+
"[CLS] similar to anywhere in the world......,.,.,.,.,.......,.,.................................................................. [SEP] [SEP]. the answer is... [SEP] [SEP]. \" we need to pay respects to the... [SEP] patients at the hospital, [SEP]",
|
| 34 |
+
"[CLS].............................................................................................................................. [SEP]",
|
| 35 |
+
"[CLS] the all - star game, the team said wednesday. [SEP] [SEP] and the doctor is. [SEP] [SEP], the sea, the, and the sun, the, the lake,, to the palm... the sun, the sea.., to the month, to, the sea..., the sun, the..., the...., the sun, sun,........, the sun, the, the sea, the sun, and the sun.... and the then and now......., in the name of the [SEP]",
|
| 36 |
+
"[CLS].............................................................................................................................. [SEP]",
|
| 37 |
+
"[CLS]..................................................................................................................... [SEP] [SEP] but the incident led to the arrest [SEP]"
|
| 38 |
+
]
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
{
|
| 42 |
+
"tag": "boslock_bancls_const05_final_state_k256_s16",
|
| 43 |
+
"out": "runs/lta_lm1b_duo_aligned_dirichlet_dualt_onehot_hardce_b128_4xh20_1m_nw0_compile_ro/genppl_lm1b_latest_boslock_bancls_const05_final_state_k256_s16.jsonl",
|
| 44 |
+
"gen_ppl": 17.299491599431878,
|
| 45 |
+
"nll": 2.8506771137558413,
|
| 46 |
+
"tokens": 2675,
|
| 47 |
+
"entropy": 2.93029293364398,
|
| 48 |
+
"distinct1": 0.20068359375,
|
| 49 |
+
"distinct2": 0.4512795275590551,
|
| 50 |
+
"top_token_mass": 0.16748046875,
|
| 51 |
+
"samples": [
|
| 52 |
+
"[CLS] - - and, increasingly, to shanghai - - of america. [SEP] [SEP] [PAD] \" i think the [ is ] perfect, \" he said. [SEP] [SEP] hilly so, as the adhering of the pen, that of our 20 carvings. [SEP] [SEP], the size of the abdomen, the size of the utes. [SEP] [SEP] damage in the end, it is a story, of the world, everything, everything,... [SEP] [SEP] the the study was published in the journal of the journal of the american. [SEP] [SEP] inspiring he never, however, was foot in the senate - - the 411. [SEP] [SEP] [SEP] \" [SEP]",
|
| 53 |
+
"[CLS]. [SEP] [SEP] [SEP] \" it is in the classroom - - the teacher is the pal, to the teacher. \" [SEP] [SEP] lying in the corner is the copy of the book. [SEP] [SEP] [SEP] 25, 2006,.... [SEP] [SEP] [SEP] on june 17, he traveled to the united states, and then to the i. s. a., to discuss the details of the prohibition. [SEP] [SEP]ang it was a good day for the government and the industry. [SEP] [SEP] townships the engineers said. [SEP] [SEP] [PAD] a lot of people. [SEP] [SEP] [SEP] the storms in the capital, santiago, to the northeast of the [SEP]",
|
| 54 |
+
"[CLS] the country. [SEP] [SEP] [SEP] \" then, according to the rules, the overwhelming. [SEP] [SEP], it ' s in the interest of the u. s. and the interest paid to yours by the government. [SEP] [SEP] [unused884] the better, the less eats, the less. [SEP] [SEP] bath the cases of the abuse... [SEP] [SEP] [SEP] \" in the standpoint, the international and the.... [SEP] [SEP] [SEP] \" the size of the maximum is between the consumer and the accessory, \" he said. [SEP] [SEP], but this, the coach, was in the game. [SEP] [SEP] [SEP] in the may, the [SEP]",
|
| 55 |
+
"[CLS] said. [SEP] [SEP] spotlight he was born in the bronx. [SEP] [SEP] [SEP] \" in the case of the players, they, the players and the fans, and the players. [SEP] [SEP] and the problem.... [SEP] [SEP] the the the sun, the the sun, the rain, the green of loss. [SEP] [SEP] spiritual they are the skiers, the climbers, the pe, the doctors, the hungarians, the swiss, the italians, the americans, the russians, and the americans. [SEP] [SEP] sabrina the leader of the party, the politician, the champion of the community, everything. [SEP] [SEP] [SEP] the study [SEP]",
|
| 56 |
+
"[CLS] place, \" he said. [SEP] [SEP] [SEP] the answer is, in the end, the future of the country, and to the people... [SEP] [SEP] [SEP] the problem, in the case of the, as big, in the... [SEP] [SEP] agent the key is the quality to the doing, and to the flow of the money, to the government, the size of the government, the size of the government, and the counter network. [SEP] [SEP]mas the problem is, in the end, in the path to the end. [SEP] [SEP] [unused297] inc., usa, and co., inc., ltd. [SEP]"
|
| 57 |
+
]
|
| 58 |
+
}
|
LTA_openwebtext_dualt/logs/lta_lm1b_classic_dirichlet_len128_4gpu_10k_driver.log
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[launch] method=categorical_fullvocab_c1024_fullycoupled host=di-20260411014000-djqhq time=2026-05-23T10:11:09+00:00
|
| 2 |
+
[launch] cwd=/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
|
| 3 |
+
[launch] run_name=lta_lm1b_classic_dirichlet_len128_gbs512_4gpu_10k_save1k_20260523
|
| 4 |
+
[launch] save_dir=runs/lta_lm1b_classic_dirichlet_len128_gbs512_4gpu_10k_save1k_20260523
|
| 5 |
+
[launch] log_file=logs/lta_lm1b_classic_dirichlet_len128_gbs512_4gpu_10k_save1k_20260523.log
|
| 6 |
+
|
| 7 |
+
*****************************************
|
| 8 |
+
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 9 |
+
*****************************************
|
| 10 |
+
NCCL version 2.25.1+cuda12.8
|
| 11 |
+
{
|
| 12 |
+
"device": "cuda:0",
|
| 13 |
+
"rank": 0,
|
| 14 |
+
"world_size": 4,
|
| 15 |
+
"samples": "wrapped_stream",
|
| 16 |
+
"vocab_size": 30522,
|
| 17 |
+
"tokenizer_vocab_size": 30522,
|
| 18 |
+
"save_dir": "runs/lta_lm1b_classic_dirichlet_len128_gbs512_4gpu_10k_save1k_20260523",
|
| 19 |
+
"max_len": 128,
|
| 20 |
+
"effective_model_max_len": 128,
|
| 21 |
+
"batch_size": 64,
|
| 22 |
+
"grad_accum": 2,
|
| 23 |
+
"effective_batch_size": 512,
|
| 24 |
+
"global_batch_size": 512,
|
| 25 |
+
"lr_schedule": "constant_warmup",
|
| 26 |
+
"optimizer": "adamw",
|
| 27 |
+
"epochs": 0.0,
|
| 28 |
+
"steps_per_epoch": 0,
|
| 29 |
+
"total_steps": 10000,
|
| 30 |
+
"warmup_steps": 2500,
|
| 31 |
+
"warmup_epochs": -1.0,
|
| 32 |
+
"min_lr": 6e-05,
|
| 33 |
+
"weight_decay": 0.0,
|
| 34 |
+
"output_weight_decay": -1.0,
|
| 35 |
+
"adamw_param_groups": "nanogpt",
|
| 36 |
+
"adam_beta1": 0.9,
|
| 37 |
+
"adam_beta2": 0.999,
|
| 38 |
+
"adam_eps": 1e-08,
|
| 39 |
+
"muon_impl": "legacy",
|
| 40 |
+
"muon_momentum": 0.95,
|
| 41 |
+
"muon_ns_steps": 5,
|
| 42 |
+
"muon_update_scale": 1.0,
|
| 43 |
+
"muon_nesterov": false,
|
| 44 |
+
"muon_width_scale": false,
|
| 45 |
+
"muon_grouping": "",
|
| 46 |
+
"muon_param_count": 0,
|
| 47 |
+
"muon_adam_param_count": 0,
|
| 48 |
+
"muon_param_names": [],
|
| 49 |
+
"muon_adam_param_names": [],
|
| 50 |
+
"muon_effective_nesterov": false,
|
| 51 |
+
"muon_effective_width_scale": false,
|
| 52 |
+
"muon_effective_weight_decay": 0.0,
|
| 53 |
+
"muon_adam_fallback_nesterov": false,
|
| 54 |
+
"muon_adam_fallback_weight_decay": 0.0,
|
| 55 |
+
"ema_decay": 0.0,
|
| 56 |
+
"ema_start_step": 0,
|
| 57 |
+
"model_type": "ddit",
|
| 58 |
+
"ddit_mlp_type": "gelu",
|
| 59 |
+
"block_anchor_every": 0,
|
| 60 |
+
"block_anchor_init_std": 0.02,
|
| 61 |
+
"bos_anchor_every": 0,
|
| 62 |
+
"bos_anchor_token_id": -1,
|
| 63 |
+
"bos_anchor_extra_len": 0,
|
| 64 |
+
"abs_pos_embed": false,
|
| 65 |
+
"abs_pos_init_std": 0.02,
|
| 66 |
+
"elf_num_time_tokens": 4,
|
| 67 |
+
"elf_num_model_mode_tokens": 0,
|
| 68 |
+
"qk_norm": true,
|
| 69 |
+
"output_bias": false,
|
| 70 |
+
"output_init_std": -1.0,
|
| 71 |
+
"norm_type": "rmsnorm",
|
| 72 |
+
"target_loss": "hard_ce",
|
| 73 |
+
"linear_soft_target_power": 1.0,
|
| 74 |
+
"linear_soft_target_min_conf": 0.0,
|
| 75 |
+
"linear_soft_target_max_conf": 1.0,
|
| 76 |
+
"t_sampling_mode": "uniform",
|
| 77 |
+
"t_sampling_power": 1.0,
|
| 78 |
+
"t_sampling_eps": 0.0001,
|
| 79 |
+
"t_sampling_logit_mean": -1.5,
|
| 80 |
+
"t_sampling_logit_std": 0.8,
|
| 81 |
+
"t_sampling_gumbel_loc": 2.2,
|
| 82 |
+
"t_sampling_gumbel_scale": 0.8,
|
| 83 |
+
"dual_t": true,
|
| 84 |
+
"corrupt_t_mode": "same",
|
| 85 |
+
"corrupt_min_t": 0.0,
|
| 86 |
+
"corrupt_max_t": 1.0,
|
| 87 |
+
"prefix_block_prob": 0.0,
|
| 88 |
+
"prefix_block_len": 128,
|
| 89 |
+
"block_ar_two_stream": false,
|
| 90 |
+
"block_ar_block_len": 128,
|
| 91 |
+
"mask_ratio_floor_schedule": "none",
|
| 92 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 93 |
+
"dirichlet_semantic_t_mode": "same",
|
| 94 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 95 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 96 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 97 |
+
"dirichlet_support_t_curve": "linear",
|
| 98 |
+
"dirichlet_support_t_power": 1.0,
|
| 99 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 100 |
+
"categorical_wrong_from_full_vocab": true,
|
| 101 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 102 |
+
"categorical_wrong_basin_token_ids": "",
|
| 103 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 104 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 105 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 106 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 107 |
+
"categorical_gold_prob_floor": 0.0,
|
| 108 |
+
"categorical_gold_prob_ceil": 1.0,
|
| 109 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 110 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 111 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 112 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 113 |
+
"mask_mixture_original_prob": 0.0,
|
| 114 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 115 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 116 |
+
"mask_mixture_block_prob": 0.0,
|
| 117 |
+
"mask_mixture_all_prob": 0.0,
|
| 118 |
+
"mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
|
| 119 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 120 |
+
"mask_mixture_block_tokens": "64,128",
|
| 121 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 122 |
+
"logistic_normal_sigma_min": 0.18,
|
| 123 |
+
"logistic_normal_sigma_max": 2.2,
|
| 124 |
+
"logistic_normal_tau_min": 0.65,
|
| 125 |
+
"logistic_normal_tau_max": 1.15,
|
| 126 |
+
"torch_compile": false,
|
| 127 |
+
"compile_mode": "max-autotune",
|
| 128 |
+
"state_format": "prob",
|
| 129 |
+
"meanflow_weight": 0.0,
|
| 130 |
+
"rollout_train_prob": 0.0,
|
| 131 |
+
"rollout_train_steps": 1,
|
| 132 |
+
"rollout_train_steps_min": -1,
|
| 133 |
+
"rollout_train_infer_steps": 64,
|
| 134 |
+
"rollout_train_time_mode": "fixed_steps",
|
| 135 |
+
"rollout_train_s_dist": "uniform",
|
| 136 |
+
"rollout_train_s_min_frac": 0.0,
|
| 137 |
+
"rollout_train_s_max_frac": 0.125,
|
| 138 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 139 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 140 |
+
"rollout_train_temp": 1.0,
|
| 141 |
+
"rollout_train_max_gamma": 1.0,
|
| 142 |
+
"rollout_train_rule": "flowmap",
|
| 143 |
+
"rollout_train_corrupt_only": true,
|
| 144 |
+
"rollout_train_samplewise": false,
|
| 145 |
+
"rollout_train_compute_always": false,
|
| 146 |
+
"rollout_train_keep_grad": false,
|
| 147 |
+
"rollout_train_sync_t": false,
|
| 148 |
+
"rollout_train_state_mix_mode": "final",
|
| 149 |
+
"rollout_train_state_mix_alpha": 0.5,
|
| 150 |
+
"bridge_noise_init": "logistic_normal",
|
| 151 |
+
"noise_sigma": -1.0,
|
| 152 |
+
"allow_tf32": true,
|
| 153 |
+
"activation_checkpointing": false,
|
| 154 |
+
"activation_checkpoint_interval": 1,
|
| 155 |
+
"activation_checkpoint_scope": "block",
|
| 156 |
+
"ddp_static_graph": false,
|
| 157 |
+
"ddp_gradient_as_bucket_view": true,
|
| 158 |
+
"blocking_data_transfer": false,
|
| 159 |
+
"dataloader_prefetch_factor": 2,
|
| 160 |
+
"full_train_stats": false,
|
| 161 |
+
"tokenized_hf": false,
|
| 162 |
+
"tokenized_pad_token": "pad",
|
| 163 |
+
"elf_conditional_hf": false,
|
| 164 |
+
"record_pad_truncate": false,
|
| 165 |
+
"record_add_eos": false,
|
| 166 |
+
"record_add_special_tokens": false,
|
| 167 |
+
"record_pad_token": "pad",
|
| 168 |
+
"record_shuffle_buffer": 10000,
|
| 169 |
+
"wrap": true,
|
| 170 |
+
"wrap_mode": "stream",
|
| 171 |
+
"wrap_record_buffer_size": 200,
|
| 172 |
+
"owt_cached_chunks": false,
|
| 173 |
+
"owt_chunk_cache_dir": "",
|
| 174 |
+
"owt_chunk_cache_rebuild": false,
|
| 175 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 176 |
+
"owt_exact_repeat_per_chunk": 0,
|
| 177 |
+
"online_chunk_shuffle": false,
|
| 178 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 179 |
+
"openwebtext_split": "all",
|
| 180 |
+
"detokenizer": "auto",
|
| 181 |
+
"resolved_detokenizer": "lm1b",
|
| 182 |
+
"num_workers": 0,
|
| 183 |
+
"latest_every": 1000,
|
| 184 |
+
"resume_path": ""
|
| 185 |
+
}
|
| 186 |
+
step=100 micro_steps=200 elapsed=29.4s lr=1.212000e-05 loss=10.1817 loss_recon=10.1817 loss_meanflow=0.0000 mean_model_t=0.5027 mean_corrupt_t=0.5027 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5306 corrupt_frac=0.5502 acc_corrupt=0.3490 loss_corrupt=10.1817 wrong_frac=0.4970 init_acc_corrupt=0.4696 acc_corrupt_t_0p0_0p2=0.0453 corrupt_frac_t_0p0_0p2=0.1946 acc_corrupt_t_0p2_0p4=0.1865 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=0.3433 corrupt_frac_t_0p4_0p6=0.2000 acc_corrupt_t_0p6_0p8=0.4912 corrupt_frac_t_0p6_0p8=0.2013 acc_corrupt_t_0p8_1p0=0.6674 corrupt_frac_t_0p8_1p0=0.2022 out_w_norm=0.8080 out_g_norm=1.4395 loss_all=9.7757 init_gold_top10=0.4971 init_gold_top100=0.5009
|
| 187 |
+
step=200 micro_steps=400 elapsed=27.7s lr=2.412000e-05 loss=8.9731 loss_recon=8.9731 loss_meanflow=0.0000 mean_model_t=0.4997 mean_corrupt_t=0.4997 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1206 corrupt_frac=0.5498 acc_corrupt=0.0859 loss_corrupt=8.9731 wrong_frac=0.5017 init_acc_corrupt=0.4632 acc_corrupt_t_0p0_0p2=0.0469 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.0543 corrupt_frac_t_0p2_0p4=0.2062 acc_corrupt_t_0p4_0p6=0.0760 corrupt_frac_t_0p4_0p6=0.1989 acc_corrupt_t_0p6_0p8=0.1092 corrupt_frac_t_0p6_0p8=0.2031 acc_corrupt_t_0p8_1p0=0.1449 corrupt_frac_t_0p8_1p0=0.1946 out_w_norm=5.9697 out_g_norm=2.2414 loss_all=8.0255 init_gold_top10=0.4657 init_gold_top100=0.4721
|
| 188 |
+
step=300 micro_steps=600 elapsed=27.6s lr=3.612000e-05 loss=7.1162 loss_recon=7.1162 loss_meanflow=0.0000 mean_model_t=0.5001 mean_corrupt_t=0.5001 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1478 corrupt_frac=0.5514 acc_corrupt=0.1129 loss_corrupt=7.1162 wrong_frac=0.5013 init_acc_corrupt=0.4641 acc_corrupt_t_0p0_0p2=0.0495 corrupt_frac_t_0p0_0p2=0.2019 acc_corrupt_t_0p2_0p4=0.0782 corrupt_frac_t_0p2_0p4=0.2027 acc_corrupt_t_0p4_0p6=0.1151 corrupt_frac_t_0p4_0p6=0.1970 acc_corrupt_t_0p6_0p8=0.1476 corrupt_frac_t_0p6_0p8=0.1973 acc_corrupt_t_0p8_1p0=0.1752 corrupt_frac_t_0p8_1p0=0.2011 out_w_norm=11.6050 out_g_norm=1.7523 loss_all=5.8356 init_gold_top10=0.4908 init_gold_top100=0.4955
|
| 189 |
+
step=400 micro_steps=800 elapsed=27.5s lr=4.812000e-05 loss=4.9208 loss_recon=4.9208 loss_meanflow=0.0000 mean_model_t=0.5018 mean_corrupt_t=0.5018 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5232 corrupt_frac=0.5493 acc_corrupt=0.3689 loss_corrupt=4.9208 wrong_frac=0.4987 init_acc_corrupt=0.4666 acc_corrupt_t_0p0_0p2=0.0784 corrupt_frac_t_0p0_0p2=0.1986 acc_corrupt_t_0p2_0p4=0.2139 corrupt_frac_t_0p2_0p4=0.1984 acc_corrupt_t_0p4_0p6=0.3755 corrupt_frac_t_0p4_0p6=0.2021 acc_corrupt_t_0p6_0p8=0.5190 corrupt_frac_t_0p6_0p8=0.1987 acc_corrupt_t_0p8_1p0=0.6522 corrupt_frac_t_0p8_1p0=0.2022 out_w_norm=15.9419 out_g_norm=0.8517 loss_all=2.7239 init_gold_top10=0.4156 init_gold_top100=0.4237
|
| 190 |
+
step=500 micro_steps=1000 elapsed=27.5s lr=6.012000e-05 loss=3.9403 loss_recon=3.9403 loss_meanflow=0.0000 mean_model_t=0.4985 mean_corrupt_t=0.4985 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6942 corrupt_frac=0.5479 acc_corrupt=0.4951 loss_corrupt=3.9403 wrong_frac=0.5030 init_acc_corrupt=0.4619 acc_corrupt_t_0p0_0p2=0.1136 corrupt_frac_t_0p0_0p2=0.2024 acc_corrupt_t_0p2_0p4=0.2967 corrupt_frac_t_0p2_0p4=0.2050 acc_corrupt_t_0p4_0p6=0.5165 corrupt_frac_t_0p4_0p6=0.1918 acc_corrupt_t_0p6_0p8=0.6943 corrupt_frac_t_0p6_0p8=0.2025 acc_corrupt_t_0p8_1p0=0.8656 corrupt_frac_t_0p8_1p0=0.1983 out_w_norm=18.8643 out_g_norm=0.8721 loss_all=2.4302 init_gold_top10=0.4616 init_gold_top100=0.4679
|
| 191 |
+
step=600 micro_steps=1200 elapsed=27.5s lr=7.212000e-05 loss=3.5629 loss_recon=3.5629 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.5040 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7281 corrupt_frac=0.5480 acc_corrupt=0.5314 loss_corrupt=3.5629 wrong_frac=0.4954 init_acc_corrupt=0.4705 acc_corrupt_t_0p0_0p2=0.1385 corrupt_frac_t_0p0_0p2=0.1969 acc_corrupt_t_0p2_0p4=0.3259 corrupt_frac_t_0p2_0p4=0.1962 acc_corrupt_t_0p4_0p6=0.5516 corrupt_frac_t_0p4_0p6=0.2033 acc_corrupt_t_0p6_0p8=0.7262 corrupt_frac_t_0p6_0p8=0.1974 acc_corrupt_t_0p8_1p0=0.8957 corrupt_frac_t_0p8_1p0=0.2063 out_w_norm=20.7596 out_g_norm=0.9740 loss_all=2.3820 init_gold_top10=0.4401 init_gold_top100=0.4465
|
| 192 |
+
step=700 micro_steps=1400 elapsed=27.5s lr=8.412000e-05 loss=3.4393 loss_recon=3.4393 loss_meanflow=0.0000 mean_model_t=0.4971 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7339 corrupt_frac=0.5493 acc_corrupt=0.5379 loss_corrupt=3.4393 wrong_frac=0.5021 init_acc_corrupt=0.4631 acc_corrupt_t_0p0_0p2=0.1486 corrupt_frac_t_0p0_0p2=0.2027 acc_corrupt_t_0p2_0p4=0.3383 corrupt_frac_t_0p2_0p4=0.2006 acc_corrupt_t_0p4_0p6=0.5666 corrupt_frac_t_0p4_0p6=0.1985 acc_corrupt_t_0p6_0p8=0.7378 corrupt_frac_t_0p6_0p8=0.2000 acc_corrupt_t_0p8_1p0=0.9077 corrupt_frac_t_0p8_1p0=0.1982 out_w_norm=22.0094 out_g_norm=1.0156 loss_all=1.8632 init_gold_top10=0.4842 init_gold_top100=0.4909
|
| 193 |
+
step=800 micro_steps=1600 elapsed=27.5s lr=9.612000e-05 loss=3.3384 loss_recon=3.3384 loss_meanflow=0.0000 mean_model_t=0.4964 mean_corrupt_t=0.4964 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7381 corrupt_frac=0.5482 acc_corrupt=0.5441 loss_corrupt=3.3384 wrong_frac=0.5055 init_acc_corrupt=0.4593 acc_corrupt_t_0p0_0p2=0.1584 corrupt_frac_t_0p0_0p2=0.2099 acc_corrupt_t_0p2_0p4=0.3504 corrupt_frac_t_0p2_0p4=0.1992 acc_corrupt_t_0p4_0p6=0.5774 corrupt_frac_t_0p4_0p6=0.1915 acc_corrupt_t_0p6_0p8=0.7472 corrupt_frac_t_0p6_0p8=0.2040 acc_corrupt_t_0p8_1p0=0.9111 corrupt_frac_t_0p8_1p0=0.1954 out_w_norm=23.1593 out_g_norm=0.9833 loss_all=1.4867 init_gold_top10=0.5494 init_gold_top100=0.5534
|
| 194 |
+
step=900 micro_steps=1800 elapsed=27.4s lr=1.081200e-04 loss=3.2127 loss_recon=3.2127 loss_meanflow=0.0000 mean_model_t=0.4998 mean_corrupt_t=0.4998 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7428 corrupt_frac=0.5508 acc_corrupt=0.5550 loss_corrupt=3.2127 wrong_frac=0.5019 init_acc_corrupt=0.4639 acc_corrupt_t_0p0_0p2=0.1658 corrupt_frac_t_0p0_0p2=0.2006 acc_corrupt_t_0p2_0p4=0.3617 corrupt_frac_t_0p2_0p4=0.1993 acc_corrupt_t_0p4_0p6=0.5859 corrupt_frac_t_0p4_0p6=0.2042 acc_corrupt_t_0p6_0p8=0.7552 corrupt_frac_t_0p6_0p8=0.2012 acc_corrupt_t_0p8_1p0=0.9143 corrupt_frac_t_0p8_1p0=0.1947 out_w_norm=24.2792 out_g_norm=0.9876 loss_all=1.7195 init_gold_top10=0.5360 init_gold_top100=0.5389
|
| 195 |
+
step=1000 micro_steps=2000 elapsed=27.4s lr=1.201200e-04 loss=3.1393 loss_recon=3.1393 loss_meanflow=0.0000 mean_model_t=0.5014 mean_corrupt_t=0.5014 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7474 corrupt_frac=0.5471 acc_corrupt=0.5607 loss_corrupt=3.1393 wrong_frac=0.5012 init_acc_corrupt=0.4645 acc_corrupt_t_0p0_0p2=0.1740 corrupt_frac_t_0p0_0p2=0.1999 acc_corrupt_t_0p2_0p4=0.3693 corrupt_frac_t_0p2_0p4=0.2022 acc_corrupt_t_0p4_0p6=0.5906 corrupt_frac_t_0p4_0p6=0.2028 acc_corrupt_t_0p6_0p8=0.7592 corrupt_frac_t_0p6_0p8=0.1971 acc_corrupt_t_0p8_1p0=0.9182 corrupt_frac_t_0p8_1p0=0.1980 out_w_norm=25.3388 out_g_norm=1.0029 loss_all=1.8307 init_gold_top10=0.5103 init_gold_top100=0.5167
|
| 196 |
+
step=1100 micro_steps=2200 elapsed=59.0s lr=1.321200e-04 loss=3.0906 loss_recon=3.0906 loss_meanflow=0.0000 mean_model_t=0.4965 mean_corrupt_t=0.4965 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7483 corrupt_frac=0.5484 acc_corrupt=0.5645 loss_corrupt=3.0906 wrong_frac=0.5011 init_acc_corrupt=0.4644 acc_corrupt_t_0p0_0p2=0.1784 corrupt_frac_t_0p0_0p2=0.2009 acc_corrupt_t_0p2_0p4=0.3713 corrupt_frac_t_0p2_0p4=0.1997 acc_corrupt_t_0p4_0p6=0.5942 corrupt_frac_t_0p4_0p6=0.2036 acc_corrupt_t_0p6_0p8=0.7613 corrupt_frac_t_0p6_0p8=0.1959 acc_corrupt_t_0p8_1p0=0.9223 corrupt_frac_t_0p8_1p0=0.1999 out_w_norm=26.4586 out_g_norm=0.9759 loss_all=1.6519 init_gold_top10=0.5149 init_gold_top100=0.5214
|
| 197 |
+
step=1200 micro_steps=2400 elapsed=48.0s lr=1.441200e-04 loss=3.0471 loss_recon=3.0471 loss_meanflow=0.0000 mean_model_t=0.4985 mean_corrupt_t=0.4985 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7487 corrupt_frac=0.5498 acc_corrupt=0.5680 loss_corrupt=3.0471 wrong_frac=0.5014 init_acc_corrupt=0.4645 acc_corrupt_t_0p0_0p2=0.1834 corrupt_frac_t_0p0_0p2=0.1983 acc_corrupt_t_0p2_0p4=0.3747 corrupt_frac_t_0p2_0p4=0.2015 acc_corrupt_t_0p4_0p6=0.5968 corrupt_frac_t_0p4_0p6=0.2040 acc_corrupt_t_0p6_0p8=0.7658 corrupt_frac_t_0p6_0p8=0.1980 acc_corrupt_t_0p8_1p0=0.9221 corrupt_frac_t_0p8_1p0=0.1982 out_w_norm=27.6348 out_g_norm=0.9598 loss_all=1.7895 init_gold_top10=0.5429 init_gold_top100=0.5470
|
| 198 |
+
step=1300 micro_steps=2600 elapsed=27.3s lr=1.561200e-04 loss=2.9908 loss_recon=2.9908 loss_meanflow=0.0000 mean_model_t=0.4995 mean_corrupt_t=0.4995 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7523 corrupt_frac=0.5477 acc_corrupt=0.5727 loss_corrupt=2.9908 wrong_frac=0.4991 init_acc_corrupt=0.4655 acc_corrupt_t_0p0_0p2=0.1911 corrupt_frac_t_0p0_0p2=0.1968 acc_corrupt_t_0p2_0p4=0.3794 corrupt_frac_t_0p2_0p4=0.2057 acc_corrupt_t_0p4_0p6=0.6009 corrupt_frac_t_0p4_0p6=0.1979 acc_corrupt_t_0p6_0p8=0.7691 corrupt_frac_t_0p6_0p8=0.1993 acc_corrupt_t_0p8_1p0=0.9231 corrupt_frac_t_0p8_1p0=0.2002 out_w_norm=28.8750 out_g_norm=0.9255 loss_all=1.5911 init_gold_top10=0.5060 init_gold_top100=0.5106
|
| 199 |
+
step=1400 micro_steps=2800 elapsed=27.4s lr=1.681200e-04 loss=2.9475 loss_recon=2.9475 loss_meanflow=0.0000 mean_model_t=0.5036 mean_corrupt_t=0.5036 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7528 corrupt_frac=0.5531 acc_corrupt=0.5773 loss_corrupt=2.9475 wrong_frac=0.4962 init_acc_corrupt=0.4689 acc_corrupt_t_0p0_0p2=0.1885 corrupt_frac_t_0p0_0p2=0.2006 acc_corrupt_t_0p2_0p4=0.3810 corrupt_frac_t_0p2_0p4=0.1970 acc_corrupt_t_0p4_0p6=0.6058 corrupt_frac_t_0p4_0p6=0.1945 acc_corrupt_t_0p6_0p8=0.7708 corrupt_frac_t_0p6_0p8=0.1975 acc_corrupt_t_0p8_1p0=0.9237 corrupt_frac_t_0p8_1p0=0.2104 out_w_norm=30.2068 out_g_norm=0.8881 loss_all=2.0703 init_gold_top10=0.4394 init_gold_top100=0.4475
|
| 200 |
+
step=1500 micro_steps=3000 elapsed=27.4s lr=1.801200e-04 loss=2.9659 loss_recon=2.9659 loss_meanflow=0.0000 mean_model_t=0.4988 mean_corrupt_t=0.4988 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7496 corrupt_frac=0.5546 acc_corrupt=0.5728 loss_corrupt=2.9659 wrong_frac=0.5035 init_acc_corrupt=0.4610 acc_corrupt_t_0p0_0p2=0.1908 corrupt_frac_t_0p0_0p2=0.1992 acc_corrupt_t_0p2_0p4=0.3858 corrupt_frac_t_0p2_0p4=0.2057 acc_corrupt_t_0p4_0p6=0.6063 corrupt_frac_t_0p4_0p6=0.2032 acc_corrupt_t_0p6_0p8=0.7720 corrupt_frac_t_0p6_0p8=0.1960 acc_corrupt_t_0p8_1p0=0.9237 corrupt_frac_t_0p8_1p0=0.1959 out_w_norm=31.6051 out_g_norm=0.8509 loss_all=1.5975 init_gold_top10=0.4906 init_gold_top100=0.4993
|
| 201 |
+
step=1600 micro_steps=3200 elapsed=27.4s lr=1.921200e-04 loss=2.9089 loss_recon=2.9089 loss_meanflow=0.0000 mean_model_t=0.4985 mean_corrupt_t=0.4985 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7549 corrupt_frac=0.5496 acc_corrupt=0.5790 loss_corrupt=2.9089 wrong_frac=0.5008 init_acc_corrupt=0.4644 acc_corrupt_t_0p0_0p2=0.1956 corrupt_frac_t_0p0_0p2=0.2044 acc_corrupt_t_0p2_0p4=0.3906 corrupt_frac_t_0p2_0p4=0.1971 acc_corrupt_t_0p4_0p6=0.6097 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=0.7752 corrupt_frac_t_0p6_0p8=0.1971 acc_corrupt_t_0p8_1p0=0.9264 corrupt_frac_t_0p8_1p0=0.2038 out_w_norm=33.1586 out_g_norm=0.8146 loss_all=1.6215 init_gold_top10=0.4407 init_gold_top100=0.4490
|
| 202 |
+
step=1700 micro_steps=3400 elapsed=27.4s lr=2.041200e-04 loss=2.8826 loss_recon=2.8826 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7566 corrupt_frac=0.5486 acc_corrupt=0.5812 loss_corrupt=2.8826 wrong_frac=0.5012 init_acc_corrupt=0.4639 acc_corrupt_t_0p0_0p2=0.2018 corrupt_frac_t_0p0_0p2=0.1974 acc_corrupt_t_0p2_0p4=0.3943 corrupt_frac_t_0p2_0p4=0.2045 acc_corrupt_t_0p4_0p6=0.6099 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=0.7768 corrupt_frac_t_0p6_0p8=0.2015 acc_corrupt_t_0p8_1p0=0.9276 corrupt_frac_t_0p8_1p0=0.1960 out_w_norm=34.8286 out_g_norm=0.7785 loss_all=1.7638 init_gold_top10=0.4600 init_gold_top100=0.4660
|
| 203 |
+
step=1800 micro_steps=3600 elapsed=27.4s lr=2.161200e-04 loss=2.8119 loss_recon=2.8119 loss_meanflow=0.0000 mean_model_t=0.5037 mean_corrupt_t=0.5037 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7609 corrupt_frac=0.5490 acc_corrupt=0.5897 loss_corrupt=2.8119 wrong_frac=0.4951 init_acc_corrupt=0.4712 acc_corrupt_t_0p0_0p2=0.2043 corrupt_frac_t_0p0_0p2=0.1896 acc_corrupt_t_0p2_0p4=0.3958 corrupt_frac_t_0p2_0p4=0.2002 acc_corrupt_t_0p4_0p6=0.6147 corrupt_frac_t_0p4_0p6=0.2027 acc_corrupt_t_0p6_0p8=0.7783 corrupt_frac_t_0p6_0p8=0.2063 acc_corrupt_t_0p8_1p0=0.9274 corrupt_frac_t_0p8_1p0=0.2013 out_w_norm=36.5634 out_g_norm=0.7392 loss_all=1.4153 init_gold_top10=0.5215 init_gold_top100=0.5287
|
| 204 |
+
step=1900 micro_steps=3800 elapsed=27.4s lr=2.281200e-04 loss=2.8259 loss_recon=2.8259 loss_meanflow=0.0000 mean_model_t=0.5000 mean_corrupt_t=0.5000 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7582 corrupt_frac=0.5502 acc_corrupt=0.5867 loss_corrupt=2.8259 wrong_frac=0.4995 init_acc_corrupt=0.4661 acc_corrupt_t_0p0_0p2=0.2040 corrupt_frac_t_0p0_0p2=0.2033 acc_corrupt_t_0p2_0p4=0.3999 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.6204 corrupt_frac_t_0p4_0p6=0.1957 acc_corrupt_t_0p6_0p8=0.7816 corrupt_frac_t_0p6_0p8=0.2020 acc_corrupt_t_0p8_1p0=0.9284 corrupt_frac_t_0p8_1p0=0.2011 out_w_norm=38.4998 out_g_norm=0.6932 loss_all=1.6129 init_gold_top10=0.4898 init_gold_top100=0.4939
|
| 205 |
+
step=2000 micro_steps=4000 elapsed=27.4s lr=2.401200e-04 loss=2.8140 loss_recon=2.8140 loss_meanflow=0.0000 mean_model_t=0.5005 mean_corrupt_t=0.5005 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7582 corrupt_frac=0.5511 acc_corrupt=0.5877 loss_corrupt=2.8140 wrong_frac=0.5001 init_acc_corrupt=0.4654 acc_corrupt_t_0p0_0p2=0.2080 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.4041 corrupt_frac_t_0p2_0p4=0.1994 acc_corrupt_t_0p4_0p6=0.6206 corrupt_frac_t_0p4_0p6=0.2021 acc_corrupt_t_0p6_0p8=0.7811 corrupt_frac_t_0p6_0p8=0.1989 acc_corrupt_t_0p8_1p0=0.9272 corrupt_frac_t_0p8_1p0=0.1991 out_w_norm=40.6782 out_g_norm=0.6578 loss_all=1.9873 init_gold_top10=0.4348 init_gold_top100=0.4409
|
| 206 |
+
step=2100 micro_steps=4200 elapsed=51.6s lr=2.521200e-04 loss=2.8071 loss_recon=2.8071 loss_meanflow=0.0000 mean_model_t=0.4980 mean_corrupt_t=0.4980 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7587 corrupt_frac=0.5491 acc_corrupt=0.5870 loss_corrupt=2.8071 wrong_frac=0.5032 init_acc_corrupt=0.4622 acc_corrupt_t_0p0_0p2=0.2077 corrupt_frac_t_0p0_0p2=0.2042 acc_corrupt_t_0p2_0p4=0.4054 corrupt_frac_t_0p2_0p4=0.1972 acc_corrupt_t_0p4_0p6=0.6244 corrupt_frac_t_0p4_0p6=0.2018 acc_corrupt_t_0p6_0p8=0.7822 corrupt_frac_t_0p6_0p8=0.2030 acc_corrupt_t_0p8_1p0=0.9282 corrupt_frac_t_0p8_1p0=0.1937 out_w_norm=42.7296 out_g_norm=0.6251 loss_all=1.6952 init_gold_top10=0.4779 init_gold_top100=0.4844
|
| 207 |
+
step=2200 micro_steps=4400 elapsed=54.9s lr=2.641200e-04 loss=2.8123 loss_recon=2.8123 loss_meanflow=0.0000 mean_model_t=0.4967 mean_corrupt_t=0.4967 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7576 corrupt_frac=0.5505 acc_corrupt=0.5865 loss_corrupt=2.8123 wrong_frac=0.5043 init_acc_corrupt=0.4600 acc_corrupt_t_0p0_0p2=0.2099 corrupt_frac_t_0p0_0p2=0.2001 acc_corrupt_t_0p2_0p4=0.4065 corrupt_frac_t_0p2_0p4=0.2087 acc_corrupt_t_0p4_0p6=0.6221 corrupt_frac_t_0p4_0p6=0.1955 acc_corrupt_t_0p6_0p8=0.7843 corrupt_frac_t_0p6_0p8=0.2023 acc_corrupt_t_0p8_1p0=0.9274 corrupt_frac_t_0p8_1p0=0.1934 out_w_norm=44.9724 out_g_norm=0.5910 loss_all=1.6415 init_gold_top10=0.5465 init_gold_top100=0.5521
|
| 208 |
+
step=2300 micro_steps=4600 elapsed=27.4s lr=2.761200e-04 loss=2.7487 loss_recon=2.7487 loss_meanflow=0.0000 mean_model_t=0.5001 mean_corrupt_t=0.5001 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7605 corrupt_frac=0.5526 acc_corrupt=0.5937 loss_corrupt=2.7487 wrong_frac=0.4991 init_acc_corrupt=0.4661 acc_corrupt_t_0p0_0p2=0.2143 corrupt_frac_t_0p0_0p2=0.2003 acc_corrupt_t_0p2_0p4=0.4138 corrupt_frac_t_0p2_0p4=0.1975 acc_corrupt_t_0p4_0p6=0.6247 corrupt_frac_t_0p4_0p6=0.2009 acc_corrupt_t_0p6_0p8=0.7836 corrupt_frac_t_0p6_0p8=0.2024 acc_corrupt_t_0p8_1p0=0.9300 corrupt_frac_t_0p8_1p0=0.1989 out_w_norm=47.2975 out_g_norm=0.5562 loss_all=1.6646 init_gold_top10=0.4992 init_gold_top100=0.5070
|
| 209 |
+
step=2400 micro_steps=4800 elapsed=27.4s lr=2.881200e-04 loss=2.7419 loss_recon=2.7419 loss_meanflow=0.0000 mean_model_t=0.5003 mean_corrupt_t=0.5003 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7612 corrupt_frac=0.5516 acc_corrupt=0.5937 loss_corrupt=2.7419 wrong_frac=0.5006 init_acc_corrupt=0.4654 acc_corrupt_t_0p0_0p2=0.2155 corrupt_frac_t_0p0_0p2=0.1999 acc_corrupt_t_0p2_0p4=0.4150 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=0.6256 corrupt_frac_t_0p4_0p6=0.2011 acc_corrupt_t_0p6_0p8=0.7865 corrupt_frac_t_0p6_0p8=0.1956 acc_corrupt_t_0p8_1p0=0.9304 corrupt_frac_t_0p8_1p0=0.2009 out_w_norm=49.6336 out_g_norm=0.5319 loss_all=1.4697 init_gold_top10=0.4734 init_gold_top100=0.4794
|
| 210 |
+
step=2500 micro_steps=5000 elapsed=27.4s lr=3.000000e-04 loss=2.7313 loss_recon=2.7313 loss_meanflow=0.0000 mean_model_t=0.4989 mean_corrupt_t=0.4989 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7635 corrupt_frac=0.5470 acc_corrupt=0.5945 loss_corrupt=2.7313 wrong_frac=0.5022 init_acc_corrupt=0.4625 acc_corrupt_t_0p0_0p2=0.2195 corrupt_frac_t_0p0_0p2=0.1996 acc_corrupt_t_0p2_0p4=0.4162 corrupt_frac_t_0p2_0p4=0.2072 acc_corrupt_t_0p4_0p6=0.6291 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=0.7903 corrupt_frac_t_0p6_0p8=0.1948 acc_corrupt_t_0p8_1p0=0.9293 corrupt_frac_t_0p8_1p0=0.1993 out_w_norm=51.8568 out_g_norm=0.5137 loss_all=1.6564 init_gold_top10=0.4988 init_gold_top100=0.5043
|
| 211 |
+
step=2600 micro_steps=5200 elapsed=27.4s lr=3.000000e-04 loss=2.6957 loss_recon=2.6957 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7646 corrupt_frac=0.5495 acc_corrupt=0.5985 loss_corrupt=2.6957 wrong_frac=0.5005 init_acc_corrupt=0.4646 acc_corrupt_t_0p0_0p2=0.2200 corrupt_frac_t_0p0_0p2=0.1995 acc_corrupt_t_0p2_0p4=0.4249 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=0.6317 corrupt_frac_t_0p4_0p6=0.1997 acc_corrupt_t_0p6_0p8=0.7893 corrupt_frac_t_0p6_0p8=0.2024 acc_corrupt_t_0p8_1p0=0.9311 corrupt_frac_t_0p8_1p0=0.1964 out_w_norm=54.0084 out_g_norm=0.4883 loss_all=1.5157 init_gold_top10=0.5138 init_gold_top100=0.5224
|
| 212 |
+
step=2700 micro_steps=5400 elapsed=27.4s lr=3.000000e-04 loss=2.7126 loss_recon=2.7126 loss_meanflow=0.0000 mean_model_t=0.4961 mean_corrupt_t=0.4961 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7620 corrupt_frac=0.5511 acc_corrupt=0.5952 loss_corrupt=2.7126 wrong_frac=0.5044 init_acc_corrupt=0.4600 acc_corrupt_t_0p0_0p2=0.2213 corrupt_frac_t_0p0_0p2=0.2106 acc_corrupt_t_0p2_0p4=0.4216 corrupt_frac_t_0p2_0p4=0.1997 acc_corrupt_t_0p4_0p6=0.6382 corrupt_frac_t_0p4_0p6=0.1973 acc_corrupt_t_0p6_0p8=0.7931 corrupt_frac_t_0p6_0p8=0.1969 acc_corrupt_t_0p8_1p0=0.9332 corrupt_frac_t_0p8_1p0=0.1954 out_w_norm=56.0107 out_g_norm=0.4709 loss_all=1.5381 init_gold_top10=0.4981 init_gold_top100=0.5063
|
| 213 |
+
step=2800 micro_steps=5600 elapsed=27.4s lr=3.000000e-04 loss=2.6797 loss_recon=2.6797 loss_meanflow=0.0000 mean_model_t=0.4970 mean_corrupt_t=0.4970 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7654 corrupt_frac=0.5487 acc_corrupt=0.5989 loss_corrupt=2.6797 wrong_frac=0.5039 init_acc_corrupt=0.4616 acc_corrupt_t_0p0_0p2=0.2241 corrupt_frac_t_0p0_0p2=0.2045 acc_corrupt_t_0p2_0p4=0.4216 corrupt_frac_t_0p2_0p4=0.1969 acc_corrupt_t_0p4_0p6=0.6358 corrupt_frac_t_0p4_0p6=0.2031 acc_corrupt_t_0p6_0p8=0.7945 corrupt_frac_t_0p6_0p8=0.2052 acc_corrupt_t_0p8_1p0=0.9345 corrupt_frac_t_0p8_1p0=0.1904 out_w_norm=57.8522 out_g_norm=0.4551 loss_all=1.3968 init_gold_top10=0.4840 init_gold_top100=0.4885
|
| 214 |
+
step=2900 micro_steps=5800 elapsed=27.4s lr=3.000000e-04 loss=2.6389 loss_recon=2.6389 loss_meanflow=0.0000 mean_model_t=0.4982 mean_corrupt_t=0.4982 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7676 corrupt_frac=0.5498 acc_corrupt=0.6034 loss_corrupt=2.6389 wrong_frac=0.5004 init_acc_corrupt=0.4654 acc_corrupt_t_0p0_0p2=0.2271 corrupt_frac_t_0p0_0p2=0.1997 acc_corrupt_t_0p2_0p4=0.4225 corrupt_frac_t_0p2_0p4=0.1975 acc_corrupt_t_0p4_0p6=0.6363 corrupt_frac_t_0p4_0p6=0.2033 acc_corrupt_t_0p6_0p8=0.7942 corrupt_frac_t_0p6_0p8=0.1986 acc_corrupt_t_0p8_1p0=0.9334 corrupt_frac_t_0p8_1p0=0.2009 out_w_norm=59.5855 out_g_norm=0.4450 loss_all=1.4845 init_gold_top10=0.5540 init_gold_top100=0.5574
|
| 215 |
+
step=3000 micro_steps=6000 elapsed=27.4s lr=3.000000e-04 loss=2.6078 loss_recon=2.6078 loss_meanflow=0.0000 mean_model_t=0.5022 mean_corrupt_t=0.5022 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7705 corrupt_frac=0.5490 acc_corrupt=0.6081 loss_corrupt=2.6078 wrong_frac=0.4958 init_acc_corrupt=0.4698 acc_corrupt_t_0p0_0p2=0.2275 corrupt_frac_t_0p0_0p2=0.1995 acc_corrupt_t_0p2_0p4=0.4249 corrupt_frac_t_0p2_0p4=0.1960 acc_corrupt_t_0p4_0p6=0.6445 corrupt_frac_t_0p4_0p6=0.2003 acc_corrupt_t_0p6_0p8=0.7950 corrupt_frac_t_0p6_0p8=0.1961 acc_corrupt_t_0p8_1p0=0.9341 corrupt_frac_t_0p8_1p0=0.2081 out_w_norm=61.2052 out_g_norm=0.4317 loss_all=1.4179 init_gold_top10=0.4590 init_gold_top100=0.4668
|
| 216 |
+
step=3100 micro_steps=6200 elapsed=45.8s lr=3.000000e-04 loss=2.6191 loss_recon=2.6191 loss_meanflow=0.0000 mean_model_t=0.5002 mean_corrupt_t=0.5002 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7690 corrupt_frac=0.5499 acc_corrupt=0.6055 loss_corrupt=2.6191 wrong_frac=0.4998 init_acc_corrupt=0.4657 acc_corrupt_t_0p0_0p2=0.2271 corrupt_frac_t_0p0_0p2=0.2049 acc_corrupt_t_0p2_0p4=0.4279 corrupt_frac_t_0p2_0p4=0.1938 acc_corrupt_t_0p4_0p6=0.6401 corrupt_frac_t_0p4_0p6=0.1965 acc_corrupt_t_0p6_0p8=0.7975 corrupt_frac_t_0p6_0p8=0.2048 acc_corrupt_t_0p8_1p0=0.9349 corrupt_frac_t_0p8_1p0=0.2000 out_w_norm=62.7482 out_g_norm=0.4201 loss_all=1.3875 init_gold_top10=0.4932 init_gold_top100=0.4986
|
| 217 |
+
step=3200 micro_steps=6400 elapsed=61.2s lr=3.000000e-04 loss=2.6280 loss_recon=2.6280 loss_meanflow=0.0000 mean_model_t=0.4972 mean_corrupt_t=0.4972 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7687 corrupt_frac=0.5497 acc_corrupt=0.6044 loss_corrupt=2.6280 wrong_frac=0.5027 init_acc_corrupt=0.4631 acc_corrupt_t_0p0_0p2=0.2271 corrupt_frac_t_0p0_0p2=0.2000 acc_corrupt_t_0p2_0p4=0.4318 corrupt_frac_t_0p2_0p4=0.2021 acc_corrupt_t_0p4_0p6=0.6396 corrupt_frac_t_0p4_0p6=0.1996 acc_corrupt_t_0p6_0p8=0.7968 corrupt_frac_t_0p6_0p8=0.2044 acc_corrupt_t_0p8_1p0=0.9342 corrupt_frac_t_0p8_1p0=0.1939 out_w_norm=64.2352 out_g_norm=0.4133 loss_all=1.7830 init_gold_top10=0.4025 init_gold_top100=0.4107
|
| 218 |
+
step=3300 micro_steps=6600 elapsed=27.4s lr=3.000000e-04 loss=2.6221 loss_recon=2.6221 loss_meanflow=0.0000 mean_model_t=0.4960 mean_corrupt_t=0.4960 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7688 corrupt_frac=0.5503 acc_corrupt=0.6045 loss_corrupt=2.6221 wrong_frac=0.5043 init_acc_corrupt=0.4606 acc_corrupt_t_0p0_0p2=0.2315 corrupt_frac_t_0p0_0p2=0.2030 acc_corrupt_t_0p2_0p4=0.4320 corrupt_frac_t_0p2_0p4=0.1994 acc_corrupt_t_0p4_0p6=0.6417 corrupt_frac_t_0p4_0p6=0.2046 acc_corrupt_t_0p6_0p8=0.7984 corrupt_frac_t_0p6_0p8=0.2002 acc_corrupt_t_0p8_1p0=0.9346 corrupt_frac_t_0p8_1p0=0.1928 out_w_norm=65.6640 out_g_norm=0.4047 loss_all=1.6352 init_gold_top10=0.4992 init_gold_top100=0.5037
|
| 219 |
+
step=3400 micro_steps=6800 elapsed=27.4s lr=3.000000e-04 loss=2.5997 loss_recon=2.5997 loss_meanflow=0.0000 mean_model_t=0.4976 mean_corrupt_t=0.4976 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7711 corrupt_frac=0.5496 acc_corrupt=0.6077 loss_corrupt=2.5997 wrong_frac=0.5011 init_acc_corrupt=0.4643 acc_corrupt_t_0p0_0p2=0.2318 corrupt_frac_t_0p0_0p2=0.2020 acc_corrupt_t_0p2_0p4=0.4276 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.6435 corrupt_frac_t_0p4_0p6=0.2020 acc_corrupt_t_0p6_0p8=0.8009 corrupt_frac_t_0p6_0p8=0.1969 acc_corrupt_t_0p8_1p0=0.9372 corrupt_frac_t_0p8_1p0=0.2012 out_w_norm=67.0399 out_g_norm=0.3941 loss_all=1.6996 init_gold_top10=0.4428 init_gold_top100=0.4487
|
| 220 |
+
step=3500 micro_steps=7000 elapsed=27.4s lr=3.000000e-04 loss=2.5439 loss_recon=2.5439 loss_meanflow=0.0000 mean_model_t=0.5018 mean_corrupt_t=0.5018 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7745 corrupt_frac=0.5515 acc_corrupt=0.6145 loss_corrupt=2.5439 wrong_frac=0.4950 init_acc_corrupt=0.4710 acc_corrupt_t_0p0_0p2=0.2358 corrupt_frac_t_0p0_0p2=0.1957 acc_corrupt_t_0p2_0p4=0.4381 corrupt_frac_t_0p2_0p4=0.1988 acc_corrupt_t_0p4_0p6=0.6432 corrupt_frac_t_0p4_0p6=0.1981 acc_corrupt_t_0p6_0p8=0.8005 corrupt_frac_t_0p6_0p8=0.2032 acc_corrupt_t_0p8_1p0=0.9364 corrupt_frac_t_0p8_1p0=0.2041 out_w_norm=68.3571 out_g_norm=0.3880 loss_all=1.2910 init_gold_top10=0.5389 init_gold_top100=0.5435
|
| 221 |
+
step=3600 micro_steps=7200 elapsed=27.4s lr=3.000000e-04 loss=2.5728 loss_recon=2.5728 loss_meanflow=0.0000 mean_model_t=0.5013 mean_corrupt_t=0.5013 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7722 corrupt_frac=0.5518 acc_corrupt=0.6105 loss_corrupt=2.5728 wrong_frac=0.5004 init_acc_corrupt=0.4654 acc_corrupt_t_0p0_0p2=0.2342 corrupt_frac_t_0p0_0p2=0.2016 acc_corrupt_t_0p2_0p4=0.4340 corrupt_frac_t_0p2_0p4=0.1974 acc_corrupt_t_0p4_0p6=0.6467 corrupt_frac_t_0p4_0p6=0.1974 acc_corrupt_t_0p6_0p8=0.8007 corrupt_frac_t_0p6_0p8=0.2035 acc_corrupt_t_0p8_1p0=0.9350 corrupt_frac_t_0p8_1p0=0.2000 out_w_norm=69.6385 out_g_norm=0.3815 loss_all=1.2250 init_gold_top10=0.5361 init_gold_top100=0.5422
|
| 222 |
+
step=3700 micro_steps=7400 elapsed=27.4s lr=3.000000e-04 loss=2.5696 loss_recon=2.5696 loss_meanflow=0.0000 mean_model_t=0.4992 mean_corrupt_t=0.4992 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7741 corrupt_frac=0.5473 acc_corrupt=0.6105 loss_corrupt=2.5696 wrong_frac=0.5016 init_acc_corrupt=0.4633 acc_corrupt_t_0p0_0p2=0.2326 corrupt_frac_t_0p0_0p2=0.2044 acc_corrupt_t_0p2_0p4=0.4390 corrupt_frac_t_0p2_0p4=0.2003 acc_corrupt_t_0p4_0p6=0.6485 corrupt_frac_t_0p4_0p6=0.1931 acc_corrupt_t_0p6_0p8=0.8016 corrupt_frac_t_0p6_0p8=0.2009 acc_corrupt_t_0p8_1p0=0.9373 corrupt_frac_t_0p8_1p0=0.2013 out_w_norm=70.8932 out_g_norm=0.3762 loss_all=0.9458 init_gold_top10=0.6213 init_gold_top100=0.6235
|
| 223 |
+
step=3800 micro_steps=7600 elapsed=27.4s lr=3.000000e-04 loss=2.5298 loss_recon=2.5298 loss_meanflow=0.0000 mean_model_t=0.5016 mean_corrupt_t=0.5016 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7767 corrupt_frac=0.5472 acc_corrupt=0.6149 loss_corrupt=2.5298 wrong_frac=0.4983 init_acc_corrupt=0.4674 acc_corrupt_t_0p0_0p2=0.2370 corrupt_frac_t_0p0_0p2=0.1956 acc_corrupt_t_0p2_0p4=0.4426 corrupt_frac_t_0p2_0p4=0.1975 acc_corrupt_t_0p4_0p6=0.6451 corrupt_frac_t_0p4_0p6=0.2056 acc_corrupt_t_0p6_0p8=0.8021 corrupt_frac_t_0p6_0p8=0.2038 acc_corrupt_t_0p8_1p0=0.9365 corrupt_frac_t_0p8_1p0=0.1975 out_w_norm=72.1211 out_g_norm=0.3735 loss_all=1.1502 init_gold_top10=0.5565 init_gold_top100=0.5608
|
| 224 |
+
step=3900 micro_steps=7800 elapsed=27.4s lr=3.000000e-04 loss=2.5870 loss_recon=2.5870 loss_meanflow=0.0000 mean_model_t=0.4945 mean_corrupt_t=0.4945 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7719 corrupt_frac=0.5488 acc_corrupt=0.6070 loss_corrupt=2.5870 wrong_frac=0.5053 init_acc_corrupt=0.4589 acc_corrupt_t_0p0_0p2=0.2344 corrupt_frac_t_0p0_0p2=0.2029 acc_corrupt_t_0p2_0p4=0.4380 corrupt_frac_t_0p2_0p4=0.2072 acc_corrupt_t_0p4_0p6=0.6463 corrupt_frac_t_0p4_0p6=0.2037 acc_corrupt_t_0p6_0p8=0.8044 corrupt_frac_t_0p6_0p8=0.1898 acc_corrupt_t_0p8_1p0=0.9386 corrupt_frac_t_0p8_1p0=0.1964 out_w_norm=73.3159 out_g_norm=0.3651 loss_all=1.5352 init_gold_top10=0.4662 init_gold_top100=0.4725
|
| 225 |
+
step=4000 micro_steps=8000 elapsed=27.3s lr=3.000000e-04 loss=2.5263 loss_recon=2.5263 loss_meanflow=0.0000 mean_model_t=0.4993 mean_corrupt_t=0.4993 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7767 corrupt_frac=0.5478 acc_corrupt=0.6152 loss_corrupt=2.5263 wrong_frac=0.4989 init_acc_corrupt=0.4665 acc_corrupt_t_0p0_0p2=0.2370 corrupt_frac_t_0p0_0p2=0.2010 acc_corrupt_t_0p2_0p4=0.4434 corrupt_frac_t_0p2_0p4=0.1995 acc_corrupt_t_0p4_0p6=0.6514 corrupt_frac_t_0p4_0p6=0.1990 acc_corrupt_t_0p6_0p8=0.8072 corrupt_frac_t_0p6_0p8=0.1982 acc_corrupt_t_0p8_1p0=0.9368 corrupt_frac_t_0p8_1p0=0.2022 out_w_norm=74.4950 out_g_norm=0.3598 loss_all=1.5450 init_gold_top10=0.4738 init_gold_top100=0.4779
|
| 226 |
+
step=4100 micro_steps=8200 elapsed=38.9s lr=3.000000e-04 loss=2.5235 loss_recon=2.5235 loss_meanflow=0.0000 mean_model_t=0.5002 mean_corrupt_t=0.5002 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7772 corrupt_frac=0.5463 acc_corrupt=0.6147 loss_corrupt=2.5235 wrong_frac=0.4997 init_acc_corrupt=0.4656 acc_corrupt_t_0p0_0p2=0.2392 corrupt_frac_t_0p0_0p2=0.1992 acc_corrupt_t_0p2_0p4=0.4408 corrupt_frac_t_0p2_0p4=0.1994 acc_corrupt_t_0p4_0p6=0.6482 corrupt_frac_t_0p4_0p6=0.2016 acc_corrupt_t_0p6_0p8=0.8042 corrupt_frac_t_0p6_0p8=0.1976 acc_corrupt_t_0p8_1p0=0.9375 corrupt_frac_t_0p8_1p0=0.2022 out_w_norm=75.6536 out_g_norm=0.3548 loss_all=1.4048 init_gold_top10=0.4710 init_gold_top100=0.4776
|
| 227 |
+
step=4200 micro_steps=8400 elapsed=66.0s lr=3.000000e-04 loss=2.5196 loss_recon=2.5196 loss_meanflow=0.0000 mean_model_t=0.4988 mean_corrupt_t=0.4988 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7770 corrupt_frac=0.5484 acc_corrupt=0.6160 loss_corrupt=2.5196 wrong_frac=0.4995 init_acc_corrupt=0.4662 acc_corrupt_t_0p0_0p2=0.2408 corrupt_frac_t_0p0_0p2=0.2027 acc_corrupt_t_0p2_0p4=0.4417 corrupt_frac_t_0p2_0p4=0.1962 acc_corrupt_t_0p4_0p6=0.6517 corrupt_frac_t_0p4_0p6=0.1981 acc_corrupt_t_0p6_0p8=0.8060 corrupt_frac_t_0p6_0p8=0.1998 acc_corrupt_t_0p8_1p0=0.9374 corrupt_frac_t_0p8_1p0=0.2031 out_w_norm=76.7883 out_g_norm=0.3477 loss_all=1.3692 init_gold_top10=0.5098 init_gold_top100=0.5155
|
| 228 |
+
step=4300 micro_steps=8600 elapsed=29.3s lr=3.000000e-04 loss=2.5189 loss_recon=2.5189 loss_meanflow=0.0000 mean_model_t=0.4992 mean_corrupt_t=0.4992 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7760 corrupt_frac=0.5522 acc_corrupt=0.6157 loss_corrupt=2.5189 wrong_frac=0.5000 init_acc_corrupt=0.4648 acc_corrupt_t_0p0_0p2=0.2409 corrupt_frac_t_0p0_0p2=0.1971 acc_corrupt_t_0p2_0p4=0.4412 corrupt_frac_t_0p2_0p4=0.2078 acc_corrupt_t_0p4_0p6=0.6538 corrupt_frac_t_0p4_0p6=0.1996 acc_corrupt_t_0p6_0p8=0.8073 corrupt_frac_t_0p6_0p8=0.1959 acc_corrupt_t_0p8_1p0=0.9412 corrupt_frac_t_0p8_1p0=0.1996 out_w_norm=77.9029 out_g_norm=0.3444 loss_all=1.5495 init_gold_top10=0.4525 init_gold_top100=0.4579
|
| 229 |
+
step=4400 micro_steps=8800 elapsed=27.4s lr=3.000000e-04 loss=2.5207 loss_recon=2.5207 loss_meanflow=0.0000 mean_model_t=0.4997 mean_corrupt_t=0.4997 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7774 corrupt_frac=0.5490 acc_corrupt=0.6151 loss_corrupt=2.5207 wrong_frac=0.5012 init_acc_corrupt=0.4637 acc_corrupt_t_0p0_0p2=0.2396 corrupt_frac_t_0p0_0p2=0.2000 acc_corrupt_t_0p2_0p4=0.4405 corrupt_frac_t_0p2_0p4=0.2022 acc_corrupt_t_0p4_0p6=0.6546 corrupt_frac_t_0p4_0p6=0.2020 acc_corrupt_t_0p6_0p8=0.8080 corrupt_frac_t_0p6_0p8=0.1955 acc_corrupt_t_0p8_1p0=0.9385 corrupt_frac_t_0p8_1p0=0.2002 out_w_norm=78.9984 out_g_norm=0.3422 loss_all=1.3282 init_gold_top10=0.5288 init_gold_top100=0.5356
|
| 230 |
+
step=4500 micro_steps=9000 elapsed=27.4s lr=3.000000e-04 loss=2.4822 loss_recon=2.4822 loss_meanflow=0.0000 mean_model_t=0.5024 mean_corrupt_t=0.5024 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7790 corrupt_frac=0.5514 acc_corrupt=0.6200 loss_corrupt=2.4822 wrong_frac=0.4975 init_acc_corrupt=0.4687 acc_corrupt_t_0p0_0p2=0.2425 corrupt_frac_t_0p0_0p2=0.1965 acc_corrupt_t_0p2_0p4=0.4475 corrupt_frac_t_0p2_0p4=0.1980 acc_corrupt_t_0p4_0p6=0.6505 corrupt_frac_t_0p4_0p6=0.2015 acc_corrupt_t_0p6_0p8=0.8068 corrupt_frac_t_0p6_0p8=0.2051 acc_corrupt_t_0p8_1p0=0.9413 corrupt_frac_t_0p8_1p0=0.1989 out_w_norm=80.0911 out_g_norm=0.3377 loss_all=1.4702 init_gold_top10=0.5369 init_gold_top100=0.5429
|
| 231 |
+
step=4600 micro_steps=9200 elapsed=27.5s lr=3.000000e-04 loss=2.4783 loss_recon=2.4783 loss_meanflow=0.0000 mean_model_t=0.5031 mean_corrupt_t=0.5031 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7780 corrupt_frac=0.5562 acc_corrupt=0.6207 loss_corrupt=2.4783 wrong_frac=0.4960 init_acc_corrupt=0.4705 acc_corrupt_t_0p0_0p2=0.2400 corrupt_frac_t_0p0_0p2=0.1945 acc_corrupt_t_0p2_0p4=0.4409 corrupt_frac_t_0p2_0p4=0.1977 acc_corrupt_t_0p4_0p6=0.6550 corrupt_frac_t_0p4_0p6=0.2045 acc_corrupt_t_0p6_0p8=0.8114 corrupt_frac_t_0p6_0p8=0.2020 acc_corrupt_t_0p8_1p0=0.9390 corrupt_frac_t_0p8_1p0=0.2013 out_w_norm=81.1681 out_g_norm=0.3318 loss_all=1.4947 init_gold_top10=0.5378 init_gold_top100=0.5414
|
| 232 |
+
step=4700 micro_steps=9400 elapsed=27.6s lr=3.000000e-04 loss=2.4738 loss_recon=2.4738 loss_meanflow=0.0000 mean_model_t=0.5055 mean_corrupt_t=0.5055 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7803 corrupt_frac=0.5514 acc_corrupt=0.6215 loss_corrupt=2.4738 wrong_frac=0.4958 init_acc_corrupt=0.4700 acc_corrupt_t_0p0_0p2=0.2395 corrupt_frac_t_0p0_0p2=0.1988 acc_corrupt_t_0p2_0p4=0.4477 corrupt_frac_t_0p2_0p4=0.2009 acc_corrupt_t_0p4_0p6=0.6567 corrupt_frac_t_0p4_0p6=0.1859 acc_corrupt_t_0p6_0p8=0.8084 corrupt_frac_t_0p6_0p8=0.2078 acc_corrupt_t_0p8_1p0=0.9383 corrupt_frac_t_0p8_1p0=0.2066 out_w_norm=82.2341 out_g_norm=0.3262 loss_all=1.9784 init_gold_top10=0.4249 init_gold_top100=0.4315
|
| 233 |
+
step=4800 micro_steps=9600 elapsed=27.5s lr=3.000000e-04 loss=2.4554 loss_recon=2.4554 loss_meanflow=0.0000 mean_model_t=0.5032 mean_corrupt_t=0.5032 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7814 corrupt_frac=0.5507 acc_corrupt=0.6233 loss_corrupt=2.4554 wrong_frac=0.4950 init_acc_corrupt=0.4706 acc_corrupt_t_0p0_0p2=0.2415 corrupt_frac_t_0p0_0p2=0.1943 acc_corrupt_t_0p2_0p4=0.4464 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=0.6589 corrupt_frac_t_0p4_0p6=0.1920 acc_corrupt_t_0p6_0p8=0.8073 corrupt_frac_t_0p6_0p8=0.2048 acc_corrupt_t_0p8_1p0=0.9403 corrupt_frac_t_0p8_1p0=0.2065 out_w_norm=83.2903 out_g_norm=0.3230 loss_all=1.4611 init_gold_top10=0.4988 init_gold_top100=0.5021
|
| 234 |
+
step=4900 micro_steps=9800 elapsed=27.5s lr=3.000000e-04 loss=2.4414 loss_recon=2.4414 loss_meanflow=0.0000 mean_model_t=0.5045 mean_corrupt_t=0.5045 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7845 corrupt_frac=0.5455 acc_corrupt=0.6243 loss_corrupt=2.4414 wrong_frac=0.4954 init_acc_corrupt=0.4704 acc_corrupt_t_0p0_0p2=0.2448 corrupt_frac_t_0p0_0p2=0.1955 acc_corrupt_t_0p2_0p4=0.4494 corrupt_frac_t_0p2_0p4=0.1960 acc_corrupt_t_0p4_0p6=0.6570 corrupt_frac_t_0p4_0p6=0.2028 acc_corrupt_t_0p6_0p8=0.8113 corrupt_frac_t_0p6_0p8=0.2025 acc_corrupt_t_0p8_1p0=0.9391 corrupt_frac_t_0p8_1p0=0.2033 out_w_norm=84.3324 out_g_norm=0.3213 loss_all=1.3477 init_gold_top10=0.4961 init_gold_top100=0.5032
|
| 235 |
+
step=5000 micro_steps=10000 elapsed=27.6s lr=3.000000e-04 loss=2.4687 loss_recon=2.4687 loss_meanflow=0.0000 mean_model_t=0.5020 mean_corrupt_t=0.5020 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7819 corrupt_frac=0.5474 acc_corrupt=0.6218 loss_corrupt=2.4687 wrong_frac=0.4979 init_acc_corrupt=0.4680 acc_corrupt_t_0p0_0p2=0.2422 corrupt_frac_t_0p0_0p2=0.1988 acc_corrupt_t_0p2_0p4=0.4463 corrupt_frac_t_0p2_0p4=0.1946 acc_corrupt_t_0p4_0p6=0.6577 corrupt_frac_t_0p4_0p6=0.1983 acc_corrupt_t_0p6_0p8=0.8090 corrupt_frac_t_0p6_0p8=0.2110 acc_corrupt_t_0p8_1p0=0.9412 corrupt_frac_t_0p8_1p0=0.1972 out_w_norm=85.3686 out_g_norm=0.3179 loss_all=1.4837 init_gold_top10=0.4503 init_gold_top100=0.4567
|
| 236 |
+
step=5100 micro_steps=10200 elapsed=34.6s lr=3.000000e-04 loss=2.4768 loss_recon=2.4768 loss_meanflow=0.0000 mean_model_t=0.5024 mean_corrupt_t=0.5024 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7805 corrupt_frac=0.5505 acc_corrupt=0.6204 loss_corrupt=2.4768 wrong_frac=0.4981 init_acc_corrupt=0.4672 acc_corrupt_t_0p0_0p2=0.2394 corrupt_frac_t_0p0_0p2=0.2012 acc_corrupt_t_0p2_0p4=0.4460 corrupt_frac_t_0p2_0p4=0.1974 acc_corrupt_t_0p4_0p6=0.6596 corrupt_frac_t_0p4_0p6=0.1938 acc_corrupt_t_0p6_0p8=0.8076 corrupt_frac_t_0p6_0p8=0.2021 acc_corrupt_t_0p8_1p0=0.9399 corrupt_frac_t_0p8_1p0=0.2055 out_w_norm=86.3955 out_g_norm=0.3118 loss_all=1.6133 init_gold_top10=0.4497 init_gold_top100=0.4601
|
| 237 |
+
step=5200 micro_steps=10400 elapsed=67.4s lr=3.000000e-04 loss=2.4572 loss_recon=2.4572 loss_meanflow=0.0000 mean_model_t=0.4991 mean_corrupt_t=0.4991 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7827 corrupt_frac=0.5477 acc_corrupt=0.6222 loss_corrupt=2.4572 wrong_frac=0.4994 init_acc_corrupt=0.4658 acc_corrupt_t_0p0_0p2=0.2484 corrupt_frac_t_0p0_0p2=0.1990 acc_corrupt_t_0p2_0p4=0.4527 corrupt_frac_t_0p2_0p4=0.2019 acc_corrupt_t_0p4_0p6=0.6562 corrupt_frac_t_0p4_0p6=0.1969 acc_corrupt_t_0p6_0p8=0.8121 corrupt_frac_t_0p6_0p8=0.2042 acc_corrupt_t_0p8_1p0=0.9411 corrupt_frac_t_0p8_1p0=0.1981 out_w_norm=87.4060 out_g_norm=0.3110 loss_all=1.4454 init_gold_top10=0.4930 init_gold_top100=0.4990
|
| 238 |
+
step=5300 micro_steps=10600 elapsed=32.6s lr=3.000000e-04 loss=2.4717 loss_recon=2.4717 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7788 corrupt_frac=0.5540 acc_corrupt=0.6200 loss_corrupt=2.4717 wrong_frac=0.5008 init_acc_corrupt=0.4645 acc_corrupt_t_0p0_0p2=0.2456 corrupt_frac_t_0p0_0p2=0.2009 acc_corrupt_t_0p2_0p4=0.4465 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=0.6588 corrupt_frac_t_0p4_0p6=0.1982 acc_corrupt_t_0p6_0p8=0.8120 corrupt_frac_t_0p6_0p8=0.1980 acc_corrupt_t_0p8_1p0=0.9424 corrupt_frac_t_0p8_1p0=0.2005 out_w_norm=88.4173 out_g_norm=0.3065 loss_all=1.4771 init_gold_top10=0.4999 init_gold_top100=0.5066
|
| 239 |
+
step=5400 micro_steps=10800 elapsed=27.6s lr=3.000000e-04 loss=2.4320 loss_recon=2.4320 loss_meanflow=0.0000 mean_model_t=0.5026 mean_corrupt_t=0.5026 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7840 corrupt_frac=0.5492 acc_corrupt=0.6254 loss_corrupt=2.4320 wrong_frac=0.4958 init_acc_corrupt=0.4705 acc_corrupt_t_0p0_0p2=0.2447 corrupt_frac_t_0p0_0p2=0.1967 acc_corrupt_t_0p2_0p4=0.4588 corrupt_frac_t_0p2_0p4=0.1955 acc_corrupt_t_0p4_0p6=0.6545 corrupt_frac_t_0p4_0p6=0.2011 acc_corrupt_t_0p6_0p8=0.8105 corrupt_frac_t_0p6_0p8=0.2043 acc_corrupt_t_0p8_1p0=0.9408 corrupt_frac_t_0p8_1p0=0.2023 out_w_norm=89.4193 out_g_norm=0.3043 loss_all=1.3080 init_gold_top10=0.5265 init_gold_top100=0.5321
|
| 240 |
+
step=5500 micro_steps=11000 elapsed=27.6s lr=3.000000e-04 loss=2.4586 loss_recon=2.4586 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7824 corrupt_frac=0.5477 acc_corrupt=0.6215 loss_corrupt=2.4586 wrong_frac=0.5004 init_acc_corrupt=0.4640 acc_corrupt_t_0p0_0p2=0.2464 corrupt_frac_t_0p0_0p2=0.2060 acc_corrupt_t_0p2_0p4=0.4511 corrupt_frac_t_0p2_0p4=0.1956 acc_corrupt_t_0p4_0p6=0.6623 corrupt_frac_t_0p4_0p6=0.1973 acc_corrupt_t_0p6_0p8=0.8109 corrupt_frac_t_0p6_0p8=0.1997 acc_corrupt_t_0p8_1p0=0.9428 corrupt_frac_t_0p8_1p0=0.2014 out_w_norm=90.4170 out_g_norm=0.3022 loss_all=1.3303 init_gold_top10=0.5047 init_gold_top100=0.5136
|
| 241 |
+
step=5600 micro_steps=11200 elapsed=27.5s lr=3.000000e-04 loss=2.4238 loss_recon=2.4238 loss_meanflow=0.0000 mean_model_t=0.5017 mean_corrupt_t=0.5017 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7848 corrupt_frac=0.5480 acc_corrupt=0.6259 loss_corrupt=2.4238 wrong_frac=0.4979 init_acc_corrupt=0.4681 acc_corrupt_t_0p0_0p2=0.2540 corrupt_frac_t_0p0_0p2=0.1987 acc_corrupt_t_0p2_0p4=0.4497 corrupt_frac_t_0p2_0p4=0.1962 acc_corrupt_t_0p4_0p6=0.6613 corrupt_frac_t_0p4_0p6=0.1989 acc_corrupt_t_0p6_0p8=0.8121 corrupt_frac_t_0p6_0p8=0.2081 acc_corrupt_t_0p8_1p0=0.9419 corrupt_frac_t_0p8_1p0=0.1982 out_w_norm=91.4053 out_g_norm=0.3003 loss_all=1.3612 init_gold_top10=0.5112 init_gold_top100=0.5188
|
| 242 |
+
step=5700 micro_steps=11400 elapsed=27.5s lr=3.000000e-04 loss=2.4663 loss_recon=2.4663 loss_meanflow=0.0000 mean_model_t=0.4974 mean_corrupt_t=0.4974 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7822 corrupt_frac=0.5472 acc_corrupt=0.6201 loss_corrupt=2.4663 wrong_frac=0.5029 init_acc_corrupt=0.4618 acc_corrupt_t_0p0_0p2=0.2467 corrupt_frac_t_0p0_0p2=0.2032 acc_corrupt_t_0p2_0p4=0.4501 corrupt_frac_t_0p2_0p4=0.2000 acc_corrupt_t_0p4_0p6=0.6606 corrupt_frac_t_0p4_0p6=0.2009 acc_corrupt_t_0p6_0p8=0.8143 corrupt_frac_t_0p6_0p8=0.1990 acc_corrupt_t_0p8_1p0=0.9407 corrupt_frac_t_0p8_1p0=0.1969 out_w_norm=92.3928 out_g_norm=0.2955 loss_all=1.6068 init_gold_top10=0.4619 init_gold_top100=0.4682
|
| 243 |
+
step=5800 micro_steps=11600 elapsed=27.6s lr=3.000000e-04 loss=2.4439 loss_recon=2.4439 loss_meanflow=0.0000 mean_model_t=0.5006 mean_corrupt_t=0.5006 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7837 corrupt_frac=0.5473 acc_corrupt=0.6233 loss_corrupt=2.4439 wrong_frac=0.5004 init_acc_corrupt=0.4651 acc_corrupt_t_0p0_0p2=0.2494 corrupt_frac_t_0p0_0p2=0.1993 acc_corrupt_t_0p2_0p4=0.4511 corrupt_frac_t_0p2_0p4=0.2011 acc_corrupt_t_0p4_0p6=0.6639 corrupt_frac_t_0p4_0p6=0.1993 acc_corrupt_t_0p6_0p8=0.8113 corrupt_frac_t_0p6_0p8=0.2013 acc_corrupt_t_0p8_1p0=0.9408 corrupt_frac_t_0p8_1p0=0.1991 out_w_norm=93.3711 out_g_norm=0.2937 loss_all=1.1639 init_gold_top10=0.5576 init_gold_top100=0.5641
|
| 244 |
+
step=5900 micro_steps=11800 elapsed=27.5s lr=3.000000e-04 loss=2.4313 loss_recon=2.4313 loss_meanflow=0.0000 mean_model_t=0.4973 mean_corrupt_t=0.4973 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7845 corrupt_frac=0.5469 acc_corrupt=0.6243 loss_corrupt=2.4313 wrong_frac=0.5017 init_acc_corrupt=0.4640 acc_corrupt_t_0p0_0p2=0.2515 corrupt_frac_t_0p0_0p2=0.2007 acc_corrupt_t_0p2_0p4=0.4600 corrupt_frac_t_0p2_0p4=0.2000 acc_corrupt_t_0p4_0p6=0.6610 corrupt_frac_t_0p4_0p6=0.2072 acc_corrupt_t_0p6_0p8=0.8146 corrupt_frac_t_0p6_0p8=0.1960 acc_corrupt_t_0p8_1p0=0.9440 corrupt_frac_t_0p8_1p0=0.1961 out_w_norm=94.3420 out_g_norm=0.2907 loss_all=1.5915 init_gold_top10=0.4815 init_gold_top100=0.4901
|
| 245 |
+
step=6000 micro_steps=12000 elapsed=27.5s lr=3.000000e-04 loss=2.4361 loss_recon=2.4361 loss_meanflow=0.0000 mean_model_t=0.4993 mean_corrupt_t=0.4993 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7842 corrupt_frac=0.5476 acc_corrupt=0.6242 loss_corrupt=2.4361 wrong_frac=0.5002 init_acc_corrupt=0.4656 acc_corrupt_t_0p0_0p2=0.2483 corrupt_frac_t_0p0_0p2=0.2042 acc_corrupt_t_0p2_0p4=0.4551 corrupt_frac_t_0p2_0p4=0.1959 acc_corrupt_t_0p4_0p6=0.6649 corrupt_frac_t_0p4_0p6=0.1974 acc_corrupt_t_0p6_0p8=0.8126 corrupt_frac_t_0p6_0p8=0.1999 acc_corrupt_t_0p8_1p0=0.9413 corrupt_frac_t_0p8_1p0=0.2026 out_w_norm=95.3030 out_g_norm=0.2868 loss_all=1.2929 init_gold_top10=0.5166 init_gold_top100=0.5224
|
| 246 |
+
step=6100 micro_steps=12200 elapsed=30.6s lr=3.000000e-04 loss=2.4390 loss_recon=2.4390 loss_meanflow=0.0000 mean_model_t=0.4984 mean_corrupt_t=0.4984 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7816 corrupt_frac=0.5546 acc_corrupt=0.6236 loss_corrupt=2.4390 wrong_frac=0.5007 init_acc_corrupt=0.4644 acc_corrupt_t_0p0_0p2=0.2498 corrupt_frac_t_0p0_0p2=0.1990 acc_corrupt_t_0p2_0p4=0.4544 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.6623 corrupt_frac_t_0p4_0p6=0.2016 acc_corrupt_t_0p6_0p8=0.8146 corrupt_frac_t_0p6_0p8=0.1986 acc_corrupt_t_0p8_1p0=0.9418 corrupt_frac_t_0p8_1p0=0.1977 out_w_norm=96.2638 out_g_norm=0.2836 loss_all=1.4549 init_gold_top10=0.5356 init_gold_top100=0.5400
|
| 247 |
+
step=6200 micro_steps=12400 elapsed=70.1s lr=3.000000e-04 loss=2.4070 loss_recon=2.4070 loss_meanflow=0.0000 mean_model_t=0.5030 mean_corrupt_t=0.5030 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7866 corrupt_frac=0.5483 acc_corrupt=0.6279 loss_corrupt=2.4070 wrong_frac=0.4967 init_acc_corrupt=0.4688 acc_corrupt_t_0p0_0p2=0.2511 corrupt_frac_t_0p0_0p2=0.1958 acc_corrupt_t_0p2_0p4=0.4548 corrupt_frac_t_0p2_0p4=0.2010 acc_corrupt_t_0p4_0p6=0.6659 corrupt_frac_t_0p4_0p6=0.1992 acc_corrupt_t_0p6_0p8=0.8136 corrupt_frac_t_0p6_0p8=0.2018 acc_corrupt_t_0p8_1p0=0.9420 corrupt_frac_t_0p8_1p0=0.2023 out_w_norm=97.2143 out_g_norm=0.2829 loss_all=1.7211 init_gold_top10=0.4437 init_gold_top100=0.4538
|
| 248 |
+
step=6300 micro_steps=12600 elapsed=33.9s lr=3.000000e-04 loss=2.4144 loss_recon=2.4144 loss_meanflow=0.0000 mean_model_t=0.5018 mean_corrupt_t=0.5018 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7847 corrupt_frac=0.5512 acc_corrupt=0.6261 loss_corrupt=2.4144 wrong_frac=0.4980 init_acc_corrupt=0.4675 acc_corrupt_t_0p0_0p2=0.2508 corrupt_frac_t_0p0_0p2=0.1913 acc_corrupt_t_0p2_0p4=0.4533 corrupt_frac_t_0p2_0p4=0.2025 acc_corrupt_t_0p4_0p6=0.6564 corrupt_frac_t_0p4_0p6=0.2066 acc_corrupt_t_0p6_0p8=0.8143 corrupt_frac_t_0p6_0p8=0.2021 acc_corrupt_t_0p8_1p0=0.9423 corrupt_frac_t_0p8_1p0=0.1975 out_w_norm=98.1645 out_g_norm=0.2781 loss_all=1.4244 init_gold_top10=0.5068 init_gold_top100=0.5147
|
| 249 |
+
step=6400 micro_steps=12800 elapsed=27.5s lr=3.000000e-04 loss=2.3976 loss_recon=2.3976 loss_meanflow=0.0000 mean_model_t=0.5032 mean_corrupt_t=0.5032 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7871 corrupt_frac=0.5488 acc_corrupt=0.6289 loss_corrupt=2.3976 wrong_frac=0.4974 init_acc_corrupt=0.4685 acc_corrupt_t_0p0_0p2=0.2502 corrupt_frac_t_0p0_0p2=0.1994 acc_corrupt_t_0p2_0p4=0.4601 corrupt_frac_t_0p2_0p4=0.1958 acc_corrupt_t_0p4_0p6=0.6665 corrupt_frac_t_0p4_0p6=0.2030 acc_corrupt_t_0p6_0p8=0.8168 corrupt_frac_t_0p6_0p8=0.2003 acc_corrupt_t_0p8_1p0=0.9429 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=99.1077 out_g_norm=0.2767 loss_all=1.1987 init_gold_top10=0.5096 init_gold_top100=0.5130
|
| 250 |
+
step=6500 micro_steps=13000 elapsed=27.5s lr=3.000000e-04 loss=2.4445 loss_recon=2.4445 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7831 corrupt_frac=0.5491 acc_corrupt=0.6224 loss_corrupt=2.4445 wrong_frac=0.5052 init_acc_corrupt=0.4596 acc_corrupt_t_0p0_0p2=0.2536 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.4519 corrupt_frac_t_0p2_0p4=0.2021 acc_corrupt_t_0p4_0p6=0.6647 corrupt_frac_t_0p4_0p6=0.2017 acc_corrupt_t_0p6_0p8=0.8165 corrupt_frac_t_0p6_0p8=0.2005 acc_corrupt_t_0p8_1p0=0.9419 corrupt_frac_t_0p8_1p0=0.1931 out_w_norm=100.0492 out_g_norm=0.2752 loss_all=1.7887 init_gold_top10=0.4066 init_gold_top100=0.4140
|
| 251 |
+
step=6600 micro_steps=13200 elapsed=27.5s lr=3.000000e-04 loss=2.3954 loss_recon=2.3954 loss_meanflow=0.0000 mean_model_t=0.5022 mean_corrupt_t=0.5022 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7867 corrupt_frac=0.5500 acc_corrupt=0.6290 loss_corrupt=2.3954 wrong_frac=0.4979 init_acc_corrupt=0.4680 acc_corrupt_t_0p0_0p2=0.2531 corrupt_frac_t_0p0_0p2=0.1985 acc_corrupt_t_0p2_0p4=0.4582 corrupt_frac_t_0p2_0p4=0.1964 acc_corrupt_t_0p4_0p6=0.6658 corrupt_frac_t_0p4_0p6=0.2058 acc_corrupt_t_0p6_0p8=0.8166 corrupt_frac_t_0p6_0p8=0.1947 acc_corrupt_t_0p8_1p0=0.9421 corrupt_frac_t_0p8_1p0=0.2045 out_w_norm=100.9815 out_g_norm=0.2716 loss_all=1.4895 init_gold_top10=0.5394 init_gold_top100=0.5430
|
| 252 |
+
step=6700 micro_steps=13400 elapsed=27.5s lr=3.000000e-04 loss=2.4070 loss_recon=2.4070 loss_meanflow=0.0000 mean_model_t=0.4999 mean_corrupt_t=0.4999 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7857 corrupt_frac=0.5492 acc_corrupt=0.6269 loss_corrupt=2.4070 wrong_frac=0.4995 init_acc_corrupt=0.4658 acc_corrupt_t_0p0_0p2=0.2548 corrupt_frac_t_0p0_0p2=0.2014 acc_corrupt_t_0p2_0p4=0.4505 corrupt_frac_t_0p2_0p4=0.1953 acc_corrupt_t_0p4_0p6=0.6650 corrupt_frac_t_0p4_0p6=0.1987 acc_corrupt_t_0p6_0p8=0.8160 corrupt_frac_t_0p6_0p8=0.2039 acc_corrupt_t_0p8_1p0=0.9421 corrupt_frac_t_0p8_1p0=0.2006 out_w_norm=101.9127 out_g_norm=0.2699 loss_all=1.2800 init_gold_top10=0.4892 init_gold_top100=0.4966
|
| 253 |
+
step=6800 micro_steps=13600 elapsed=27.4s lr=3.000000e-04 loss=2.4082 loss_recon=2.4082 loss_meanflow=0.0000 mean_model_t=0.4974 mean_corrupt_t=0.4974 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7848 corrupt_frac=0.5532 acc_corrupt=0.6272 loss_corrupt=2.4082 wrong_frac=0.4999 init_acc_corrupt=0.4664 acc_corrupt_t_0p0_0p2=0.2550 corrupt_frac_t_0p0_0p2=0.2021 acc_corrupt_t_0p2_0p4=0.4571 corrupt_frac_t_0p2_0p4=0.1958 acc_corrupt_t_0p4_0p6=0.6637 corrupt_frac_t_0p4_0p6=0.2030 acc_corrupt_t_0p6_0p8=0.8164 corrupt_frac_t_0p6_0p8=0.1978 acc_corrupt_t_0p8_1p0=0.9438 corrupt_frac_t_0p8_1p0=0.2013 out_w_norm=102.8368 out_g_norm=0.2676 loss_all=1.7544 init_gold_top10=0.4595 init_gold_top100=0.4649
|
| 254 |
+
step=6900 micro_steps=13800 elapsed=27.5s lr=3.000000e-04 loss=2.3755 loss_recon=2.3755 loss_meanflow=0.0000 mean_model_t=0.5006 mean_corrupt_t=0.5006 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7871 corrupt_frac=0.5524 acc_corrupt=0.6314 loss_corrupt=2.3755 wrong_frac=0.4964 init_acc_corrupt=0.4699 acc_corrupt_t_0p0_0p2=0.2551 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.4604 corrupt_frac_t_0p2_0p4=0.1944 acc_corrupt_t_0p4_0p6=0.6715 corrupt_frac_t_0p4_0p6=0.1972 acc_corrupt_t_0p6_0p8=0.8154 corrupt_frac_t_0p6_0p8=0.2050 acc_corrupt_t_0p8_1p0=0.9424 corrupt_frac_t_0p8_1p0=0.2028 out_w_norm=103.7581 out_g_norm=0.2647 loss_all=1.5012 init_gold_top10=0.5066 init_gold_top100=0.5122
|
| 255 |
+
step=7000 micro_steps=14000 elapsed=27.5s lr=3.000000e-04 loss=2.4165 loss_recon=2.4165 loss_meanflow=0.0000 mean_model_t=0.5005 mean_corrupt_t=0.5005 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7849 corrupt_frac=0.5522 acc_corrupt=0.6262 loss_corrupt=2.4165 wrong_frac=0.5008 init_acc_corrupt=0.4648 acc_corrupt_t_0p0_0p2=0.2508 corrupt_frac_t_0p0_0p2=0.2040 acc_corrupt_t_0p2_0p4=0.4583 corrupt_frac_t_0p2_0p4=0.2002 acc_corrupt_t_0p4_0p6=0.6692 corrupt_frac_t_0p4_0p6=0.1940 acc_corrupt_t_0p6_0p8=0.8159 corrupt_frac_t_0p6_0p8=0.1996 acc_corrupt_t_0p8_1p0=0.9425 corrupt_frac_t_0p8_1p0=0.2022 out_w_norm=104.6749 out_g_norm=0.2618 loss_all=1.1028 init_gold_top10=0.5613 init_gold_top100=0.5669
|
| 256 |
+
step=7100 micro_steps=14200 elapsed=30.6s lr=3.000000e-04 loss=2.3798 loss_recon=2.3798 loss_meanflow=0.0000 mean_model_t=0.5010 mean_corrupt_t=0.5010 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7892 corrupt_frac=0.5476 acc_corrupt=0.6307 loss_corrupt=2.3798 wrong_frac=0.4977 init_acc_corrupt=0.4681 acc_corrupt_t_0p0_0p2=0.2545 corrupt_frac_t_0p0_0p2=0.1951 acc_corrupt_t_0p2_0p4=0.4606 corrupt_frac_t_0p2_0p4=0.2035 acc_corrupt_t_0p4_0p6=0.6675 corrupt_frac_t_0p4_0p6=0.1979 acc_corrupt_t_0p6_0p8=0.8200 corrupt_frac_t_0p6_0p8=0.2046 acc_corrupt_t_0p8_1p0=0.9422 corrupt_frac_t_0p8_1p0=0.1990 out_w_norm=105.5852 out_g_norm=0.2606 loss_all=1.4140 init_gold_top10=0.4581 init_gold_top100=0.4648
|
| 257 |
+
step=7200 micro_steps=14400 elapsed=80.1s lr=3.000000e-04 loss=2.3719 loss_recon=2.3719 loss_meanflow=0.0000 mean_model_t=0.5028 mean_corrupt_t=0.5028 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7890 corrupt_frac=0.5504 acc_corrupt=0.6320 loss_corrupt=2.3719 wrong_frac=0.4952 init_acc_corrupt=0.4701 acc_corrupt_t_0p0_0p2=0.2563 corrupt_frac_t_0p0_0p2=0.1949 acc_corrupt_t_0p2_0p4=0.4587 corrupt_frac_t_0p2_0p4=0.2025 acc_corrupt_t_0p4_0p6=0.6688 corrupt_frac_t_0p4_0p6=0.1981 acc_corrupt_t_0p6_0p8=0.8160 corrupt_frac_t_0p6_0p8=0.1963 acc_corrupt_t_0p8_1p0=0.9437 corrupt_frac_t_0p8_1p0=0.2082 out_w_norm=106.4842 out_g_norm=0.2592 loss_all=1.3028 init_gold_top10=0.4801 init_gold_top100=0.4881
|
| 258 |
+
step=7300 micro_steps=14600 elapsed=89.7s lr=3.000000e-04 loss=2.4204 loss_recon=2.4204 loss_meanflow=0.0000 mean_model_t=0.4943 mean_corrupt_t=0.4943 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7838 corrupt_frac=0.5532 acc_corrupt=0.6247 loss_corrupt=2.4204 wrong_frac=0.5056 init_acc_corrupt=0.4593 acc_corrupt_t_0p0_0p2=0.2588 corrupt_frac_t_0p0_0p2=0.2062 acc_corrupt_t_0p2_0p4=0.4609 corrupt_frac_t_0p2_0p4=0.2031 acc_corrupt_t_0p4_0p6=0.6677 corrupt_frac_t_0p4_0p6=0.1982 acc_corrupt_t_0p6_0p8=0.8186 corrupt_frac_t_0p6_0p8=0.2006 acc_corrupt_t_0p8_1p0=0.9439 corrupt_frac_t_0p8_1p0=0.1920 out_w_norm=107.3845 out_g_norm=0.2578 loss_all=1.3866 init_gold_top10=0.4858 init_gold_top100=0.4896
|
| 259 |
+
step=7400 micro_steps=14800 elapsed=89.3s lr=3.000000e-04 loss=2.3988 loss_recon=2.3988 loss_meanflow=0.0000 mean_model_t=0.4988 mean_corrupt_t=0.4988 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7864 corrupt_frac=0.5520 acc_corrupt=0.6283 loss_corrupt=2.3988 wrong_frac=0.5005 init_acc_corrupt=0.4648 acc_corrupt_t_0p0_0p2=0.2548 corrupt_frac_t_0p0_0p2=0.1989 acc_corrupt_t_0p2_0p4=0.4597 corrupt_frac_t_0p2_0p4=0.2011 acc_corrupt_t_0p4_0p6=0.6662 corrupt_frac_t_0p4_0p6=0.2018 acc_corrupt_t_0p6_0p8=0.8164 corrupt_frac_t_0p6_0p8=0.1971 acc_corrupt_t_0p8_1p0=0.9440 corrupt_frac_t_0p8_1p0=0.2010 out_w_norm=108.2819 out_g_norm=0.2567 loss_all=1.1978 init_gold_top10=0.5472 init_gold_top100=0.5504
|
| 260 |
+
step=7500 micro_steps=15000 elapsed=90.0s lr=3.000000e-04 loss=2.3517 loss_recon=2.3517 loss_meanflow=0.0000 mean_model_t=0.5048 mean_corrupt_t=0.5048 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7907 corrupt_frac=0.5509 acc_corrupt=0.6347 loss_corrupt=2.3517 wrong_frac=0.4950 init_acc_corrupt=0.4708 acc_corrupt_t_0p0_0p2=0.2572 corrupt_frac_t_0p0_0p2=0.1934 acc_corrupt_t_0p2_0p4=0.4574 corrupt_frac_t_0p2_0p4=0.2010 acc_corrupt_t_0p4_0p6=0.6738 corrupt_frac_t_0p4_0p6=0.1977 acc_corrupt_t_0p6_0p8=0.8211 corrupt_frac_t_0p6_0p8=0.2056 acc_corrupt_t_0p8_1p0=0.9440 corrupt_frac_t_0p8_1p0=0.2022 out_w_norm=109.1770 out_g_norm=0.2529 loss_all=1.4182 init_gold_top10=0.5018 init_gold_top100=0.5101
|
| 261 |
+
step=7600 micro_steps=15200 elapsed=62.2s lr=3.000000e-04 loss=2.3874 loss_recon=2.3874 loss_meanflow=0.0000 mean_model_t=0.4999 mean_corrupt_t=0.4999 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7880 corrupt_frac=0.5483 acc_corrupt=0.6287 loss_corrupt=2.3874 wrong_frac=0.5012 init_acc_corrupt=0.4635 acc_corrupt_t_0p0_0p2=0.2595 corrupt_frac_t_0p0_0p2=0.1970 acc_corrupt_t_0p2_0p4=0.4605 corrupt_frac_t_0p2_0p4=0.2062 acc_corrupt_t_0p4_0p6=0.6691 corrupt_frac_t_0p4_0p6=0.1983 acc_corrupt_t_0p6_0p8=0.8175 corrupt_frac_t_0p6_0p8=0.2044 acc_corrupt_t_0p8_1p0=0.9416 corrupt_frac_t_0p8_1p0=0.1942 out_w_norm=110.0706 out_g_norm=0.2529 loss_all=1.6005 init_gold_top10=0.4195 init_gold_top100=0.4249
|
| 262 |
+
step=7700 micro_steps=15400 elapsed=27.5s lr=3.000000e-04 loss=2.4065 loss_recon=2.4065 loss_meanflow=0.0000 mean_model_t=0.4996 mean_corrupt_t=0.4996 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7857 corrupt_frac=0.5518 acc_corrupt=0.6267 loss_corrupt=2.4065 wrong_frac=0.5025 init_acc_corrupt=0.4620 acc_corrupt_t_0p0_0p2=0.2563 corrupt_frac_t_0p0_0p2=0.2008 acc_corrupt_t_0p2_0p4=0.4557 corrupt_frac_t_0p2_0p4=0.2026 acc_corrupt_t_0p4_0p6=0.6686 corrupt_frac_t_0p4_0p6=0.1968 acc_corrupt_t_0p6_0p8=0.8195 corrupt_frac_t_0p6_0p8=0.2071 acc_corrupt_t_0p8_1p0=0.9425 corrupt_frac_t_0p8_1p0=0.1928 out_w_norm=110.9678 out_g_norm=0.2507 loss_all=1.3292 init_gold_top10=0.4979 init_gold_top100=0.5030
|
| 263 |
+
step=7800 micro_steps=15600 elapsed=27.5s lr=3.000000e-04 loss=2.4010 loss_recon=2.4010 loss_meanflow=0.0000 mean_model_t=0.5001 mean_corrupt_t=0.5001 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7876 corrupt_frac=0.5481 acc_corrupt=0.6277 loss_corrupt=2.4010 wrong_frac=0.5018 init_acc_corrupt=0.4625 acc_corrupt_t_0p0_0p2=0.2539 corrupt_frac_t_0p0_0p2=0.2016 acc_corrupt_t_0p2_0p4=0.4605 corrupt_frac_t_0p2_0p4=0.2031 acc_corrupt_t_0p4_0p6=0.6677 corrupt_frac_t_0p4_0p6=0.1980 acc_corrupt_t_0p6_0p8=0.8198 corrupt_frac_t_0p6_0p8=0.1971 acc_corrupt_t_0p8_1p0=0.9448 corrupt_frac_t_0p8_1p0=0.2002 out_w_norm=111.8583 out_g_norm=0.2477 loss_all=1.0298 init_gold_top10=0.5431 init_gold_top100=0.5490
|
| 264 |
+
step=7900 micro_steps=15800 elapsed=27.6s lr=3.000000e-04 loss=2.3741 loss_recon=2.3741 loss_meanflow=0.0000 mean_model_t=0.5017 mean_corrupt_t=0.5017 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7890 corrupt_frac=0.5486 acc_corrupt=0.6303 loss_corrupt=2.3741 wrong_frac=0.4989 init_acc_corrupt=0.4662 acc_corrupt_t_0p0_0p2=0.2550 corrupt_frac_t_0p0_0p2=0.1991 acc_corrupt_t_0p2_0p4=0.4589 corrupt_frac_t_0p2_0p4=0.2002 acc_corrupt_t_0p4_0p6=0.6701 corrupt_frac_t_0p4_0p6=0.2004 acc_corrupt_t_0p6_0p8=0.8187 corrupt_frac_t_0p6_0p8=0.1967 acc_corrupt_t_0p8_1p0=0.9445 corrupt_frac_t_0p8_1p0=0.2036 out_w_norm=112.7469 out_g_norm=0.2479 loss_all=1.6038 init_gold_top10=0.4413 init_gold_top100=0.4503
|
| 265 |
+
step=8000 micro_steps=16000 elapsed=27.5s lr=3.000000e-04 loss=2.3848 loss_recon=2.3848 loss_meanflow=0.0000 mean_model_t=0.5010 mean_corrupt_t=0.5010 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7871 corrupt_frac=0.5532 acc_corrupt=0.6294 loss_corrupt=2.3848 wrong_frac=0.5013 init_acc_corrupt=0.4637 acc_corrupt_t_0p0_0p2=0.2578 corrupt_frac_t_0p0_0p2=0.1997 acc_corrupt_t_0p2_0p4=0.4606 corrupt_frac_t_0p2_0p4=0.2035 acc_corrupt_t_0p4_0p6=0.6707 corrupt_frac_t_0p4_0p6=0.2002 acc_corrupt_t_0p6_0p8=0.8190 corrupt_frac_t_0p6_0p8=0.1978 acc_corrupt_t_0p8_1p0=0.9452 corrupt_frac_t_0p8_1p0=0.1989 out_w_norm=113.6273 out_g_norm=0.2437 loss_all=1.3985 init_gold_top10=0.4858 init_gold_top100=0.4904
|
| 266 |
+
step=8100 micro_steps=16200 elapsed=36.2s lr=3.000000e-04 loss=2.3844 loss_recon=2.3844 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7877 corrupt_frac=0.5516 acc_corrupt=0.6296 loss_corrupt=2.3844 wrong_frac=0.4996 init_acc_corrupt=0.4663 acc_corrupt_t_0p0_0p2=0.2523 corrupt_frac_t_0p0_0p2=0.1983 acc_corrupt_t_0p2_0p4=0.4605 corrupt_frac_t_0p2_0p4=0.2007 acc_corrupt_t_0p4_0p6=0.6690 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.8199 corrupt_frac_t_0p6_0p8=0.1969 acc_corrupt_t_0p8_1p0=0.9433 corrupt_frac_t_0p8_1p0=0.2018 out_w_norm=114.5040 out_g_norm=0.2430 loss_all=1.6059 init_gold_top10=0.4645 init_gold_top100=0.4722
|
| 267 |
+
step=8200 micro_steps=16400 elapsed=66.8s lr=3.000000e-04 loss=2.3749 loss_recon=2.3749 loss_meanflow=0.0000 mean_model_t=0.5010 mean_corrupt_t=0.5010 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7885 corrupt_frac=0.5511 acc_corrupt=0.6299 loss_corrupt=2.3749 wrong_frac=0.5010 init_acc_corrupt=0.4640 acc_corrupt_t_0p0_0p2=0.2570 corrupt_frac_t_0p0_0p2=0.2004 acc_corrupt_t_0p2_0p4=0.4655 corrupt_frac_t_0p2_0p4=0.2022 acc_corrupt_t_0p4_0p6=0.6693 corrupt_frac_t_0p4_0p6=0.1993 acc_corrupt_t_0p6_0p8=0.8184 corrupt_frac_t_0p6_0p8=0.1968 acc_corrupt_t_0p8_1p0=0.9433 corrupt_frac_t_0p8_1p0=0.2012 out_w_norm=115.3769 out_g_norm=0.2423 loss_all=1.2364 init_gold_top10=0.5169 init_gold_top100=0.5234
|
| 268 |
+
step=8300 micro_steps=16600 elapsed=31.6s lr=3.000000e-04 loss=2.3471 loss_recon=2.3471 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7906 corrupt_frac=0.5513 acc_corrupt=0.6341 loss_corrupt=2.3471 wrong_frac=0.4966 init_acc_corrupt=0.4687 acc_corrupt_t_0p0_0p2=0.2576 corrupt_frac_t_0p0_0p2=0.1973 acc_corrupt_t_0p2_0p4=0.4638 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.6706 corrupt_frac_t_0p4_0p6=0.1988 acc_corrupt_t_0p6_0p8=0.8196 corrupt_frac_t_0p6_0p8=0.2015 acc_corrupt_t_0p8_1p0=0.9438 corrupt_frac_t_0p8_1p0=0.2046 out_w_norm=116.2496 out_g_norm=0.2410 loss_all=1.2757 init_gold_top10=0.5234 init_gold_top100=0.5282
|
| 269 |
+
step=8400 micro_steps=16800 elapsed=27.5s lr=3.000000e-04 loss=2.3661 loss_recon=2.3661 loss_meanflow=0.0000 mean_model_t=0.4992 mean_corrupt_t=0.4992 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7899 corrupt_frac=0.5484 acc_corrupt=0.6312 loss_corrupt=2.3661 wrong_frac=0.5015 init_acc_corrupt=0.4641 acc_corrupt_t_0p0_0p2=0.2597 corrupt_frac_t_0p0_0p2=0.2003 acc_corrupt_t_0p2_0p4=0.4667 corrupt_frac_t_0p2_0p4=0.1987 acc_corrupt_t_0p4_0p6=0.6673 corrupt_frac_t_0p4_0p6=0.2043 acc_corrupt_t_0p6_0p8=0.8202 corrupt_frac_t_0p6_0p8=0.1968 acc_corrupt_t_0p8_1p0=0.9444 corrupt_frac_t_0p8_1p0=0.1998 out_w_norm=117.1174 out_g_norm=0.2393 loss_all=1.4517 init_gold_top10=0.4539 init_gold_top100=0.4595
|
| 270 |
+
step=8500 micro_steps=17000 elapsed=27.4s lr=3.000000e-04 loss=2.3886 loss_recon=2.3886 loss_meanflow=0.0000 mean_model_t=0.4970 mean_corrupt_t=0.4970 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7874 corrupt_frac=0.5516 acc_corrupt=0.6281 loss_corrupt=2.3886 wrong_frac=0.5051 init_acc_corrupt=0.4599 acc_corrupt_t_0p0_0p2=0.2573 corrupt_frac_t_0p0_0p2=0.2013 acc_corrupt_t_0p2_0p4=0.4632 corrupt_frac_t_0p2_0p4=0.2045 acc_corrupt_t_0p4_0p6=0.6707 corrupt_frac_t_0p4_0p6=0.1978 acc_corrupt_t_0p6_0p8=0.8191 corrupt_frac_t_0p6_0p8=0.2034 acc_corrupt_t_0p8_1p0=0.9444 corrupt_frac_t_0p8_1p0=0.1930 out_w_norm=117.9762 out_g_norm=0.2367 loss_all=1.5048 init_gold_top10=0.4661 init_gold_top100=0.4726
|
| 271 |
+
step=8600 micro_steps=17200 elapsed=27.5s lr=3.000000e-04 loss=2.3871 loss_recon=2.3871 loss_meanflow=0.0000 mean_model_t=0.4940 mean_corrupt_t=0.4940 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7866 corrupt_frac=0.5534 acc_corrupt=0.6285 loss_corrupt=2.3871 wrong_frac=0.5046 init_acc_corrupt=0.4605 acc_corrupt_t_0p0_0p2=0.2610 corrupt_frac_t_0p0_0p2=0.2054 acc_corrupt_t_0p2_0p4=0.4614 corrupt_frac_t_0p2_0p4=0.1985 acc_corrupt_t_0p4_0p6=0.6714 corrupt_frac_t_0p4_0p6=0.2034 acc_corrupt_t_0p6_0p8=0.8222 corrupt_frac_t_0p6_0p8=0.1982 acc_corrupt_t_0p8_1p0=0.9452 corrupt_frac_t_0p8_1p0=0.1944 out_w_norm=118.8288 out_g_norm=0.2351 loss_all=1.2799 init_gold_top10=0.5116 init_gold_top100=0.5183
|
| 272 |
+
step=8700 micro_steps=17400 elapsed=27.4s lr=3.000000e-04 loss=2.3182 loss_recon=2.3182 loss_meanflow=0.0000 mean_model_t=0.5047 mean_corrupt_t=0.5047 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7945 corrupt_frac=0.5476 acc_corrupt=0.6381 loss_corrupt=2.3182 wrong_frac=0.4960 init_acc_corrupt=0.4710 acc_corrupt_t_0p0_0p2=0.2628 corrupt_frac_t_0p0_0p2=0.1964 acc_corrupt_t_0p2_0p4=0.4698 corrupt_frac_t_0p2_0p4=0.1918 acc_corrupt_t_0p4_0p6=0.6723 corrupt_frac_t_0p4_0p6=0.2097 acc_corrupt_t_0p6_0p8=0.8221 corrupt_frac_t_0p6_0p8=0.1995 acc_corrupt_t_0p8_1p0=0.9449 corrupt_frac_t_0p8_1p0=0.2026 out_w_norm=119.6784 out_g_norm=0.2340 loss_all=1.2643 init_gold_top10=0.5153 init_gold_top100=0.5196
|
| 273 |
+
step=8800 micro_steps=17600 elapsed=27.4s lr=3.000000e-04 loss=2.3579 loss_recon=2.3579 loss_meanflow=0.0000 mean_model_t=0.5011 mean_corrupt_t=0.5011 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7896 corrupt_frac=0.5535 acc_corrupt=0.6329 loss_corrupt=2.3579 wrong_frac=0.4989 init_acc_corrupt=0.4663 acc_corrupt_t_0p0_0p2=0.2572 corrupt_frac_t_0p0_0p2=0.1986 acc_corrupt_t_0p2_0p4=0.4639 corrupt_frac_t_0p2_0p4=0.2010 acc_corrupt_t_0p4_0p6=0.6738 corrupt_frac_t_0p4_0p6=0.2020 acc_corrupt_t_0p6_0p8=0.8242 corrupt_frac_t_0p6_0p8=0.1998 acc_corrupt_t_0p8_1p0=0.9452 corrupt_frac_t_0p8_1p0=0.1986 out_w_norm=120.5332 out_g_norm=0.2317 loss_all=1.3301 init_gold_top10=0.5567 init_gold_top100=0.5626
|
| 274 |
+
step=8900 micro_steps=17800 elapsed=27.4s lr=3.000000e-04 loss=2.3722 loss_recon=2.3722 loss_meanflow=0.0000 mean_model_t=0.4970 mean_corrupt_t=0.4970 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7891 corrupt_frac=0.5500 acc_corrupt=0.6301 loss_corrupt=2.3722 wrong_frac=0.5037 init_acc_corrupt=0.4613 acc_corrupt_t_0p0_0p2=0.2612 corrupt_frac_t_0p0_0p2=0.1990 acc_corrupt_t_0p2_0p4=0.4631 corrupt_frac_t_0p2_0p4=0.2076 acc_corrupt_t_0p4_0p6=0.6717 corrupt_frac_t_0p4_0p6=0.2025 acc_corrupt_t_0p6_0p8=0.8223 corrupt_frac_t_0p6_0p8=0.1894 acc_corrupt_t_0p8_1p0=0.9440 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=121.3817 out_g_norm=0.2317 loss_all=1.3111 init_gold_top10=0.5047 init_gold_top100=0.5089
|
| 275 |
+
step=9000 micro_steps=18000 elapsed=27.4s lr=3.000000e-04 loss=2.3794 loss_recon=2.3794 loss_meanflow=0.0000 mean_model_t=0.4963 mean_corrupt_t=0.4963 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7878 corrupt_frac=0.5521 acc_corrupt=0.6293 loss_corrupt=2.3794 wrong_frac=0.5035 init_acc_corrupt=0.4617 acc_corrupt_t_0p0_0p2=0.2583 corrupt_frac_t_0p0_0p2=0.1982 acc_corrupt_t_0p2_0p4=0.4604 corrupt_frac_t_0p2_0p4=0.2088 acc_corrupt_t_0p4_0p6=0.6727 corrupt_frac_t_0p4_0p6=0.2003 acc_corrupt_t_0p6_0p8=0.8221 corrupt_frac_t_0p6_0p8=0.1950 acc_corrupt_t_0p8_1p0=0.9454 corrupt_frac_t_0p8_1p0=0.1977 out_w_norm=122.2292 out_g_norm=0.2289 loss_all=1.3437 init_gold_top10=0.5058 init_gold_top100=0.5098
|
| 276 |
+
step=9100 micro_steps=18200 elapsed=30.8s lr=3.000000e-04 loss=2.4039 loss_recon=2.4039 loss_meanflow=0.0000 mean_model_t=0.4950 mean_corrupt_t=0.4950 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7870 corrupt_frac=0.5490 acc_corrupt=0.6258 loss_corrupt=2.4039 wrong_frac=0.5069 init_acc_corrupt=0.4574 acc_corrupt_t_0p0_0p2=0.2565 corrupt_frac_t_0p0_0p2=0.2090 acc_corrupt_t_0p2_0p4=0.4596 corrupt_frac_t_0p2_0p4=0.2021 acc_corrupt_t_0p4_0p6=0.6767 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=0.8248 corrupt_frac_t_0p6_0p8=0.1947 acc_corrupt_t_0p8_1p0=0.9451 corrupt_frac_t_0p8_1p0=0.1936 out_w_norm=123.0713 out_g_norm=0.2291 loss_all=1.3714 init_gold_top10=0.4809 init_gold_top100=0.4822
|
| 277 |
+
step=9200 micro_steps=18400 elapsed=70.2s lr=3.000000e-04 loss=2.3259 loss_recon=2.3259 loss_meanflow=0.0000 mean_model_t=0.5045 mean_corrupt_t=0.5045 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7927 corrupt_frac=0.5511 acc_corrupt=0.6366 loss_corrupt=2.3259 wrong_frac=0.4965 init_acc_corrupt=0.4696 acc_corrupt_t_0p0_0p2=0.2594 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.4692 corrupt_frac_t_0p2_0p4=0.1998 acc_corrupt_t_0p4_0p6=0.6739 corrupt_frac_t_0p4_0p6=0.1932 acc_corrupt_t_0p6_0p8=0.8199 corrupt_frac_t_0p6_0p8=0.2072 acc_corrupt_t_0p8_1p0=0.9454 corrupt_frac_t_0p8_1p0=0.2027 out_w_norm=123.9123 out_g_norm=0.2262 loss_all=1.5400 init_gold_top10=0.4510 init_gold_top100=0.4583
|
| 278 |
+
step=9300 micro_steps=18600 elapsed=34.1s lr=3.000000e-04 loss=2.3463 loss_recon=2.3463 loss_meanflow=0.0000 mean_model_t=0.5010 mean_corrupt_t=0.5010 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7919 corrupt_frac=0.5482 acc_corrupt=0.6336 loss_corrupt=2.3463 wrong_frac=0.4998 init_acc_corrupt=0.4667 acc_corrupt_t_0p0_0p2=0.2590 corrupt_frac_t_0p0_0p2=0.1962 acc_corrupt_t_0p2_0p4=0.4643 corrupt_frac_t_0p2_0p4=0.1982 acc_corrupt_t_0p4_0p6=0.6674 corrupt_frac_t_0p4_0p6=0.2031 acc_corrupt_t_0p6_0p8=0.8221 corrupt_frac_t_0p6_0p8=0.2058 acc_corrupt_t_0p8_1p0=0.9455 corrupt_frac_t_0p8_1p0=0.1967 out_w_norm=124.7542 out_g_norm=0.2260 loss_all=1.4271 init_gold_top10=0.4819 init_gold_top100=0.4877
|
| 279 |
+
step=9400 micro_steps=18800 elapsed=27.4s lr=3.000000e-04 loss=2.3302 loss_recon=2.3302 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7930 corrupt_frac=0.5492 acc_corrupt=0.6357 loss_corrupt=2.3302 wrong_frac=0.4982 init_acc_corrupt=0.4671 acc_corrupt_t_0p0_0p2=0.2623 corrupt_frac_t_0p0_0p2=0.1966 acc_corrupt_t_0p2_0p4=0.4667 corrupt_frac_t_0p2_0p4=0.2038 acc_corrupt_t_0p4_0p6=0.6748 corrupt_frac_t_0p4_0p6=0.1944 acc_corrupt_t_0p6_0p8=0.8216 corrupt_frac_t_0p6_0p8=0.2031 acc_corrupt_t_0p8_1p0=0.9449 corrupt_frac_t_0p8_1p0=0.2020 out_w_norm=125.5937 out_g_norm=0.2231 loss_all=1.3928 init_gold_top10=0.4629 init_gold_top100=0.4680
|
| 280 |
+
step=9500 micro_steps=19000 elapsed=27.4s lr=3.000000e-04 loss=2.3529 loss_recon=2.3529 loss_meanflow=0.0000 mean_model_t=0.4974 mean_corrupt_t=0.4974 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7908 corrupt_frac=0.5505 acc_corrupt=0.6326 loss_corrupt=2.3529 wrong_frac=0.5016 init_acc_corrupt=0.4631 acc_corrupt_t_0p0_0p2=0.2617 corrupt_frac_t_0p0_0p2=0.2000 acc_corrupt_t_0p2_0p4=0.4665 corrupt_frac_t_0p2_0p4=0.2041 acc_corrupt_t_0p4_0p6=0.6752 corrupt_frac_t_0p4_0p6=0.1975 acc_corrupt_t_0p6_0p8=0.8242 corrupt_frac_t_0p6_0p8=0.2045 acc_corrupt_t_0p8_1p0=0.9448 corrupt_frac_t_0p8_1p0=0.1939 out_w_norm=126.4298 out_g_norm=0.2226 loss_all=0.9248 init_gold_top10=0.5973 init_gold_top100=0.6017
|
| 281 |
+
step=9600 micro_steps=19200 elapsed=27.4s lr=3.000000e-04 loss=2.3466 loss_recon=2.3466 loss_meanflow=0.0000 mean_model_t=0.4996 mean_corrupt_t=0.4996 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7915 corrupt_frac=0.5494 acc_corrupt=0.6334 loss_corrupt=2.3466 wrong_frac=0.5013 init_acc_corrupt=0.4642 acc_corrupt_t_0p0_0p2=0.2616 corrupt_frac_t_0p0_0p2=0.2035 acc_corrupt_t_0p2_0p4=0.4715 corrupt_frac_t_0p2_0p4=0.1983 acc_corrupt_t_0p4_0p6=0.6750 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=0.8212 corrupt_frac_t_0p6_0p8=0.1983 acc_corrupt_t_0p8_1p0=0.9452 corrupt_frac_t_0p8_1p0=0.1994 out_w_norm=127.2573 out_g_norm=0.2219 loss_all=1.2936 init_gold_top10=0.4978 init_gold_top100=0.5047
|
| 282 |
+
step=9700 micro_steps=19400 elapsed=27.4s lr=3.000000e-04 loss=2.3612 loss_recon=2.3612 loss_meanflow=0.0000 mean_model_t=0.5013 mean_corrupt_t=0.5013 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7885 corrupt_frac=0.5549 acc_corrupt=0.6314 loss_corrupt=2.3612 wrong_frac=0.5011 init_acc_corrupt=0.4637 acc_corrupt_t_0p0_0p2=0.2584 corrupt_frac_t_0p0_0p2=0.2057 acc_corrupt_t_0p2_0p4=0.4643 corrupt_frac_t_0p2_0p4=0.1964 acc_corrupt_t_0p4_0p6=0.6723 corrupt_frac_t_0p4_0p6=0.2019 acc_corrupt_t_0p6_0p8=0.8243 corrupt_frac_t_0p6_0p8=0.1903 acc_corrupt_t_0p8_1p0=0.9455 corrupt_frac_t_0p8_1p0=0.2057 out_w_norm=128.0768 out_g_norm=0.2199 loss_all=1.3285 init_gold_top10=0.5194 init_gold_top100=0.5258
|
| 283 |
+
step=9800 micro_steps=19600 elapsed=27.4s lr=3.000000e-04 loss=2.3266 loss_recon=2.3266 loss_meanflow=0.0000 mean_model_t=0.5013 mean_corrupt_t=0.5013 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7930 corrupt_frac=0.5506 acc_corrupt=0.6364 loss_corrupt=2.3266 wrong_frac=0.4983 init_acc_corrupt=0.4679 acc_corrupt_t_0p0_0p2=0.2599 corrupt_frac_t_0p0_0p2=0.1971 acc_corrupt_t_0p2_0p4=0.4695 corrupt_frac_t_0p2_0p4=0.1987 acc_corrupt_t_0p4_0p6=0.6735 corrupt_frac_t_0p4_0p6=0.2029 acc_corrupt_t_0p6_0p8=0.8234 corrupt_frac_t_0p6_0p8=0.2000 acc_corrupt_t_0p8_1p0=0.9465 corrupt_frac_t_0p8_1p0=0.2013 out_w_norm=128.8904 out_g_norm=0.2186 loss_all=1.3948 init_gold_top10=0.4819 init_gold_top100=0.4894
|
| 284 |
+
step=9900 micro_steps=19800 elapsed=27.4s lr=3.000000e-04 loss=2.3585 loss_recon=2.3585 loss_meanflow=0.0000 mean_model_t=0.5004 mean_corrupt_t=0.5004 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7901 corrupt_frac=0.5512 acc_corrupt=0.6317 loss_corrupt=2.3585 wrong_frac=0.5024 init_acc_corrupt=0.4626 acc_corrupt_t_0p0_0p2=0.2562 corrupt_frac_t_0p0_0p2=0.1971 acc_corrupt_t_0p2_0p4=0.4647 corrupt_frac_t_0p2_0p4=0.2067 acc_corrupt_t_0p4_0p6=0.6758 corrupt_frac_t_0p4_0p6=0.1989 acc_corrupt_t_0p6_0p8=0.8211 corrupt_frac_t_0p6_0p8=0.1996 acc_corrupt_t_0p8_1p0=0.9447 corrupt_frac_t_0p8_1p0=0.1978 out_w_norm=129.7026 out_g_norm=0.2175 loss_all=1.5547 init_gold_top10=0.4674 init_gold_top100=0.4739
|
| 285 |
+
step=10000 micro_steps=20000 elapsed=27.4s lr=3.000000e-04 loss=2.3542 loss_recon=2.3542 loss_meanflow=0.0000 mean_model_t=0.4992 mean_corrupt_t=0.4992 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7899 corrupt_frac=0.5529 acc_corrupt=0.6324 loss_corrupt=2.3542 wrong_frac=0.5019 init_acc_corrupt=0.4632 acc_corrupt_t_0p0_0p2=0.2579 corrupt_frac_t_0p0_0p2=0.2033 acc_corrupt_t_0p2_0p4=0.4706 corrupt_frac_t_0p2_0p4=0.1992 acc_corrupt_t_0p4_0p6=0.6733 corrupt_frac_t_0p4_0p6=0.2027 acc_corrupt_t_0p6_0p8=0.8253 corrupt_frac_t_0p6_0p8=0.1921 acc_corrupt_t_0p8_1p0=0.9435 corrupt_frac_t_0p8_1p0=0.2027 out_w_norm=130.5149 out_g_norm=0.2155 loss_all=1.3325 init_gold_top10=0.5370 init_gold_top100=0.5441
|
LTA_openwebtext_dualt/logs/lta_lm1b_classic_dirichlet_len512_gbs512_4gpu_20k_save1k_20260523.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/lta_lm1b_dirichlet_categorical_fullvocab_c1024_fullycoupled_flmpack_onehot_hardce_ddit_small_len128_gbs512_4gpu_1m_nw0.resume_20260508.nohup.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/lta_lm1b_dirichlet_categorical_fullvocab_c16p0_dualt_flmpack_onehot_hardce_ddit_small_len128_gbs512_4gpu_1m_nw0.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/lta_lm1b_dirichlet_len1024_Cv_to_2v_nosep_gbs512_4gpu_20k_save1k_gumbelwatch_20260525_watcher.log
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[watch-gumbel] run_dir=runs/lta_lm1b_dirichlet_len1024_Cv_to_2v_nosep_gbs512_4gpu_20k_save1k_gumbelwatch_20260525
|
| 2 |
+
[watch-gumbel] out_base=docs/lta_samples/metrics_20260525/lm1b_dirichlet_len1024_Cv_to_2v_nosep_every1k_sde_gumbel_topp0p95_tau1p0_to_0p2_blend_c30522_61044_n128/lta_lm1b_dirichlet_len1024_Cv_to_2v_nosep_gbs512_4gpu_20k_save1k_gumbelwatch_20260525
|
| 3 |
+
[watch-gumbel] interval=1000 max_len=1024 steps=128 c=30522->61044 temp=1.45 top_p=0.95 tau=1.0->0.2 n=128
|
| 4 |
+
[watch-gumbel] 2026-05-25_16:07:24 no ckpt yet
|
| 5 |
+
[watch-gumbel] 2026-05-25_16:07:54 no ckpt yet
|
LTA_openwebtext_dualt/logs/lta_owt_c1024_len1024_t0to1_lowk64plus_noall_buf1000_gbs128_4gpu_20k.log
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
*****************************************
|
| 3 |
+
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
*****************************************
|
| 5 |
+
NCCL version 2.25.1+cuda12.8
|
| 6 |
+
{
|
| 7 |
+
"device": "cuda:0",
|
| 8 |
+
"rank": 0,
|
| 9 |
+
"world_size": 4,
|
| 10 |
+
"samples": "wrapped_stream_online_shuffle:1000",
|
| 11 |
+
"vocab_size": 50257,
|
| 12 |
+
"save_dir": "runs/lta_owt_c1024_len1024_t0to1_lowk64plus_noall_buf1000_gbs128_4gpu_20k",
|
| 13 |
+
"batch_size": 16,
|
| 14 |
+
"grad_accum": 2,
|
| 15 |
+
"effective_batch_size": 128,
|
| 16 |
+
"global_batch_size": 128,
|
| 17 |
+
"lr_schedule": "constant_warmup",
|
| 18 |
+
"warmup_steps": 250,
|
| 19 |
+
"adam_beta1": 0.9,
|
| 20 |
+
"adam_beta2": 0.999,
|
| 21 |
+
"adam_eps": 1e-08,
|
| 22 |
+
"model_type": "ddit",
|
| 23 |
+
"dual_t": true,
|
| 24 |
+
"corrupt_t_mode": "same",
|
| 25 |
+
"corrupt_min_t": 0.0,
|
| 26 |
+
"corrupt_max_t": 1.0,
|
| 27 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 28 |
+
"dirichlet_semantic_t_mode": "same",
|
| 29 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 30 |
+
"categorical_wrong_from_full_vocab": true,
|
| 31 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 32 |
+
"logistic_normal_sigma_min": 0.18,
|
| 33 |
+
"logistic_normal_sigma_max": 2.2,
|
| 34 |
+
"logistic_normal_tau_min": 0.65,
|
| 35 |
+
"logistic_normal_tau_max": 1.15,
|
| 36 |
+
"torch_compile": false,
|
| 37 |
+
"compile_mode": "max-autotune",
|
| 38 |
+
"state_format": "prob",
|
| 39 |
+
"target_loss": "hard_ce",
|
| 40 |
+
"meanflow_weight": 0.0,
|
| 41 |
+
"bridge_noise_init": "logistic_normal",
|
| 42 |
+
"noise_sigma": -1.0,
|
| 43 |
+
"wrap": true,
|
| 44 |
+
"wrap_mode": "stream",
|
| 45 |
+
"wrap_record_buffer_size": 200,
|
| 46 |
+
"owt_cached_chunks": false,
|
| 47 |
+
"owt_chunk_cache_dir": "",
|
| 48 |
+
"owt_chunk_cache_rebuild": false,
|
| 49 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 50 |
+
"online_chunk_shuffle": true,
|
| 51 |
+
"online_chunk_shuffle_buffer": 1000,
|
| 52 |
+
"openwebtext_split": "train_minus_100k",
|
| 53 |
+
"detokenizer": "auto",
|
| 54 |
+
"resolved_detokenizer": null,
|
| 55 |
+
"num_workers": 0,
|
| 56 |
+
"latest_every": 500,
|
| 57 |
+
"resume_path": ""
|
| 58 |
+
}
|
| 59 |
+
step=100 micro_steps=200 elapsed=162.1s lr=1.212000e-04 loss_all=9.3275 acc_all=0.2237 loss_corrupt=9.3412 acc_corrupt=0.1930 corrupt_frac=0.8537 loss=9.3412 loss_recon=9.3412 loss_meanflow=0.0000 mean_model_t=0.4965 mean_corrupt_t=0.4965 wrong_frac=0.5041 init_acc_corrupt=0.4597 init_gold_top10=0.4902 init_gold_top100=0.5211
|
| 60 |
+
step=200 micro_steps=400 elapsed=158.8s lr=2.412000e-04 loss_all=4.5619 acc_all=0.4461 loss_corrupt=5.0229 acc_corrupt=0.3884 corrupt_frac=0.8532 loss=5.0229 loss_recon=5.0229 loss_meanflow=0.0000 mean_model_t=0.5034 mean_corrupt_t=0.5034 wrong_frac=0.4970 init_acc_corrupt=0.4681 init_gold_top10=0.4975 init_gold_top100=0.5268
|
| 61 |
+
step=300 micro_steps=600 elapsed=159.0s lr=3.000000e-04 loss_all=3.8212 acc_all=0.5336 loss_corrupt=4.3806 acc_corrupt=0.4632 corrupt_frac=0.8543 loss=4.3806 loss_recon=4.3806 loss_meanflow=0.0000 mean_model_t=0.4884 mean_corrupt_t=0.4884 wrong_frac=0.5112 init_acc_corrupt=0.4526 init_gold_top10=0.4828 init_gold_top100=0.5138
|
| 62 |
+
step=400 micro_steps=800 elapsed=158.9s lr=3.000000e-04 loss_all=3.6133 acc_all=0.5530 loss_corrupt=4.1389 acc_corrupt=0.4868 corrupt_frac=0.8538 loss=4.1389 loss_recon=4.1389 loss_meanflow=0.0000 mean_model_t=0.5021 mean_corrupt_t=0.5021 wrong_frac=0.4981 init_acc_corrupt=0.4670 init_gold_top10=0.4962 init_gold_top100=0.5259
|
| 63 |
+
step=500 micro_steps=1000 elapsed=158.8s lr=3.000000e-04 loss_all=3.4221 acc_all=0.5664 loss_corrupt=3.9294 acc_corrupt=0.5013 corrupt_frac=0.8544 loss=3.9294 loss_recon=3.9294 loss_meanflow=0.0000 mean_model_t=0.5030 mean_corrupt_t=0.5030 wrong_frac=0.4967 init_acc_corrupt=0.4694 init_gold_top10=0.4978 init_gold_top100=0.5273
|
| 64 |
+
step=600 micro_steps=1200 elapsed=168.6s lr=3.000000e-04 loss_all=3.3797 acc_all=0.5645 loss_corrupt=3.8939 acc_corrupt=0.4976 corrupt_frac=0.8520 loss=3.8939 loss_recon=3.8939 loss_meanflow=0.0000 mean_model_t=0.4915 mean_corrupt_t=0.4915 wrong_frac=0.5084 init_acc_corrupt=0.4563 init_gold_top10=0.4857 init_gold_top100=0.5166
|
| 65 |
+
step=700 micro_steps=1400 elapsed=189.3s lr=3.000000e-04 loss_all=3.2705 acc_all=0.5742 loss_corrupt=3.7647 acc_corrupt=0.5091 corrupt_frac=0.8531 loss=3.7647 loss_recon=3.7647 loss_meanflow=0.0000 mean_model_t=0.4981 mean_corrupt_t=0.4981 wrong_frac=0.5014 init_acc_corrupt=0.4640 init_gold_top10=0.4929 init_gold_top100=0.5226
|
| 66 |
+
step=800 micro_steps=1600 elapsed=179.9s lr=3.000000e-04 loss_all=3.1737 acc_all=0.5835 loss_corrupt=3.6518 acc_corrupt=0.5201 corrupt_frac=0.8531 loss=3.6518 loss_recon=3.6518 loss_meanflow=0.0000 mean_model_t=0.5055 mean_corrupt_t=0.5055 wrong_frac=0.4946 init_acc_corrupt=0.4715 init_gold_top10=0.4996 init_gold_top100=0.5290
|
| 67 |
+
step=900 micro_steps=1800 elapsed=214.4s lr=3.000000e-04 loss_all=3.2227 acc_all=0.5736 loss_corrupt=3.7010 acc_corrupt=0.5098 corrupt_frac=0.8542 loss=3.7010 loss_recon=3.7010 loss_meanflow=0.0000 mean_model_t=0.4934 mean_corrupt_t=0.4934 wrong_frac=0.5070 init_acc_corrupt=0.4557 init_gold_top10=0.4871 init_gold_top100=0.5179
|
| 68 |
+
step=1000 micro_steps=2000 elapsed=95.4s lr=3.000000e-04 loss_all=3.1676 acc_all=0.5790 loss_corrupt=3.6268 acc_corrupt=0.5176 corrupt_frac=0.8562 loss=3.6268 loss_recon=3.6268 loss_meanflow=0.0000 mean_model_t=0.4973 mean_corrupt_t=0.4973 wrong_frac=0.5026 init_acc_corrupt=0.4617 init_gold_top10=0.4916 init_gold_top100=0.5216
|
| 69 |
+
step=1100 micro_steps=2200 elapsed=178.3s lr=3.000000e-04 loss_all=3.1324 acc_all=0.5816 loss_corrupt=3.5934 acc_corrupt=0.5198 corrupt_frac=0.8536 loss=3.5934 loss_recon=3.5934 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 wrong_frac=0.5036 init_acc_corrupt=0.4611 init_gold_top10=0.4906 init_gold_top100=0.5200
|
| 70 |
+
step=1200 micro_steps=2400 elapsed=118.6s lr=3.000000e-04 loss_all=3.1075 acc_all=0.5830 loss_corrupt=3.5610 acc_corrupt=0.5219 corrupt_frac=0.8547 loss=3.5610 loss_recon=3.5610 loss_meanflow=0.0000 mean_model_t=0.4980 mean_corrupt_t=0.4980 wrong_frac=0.5023 init_acc_corrupt=0.4631 init_gold_top10=0.4916 init_gold_top100=0.5237
|
| 71 |
+
step=1300 micro_steps=2600 elapsed=90.6s lr=3.000000e-04 loss_all=3.0150 acc_all=0.5936 loss_corrupt=3.4552 acc_corrupt=0.5338 corrupt_frac=0.8556 loss=3.4552 loss_recon=3.4552 loss_meanflow=0.0000 mean_model_t=0.5084 mean_corrupt_t=0.5084 wrong_frac=0.4914 init_acc_corrupt=0.4744 init_gold_top10=0.5032 init_gold_top100=0.5323
|
| 72 |
+
step=1400 micro_steps=2800 elapsed=90.7s lr=3.000000e-04 loss_all=3.0326 acc_all=0.5915 loss_corrupt=3.4759 acc_corrupt=0.5313 corrupt_frac=0.8557 loss=3.4759 loss_recon=3.4759 loss_meanflow=0.0000 mean_model_t=0.5049 mean_corrupt_t=0.5049 wrong_frac=0.4949 init_acc_corrupt=0.4712 init_gold_top10=0.4996 init_gold_top100=0.5288
|
| 73 |
+
step=1500 micro_steps=3000 elapsed=90.5s lr=3.000000e-04 loss_all=3.0691 acc_all=0.5842 loss_corrupt=3.5178 acc_corrupt=0.5230 corrupt_frac=0.8549 loss=3.5178 loss_recon=3.5178 loss_meanflow=0.0000 mean_model_t=0.4958 mean_corrupt_t=0.4958 wrong_frac=0.5043 init_acc_corrupt=0.4610 init_gold_top10=0.4897 init_gold_top100=0.5209
|
| 74 |
+
step=1600 micro_steps=3200 elapsed=108.0s lr=3.000000e-04 loss_all=3.0519 acc_all=0.5855 loss_corrupt=3.4965 acc_corrupt=0.5246 corrupt_frac=0.8562 loss=3.4965 loss_recon=3.4965 loss_meanflow=0.0000 mean_model_t=0.4953 mean_corrupt_t=0.4953 wrong_frac=0.5044 init_acc_corrupt=0.4601 init_gold_top10=0.4897 init_gold_top100=0.5203
|
| 75 |
+
step=1700 micro_steps=3400 elapsed=90.8s lr=3.000000e-04 loss_all=2.9948 acc_all=0.5920 loss_corrupt=3.4342 acc_corrupt=0.5316 corrupt_frac=0.8556 loss=3.4342 loss_recon=3.4342 loss_meanflow=0.0000 mean_model_t=0.5022 mean_corrupt_t=0.5022 wrong_frac=0.4978 init_acc_corrupt=0.4671 init_gold_top10=0.4967 init_gold_top100=0.5262
|
| 76 |
+
W0512 00:36:09.894000 862137 torch/distributed/elastic/agent/server/api.py:719] Received 15 death signal, shutting down workers
|
| 77 |
+
W0512 00:36:09.896000 862137 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 862206 closing signal SIGTERM
|
| 78 |
+
W0512 00:36:09.897000 862137 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 862207 closing signal SIGTERM
|
| 79 |
+
W0512 00:36:09.897000 862137 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 862208 closing signal SIGTERM
|
| 80 |
+
W0512 00:36:09.898000 862137 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 862209 closing signal SIGTERM
|
| 81 |
+
Traceback (most recent call last):
|
| 82 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 83 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 84 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
|
| 85 |
+
main()
|
| 86 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 87 |
+
return f(*args, **kwargs)
|
| 88 |
+
^^^^^^^^^^^^^^^^^^
|
| 89 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
|
| 90 |
+
run(args)
|
| 91 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
|
| 92 |
+
elastic_launch(
|
| 93 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
|
| 94 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 95 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 96 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
|
| 97 |
+
result = agent.run()
|
| 98 |
+
^^^^^^^^^^^
|
| 99 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
|
| 100 |
+
result = f(*args, **kwargs)
|
| 101 |
+
^^^^^^^^^^^^^^^^^^
|
| 102 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run
|
| 103 |
+
result = self._invoke_run(role)
|
| 104 |
+
^^^^^^^^^^^^^^^^^^^^^^
|
| 105 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 870, in _invoke_run
|
| 106 |
+
time.sleep(monitor_interval)
|
| 107 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler
|
| 108 |
+
raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
|
| 109 |
+
torch.distributed.elastic.multiprocessing.api.SignalException: Process 862137 got signal: 15
|
LTA_openwebtext_dualt/logs/lta_owt_classic_fullvocab_bert_c1024_len128_gbs512_4gpu_1m_save1k_20260521_210848.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/lta_owt_dirichlet_categorical_fullvocab_c1024_fullycoupled_flmpack_onehot_hardce_ddit_small_len1024_gbs512_8gpu_1m_nw4.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/lta_owt_dirichlet_categorical_fullvocab_c1024_fullycoupled_online_shuffle_len128_gbs512_4gpu_1m_nw2_buf20k.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|