Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- LTA_openwebtext_dualt/logs/ar_lm1b_flmpack_bert_small_len128_gbs512_4gpu_1m_20260504_195806.log +305 -0
- LTA_openwebtext_dualt/logs/lta_lm1b_classic_dirichlet_len256_gbs512_4gpu_10k_save1k_20260523.train.pid +1 -0
- LTA_openwebtext_dualt/logs/lta_owt_dirichlet_categorical_fullvocab_c1024_fullycoupled_shufchunks_len128_gbs512_8gpu_1m.log +0 -0
- LTA_openwebtext_dualt/logs/lta_owt_gpt2cached_len1024_rollout1_p1_bench4gpu_20260513_152806.log +103 -0
- LTA_openwebtext_dualt/logs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525_watcher.log +458 -0
- LTA_openwebtext_dualt/logs/owt_candidate_catdualt_step246k_64_c1024_t1p2_blend_n64.log +3 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/smoke_gpt2_softendpoint_mn_n128_onehot.log +94 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_combo_len256_dirichlet_unigram_shared_highC_20260517_170456.log +197 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456.log +395 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_20260517_223933.log +229 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139.log +609 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705.log +0 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728.log +1034 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_t5tok_p50_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014620.log +196 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705.log +1024 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n1024_compactv2664_3l_bs512_hard_ce_allcorrupt.log +0 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n1024_compactv2664_3l_bs512_hard_ce_onehot.log +0 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n512_compactv1635_3l_bs512_hard_ce_allcorrupt.log +0 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n64_compactv335_3l_bs512_hard_ce_allcorrupt.log +0 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n8_compactv47_3l_hard_ce_onehot.log +0 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n8_linear_soft_kl_onehot_20260517_train8ctx8_overfit.log +326 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805.log +987 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805.log +791 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_noisegeo_len256_allcorrupt_unigram_shared0p5_20260517_163805.log +634 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_focused_len256_rollin_p100_s4_i32_20260517_1733focused.log +193 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_len1024_rollin_p50_s4_i32_20260517_1840ctx1024.log +397 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_len1024_rollin_p50_s4_i32_20260517_1855ctx1024bs128.log +0 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_synct_len256_synct_p50_s8_i64_20260517_1800synct.log +224 -0
- LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_wrongfloor_len256_wrongfloor0p3_20260517_1815wrongfloor.log +199 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/hf_xet-1.5.0.dist-info/RECORD +9 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/hf_xet-1.5.0.dist-info/WHEEL +5 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/__init__.py +0 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/anyio.py +146 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/auto.py +52 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/base.py +101 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/mock.py +143 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/sync.py +241 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/trio.py +159 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/__init__.py +18 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_decode.py +104 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_encode.py +85 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_format.py +27 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_parse.py +304 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_url.py +14 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/py.typed +1 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/INSTALLER +1 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/LICENSE.txt +971 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/METADATA +1092 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/RECORD +792 -0
- LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/REQUESTED +0 -0
LTA_openwebtext_dualt/logs/ar_lm1b_flmpack_bert_small_len128_gbs512_4gpu_1m_20260504_195806.log
ADDED
|
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
*****************************************
|
| 3 |
+
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
*****************************************
|
| 5 |
+
NCCL version 2.25.1+cuda12.8
|
| 6 |
+
{
|
| 7 |
+
"task": "ar_lm",
|
| 8 |
+
"device": "cuda:0",
|
| 9 |
+
"rank": 0,
|
| 10 |
+
"world_size": 4,
|
| 11 |
+
"samples": "wrapped_streaming",
|
| 12 |
+
"vocab_size": 30522,
|
| 13 |
+
"bos_id": 101,
|
| 14 |
+
"eos_id": 102,
|
| 15 |
+
"save_dir": "runs/ar_lm1b_flmpack_bert_small_len128_gbs512_4gpu_1m_20260504_195806",
|
| 16 |
+
"params": 108440832,
|
| 17 |
+
"batch_size": 64,
|
| 18 |
+
"grad_accum": 2,
|
| 19 |
+
"effective_batch_size": 512,
|
| 20 |
+
"global_batch_size": 512,
|
| 21 |
+
"max_len": 128,
|
| 22 |
+
"wrap": true,
|
| 23 |
+
"text_detokenizer": "lm1b",
|
| 24 |
+
"openwebtext_split": "all",
|
| 25 |
+
"torch_compile": false
|
| 26 |
+
}
|
| 27 |
+
step=20 micro_steps=40 elapsed=4.3s lr=2.520000e-06 loss=10.3798 ppl=32478.6943 acc=0.0027 tokens=8128.0000
|
| 28 |
+
step=40 micro_steps=80 elapsed=3.6s lr=4.920000e-06 loss=9.6982 ppl=16677.9630 acc=0.0420 tokens=8128.0000
|
| 29 |
+
step=60 micro_steps=120 elapsed=3.6s lr=7.320000e-06 loss=9.1128 ppl=9132.2660 acc=0.0511 tokens=8128.0000
|
| 30 |
+
step=80 micro_steps=160 elapsed=3.6s lr=9.720000e-06 loss=8.8143 ppl=6750.1859 acc=0.0650 tokens=8128.0000
|
| 31 |
+
step=100 micro_steps=200 elapsed=3.6s lr=1.212000e-05 loss=8.5221 ppl=5041.4905 acc=0.0799 tokens=8128.0000
|
| 32 |
+
step=120 micro_steps=240 elapsed=3.6s lr=1.452000e-05 loss=8.2581 ppl=3870.1153 acc=0.0867 tokens=8128.0000
|
| 33 |
+
step=140 micro_steps=280 elapsed=3.6s lr=1.692000e-05 loss=7.9724 ppl=2911.8598 acc=0.0919 tokens=8128.0000
|
| 34 |
+
step=160 micro_steps=320 elapsed=3.6s lr=1.932000e-05 loss=7.6568 ppl=2123.6313 acc=0.1067 tokens=8128.0000
|
| 35 |
+
step=180 micro_steps=360 elapsed=3.6s lr=2.172000e-05 loss=7.3502 ppl=1563.0248 acc=0.1163 tokens=8128.0000
|
| 36 |
+
step=200 micro_steps=400 elapsed=3.6s lr=2.412000e-05 loss=7.0837 ppl=1195.9628 acc=0.1251 tokens=8128.0000
|
| 37 |
+
step=220 micro_steps=440 elapsed=3.6s lr=2.652000e-05 loss=6.8824 ppl=976.4170 acc=0.1333 tokens=8128.0000
|
| 38 |
+
step=240 micro_steps=480 elapsed=3.6s lr=2.892000e-05 loss=6.7489 ppl=854.0937 acc=0.1375 tokens=8128.0000
|
| 39 |
+
step=260 micro_steps=520 elapsed=3.6s lr=3.132000e-05 loss=6.6435 ppl=768.6732 acc=0.1441 tokens=8128.0000
|
| 40 |
+
step=280 micro_steps=560 elapsed=3.6s lr=3.372000e-05 loss=6.5504 ppl=700.1528 acc=0.1519 tokens=8128.0000
|
| 41 |
+
step=300 micro_steps=600 elapsed=3.6s lr=3.612000e-05 loss=6.4669 ppl=644.4497 acc=0.1587 tokens=8128.0000
|
| 42 |
+
step=320 micro_steps=640 elapsed=3.6s lr=3.852000e-05 loss=6.3869 ppl=594.5753 acc=0.1655 tokens=8128.0000
|
| 43 |
+
step=340 micro_steps=680 elapsed=3.6s lr=4.092000e-05 loss=6.2992 ppl=544.6934 acc=0.1715 tokens=8128.0000
|
| 44 |
+
step=360 micro_steps=720 elapsed=3.6s lr=4.332000e-05 loss=6.2257 ppl=506.2021 acc=0.1749 tokens=8128.0000
|
| 45 |
+
step=380 micro_steps=760 elapsed=3.6s lr=4.572000e-05 loss=6.1530 ppl=470.6410 acc=0.1788 tokens=8128.0000
|
| 46 |
+
step=400 micro_steps=800 elapsed=3.6s lr=4.812000e-05 loss=6.0747 ppl=435.1815 acc=0.1824 tokens=8128.0000
|
| 47 |
+
step=420 micro_steps=840 elapsed=3.6s lr=5.052000e-05 loss=5.9999 ppl=403.7083 acc=0.1866 tokens=8128.0000
|
| 48 |
+
step=440 micro_steps=880 elapsed=3.6s lr=5.292000e-05 loss=5.9622 ppl=388.8123 acc=0.1894 tokens=8128.0000
|
| 49 |
+
step=460 micro_steps=920 elapsed=3.6s lr=5.532000e-05 loss=5.9080 ppl=368.1609 acc=0.1923 tokens=8128.0000
|
| 50 |
+
step=480 micro_steps=960 elapsed=3.6s lr=5.772000e-05 loss=5.8534 ppl=348.7774 acc=0.1953 tokens=8128.0000
|
| 51 |
+
step=500 micro_steps=1000 elapsed=3.6s lr=6.012000e-05 loss=5.8124 ppl=334.7449 acc=0.1972 tokens=8128.0000
|
| 52 |
+
step=520 micro_steps=1040 elapsed=3.6s lr=6.252000e-05 loss=5.7628 ppl=318.4989 acc=0.1996 tokens=8128.0000
|
| 53 |
+
step=540 micro_steps=1080 elapsed=3.6s lr=6.492000e-05 loss=5.7351 ppl=309.8943 acc=0.2010 tokens=8128.0000
|
| 54 |
+
step=560 micro_steps=1120 elapsed=3.6s lr=6.732000e-05 loss=5.6905 ppl=296.3728 acc=0.2039 tokens=8128.0000
|
| 55 |
+
step=580 micro_steps=1160 elapsed=3.6s lr=6.972000e-05 loss=5.6509 ppl=284.9865 acc=0.2069 tokens=8128.0000
|
| 56 |
+
step=600 micro_steps=1200 elapsed=3.6s lr=7.212000e-05 loss=5.6142 ppl=274.5673 acc=0.2090 tokens=8128.0000
|
| 57 |
+
step=620 micro_steps=1240 elapsed=3.6s lr=7.452000e-05 loss=5.5644 ppl=261.2549 acc=0.2113 tokens=8128.0000
|
| 58 |
+
step=640 micro_steps=1280 elapsed=3.6s lr=7.692000e-05 loss=5.5398 ppl=254.9341 acc=0.2136 tokens=8128.0000
|
| 59 |
+
step=660 micro_steps=1320 elapsed=3.6s lr=7.932000e-05 loss=5.5199 ppl=249.8425 acc=0.2146 tokens=8128.0000
|
| 60 |
+
step=680 micro_steps=1360 elapsed=3.6s lr=8.172000e-05 loss=5.4786 ppl=239.7250 acc=0.2176 tokens=8128.0000
|
| 61 |
+
step=700 micro_steps=1400 elapsed=3.6s lr=8.412000e-05 loss=5.4594 ppl=235.2477 acc=0.2177 tokens=8128.0000
|
| 62 |
+
step=720 micro_steps=1440 elapsed=3.6s lr=8.652000e-05 loss=5.4230 ppl=226.7282 acc=0.2195 tokens=8128.0000
|
| 63 |
+
step=740 micro_steps=1480 elapsed=3.6s lr=8.892000e-05 loss=5.3881 ppl=219.0196 acc=0.2224 tokens=8128.0000
|
| 64 |
+
step=760 micro_steps=1520 elapsed=3.6s lr=9.132000e-05 loss=5.3681 ppl=214.6676 acc=0.2242 tokens=8128.0000
|
| 65 |
+
step=780 micro_steps=1560 elapsed=3.6s lr=9.372000e-05 loss=5.3561 ppl=212.1429 acc=0.2236 tokens=8128.0000
|
| 66 |
+
step=800 micro_steps=1600 elapsed=3.6s lr=9.612000e-05 loss=5.3154 ppl=203.6174 acc=0.2266 tokens=8128.0000
|
| 67 |
+
step=820 micro_steps=1640 elapsed=3.6s lr=9.852000e-05 loss=5.2852 ppl=197.5881 acc=0.2281 tokens=8128.0000
|
| 68 |
+
step=840 micro_steps=1680 elapsed=3.6s lr=1.009200e-04 loss=5.2622 ppl=193.1013 acc=0.2307 tokens=8128.0000
|
| 69 |
+
step=860 micro_steps=1720 elapsed=3.6s lr=1.033200e-04 loss=5.2473 ppl=190.2556 acc=0.2309 tokens=8128.0000
|
| 70 |
+
step=880 micro_steps=1760 elapsed=3.6s lr=1.057200e-04 loss=5.2202 ppl=185.1611 acc=0.2324 tokens=8128.0000
|
| 71 |
+
step=900 micro_steps=1800 elapsed=3.6s lr=1.081200e-04 loss=5.1907 ppl=179.7384 acc=0.2351 tokens=8128.0000
|
| 72 |
+
step=920 micro_steps=1840 elapsed=3.6s lr=1.105200e-04 loss=5.1668 ppl=175.4561 acc=0.2370 tokens=8128.0000
|
| 73 |
+
step=940 micro_steps=1880 elapsed=3.6s lr=1.129200e-04 loss=5.1512 ppl=172.8847 acc=0.2380 tokens=8128.0000
|
| 74 |
+
step=960 micro_steps=1920 elapsed=3.6s lr=1.153200e-04 loss=5.1224 ppl=167.9127 acc=0.2385 tokens=8128.0000
|
| 75 |
+
step=980 micro_steps=1960 elapsed=3.6s lr=1.177200e-04 loss=5.1084 ppl=165.6339 acc=0.2393 tokens=8128.0000
|
| 76 |
+
step=1000 micro_steps=2000 elapsed=3.6s lr=1.201200e-04 loss=5.0897 ppl=162.5289 acc=0.2409 tokens=8128.0000
|
| 77 |
+
[sample step=1000] [CLS] bash turned on and had five - day splash design. [SEP] the rails of proficiency were eastern, with the drum - smashed races of tif hoc sabbath. [SEP] good could be in liechtenstein, but not 314 of the same floor is imminent. [SEP] kristin fox expired 24 - 15 on beth gregor marsh's celebration and he was volleyed with his being sacked as top - completing practice in oakland. [SEP] passion for poverty and cuts: chief executive the company's office reported that markedly of last year'slight and the administration's potentially derivatives force indicates to become careful friday. [SEP] overall, then the economic compensation program has acquired [SEP]
|
| 78 |
+
step=1020 micro_steps=2040 elapsed=6.4s lr=1.225200e-04 loss=5.0651 ppl=158.5900 acc=0.2426 tokens=8128.0000
|
| 79 |
+
step=1040 micro_steps=2080 elapsed=3.6s lr=1.249200e-04 loss=5.0462 ppl=155.6245 acc=0.2436 tokens=8128.0000
|
| 80 |
+
step=1060 micro_steps=2120 elapsed=3.6s lr=1.273200e-04 loss=5.0489 ppl=156.0155 acc=0.2436 tokens=8128.0000
|
| 81 |
+
step=1080 micro_steps=2160 elapsed=3.6s lr=1.297200e-04 loss=5.0148 ppl=150.8239 acc=0.2466 tokens=8128.0000
|
| 82 |
+
step=1100 micro_steps=2200 elapsed=3.6s lr=1.321200e-04 loss=4.9821 ppl=145.9810 acc=0.2484 tokens=8128.0000
|
| 83 |
+
step=1120 micro_steps=2240 elapsed=3.6s lr=1.345200e-04 loss=4.9698 ppl=144.2456 acc=0.2497 tokens=8128.0000
|
| 84 |
+
step=1140 micro_steps=2280 elapsed=3.6s lr=1.369200e-04 loss=4.9559 ppl=142.2155 acc=0.2496 tokens=8128.0000
|
| 85 |
+
step=1160 micro_steps=2320 elapsed=3.6s lr=1.393200e-04 loss=4.9270 ppl=138.0649 acc=0.2513 tokens=8128.0000
|
| 86 |
+
step=1180 micro_steps=2360 elapsed=3.6s lr=1.417200e-04 loss=4.9200 ppl=137.1738 acc=0.2521 tokens=8128.0000
|
| 87 |
+
step=1200 micro_steps=2400 elapsed=3.6s lr=1.441200e-04 loss=4.9104 ppl=135.8119 acc=0.2530 tokens=8128.0000
|
| 88 |
+
step=1220 micro_steps=2440 elapsed=3.6s lr=1.465200e-04 loss=4.8784 ppl=131.6055 acc=0.2556 tokens=8128.0000
|
| 89 |
+
step=1240 micro_steps=2480 elapsed=3.6s lr=1.489200e-04 loss=4.8727 ppl=130.8100 acc=0.2568 tokens=8128.0000
|
| 90 |
+
step=1260 micro_steps=2520 elapsed=3.6s lr=1.513200e-04 loss=4.8468 ppl=127.4586 acc=0.2564 tokens=8128.0000
|
| 91 |
+
step=1280 micro_steps=2560 elapsed=3.6s lr=1.537200e-04 loss=4.8371 ppl=126.2536 acc=0.2580 tokens=8128.0000
|
| 92 |
+
step=1300 micro_steps=2600 elapsed=3.6s lr=1.561200e-04 loss=4.8055 ppl=122.3074 acc=0.2601 tokens=8128.0000
|
| 93 |
+
step=1320 micro_steps=2640 elapsed=3.6s lr=1.585200e-04 loss=4.8018 ppl=121.8846 acc=0.2605 tokens=8128.0000
|
| 94 |
+
step=1340 micro_steps=2680 elapsed=3.6s lr=1.609200e-04 loss=4.7835 ppl=119.6142 acc=0.2619 tokens=8128.0000
|
| 95 |
+
step=1360 micro_steps=2720 elapsed=3.6s lr=1.633200e-04 loss=4.7571 ppl=116.5595 acc=0.2649 tokens=8128.0000
|
| 96 |
+
step=1380 micro_steps=2760 elapsed=3.6s lr=1.657200e-04 loss=4.7526 ppl=116.0234 acc=0.2641 tokens=8128.0000
|
| 97 |
+
step=1400 micro_steps=2800 elapsed=3.6s lr=1.681200e-04 loss=4.7498 ppl=115.6697 acc=0.2641 tokens=8128.0000
|
| 98 |
+
step=1420 micro_steps=2840 elapsed=3.6s lr=1.705200e-04 loss=4.7416 ppl=114.7103 acc=0.2650 tokens=8128.0000
|
| 99 |
+
step=1440 micro_steps=2880 elapsed=3.6s lr=1.729200e-04 loss=4.7126 ppl=111.4812 acc=0.2663 tokens=8128.0000
|
| 100 |
+
step=1460 micro_steps=2920 elapsed=3.6s lr=1.753200e-04 loss=4.7078 ppl=110.9196 acc=0.2674 tokens=8128.0000
|
| 101 |
+
step=1480 micro_steps=2960 elapsed=3.6s lr=1.777200e-04 loss=4.6839 ppl=108.3107 acc=0.2680 tokens=8128.0000
|
| 102 |
+
step=1500 micro_steps=3000 elapsed=3.6s lr=1.801200e-04 loss=4.6728 ppl=107.1491 acc=0.2698 tokens=8128.0000
|
| 103 |
+
step=1520 micro_steps=3040 elapsed=3.6s lr=1.825200e-04 loss=4.6493 ppl=104.6331 acc=0.2710 tokens=8128.0000
|
| 104 |
+
step=1540 micro_steps=3080 elapsed=3.6s lr=1.849200e-04 loss=4.6501 ppl=104.7323 acc=0.2707 tokens=8128.0000
|
| 105 |
+
step=1560 micro_steps=3120 elapsed=3.6s lr=1.873200e-04 loss=4.6381 ppl=103.4611 acc=0.2718 tokens=8128.0000
|
| 106 |
+
step=1580 micro_steps=3160 elapsed=3.6s lr=1.897200e-04 loss=4.6088 ppl=100.5290 acc=0.2751 tokens=8128.0000
|
| 107 |
+
step=1600 micro_steps=3200 elapsed=3.6s lr=1.921200e-04 loss=4.6027 ppl=99.8427 acc=0.2747 tokens=8128.0000
|
| 108 |
+
step=1620 micro_steps=3240 elapsed=3.6s lr=1.945200e-04 loss=4.5819 ppl=97.7759 acc=0.2764 tokens=8128.0000
|
| 109 |
+
step=1640 micro_steps=3280 elapsed=3.6s lr=1.969200e-04 loss=4.5748 ppl=97.1006 acc=0.2762 tokens=8128.0000
|
| 110 |
+
step=1660 micro_steps=3320 elapsed=3.6s lr=1.993200e-04 loss=4.5683 ppl=96.5347 acc=0.2770 tokens=8128.0000
|
| 111 |
+
step=1680 micro_steps=3360 elapsed=3.6s lr=2.017200e-04 loss=4.5531 ppl=95.0791 acc=0.2777 tokens=8128.0000
|
| 112 |
+
step=1700 micro_steps=3400 elapsed=3.6s lr=2.041200e-04 loss=4.5314 ppl=92.9927 acc=0.2800 tokens=8128.0000
|
| 113 |
+
step=1720 micro_steps=3440 elapsed=3.6s lr=2.065200e-04 loss=4.5209 ppl=92.0741 acc=0.2796 tokens=8128.0000
|
| 114 |
+
step=1740 micro_steps=3480 elapsed=3.6s lr=2.089200e-04 loss=4.5257 ppl=92.4572 acc=0.2809 tokens=8128.0000
|
| 115 |
+
step=1760 micro_steps=3520 elapsed=3.6s lr=2.113200e-04 loss=4.5192 ppl=91.9197 acc=0.2803 tokens=8128.0000
|
| 116 |
+
step=1780 micro_steps=3560 elapsed=3.6s lr=2.137200e-04 loss=4.4929 ppl=89.4739 acc=0.2837 tokens=8128.0000
|
| 117 |
+
step=1800 micro_steps=3600 elapsed=3.6s lr=2.161200e-04 loss=4.4789 ppl=88.3597 acc=0.2846 tokens=8128.0000
|
| 118 |
+
step=1820 micro_steps=3640 elapsed=3.6s lr=2.185200e-04 loss=4.4777 ppl=88.1819 acc=0.2846 tokens=8128.0000
|
| 119 |
+
step=1840 micro_steps=3680 elapsed=3.6s lr=2.209200e-04 loss=4.4645 ppl=86.9802 acc=0.2859 tokens=8128.0000
|
| 120 |
+
step=1860 micro_steps=3720 elapsed=3.6s lr=2.233200e-04 loss=4.4604 ppl=86.6318 acc=0.2861 tokens=8128.0000
|
| 121 |
+
step=1880 micro_steps=3760 elapsed=3.6s lr=2.257200e-04 loss=4.4447 ppl=85.2599 acc=0.2866 tokens=8128.0000
|
| 122 |
+
step=1900 micro_steps=3800 elapsed=3.6s lr=2.281200e-04 loss=4.4276 ppl=83.8217 acc=0.2876 tokens=8128.0000
|
| 123 |
+
step=1920 micro_steps=3840 elapsed=3.6s lr=2.305200e-04 loss=4.4316 ppl=84.1305 acc=0.2883 tokens=8128.0000
|
| 124 |
+
step=1940 micro_steps=3880 elapsed=3.6s lr=2.329200e-04 loss=4.4196 ppl=83.1516 acc=0.2876 tokens=8128.0000
|
| 125 |
+
step=1960 micro_steps=3920 elapsed=3.6s lr=2.353200e-04 loss=4.4023 ppl=81.7102 acc=0.2905 tokens=8128.0000
|
| 126 |
+
step=1980 micro_steps=3960 elapsed=3.6s lr=2.377200e-04 loss=4.4063 ppl=82.0973 acc=0.2905 tokens=8128.0000
|
| 127 |
+
step=2000 micro_steps=4000 elapsed=3.6s lr=2.401200e-04 loss=4.3846 ppl=80.3016 acc=0.2917 tokens=8128.0000
|
| 128 |
+
[sample step=2000] [CLS]t ), and at the moment - - you really don't know the same the best proportions for getting things attention this is going to get hurt. [SEP] mr justice debashev had a positive interest in most of the country. [SEP] it wants to be a useful change. [SEP] the principal names of veteran officers, groups of fans committees and members of congress, because they while not horse - goers will play an important role in obtaining substantial invoking over his federal support, there are no help for the united states to preserve iranian community capacity. [SEP] the pendulum of dieting out is underscore the size of gas [SEP]
|
| 129 |
+
step=2020 micro_steps=4040 elapsed=5.8s lr=2.425200e-04 loss=4.3824 ppl=80.1633 acc=0.2902 tokens=8128.0000
|
| 130 |
+
step=2040 micro_steps=4080 elapsed=3.6s lr=2.449200e-04 loss=4.3680 ppl=78.9561 acc=0.2932 tokens=8128.0000
|
| 131 |
+
step=2060 micro_steps=4120 elapsed=3.6s lr=2.473200e-04 loss=4.3571 ppl=78.1244 acc=0.2932 tokens=8128.0000
|
| 132 |
+
step=2080 micro_steps=4160 elapsed=3.6s lr=2.497200e-04 loss=4.3752 ppl=79.5609 acc=0.2914 tokens=8128.0000
|
| 133 |
+
step=2100 micro_steps=4200 elapsed=3.6s lr=2.521200e-04 loss=4.3523 ppl=77.7967 acc=0.2944 tokens=8128.0000
|
| 134 |
+
step=2120 micro_steps=4240 elapsed=3.6s lr=2.545200e-04 loss=4.3403 ppl=76.8172 acc=0.2947 tokens=8128.0000
|
| 135 |
+
step=2140 micro_steps=4280 elapsed=3.6s lr=2.569200e-04 loss=4.3307 ppl=76.0678 acc=0.2953 tokens=8128.0000
|
| 136 |
+
step=2160 micro_steps=4320 elapsed=3.6s lr=2.593200e-04 loss=4.3392 ppl=76.7835 acc=0.2948 tokens=8128.0000
|
| 137 |
+
step=2180 micro_steps=4360 elapsed=3.6s lr=2.617200e-04 loss=4.3338 ppl=76.3271 acc=0.2947 tokens=8128.0000
|
| 138 |
+
step=2200 micro_steps=4400 elapsed=3.6s lr=2.641200e-04 loss=4.3230 ppl=75.4672 acc=0.2964 tokens=8128.0000
|
| 139 |
+
step=2220 micro_steps=4440 elapsed=3.6s lr=2.665200e-04 loss=4.3048 ppl=74.1440 acc=0.2987 tokens=8128.0000
|
| 140 |
+
step=2240 micro_steps=4480 elapsed=3.6s lr=2.689200e-04 loss=4.2885 ppl=72.9240 acc=0.2989 tokens=8128.0000
|
| 141 |
+
step=2260 micro_steps=4520 elapsed=3.6s lr=2.713200e-04 loss=4.3005 ppl=73.8235 acc=0.2969 tokens=8128.0000
|
| 142 |
+
step=2280 micro_steps=4560 elapsed=3.6s lr=2.737200e-04 loss=4.2877 ppl=72.8882 acc=0.2994 tokens=8128.0000
|
| 143 |
+
step=2300 micro_steps=4600 elapsed=3.6s lr=2.761200e-04 loss=4.2833 ppl=72.5995 acc=0.2992 tokens=8128.0000
|
| 144 |
+
step=2320 micro_steps=4640 elapsed=3.6s lr=2.785200e-04 loss=4.2780 ppl=72.1578 acc=0.2993 tokens=8128.0000
|
| 145 |
+
step=2340 micro_steps=4680 elapsed=3.6s lr=2.809200e-04 loss=4.2667 ppl=71.3834 acc=0.2998 tokens=8128.0000
|
| 146 |
+
step=2360 micro_steps=4720 elapsed=3.6s lr=2.833200e-04 loss=4.2594 ppl=70.8318 acc=0.3021 tokens=8128.0000
|
| 147 |
+
step=2380 micro_steps=4760 elapsed=3.6s lr=2.857200e-04 loss=4.2395 ppl=69.4661 acc=0.3038 tokens=8128.0000
|
| 148 |
+
step=2400 micro_steps=4800 elapsed=3.6s lr=2.881200e-04 loss=4.2583 ppl=70.7645 acc=0.3008 tokens=8128.0000
|
| 149 |
+
step=2420 micro_steps=4840 elapsed=3.6s lr=2.905200e-04 loss=4.2262 ppl=68.5519 acc=0.3055 tokens=8128.0000
|
| 150 |
+
step=2440 micro_steps=4880 elapsed=3.6s lr=2.929200e-04 loss=4.2411 ppl=69.5565 acc=0.3025 tokens=8128.0000
|
| 151 |
+
step=2460 micro_steps=4920 elapsed=3.6s lr=2.953200e-04 loss=4.2159 ppl=67.8834 acc=0.3051 tokens=8128.0000
|
| 152 |
+
step=2480 micro_steps=4960 elapsed=3.6s lr=2.977200e-04 loss=4.2350 ppl=69.1677 acc=0.3034 tokens=8128.0000
|
| 153 |
+
step=2500 micro_steps=5000 elapsed=3.6s lr=3.000000e-04 loss=4.2098 ppl=67.4287 acc=0.3069 tokens=8128.0000
|
| 154 |
+
step=2520 micro_steps=5040 elapsed=3.6s lr=3.000000e-04 loss=4.1991 ppl=66.6870 acc=0.3070 tokens=8128.0000
|
| 155 |
+
step=2540 micro_steps=5080 elapsed=3.6s lr=3.000000e-04 loss=4.2085 ppl=67.3566 acc=0.3055 tokens=8128.0000
|
| 156 |
+
step=2560 micro_steps=5120 elapsed=3.6s lr=3.000000e-04 loss=4.1952 ppl=66.4172 acc=0.3073 tokens=8128.0000
|
| 157 |
+
step=2580 micro_steps=5160 elapsed=3.6s lr=3.000000e-04 loss=4.1903 ppl=66.1340 acc=0.3071 tokens=8128.0000
|
| 158 |
+
step=2600 micro_steps=5200 elapsed=3.6s lr=3.000000e-04 loss=4.1832 ppl=65.6817 acc=0.3084 tokens=8128.0000
|
| 159 |
+
step=2620 micro_steps=5240 elapsed=3.6s lr=3.000000e-04 loss=4.1843 ppl=65.7183 acc=0.3079 tokens=8128.0000
|
| 160 |
+
step=2640 micro_steps=5280 elapsed=3.6s lr=3.000000e-04 loss=4.1928 ppl=66.2759 acc=0.3070 tokens=8128.0000
|
| 161 |
+
step=2660 micro_steps=5320 elapsed=3.6s lr=3.000000e-04 loss=4.1723 ppl=64.9454 acc=0.3093 tokens=8128.0000
|
| 162 |
+
step=2680 micro_steps=5360 elapsed=3.6s lr=3.000000e-04 loss=4.1549 ppl=63.8198 acc=0.3101 tokens=8128.0000
|
| 163 |
+
step=2700 micro_steps=5400 elapsed=3.6s lr=3.000000e-04 loss=4.1537 ppl=63.7657 acc=0.3095 tokens=8128.0000
|
| 164 |
+
step=2720 micro_steps=5440 elapsed=3.6s lr=3.000000e-04 loss=4.1471 ppl=63.3684 acc=0.3103 tokens=8128.0000
|
| 165 |
+
step=2740 micro_steps=5480 elapsed=3.6s lr=3.000000e-04 loss=4.1499 ppl=63.5364 acc=0.3112 tokens=8128.0000
|
| 166 |
+
step=2760 micro_steps=5520 elapsed=3.6s lr=3.000000e-04 loss=4.1403 ppl=62.9134 acc=0.3120 tokens=8128.0000
|
| 167 |
+
step=2780 micro_steps=5560 elapsed=3.6s lr=3.000000e-04 loss=4.1390 ppl=62.8143 acc=0.3123 tokens=8128.0000
|
| 168 |
+
step=2800 micro_steps=5600 elapsed=3.6s lr=3.000000e-04 loss=4.1324 ppl=62.4175 acc=0.3124 tokens=8128.0000
|
| 169 |
+
step=2820 micro_steps=5640 elapsed=3.6s lr=3.000000e-04 loss=4.1207 ppl=61.6718 acc=0.3127 tokens=8128.0000
|
| 170 |
+
step=2840 micro_steps=5680 elapsed=3.6s lr=3.000000e-04 loss=4.1225 ppl=61.7597 acc=0.3127 tokens=8128.0000
|
| 171 |
+
step=2860 micro_steps=5720 elapsed=3.6s lr=3.000000e-04 loss=4.1187 ppl=61.5916 acc=0.3140 tokens=8128.0000
|
| 172 |
+
step=2880 micro_steps=5760 elapsed=3.6s lr=3.000000e-04 loss=4.1156 ppl=61.3453 acc=0.3132 tokens=8128.0000
|
| 173 |
+
step=2900 micro_steps=5800 elapsed=3.6s lr=3.000000e-04 loss=4.0986 ppl=60.3072 acc=0.3161 tokens=8128.0000
|
| 174 |
+
step=2920 micro_steps=5840 elapsed=3.6s lr=3.000000e-04 loss=4.1038 ppl=60.6632 acc=0.3147 tokens=8128.0000
|
| 175 |
+
step=2940 micro_steps=5880 elapsed=3.6s lr=3.000000e-04 loss=4.1071 ppl=60.8467 acc=0.3137 tokens=8128.0000
|
| 176 |
+
step=2960 micro_steps=5920 elapsed=3.6s lr=3.000000e-04 loss=4.0904 ppl=59.8346 acc=0.3163 tokens=8128.0000
|
| 177 |
+
step=2980 micro_steps=5960 elapsed=3.6s lr=3.000000e-04 loss=4.1033 ppl=60.6203 acc=0.3137 tokens=8128.0000
|
| 178 |
+
step=3000 micro_steps=6000 elapsed=3.6s lr=3.000000e-04 loss=4.0854 ppl=59.5390 acc=0.3165 tokens=8128.0000
|
| 179 |
+
[sample step=3000] [CLS] hapless crude, trading from $38. 26 a barrel on reports on immeas roderick lee - - who had seen the big 12 experience when people arrived to check the stock. [SEP] attempted asylum ringleader s. b. son - in - law also linked col. umar with gen. rauci radan, who has until september 11 to attend the celebre jury hijacker in mount wootton, north yorkshire, and recently flown to new york, cuba with an apology from 18 self - proclaimed owners of a home where 13 - year - old gareth culbert abused his job as one of the first [SEP]
|
| 180 |
+
step=3020 micro_steps=6040 elapsed=5.8s lr=3.000000e-04 loss=4.0828 ppl=59.3799 acc=0.3161 tokens=8128.0000
|
| 181 |
+
step=3040 micro_steps=6080 elapsed=3.6s lr=3.000000e-04 loss=4.0786 ppl=59.1482 acc=0.3171 tokens=8128.0000
|
| 182 |
+
step=3060 micro_steps=6120 elapsed=3.6s lr=3.000000e-04 loss=4.0716 ppl=58.7107 acc=0.3180 tokens=8128.0000
|
| 183 |
+
step=3080 micro_steps=6160 elapsed=3.6s lr=3.000000e-04 loss=4.0767 ppl=59.0115 acc=0.3177 tokens=8128.0000
|
| 184 |
+
step=3100 micro_steps=6200 elapsed=3.6s lr=3.000000e-04 loss=4.0632 ppl=58.2278 acc=0.3177 tokens=8128.0000
|
| 185 |
+
step=3120 micro_steps=6240 elapsed=3.6s lr=3.000000e-04 loss=4.0738 ppl=58.8580 acc=0.3168 tokens=8128.0000
|
| 186 |
+
step=3140 micro_steps=6280 elapsed=3.6s lr=3.000000e-04 loss=4.0672 ppl=58.4934 acc=0.3178 tokens=8128.0000
|
| 187 |
+
step=3160 micro_steps=6320 elapsed=3.6s lr=3.000000e-04 loss=4.0560 ppl=57.8210 acc=0.3183 tokens=8128.0000
|
| 188 |
+
step=3180 micro_steps=6360 elapsed=3.6s lr=3.000000e-04 loss=4.0523 ppl=57.6100 acc=0.3202 tokens=8128.0000
|
| 189 |
+
step=3200 micro_steps=6400 elapsed=3.6s lr=3.000000e-04 loss=4.0588 ppl=57.9823 acc=0.3185 tokens=8128.0000
|
| 190 |
+
step=3220 micro_steps=6440 elapsed=3.6s lr=3.000000e-04 loss=4.0476 ppl=57.3425 acc=0.3196 tokens=8128.0000
|
| 191 |
+
step=3240 micro_steps=6480 elapsed=3.6s lr=3.000000e-04 loss=4.0504 ppl=57.5045 acc=0.3200 tokens=8128.0000
|
| 192 |
+
step=3260 micro_steps=6520 elapsed=3.6s lr=3.000000e-04 loss=4.0352 ppl=56.6460 acc=0.3207 tokens=8128.0000
|
| 193 |
+
step=3280 micro_steps=6560 elapsed=3.6s lr=3.000000e-04 loss=4.0359 ppl=56.6621 acc=0.3211 tokens=8128.0000
|
| 194 |
+
step=3300 micro_steps=6600 elapsed=3.6s lr=3.000000e-04 loss=4.0452 ppl=57.1848 acc=0.3194 tokens=8128.0000
|
| 195 |
+
step=3320 micro_steps=6640 elapsed=3.6s lr=3.000000e-04 loss=4.0308 ppl=56.3376 acc=0.3218 tokens=8128.0000
|
| 196 |
+
step=3340 micro_steps=6680 elapsed=3.6s lr=3.000000e-04 loss=4.0391 ppl=56.8609 acc=0.3209 tokens=8128.0000
|
| 197 |
+
step=3360 micro_steps=6720 elapsed=3.6s lr=3.000000e-04 loss=4.0293 ppl=56.2732 acc=0.3222 tokens=8128.0000
|
| 198 |
+
step=3380 micro_steps=6760 elapsed=3.6s lr=3.000000e-04 loss=4.0174 ppl=55.5969 acc=0.3221 tokens=8128.0000
|
| 199 |
+
step=3400 micro_steps=6800 elapsed=3.6s lr=3.000000e-04 loss=4.0322 ppl=56.4517 acc=0.3218 tokens=8128.0000
|
| 200 |
+
step=3420 micro_steps=6840 elapsed=3.6s lr=3.000000e-04 loss=4.0103 ppl=55.2042 acc=0.3222 tokens=8128.0000
|
| 201 |
+
step=3440 micro_steps=6880 elapsed=3.6s lr=3.000000e-04 loss=4.0159 ppl=55.5478 acc=0.3234 tokens=8128.0000
|
| 202 |
+
step=3460 micro_steps=6920 elapsed=3.6s lr=3.000000e-04 loss=4.0053 ppl=54.9606 acc=0.3242 tokens=8128.0000
|
| 203 |
+
step=3480 micro_steps=6960 elapsed=3.6s lr=3.000000e-04 loss=4.0200 ppl=55.7635 acc=0.3221 tokens=8128.0000
|
| 204 |
+
step=3500 micro_steps=7000 elapsed=3.6s lr=3.000000e-04 loss=3.9826 ppl=53.7106 acc=0.3250 tokens=8128.0000
|
| 205 |
+
step=3520 micro_steps=7040 elapsed=3.6s lr=3.000000e-04 loss=4.0046 ppl=54.9312 acc=0.3248 tokens=8128.0000
|
| 206 |
+
step=3540 micro_steps=7080 elapsed=3.6s lr=3.000000e-04 loss=3.9995 ppl=54.6437 acc=0.3252 tokens=8128.0000
|
| 207 |
+
step=3560 micro_steps=7120 elapsed=3.6s lr=3.000000e-04 loss=3.9930 ppl=54.2816 acc=0.3244 tokens=8128.0000
|
| 208 |
+
step=3580 micro_steps=7160 elapsed=3.6s lr=3.000000e-04 loss=3.9871 ppl=53.9546 acc=0.3248 tokens=8128.0000
|
| 209 |
+
step=3600 micro_steps=7200 elapsed=3.6s lr=3.000000e-04 loss=3.9826 ppl=53.7258 acc=0.3260 tokens=8128.0000
|
| 210 |
+
step=3620 micro_steps=7240 elapsed=3.6s lr=3.000000e-04 loss=3.9888 ppl=54.0497 acc=0.3256 tokens=8128.0000
|
| 211 |
+
step=3640 micro_steps=7280 elapsed=3.6s lr=3.000000e-04 loss=3.9819 ppl=53.6996 acc=0.3260 tokens=8128.0000
|
| 212 |
+
step=3660 micro_steps=7320 elapsed=3.6s lr=3.000000e-04 loss=3.9797 ppl=53.5662 acc=0.3270 tokens=8128.0000
|
| 213 |
+
step=3680 micro_steps=7360 elapsed=3.6s lr=3.000000e-04 loss=3.9860 ppl=53.8878 acc=0.3255 tokens=8128.0000
|
| 214 |
+
step=3700 micro_steps=7400 elapsed=3.6s lr=3.000000e-04 loss=3.9782 ppl=53.4761 acc=0.3250 tokens=8128.0000
|
| 215 |
+
step=3720 micro_steps=7440 elapsed=3.6s lr=3.000000e-04 loss=3.9659 ppl=52.8435 acc=0.3281 tokens=8128.0000
|
| 216 |
+
step=3740 micro_steps=7480 elapsed=3.6s lr=3.000000e-04 loss=3.9718 ppl=53.1304 acc=0.3275 tokens=8128.0000
|
| 217 |
+
step=3760 micro_steps=7520 elapsed=3.6s lr=3.000000e-04 loss=3.9885 ppl=54.0326 acc=0.3251 tokens=8128.0000
|
| 218 |
+
step=3780 micro_steps=7560 elapsed=3.6s lr=3.000000e-04 loss=3.9540 ppl=52.2093 acc=0.3283 tokens=8128.0000
|
| 219 |
+
step=3800 micro_steps=7600 elapsed=3.6s lr=3.000000e-04 loss=3.9595 ppl=52.4764 acc=0.3275 tokens=8128.0000
|
| 220 |
+
step=3820 micro_steps=7640 elapsed=3.6s lr=3.000000e-04 loss=3.9700 ppl=53.0521 acc=0.3263 tokens=8128.0000
|
| 221 |
+
step=3840 micro_steps=7680 elapsed=3.6s lr=3.000000e-04 loss=3.9764 ppl=53.4071 acc=0.3256 tokens=8128.0000
|
| 222 |
+
step=3860 micro_steps=7720 elapsed=3.6s lr=3.000000e-04 loss=3.9489 ppl=51.9420 acc=0.3287 tokens=8128.0000
|
| 223 |
+
step=3880 micro_steps=7760 elapsed=3.6s lr=3.000000e-04 loss=3.9492 ppl=51.9533 acc=0.3290 tokens=8128.0000
|
| 224 |
+
step=3900 micro_steps=7800 elapsed=3.6s lr=3.000000e-04 loss=3.9630 ppl=52.6557 acc=0.3273 tokens=8128.0000
|
| 225 |
+
step=3920 micro_steps=7840 elapsed=3.6s lr=3.000000e-04 loss=3.9416 ppl=51.5544 acc=0.3302 tokens=8128.0000
|
| 226 |
+
step=3940 micro_steps=7880 elapsed=3.6s lr=3.000000e-04 loss=3.9534 ppl=52.1683 acc=0.3286 tokens=8128.0000
|
| 227 |
+
step=3960 micro_steps=7920 elapsed=3.6s lr=3.000000e-04 loss=3.9490 ppl=51.9456 acc=0.3284 tokens=8128.0000
|
| 228 |
+
step=3980 micro_steps=7960 elapsed=3.6s lr=3.000000e-04 loss=3.9503 ppl=52.0025 acc=0.3275 tokens=8128.0000
|
| 229 |
+
step=4000 micro_steps=8000 elapsed=3.6s lr=3.000000e-04 loss=3.9315 ppl=51.0298 acc=0.3307 tokens=8128.0000
|
| 230 |
+
[sample step=4000] [CLS] such a showdown, which ended some years later with a trip to the united states. [SEP] worsening the pain in americans - - and hopefully inapturating it - - is hard on those people. [SEP] at one point, it would rise near the corset airport in post - communist north africa, drawing viewers who struggle to cope with tourist hikes. [SEP] he had sheltered some $40, 000 for a farmer from working his way to a nearby park instead of labeling a panda or watching the wild boar (30 feet) instead of the man. [SEP] is affluensky's demand definitely not worth? [SEP] [SEP]
|
| 231 |
+
step=4020 micro_steps=8040 elapsed=5.8s lr=3.000000e-04 loss=3.9312 ppl=51.0517 acc=0.3308 tokens=8128.0000
|
| 232 |
+
step=4040 micro_steps=8080 elapsed=3.6s lr=3.000000e-04 loss=3.9361 ppl=51.2798 acc=0.3303 tokens=8128.0000
|
| 233 |
+
step=4060 micro_steps=8120 elapsed=3.6s lr=3.000000e-04 loss=3.9501 ppl=52.0138 acc=0.3285 tokens=8128.0000
|
| 234 |
+
step=4080 micro_steps=8160 elapsed=3.6s lr=3.000000e-04 loss=3.9394 ppl=51.4709 acc=0.3286 tokens=8128.0000
|
| 235 |
+
step=4100 micro_steps=8200 elapsed=3.6s lr=3.000000e-04 loss=3.9253 ppl=50.7103 acc=0.3307 tokens=8128.0000
|
| 236 |
+
step=4120 micro_steps=8240 elapsed=3.6s lr=3.000000e-04 loss=3.9292 ppl=50.9211 acc=0.3316 tokens=8128.0000
|
| 237 |
+
step=4140 micro_steps=8280 elapsed=3.6s lr=3.000000e-04 loss=3.9309 ppl=51.0085 acc=0.3309 tokens=8128.0000
|
| 238 |
+
step=4160 micro_steps=8320 elapsed=3.6s lr=3.000000e-04 loss=3.9245 ppl=50.7222 acc=0.3310 tokens=8128.0000
|
| 239 |
+
step=4180 micro_steps=8360 elapsed=3.6s lr=3.000000e-04 loss=3.9219 ppl=50.5655 acc=0.3307 tokens=8128.0000
|
| 240 |
+
step=4200 micro_steps=8400 elapsed=3.6s lr=3.000000e-04 loss=3.9287 ppl=50.9071 acc=0.3310 tokens=8128.0000
|
| 241 |
+
step=4220 micro_steps=8440 elapsed=3.6s lr=3.000000e-04 loss=3.9173 ppl=50.3085 acc=0.3305 tokens=8128.0000
|
| 242 |
+
step=4240 micro_steps=8480 elapsed=3.6s lr=3.000000e-04 loss=3.9112 ppl=50.0233 acc=0.3331 tokens=8128.0000
|
| 243 |
+
step=4260 micro_steps=8520 elapsed=3.6s lr=3.000000e-04 loss=3.9256 ppl=50.7239 acc=0.3311 tokens=8128.0000
|
| 244 |
+
step=4280 micro_steps=8560 elapsed=3.6s lr=3.000000e-04 loss=3.9058 ppl=49.7594 acc=0.3336 tokens=8128.0000
|
| 245 |
+
step=4300 micro_steps=8600 elapsed=3.6s lr=3.000000e-04 loss=3.8825 ppl=48.6190 acc=0.3352 tokens=8128.0000
|
| 246 |
+
step=4320 micro_steps=8640 elapsed=3.6s lr=3.000000e-04 loss=3.9073 ppl=49.8234 acc=0.3320 tokens=8128.0000
|
| 247 |
+
step=4340 micro_steps=8680 elapsed=3.6s lr=3.000000e-04 loss=3.9016 ppl=49.5402 acc=0.3324 tokens=8128.0000
|
| 248 |
+
step=4360 micro_steps=8720 elapsed=3.6s lr=3.000000e-04 loss=3.9019 ppl=49.5451 acc=0.3337 tokens=8128.0000
|
| 249 |
+
step=4380 micro_steps=8760 elapsed=3.6s lr=3.000000e-04 loss=3.8976 ppl=49.3407 acc=0.3327 tokens=8128.0000
|
| 250 |
+
step=4400 micro_steps=8800 elapsed=3.6s lr=3.000000e-04 loss=3.8919 ppl=49.0610 acc=0.3336 tokens=8128.0000
|
| 251 |
+
step=4420 micro_steps=8840 elapsed=3.6s lr=3.000000e-04 loss=3.8915 ppl=49.0252 acc=0.3344 tokens=8128.0000
|
| 252 |
+
step=4440 micro_steps=8880 elapsed=3.6s lr=3.000000e-04 loss=3.8883 ppl=48.8853 acc=0.3333 tokens=8128.0000
|
| 253 |
+
step=4460 micro_steps=8920 elapsed=3.6s lr=3.000000e-04 loss=3.9000 ppl=49.4839 acc=0.3336 tokens=8128.0000
|
| 254 |
+
step=4480 micro_steps=8960 elapsed=3.6s lr=3.000000e-04 loss=3.8926 ppl=49.0913 acc=0.3341 tokens=8128.0000
|
| 255 |
+
step=4500 micro_steps=9000 elapsed=3.6s lr=3.000000e-04 loss=3.8436 ppl=47.3114 acc=0.3418 tokens=8128.0000
|
| 256 |
+
step=4520 micro_steps=9040 elapsed=3.6s lr=3.000000e-04 loss=3.8916 ppl=49.0472 acc=0.3343 tokens=8128.0000
|
| 257 |
+
step=4540 micro_steps=9080 elapsed=3.6s lr=3.000000e-04 loss=3.8798 ppl=48.4476 acc=0.3358 tokens=8128.0000
|
| 258 |
+
step=4560 micro_steps=9120 elapsed=3.6s lr=3.000000e-04 loss=3.8811 ppl=48.5576 acc=0.3355 tokens=8128.0000
|
| 259 |
+
step=4580 micro_steps=9160 elapsed=3.6s lr=3.000000e-04 loss=3.8873 ppl=48.8260 acc=0.3343 tokens=8128.0000
|
| 260 |
+
step=4600 micro_steps=9200 elapsed=3.6s lr=3.000000e-04 loss=3.8902 ppl=48.9647 acc=0.3335 tokens=8128.0000
|
| 261 |
+
step=4620 micro_steps=9240 elapsed=3.6s lr=3.000000e-04 loss=3.8757 ppl=48.2724 acc=0.3353 tokens=8128.0000
|
| 262 |
+
step=4640 micro_steps=9280 elapsed=3.6s lr=3.000000e-04 loss=3.8801 ppl=48.5090 acc=0.3355 tokens=8128.0000
|
| 263 |
+
step=4660 micro_steps=9320 elapsed=3.6s lr=3.000000e-04 loss=3.8660 ppl=47.8119 acc=0.3357 tokens=8128.0000
|
| 264 |
+
step=4680 micro_steps=9360 elapsed=3.6s lr=3.000000e-04 loss=3.8870 ppl=48.8011 acc=0.3352 tokens=8128.0000
|
| 265 |
+
step=4700 micro_steps=9400 elapsed=3.6s lr=3.000000e-04 loss=3.8677 ppl=47.9033 acc=0.3370 tokens=8128.0000
|
| 266 |
+
step=4720 micro_steps=9440 elapsed=3.6s lr=3.000000e-04 loss=3.8797 ppl=48.4573 acc=0.3350 tokens=8128.0000
|
| 267 |
+
step=4740 micro_steps=9480 elapsed=3.6s lr=3.000000e-04 loss=3.8766 ppl=48.3031 acc=0.3353 tokens=8128.0000
|
| 268 |
+
step=4760 micro_steps=9520 elapsed=3.6s lr=3.000000e-04 loss=3.8714 ppl=48.0545 acc=0.3365 tokens=8128.0000
|
| 269 |
+
step=4780 micro_steps=9560 elapsed=3.6s lr=3.000000e-04 loss=3.8526 ppl=47.1621 acc=0.3381 tokens=8128.0000
|
| 270 |
+
step=4800 micro_steps=9600 elapsed=3.6s lr=3.000000e-04 loss=3.8639 ppl=47.6841 acc=0.3374 tokens=8128.0000
|
| 271 |
+
step=4820 micro_steps=9640 elapsed=3.6s lr=3.000000e-04 loss=3.8566 ppl=47.3693 acc=0.3369 tokens=8128.0000
|
| 272 |
+
step=4840 micro_steps=9680 elapsed=3.6s lr=3.000000e-04 loss=3.8527 ppl=47.1919 acc=0.3392 tokens=8128.0000
|
| 273 |
+
step=4860 micro_steps=9720 elapsed=3.6s lr=3.000000e-04 loss=3.8581 ppl=47.4503 acc=0.3370 tokens=8128.0000
|
| 274 |
+
step=4880 micro_steps=9760 elapsed=3.6s lr=3.000000e-04 loss=3.8667 ppl=47.8499 acc=0.3356 tokens=8128.0000
|
| 275 |
+
step=4900 micro_steps=9800 elapsed=3.6s lr=3.000000e-04 loss=3.8576 ppl=47.4053 acc=0.3376 tokens=8128.0000
|
| 276 |
+
step=4920 micro_steps=9840 elapsed=3.6s lr=3.000000e-04 loss=3.8391 ppl=46.5728 acc=0.3389 tokens=8128.0000
|
| 277 |
+
step=4940 micro_steps=9880 elapsed=3.6s lr=3.000000e-04 loss=3.8671 ppl=47.8644 acc=0.3354 tokens=8128.0000
|
| 278 |
+
step=4960 micro_steps=9920 elapsed=3.6s lr=3.000000e-04 loss=3.8590 ppl=47.4656 acc=0.3372 tokens=8128.0000
|
| 279 |
+
step=4980 micro_steps=9960 elapsed=3.6s lr=3.000000e-04 loss=3.8433 ppl=46.7323 acc=0.3388 tokens=8128.0000
|
| 280 |
+
step=5000 micro_steps=10000 elapsed=3.6s lr=3.000000e-04 loss=3.8529 ppl=47.1858 acc=0.3383 tokens=8128.0000
|
| 281 |
+
[sample step=5000] [CLS] siddiqui, said he felt that mr. haidl's decision had been influenced at best by popular experiences, his long - time opponent in power and friendship among iraqis, his family and friends. [SEP] the oft found the us consumer credit agency in a weak showing in the early 1990s as well as a resurgence by the htc of the u. s. - based marketing firm roche. [SEP] tidal energy's developers will also have to acquire time warner network for up to $1 billion. [SEP] 0428: will clorox and epert 8 sell it in jars? [SEP] until now, ebay ' [SEP]
|
| 282 |
+
step=5020 micro_steps=10040 elapsed=5.8s lr=3.000000e-04 loss=3.8548 ppl=47.2749 acc=0.3385 tokens=8128.0000
|
| 283 |
+
step=5040 micro_steps=10080 elapsed=3.6s lr=3.000000e-04 loss=3.8402 ppl=46.5920 acc=0.3385 tokens=8128.0000
|
| 284 |
+
step=5060 micro_steps=10120 elapsed=3.6s lr=3.000000e-04 loss=3.8436 ppl=46.7635 acc=0.3386 tokens=8128.0000
|
| 285 |
+
step=5080 micro_steps=10160 elapsed=3.6s lr=3.000000e-04 loss=3.8557 ppl=47.3276 acc=0.3375 tokens=8128.0000
|
| 286 |
+
step=5100 micro_steps=10200 elapsed=3.6s lr=3.000000e-04 loss=3.8403 ppl=46.5893 acc=0.3387 tokens=8128.0000
|
| 287 |
+
step=5120 micro_steps=10240 elapsed=3.6s lr=3.000000e-04 loss=3.8308 ppl=46.1426 acc=0.3396 tokens=8128.0000
|
| 288 |
+
step=5140 micro_steps=10280 elapsed=3.6s lr=3.000000e-04 loss=3.8295 ppl=46.0875 acc=0.3423 tokens=8128.0000
|
| 289 |
+
step=5160 micro_steps=10320 elapsed=3.6s lr=3.000000e-04 loss=3.8385 ppl=46.4952 acc=0.3402 tokens=8128.0000
|
| 290 |
+
step=5180 micro_steps=10360 elapsed=3.6s lr=3.000000e-04 loss=3.8380 ppl=46.4819 acc=0.3401 tokens=8128.0000
|
| 291 |
+
step=5200 micro_steps=10400 elapsed=3.6s lr=3.000000e-04 loss=3.8301 ppl=46.1601 acc=0.3395 tokens=8128.0000
|
| 292 |
+
step=5220 micro_steps=10440 elapsed=3.6s lr=3.000000e-04 loss=3.8380 ppl=46.5034 acc=0.3395 tokens=8128.0000
|
| 293 |
+
step=5240 micro_steps=10480 elapsed=3.6s lr=3.000000e-04 loss=3.8326 ppl=46.2529 acc=0.3408 tokens=8128.0000
|
| 294 |
+
step=5260 micro_steps=10520 elapsed=3.6s lr=3.000000e-04 loss=3.8225 ppl=45.7632 acc=0.3410 tokens=8128.0000
|
| 295 |
+
step=5280 micro_steps=10560 elapsed=3.6s lr=3.000000e-04 loss=3.8151 ppl=45.4480 acc=0.3414 tokens=8128.0000
|
| 296 |
+
step=5300 micro_steps=10600 elapsed=3.6s lr=3.000000e-04 loss=3.8284 ppl=46.0495 acc=0.3399 tokens=8128.0000
|
| 297 |
+
step=5320 micro_steps=10640 elapsed=3.6s lr=3.000000e-04 loss=3.8339 ppl=46.2933 acc=0.3391 tokens=8128.0000
|
| 298 |
+
step=5340 micro_steps=10680 elapsed=3.6s lr=3.000000e-04 loss=3.8308 ppl=46.1532 acc=0.3395 tokens=8128.0000
|
| 299 |
+
step=5360 micro_steps=10720 elapsed=3.6s lr=3.000000e-04 loss=3.8233 ppl=45.8144 acc=0.3416 tokens=8128.0000
|
| 300 |
+
step=5380 micro_steps=10760 elapsed=3.6s lr=3.000000e-04 loss=3.8281 ppl=46.0230 acc=0.3399 tokens=8128.0000
|
| 301 |
+
step=5400 micro_steps=10800 elapsed=3.6s lr=3.000000e-04 loss=3.8238 ppl=45.8492 acc=0.3411 tokens=8128.0000
|
| 302 |
+
step=5420 micro_steps=10840 elapsed=3.6s lr=3.000000e-04 loss=3.8261 ppl=45.9348 acc=0.3401 tokens=8128.0000
|
| 303 |
+
step=5440 micro_steps=10880 elapsed=3.6s lr=3.000000e-04 loss=3.8114 ppl=45.2710 acc=0.3412 tokens=8128.0000
|
| 304 |
+
step=5460 micro_steps=10920 elapsed=3.6s lr=3.000000e-04 loss=3.8170 ppl=45.5219 acc=0.3415 tokens=8128.0000
|
| 305 |
+
step=5480 micro_steps=10960 elapsed=3.6s lr=3.000000e-04 loss=3.8110 ppl=45.2390 acc=0.3422 tokens=8128.0000
|
LTA_openwebtext_dualt/logs/lta_lm1b_classic_dirichlet_len256_gbs512_4gpu_10k_save1k_20260523.train.pid
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
994417
|
LTA_openwebtext_dualt/logs/lta_owt_dirichlet_categorical_fullvocab_c1024_fullycoupled_shufchunks_len128_gbs512_8gpu_1m.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/lta_owt_gpt2cached_len1024_rollout1_p1_bench4gpu_20260513_152806.log
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
*****************************************
|
| 3 |
+
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
*****************************************
|
| 5 |
+
NCCL version 2.25.1+cuda12.8
|
| 6 |
+
{
|
| 7 |
+
"device": "cuda:0",
|
| 8 |
+
"rank": 0,
|
| 9 |
+
"world_size": 4,
|
| 10 |
+
"samples": "owt_cached_chunks:8734897",
|
| 11 |
+
"vocab_size": 50257,
|
| 12 |
+
"tokenizer_vocab_size": 50257,
|
| 13 |
+
"save_dir": "runs/lta_owt_gpt2cached_len1024_rollout1_p1_bench4gpu_20260513_152806",
|
| 14 |
+
"batch_size": 32,
|
| 15 |
+
"grad_accum": 4,
|
| 16 |
+
"effective_batch_size": 512,
|
| 17 |
+
"global_batch_size": 512,
|
| 18 |
+
"lr_schedule": "cosine",
|
| 19 |
+
"optimizer": "adamw",
|
| 20 |
+
"warmup_steps": 5,
|
| 21 |
+
"min_lr": 6e-05,
|
| 22 |
+
"weight_decay": 0.1,
|
| 23 |
+
"adamw_param_groups": "nanogpt",
|
| 24 |
+
"adam_beta1": 0.9,
|
| 25 |
+
"adam_beta2": 0.95,
|
| 26 |
+
"adam_eps": 1e-08,
|
| 27 |
+
"muon_momentum": 0.95,
|
| 28 |
+
"muon_ns_steps": 5,
|
| 29 |
+
"muon_update_scale": 1.0,
|
| 30 |
+
"ema_decay": 0.0,
|
| 31 |
+
"ema_start_step": 0,
|
| 32 |
+
"model_type": "ddit",
|
| 33 |
+
"dual_t": true,
|
| 34 |
+
"corrupt_t_mode": "same",
|
| 35 |
+
"corrupt_min_t": 0.0,
|
| 36 |
+
"corrupt_max_t": 1.0,
|
| 37 |
+
"prefix_block_prob": 0.0,
|
| 38 |
+
"prefix_block_len": 128,
|
| 39 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 40 |
+
"dirichlet_semantic_t_mode": "same",
|
| 41 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 42 |
+
"categorical_wrong_from_full_vocab": true,
|
| 43 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 44 |
+
"mask_mixture_original_prob": 0.0,
|
| 45 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 46 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 47 |
+
"mask_mixture_block_prob": 0.0,
|
| 48 |
+
"mask_mixture_all_prob": 0.0,
|
| 49 |
+
"mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
|
| 50 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 51 |
+
"mask_mixture_block_tokens": "64,128",
|
| 52 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 53 |
+
"logistic_normal_sigma_min": 0.18,
|
| 54 |
+
"logistic_normal_sigma_max": 2.2,
|
| 55 |
+
"logistic_normal_tau_min": 0.65,
|
| 56 |
+
"logistic_normal_tau_max": 1.15,
|
| 57 |
+
"torch_compile": false,
|
| 58 |
+
"compile_mode": "max-autotune",
|
| 59 |
+
"state_format": "prob",
|
| 60 |
+
"target_loss": "hard_ce",
|
| 61 |
+
"meanflow_weight": 0.0,
|
| 62 |
+
"rollout_train_prob": 1.0,
|
| 63 |
+
"rollout_train_steps": 1,
|
| 64 |
+
"rollout_train_infer_steps": 64,
|
| 65 |
+
"rollout_train_temp": 1.45,
|
| 66 |
+
"rollout_train_max_gamma": 1.0,
|
| 67 |
+
"rollout_train_corrupt_only": true,
|
| 68 |
+
"rollout_train_samplewise": false,
|
| 69 |
+
"rollout_train_compute_always": false,
|
| 70 |
+
"bridge_noise_init": "logistic_normal",
|
| 71 |
+
"noise_sigma": -1.0,
|
| 72 |
+
"allow_tf32": true,
|
| 73 |
+
"activation_checkpointing": false,
|
| 74 |
+
"activation_checkpoint_interval": 1,
|
| 75 |
+
"ddp_static_graph": false,
|
| 76 |
+
"ddp_gradient_as_bucket_view": true,
|
| 77 |
+
"blocking_data_transfer": false,
|
| 78 |
+
"dataloader_prefetch_factor": 4,
|
| 79 |
+
"full_train_stats": false,
|
| 80 |
+
"record_pad_truncate": false,
|
| 81 |
+
"record_add_eos": false,
|
| 82 |
+
"record_add_special_tokens": false,
|
| 83 |
+
"record_pad_token": "pad",
|
| 84 |
+
"record_shuffle_buffer": 10000,
|
| 85 |
+
"wrap": true,
|
| 86 |
+
"wrap_mode": "stream",
|
| 87 |
+
"wrap_record_buffer_size": 200,
|
| 88 |
+
"owt_cached_chunks": true,
|
| 89 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train_minus_100k",
|
| 90 |
+
"owt_chunk_cache_rebuild": false,
|
| 91 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 92 |
+
"owt_exact_repeat_per_chunk": 0,
|
| 93 |
+
"online_chunk_shuffle": false,
|
| 94 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 95 |
+
"openwebtext_split": "train_minus_100k",
|
| 96 |
+
"detokenizer": "auto",
|
| 97 |
+
"resolved_detokenizer": null,
|
| 98 |
+
"num_workers": 8,
|
| 99 |
+
"latest_every": 100000,
|
| 100 |
+
"resume_path": ""
|
| 101 |
+
}
|
| 102 |
+
step=5 micro_steps=20 elapsed=17.2s lr=6.000000e-04 loss=10.8125 loss_recon=10.8125 loss_meanflow=0.0000 mean_model_t=0.5068 mean_corrupt_t=0.5068 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 rollout_train_applied=1.0000 acc_all=0.0005 acc_corrupt=0.0005 corrupt_frac=0.5206 loss_all=10.8125 loss_corrupt=10.8125 acc_corrupt_t_0p0_0p2=0.0011 corrupt_frac_t_0p0_0p2=0.1067 acc_corrupt_t_0p2_0p4=0.0004 corrupt_frac_t_0p2_0p4=0.1344 acc_corrupt_t_0p4_0p6=0.0000 corrupt_frac_t_0p4_0p6=0.0925 acc_corrupt_t_0p6_0p8=0.0007 corrupt_frac_t_0p6_0p8=0.4272 acc_corrupt_t_0p8_1p0=0.0000 corrupt_frac_t_0p8_1p0=0.2392 wrong_frac=0.3916 init_acc_corrupt=0.5891 init_gold_top10=0.6052 init_gold_top100=0.6225
|
| 103 |
+
step=10 micro_steps=40 elapsed=19.8s lr=6.000000e-05 loss=10.8125 loss_recon=10.8125 loss_meanflow=0.0000 mean_model_t=0.4822 mean_corrupt_t=0.4822 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 rollout_train_applied=1.0000 acc_all=0.0007 acc_corrupt=0.0006 corrupt_frac=0.6584 loss_all=10.8125 loss_corrupt=10.8125 acc_corrupt_t_0p0_0p2=0.0010 corrupt_frac_t_0p0_0p2=0.3402 acc_corrupt_t_0p2_0p4=0.0003 corrupt_frac_t_0p2_0p4=0.1348 acc_corrupt_t_0p4_0p6=0.0005 corrupt_frac_t_0p4_0p6=0.1821 acc_corrupt_t_0p6_0p8=0.0000 corrupt_frac_t_0p6_0p8=0.1046 acc_corrupt_t_0p8_1p0=0.0004 corrupt_frac_t_0p8_1p0=0.2383 wrong_frac=0.5652 init_acc_corrupt=0.4016 init_gold_top10=0.4274 init_gold_top100=0.4691
|
LTA_openwebtext_dualt/logs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525_watcher.log
ADDED
|
@@ -0,0 +1,458 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[watch-gumbel] run_dir=runs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525
|
| 2 |
+
[watch-gumbel] out_base=docs/lta_samples/metrics_20260525/owt_t5_absrope_adaln_Cv_to_2v_mask0p1_1p0_sameT_sde_gumbel_topp0.95_tau1.0_to_0.2_blend_c32100_64200_n128/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525
|
| 3 |
+
[watch-gumbel] interval=10000 max_len=1024 steps=128 c=32100->64200 temp=1.45 top_p=0.95 tau=1.0->0.2 n=128
|
| 4 |
+
[watch-gumbel] 2026-05-25_17:58:57 no ckpt yet
|
| 5 |
+
[watch-gumbel] 2026-05-25_17:59:27 no ckpt yet
|
| 6 |
+
[watch-gumbel] 2026-05-25_17:59:57 no ckpt yet
|
| 7 |
+
[watch-gumbel] 2026-05-25_18:00:27 no ckpt yet
|
| 8 |
+
[watch-gumbel] 2026-05-25_18:00:57 no ckpt yet
|
| 9 |
+
[watch-gumbel] 2026-05-25_18:01:27 no ckpt yet
|
| 10 |
+
[watch-gumbel] 2026-05-25_18:01:57 no ckpt yet
|
| 11 |
+
[watch-gumbel] 2026-05-25_18:02:27 no ckpt yet
|
| 12 |
+
[watch-gumbel] 2026-05-25_18:02:57 no ckpt yet
|
| 13 |
+
[watch-gumbel] 2026-05-25_18:03:27 no ckpt yet
|
| 14 |
+
[watch-gumbel] 2026-05-25_18:03:57 no ckpt yet
|
| 15 |
+
[watch-gumbel] 2026-05-25_18:04:27 no ckpt yet
|
| 16 |
+
[watch-gumbel] 2026-05-25_18:04:57 no ckpt yet
|
| 17 |
+
[watch-gumbel] 2026-05-25_18:05:27 no ckpt yet
|
| 18 |
+
[watch-gumbel] 2026-05-25_18:05:57 no ckpt yet
|
| 19 |
+
[watch-gumbel] 2026-05-25_18:06:27 no ckpt yet
|
| 20 |
+
[watch-gumbel] 2026-05-25_18:06:57 no ckpt yet
|
| 21 |
+
[watch-gumbel] 2026-05-25_18:07:27 no ckpt yet
|
| 22 |
+
[watch-gumbel] 2026-05-25_18:07:57 no ckpt yet
|
| 23 |
+
[watch-gumbel] 2026-05-25_18:08:27 no ckpt yet
|
| 24 |
+
[watch-gumbel] 2026-05-25_18:08:57 no ckpt yet
|
| 25 |
+
[watch-gumbel] 2026-05-25_18:09:27 no ckpt yet
|
| 26 |
+
[watch-gumbel] 2026-05-25_18:09:57 no ckpt yet
|
| 27 |
+
[watch-gumbel] 2026-05-25_18:10:27 no ckpt yet
|
| 28 |
+
[watch-gumbel] 2026-05-25_18:10:57 no ckpt yet
|
| 29 |
+
[watch-gumbel] 2026-05-25_18:11:27 no ckpt yet
|
| 30 |
+
[watch-gumbel] 2026-05-25_18:11:57 no ckpt yet
|
| 31 |
+
[watch-gumbel] 2026-05-25_18:12:27 no ckpt yet
|
| 32 |
+
[watch-gumbel] 2026-05-25_18:12:57 no ckpt yet
|
| 33 |
+
[watch-gumbel] 2026-05-25_18:13:27 no ckpt yet
|
| 34 |
+
[watch-gumbel] 2026-05-25_18:13:57 no ckpt yet
|
| 35 |
+
[watch-gumbel] 2026-05-25_18:14:27 no ckpt yet
|
| 36 |
+
[watch-gumbel] 2026-05-25_18:14:57 no ckpt yet
|
| 37 |
+
[watch-gumbel] 2026-05-25_18:15:27 no ckpt yet
|
| 38 |
+
[watch-gumbel] 2026-05-25_18:15:57 no ckpt yet
|
| 39 |
+
[watch-gumbel] 2026-05-25_18:16:27 no ckpt yet
|
| 40 |
+
[watch-gumbel] 2026-05-25_18:16:57 no ckpt yet
|
| 41 |
+
[watch-gumbel] 2026-05-25_18:17:27 no ckpt yet
|
| 42 |
+
[watch-gumbel] 2026-05-25_18:17:57 no ckpt yet
|
| 43 |
+
[watch-gumbel] 2026-05-25_18:18:27 no ckpt yet
|
| 44 |
+
[watch-gumbel] 2026-05-25_18:18:57 no ckpt yet
|
| 45 |
+
[watch-gumbel] 2026-05-25_18:19:27 no ckpt yet
|
| 46 |
+
[watch-gumbel] 2026-05-25_18:19:57 no ckpt yet
|
| 47 |
+
[watch-gumbel] 2026-05-25_18:20:27 no ckpt yet
|
| 48 |
+
[watch-gumbel] 2026-05-25_18:20:57 no ckpt yet
|
| 49 |
+
[watch-gumbel] 2026-05-25_18:21:27 no ckpt yet
|
| 50 |
+
[watch-gumbel] 2026-05-25_18:21:57 no ckpt yet
|
| 51 |
+
[watch-gumbel] 2026-05-25_18:22:27 no ckpt yet
|
| 52 |
+
[watch-gumbel] 2026-05-25_18:22:57 no ckpt yet
|
| 53 |
+
[watch-gumbel] 2026-05-25_18:23:27 no ckpt yet
|
| 54 |
+
[watch-gumbel] 2026-05-25_18:23:57 no ckpt yet
|
| 55 |
+
[watch-gumbel] 2026-05-25_18:24:27 no ckpt yet
|
| 56 |
+
[watch-gumbel] 2026-05-25_18:24:57 no ckpt yet
|
| 57 |
+
[watch-gumbel] 2026-05-25_18:25:27 no ckpt yet
|
| 58 |
+
[watch-gumbel] 2026-05-25_18:25:57 no ckpt yet
|
| 59 |
+
[watch-gumbel] 2026-05-25_18:26:27 no ckpt yet
|
| 60 |
+
[watch-gumbel] 2026-05-25_18:26:57 no ckpt yet
|
| 61 |
+
[watch-gumbel] 2026-05-25_18:27:27 no ckpt yet
|
| 62 |
+
[watch-gumbel] 2026-05-25_18:27:57 no ckpt yet
|
| 63 |
+
[watch-gumbel] 2026-05-25_18:28:27 no ckpt yet
|
| 64 |
+
[watch-gumbel] 2026-05-25_18:28:57 no ckpt yet
|
| 65 |
+
[watch-gumbel] 2026-05-25_18:29:27 no ckpt yet
|
| 66 |
+
[watch-gumbel] 2026-05-25_18:29:57 no ckpt yet
|
| 67 |
+
[watch-gumbel] 2026-05-25_18:30:27 no ckpt yet
|
| 68 |
+
[watch-gumbel] 2026-05-25_18:30:57 no ckpt yet
|
| 69 |
+
[watch-gumbel] 2026-05-25_18:31:27 no ckpt yet
|
| 70 |
+
[watch-gumbel] 2026-05-25_18:31:57 no ckpt yet
|
| 71 |
+
[watch-gumbel] 2026-05-25_18:32:27 no ckpt yet
|
| 72 |
+
[watch-gumbel] 2026-05-25_18:32:57 no ckpt yet
|
| 73 |
+
[watch-gumbel] 2026-05-25_18:33:27 no ckpt yet
|
| 74 |
+
[watch-gumbel] 2026-05-25_18:33:57 no ckpt yet
|
| 75 |
+
[watch-gumbel] 2026-05-25_18:34:27 no ckpt yet
|
| 76 |
+
[watch-gumbel] 2026-05-25_18:34:57 no ckpt yet
|
| 77 |
+
[watch-gumbel] 2026-05-25_18:35:27 no ckpt yet
|
| 78 |
+
[watch-gumbel] 2026-05-25_18:35:57 no ckpt yet
|
| 79 |
+
[watch-gumbel] 2026-05-25_18:36:27 no ckpt yet
|
| 80 |
+
[watch-gumbel] 2026-05-25_18:36:57 no ckpt yet
|
| 81 |
+
[watch-gumbel] 2026-05-25_18:37:27 no ckpt yet
|
| 82 |
+
[watch-gumbel] 2026-05-25_18:37:57 no ckpt yet
|
| 83 |
+
[watch-gumbel] 2026-05-25_18:38:27 no ckpt yet
|
| 84 |
+
[watch-gumbel] 2026-05-25_18:38:57 no ckpt yet
|
| 85 |
+
[watch-gumbel] 2026-05-25_18:39:27 no ckpt yet
|
| 86 |
+
[watch-gumbel] 2026-05-25_18:39:57 no ckpt yet
|
| 87 |
+
[watch-gumbel] 2026-05-25_18:40:27 no ckpt yet
|
| 88 |
+
[watch-gumbel] 2026-05-25_18:40:57 no ckpt yet
|
| 89 |
+
[watch-gumbel] 2026-05-25_18:41:27 no ckpt yet
|
| 90 |
+
[watch-gumbel] 2026-05-25_18:41:57 no ckpt yet
|
| 91 |
+
[watch-gumbel] 2026-05-25_18:42:27 no ckpt yet
|
| 92 |
+
[watch-gumbel] 2026-05-25_18:42:57 no ckpt yet
|
| 93 |
+
[watch-gumbel] 2026-05-25_18:43:27 no ckpt yet
|
| 94 |
+
[watch-gumbel] 2026-05-25_18:43:57 no ckpt yet
|
| 95 |
+
[watch-gumbel] 2026-05-25_18:44:27 no ckpt yet
|
| 96 |
+
[watch-gumbel] 2026-05-25_18:44:57 no ckpt yet
|
| 97 |
+
[watch-gumbel] 2026-05-25_18:45:27 no ckpt yet
|
| 98 |
+
[watch-gumbel] 2026-05-25_18:45:57 no ckpt yet
|
| 99 |
+
[watch-gumbel] 2026-05-25_18:46:27 no ckpt yet
|
| 100 |
+
[watch-gumbel] 2026-05-25_18:46:57 no ckpt yet
|
| 101 |
+
[watch-gumbel] 2026-05-25_18:47:27 no ckpt yet
|
| 102 |
+
[watch-gumbel] 2026-05-25_18:47:57 no ckpt yet
|
| 103 |
+
[watch-gumbel] 2026-05-25_18:48:27 no ckpt yet
|
| 104 |
+
[watch-gumbel] 2026-05-25_18:48:57 no ckpt yet
|
| 105 |
+
[watch-gumbel] 2026-05-25_18:49:27 no ckpt yet
|
| 106 |
+
[watch-gumbel] 2026-05-25_18:49:57 no ckpt yet
|
| 107 |
+
[watch-gumbel] 2026-05-25_18:50:27 no ckpt yet
|
| 108 |
+
[watch-gumbel] 2026-05-25_18:50:57 no ckpt yet
|
| 109 |
+
[watch-gumbel] 2026-05-25_18:51:27 no ckpt yet
|
| 110 |
+
[watch-gumbel] 2026-05-25_18:51:57 no ckpt yet
|
| 111 |
+
[watch-gumbel] 2026-05-25_18:52:27 no ckpt yet
|
| 112 |
+
[watch-gumbel] 2026-05-25_18:52:57 no ckpt yet
|
| 113 |
+
[watch-gumbel] 2026-05-25_18:53:27 no ckpt yet
|
| 114 |
+
[watch-gumbel] 2026-05-25_18:53:57 no ckpt yet
|
| 115 |
+
[watch-gumbel] 2026-05-25_18:54:27 no ckpt yet
|
| 116 |
+
[watch-gumbel] 2026-05-25_18:54:57 no ckpt yet
|
| 117 |
+
[watch-gumbel] 2026-05-25_18:55:27 no ckpt yet
|
| 118 |
+
[watch-gumbel] 2026-05-25_18:55:57 no ckpt yet
|
| 119 |
+
[watch-gumbel] 2026-05-25_18:56:27 no ckpt yet
|
| 120 |
+
[watch-gumbel] 2026-05-25_18:56:57 no ckpt yet
|
| 121 |
+
[watch-gumbel] 2026-05-25_18:57:27 no ckpt yet
|
| 122 |
+
[watch-gumbel] 2026-05-25_18:57:57 no ckpt yet
|
| 123 |
+
[watch-gumbel] 2026-05-25_18:58:27 no ckpt yet
|
| 124 |
+
[watch-gumbel] 2026-05-25_18:58:57 no ckpt yet
|
| 125 |
+
[watch-gumbel] 2026-05-25_18:59:27 no ckpt yet
|
| 126 |
+
[watch-gumbel] 2026-05-25_18:59:57 no ckpt yet
|
| 127 |
+
[watch-gumbel] 2026-05-25_19:00:27 no ckpt yet
|
| 128 |
+
[watch-gumbel] 2026-05-25_19:00:57 no ckpt yet
|
| 129 |
+
[watch-gumbel] 2026-05-25_19:01:27 no ckpt yet
|
| 130 |
+
[watch-gumbel] 2026-05-25_19:01:57 no ckpt yet
|
| 131 |
+
[watch-gumbel] 2026-05-25_19:02:27 no ckpt yet
|
| 132 |
+
[watch-gumbel] 2026-05-25_19:02:57 no ckpt yet
|
| 133 |
+
[watch-gumbel] 2026-05-25_19:03:27 no ckpt yet
|
| 134 |
+
[watch-gumbel] 2026-05-25_19:03:57 no ckpt yet
|
| 135 |
+
[watch-gumbel] 2026-05-25_19:04:27 no ckpt yet
|
| 136 |
+
[watch-gumbel] 2026-05-25_19:04:57 no ckpt yet
|
| 137 |
+
[watch-gumbel] 2026-05-25_19:05:27 no ckpt yet
|
| 138 |
+
[watch-gumbel] 2026-05-25_19:05:57 no ckpt yet
|
| 139 |
+
[watch-gumbel] 2026-05-25_19:06:27 no ckpt yet
|
| 140 |
+
[watch-gumbel] 2026-05-25_19:06:57 no ckpt yet
|
| 141 |
+
[watch-gumbel] 2026-05-25_19:07:27 no ckpt yet
|
| 142 |
+
[watch-gumbel] 2026-05-25_19:07:57 no ckpt yet
|
| 143 |
+
[watch-gumbel] 2026-05-25_19:08:27 no ckpt yet
|
| 144 |
+
[watch-gumbel] 2026-05-25_19:08:57 no ckpt yet
|
| 145 |
+
[watch-gumbel] 2026-05-25_19:09:27 no ckpt yet
|
| 146 |
+
[watch-gumbel] 2026-05-25_19:09:57 no ckpt yet
|
| 147 |
+
[watch-gumbel] 2026-05-25_19:10:27 no ckpt yet
|
| 148 |
+
[watch-gumbel] 2026-05-25_19:10:57 no ckpt yet
|
| 149 |
+
[watch-gumbel] 2026-05-25_19:11:27 no ckpt yet
|
| 150 |
+
[watch-gumbel] 2026-05-25_19:11:57 no ckpt yet
|
| 151 |
+
[watch-gumbel] 2026-05-25_19:12:27 no ckpt yet
|
| 152 |
+
[watch-gumbel] 2026-05-25_19:12:57 no ckpt yet
|
| 153 |
+
[watch-gumbel] 2026-05-25_19:13:27 no ckpt yet
|
| 154 |
+
[watch-gumbel] 2026-05-25_19:13:57 no ckpt yet
|
| 155 |
+
[watch-gumbel] 2026-05-25_19:14:27 no ckpt yet
|
| 156 |
+
[watch-gumbel] 2026-05-25_19:14:57 no ckpt yet
|
| 157 |
+
[watch-gumbel] 2026-05-25_19:15:27 no ckpt yet
|
| 158 |
+
[watch-gumbel] 2026-05-25_19:15:57 no ckpt yet
|
| 159 |
+
[watch-gumbel] 2026-05-25_19:16:27 no ckpt yet
|
| 160 |
+
[watch-gumbel] 2026-05-25_19:16:57 no ckpt yet
|
| 161 |
+
[watch-gumbel] 2026-05-25_19:17:27 no ckpt yet
|
| 162 |
+
[watch-gumbel] 2026-05-25_19:17:57 no ckpt yet
|
| 163 |
+
[watch-gumbel] 2026-05-25_19:18:27 no ckpt yet
|
| 164 |
+
[watch-gumbel] 2026-05-25_19:18:57 no ckpt yet
|
| 165 |
+
[watch-gumbel] 2026-05-25_19:19:27 no ckpt yet
|
| 166 |
+
[watch-gumbel] 2026-05-25_19:19:57 no ckpt yet
|
| 167 |
+
[watch-gumbel] 2026-05-25_19:20:27 no ckpt yet
|
| 168 |
+
[watch-gumbel] 2026-05-25_19:20:57 no ckpt yet
|
| 169 |
+
[watch-gumbel] 2026-05-25_19:21:27 no ckpt yet
|
| 170 |
+
[watch-gumbel] 2026-05-25_19:21:57 no ckpt yet
|
| 171 |
+
[watch-gumbel] 2026-05-25_19:22:27 no ckpt yet
|
| 172 |
+
[watch-gumbel] 2026-05-25_19:22:57 no ckpt yet
|
| 173 |
+
[watch-gumbel] 2026-05-25_19:23:27 no ckpt yet
|
| 174 |
+
[watch-gumbel] 2026-05-25_19:23:57 no ckpt yet
|
| 175 |
+
[watch-gumbel] 2026-05-25_19:24:27 no ckpt yet
|
| 176 |
+
[watch-gumbel] 2026-05-25_19:24:57 no ckpt yet
|
| 177 |
+
[watch-gumbel] 2026-05-25_19:25:27 no ckpt yet
|
| 178 |
+
[watch-gumbel] 2026-05-25_19:25:57 no ckpt yet
|
| 179 |
+
[watch-gumbel] 2026-05-25_19:26:27 no ckpt yet
|
| 180 |
+
[watch-gumbel] 2026-05-25_19:26:57 no ckpt yet
|
| 181 |
+
[watch-gumbel] 2026-05-25_19:27:27 no ckpt yet
|
| 182 |
+
[watch-gumbel] 2026-05-25_19:27:57 no ckpt yet
|
| 183 |
+
[watch-gumbel] 2026-05-25_19:28:27 no ckpt yet
|
| 184 |
+
[watch-gumbel] 2026-05-25_19:28:57 no ckpt yet
|
| 185 |
+
[watch-gumbel] 2026-05-25_19:29:27 no ckpt yet
|
| 186 |
+
[watch-gumbel] 2026-05-25_19:29:57 no ckpt yet
|
| 187 |
+
[watch-gumbel] 2026-05-25_19:30:27 infer runs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0010000.pt -> docs/lta_samples/metrics_20260525/owt_t5_absrope_adaln_Cv_to_2v_mask0p1_1p0_sameT_sde_gumbel_topp0.95_tau1.0_to_0.2_blend_c32100_64200_n128/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0010000
|
| 188 |
+
[load] runs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0010000.pt
|
| 189 |
+
[ckpt] step=10000
|
| 190 |
+
[sde] generated 2/128
|
| 191 |
+
[sde] generated 4/128
|
| 192 |
+
[sde] generated 6/128
|
| 193 |
+
[sde] generated 8/128
|
| 194 |
+
[sde] generated 10/128
|
| 195 |
+
[sde] generated 12/128
|
| 196 |
+
[sde] generated 14/128
|
| 197 |
+
[sde] generated 16/128
|
| 198 |
+
[sde] generated 18/128
|
| 199 |
+
[sde] generated 20/128
|
| 200 |
+
[sde] generated 22/128
|
| 201 |
+
[sde] generated 24/128
|
| 202 |
+
[sde] generated 26/128
|
| 203 |
+
[sde] generated 28/128
|
| 204 |
+
[sde] generated 30/128
|
| 205 |
+
[sde] generated 32/128
|
| 206 |
+
[sde] generated 34/128
|
| 207 |
+
[sde] generated 36/128
|
| 208 |
+
[sde] generated 38/128
|
| 209 |
+
[sde] generated 40/128
|
| 210 |
+
[sde] generated 42/128
|
| 211 |
+
[sde] generated 44/128
|
| 212 |
+
[sde] generated 46/128
|
| 213 |
+
[sde] generated 48/128
|
| 214 |
+
[sde] generated 50/128
|
| 215 |
+
[sde] generated 52/128
|
| 216 |
+
[sde] generated 54/128
|
| 217 |
+
[sde] generated 56/128
|
| 218 |
+
[sde] generated 58/128
|
| 219 |
+
[sde] generated 60/128
|
| 220 |
+
[sde] generated 62/128
|
| 221 |
+
[sde] generated 64/128
|
| 222 |
+
[sde] generated 66/128
|
| 223 |
+
[sde] generated 68/128
|
| 224 |
+
[sde] generated 70/128
|
| 225 |
+
[sde] generated 72/128
|
| 226 |
+
[sde] generated 74/128
|
| 227 |
+
[sde] generated 76/128
|
| 228 |
+
[sde] generated 78/128
|
| 229 |
+
[sde] generated 80/128
|
| 230 |
+
[sde] generated 82/128
|
| 231 |
+
[sde] generated 84/128
|
| 232 |
+
[sde] generated 86/128
|
| 233 |
+
[sde] generated 88/128
|
| 234 |
+
[sde] generated 90/128
|
| 235 |
+
[sde] generated 92/128
|
| 236 |
+
[sde] generated 94/128
|
| 237 |
+
[sde] generated 96/128
|
| 238 |
+
[sde] generated 98/128
|
| 239 |
+
[sde] generated 100/128
|
| 240 |
+
[sde] generated 102/128
|
| 241 |
+
[sde] generated 104/128
|
| 242 |
+
[sde] generated 106/128
|
| 243 |
+
[sde] generated 108/128
|
| 244 |
+
[sde] generated 110/128
|
| 245 |
+
[sde] generated 112/128
|
| 246 |
+
[sde] generated 114/128
|
| 247 |
+
[sde] generated 116/128
|
| 248 |
+
[sde] generated 118/128
|
| 249 |
+
[sde] generated 120/128
|
| 250 |
+
[sde] generated 122/128
|
| 251 |
+
[sde] generated 124/128
|
| 252 |
+
[sde] generated 126/128
|
| 253 |
+
[sde] generated 128/128
|
| 254 |
+
[score] loading scorer: /e2e-data/evad-tech-vla/wanghan58/models/flowtext_scorers/gpt2-large-standard
|
| 255 |
+
[summary] {
|
| 256 |
+
"type": "summary",
|
| 257 |
+
"checkpoint": "runs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0010000.pt",
|
| 258 |
+
"step": 10000,
|
| 259 |
+
"decode": {
|
| 260 |
+
"decode_rule": "dirichlet_resample_sde",
|
| 261 |
+
"steps": 128,
|
| 262 |
+
"model_t_mode": "support_t",
|
| 263 |
+
"mean_mode": "endpoint_only",
|
| 264 |
+
"anchor_gamma": 1.0,
|
| 265 |
+
"endpoint_floor": 0.0,
|
| 266 |
+
"concentration_min": 32100.0,
|
| 267 |
+
"concentration_max": 64200.0,
|
| 268 |
+
"endpoint_temp": 1.45,
|
| 269 |
+
"endpoint_temp_start": null,
|
| 270 |
+
"endpoint_temp_end": null,
|
| 271 |
+
"endpoint_projection": "gumbel_softmax",
|
| 272 |
+
"endpoint_top_k": 0,
|
| 273 |
+
"endpoint_top_p": 0.95,
|
| 274 |
+
"gumbel_tau_start": 1.0,
|
| 275 |
+
"gumbel_tau_end": 0.2,
|
| 276 |
+
"gumbel_noise_scale_start": 1.0,
|
| 277 |
+
"gumbel_noise_scale_end": 1.0,
|
| 278 |
+
"ban_special_tokens": false,
|
| 279 |
+
"banned_endpoint_ids": [],
|
| 280 |
+
"support_power": 1.0,
|
| 281 |
+
"semantic_power": 1.0,
|
| 282 |
+
"noise_init": "dirichlet",
|
| 283 |
+
"noise_sigma": -1.0,
|
| 284 |
+
"noise_dirichlet_concentration": 32100.0,
|
| 285 |
+
"sde_resample": "dirichlet",
|
| 286 |
+
"logistic_normal_sigma_min": 0.18,
|
| 287 |
+
"logistic_normal_sigma_max": 3.0,
|
| 288 |
+
"logistic_normal_tau_min": 0.65,
|
| 289 |
+
"logistic_normal_tau_max": 1.0,
|
| 290 |
+
"final_from": "blend_0.5",
|
| 291 |
+
"n_samples": 128,
|
| 292 |
+
"seed": 20260524
|
| 293 |
+
},
|
| 294 |
+
"raw_genppl": {
|
| 295 |
+
"ppl": 2.098817389847081,
|
| 296 |
+
"nll_per_token": 0.7413740384102933,
|
| 297 |
+
"tokens": 129915,
|
| 298 |
+
"kept_samples": 128,
|
| 299 |
+
"total_samples": 128,
|
| 300 |
+
"empty_rate": 0.0,
|
| 301 |
+
"skipped_samples": 0
|
| 302 |
+
},
|
| 303 |
+
"stripped_genppl": {
|
| 304 |
+
"ppl": 2.091704636784785,
|
| 305 |
+
"nll_per_token": 0.737979349229492,
|
| 306 |
+
"tokens": 129874,
|
| 307 |
+
"kept_samples": 128,
|
| 308 |
+
"total_samples": 128,
|
| 309 |
+
"empty_rate": 0.0,
|
| 310 |
+
"skipped_samples": 0
|
| 311 |
+
},
|
| 312 |
+
"diversity": {
|
| 313 |
+
"sample_entropy": 1.1927971824809422,
|
| 314 |
+
"unique_tokens": 450,
|
| 315 |
+
"token_count": 131072,
|
| 316 |
+
"distinct_1": 0.0034332275390625,
|
| 317 |
+
"distinct_2": 0.02113880742913001,
|
| 318 |
+
"top_token_mass": 0.6524658203125
|
| 319 |
+
}
|
| 320 |
+
}
|
| 321 |
+
[done] docs/lta_samples/metrics_20260525/owt_t5_absrope_adaln_Cv_to_2v_mask0p1_1p0_sameT_sde_gumbel_topp0.95_tau1.0_to_0.2_blend_c32100_64200_n128/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0010000/sde_steps128_samples128_scored.jsonl
|
| 322 |
+
[watch-gumbel] 2026-05-25_19:37:00 done step_0010000
|
| 323 |
+
[watch-gumbel] 2026-05-25_21:02:31 infer runs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0020000.pt -> docs/lta_samples/metrics_20260525/owt_t5_absrope_adaln_Cv_to_2v_mask0p1_1p0_sameT_sde_gumbel_topp0.95_tau1.0_to_0.2_blend_c32100_64200_n128/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0020000
|
| 324 |
+
[load] runs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0020000.pt
|
| 325 |
+
[ckpt] step=20000
|
| 326 |
+
[sde] generated 2/128
|
| 327 |
+
[sde] generated 4/128
|
| 328 |
+
[sde] generated 6/128
|
| 329 |
+
[sde] generated 8/128
|
| 330 |
+
[sde] generated 10/128
|
| 331 |
+
[sde] generated 12/128
|
| 332 |
+
[sde] generated 14/128
|
| 333 |
+
[sde] generated 16/128
|
| 334 |
+
[sde] generated 18/128
|
| 335 |
+
[sde] generated 20/128
|
| 336 |
+
[sde] generated 22/128
|
| 337 |
+
[sde] generated 24/128
|
| 338 |
+
[sde] generated 26/128
|
| 339 |
+
[sde] generated 28/128
|
| 340 |
+
[sde] generated 30/128
|
| 341 |
+
[sde] generated 32/128
|
| 342 |
+
[sde] generated 34/128
|
| 343 |
+
[sde] generated 36/128
|
| 344 |
+
[sde] generated 38/128
|
| 345 |
+
[sde] generated 40/128
|
| 346 |
+
[sde] generated 42/128
|
| 347 |
+
[sde] generated 44/128
|
| 348 |
+
[sde] generated 46/128
|
| 349 |
+
[sde] generated 48/128
|
| 350 |
+
[sde] generated 50/128
|
| 351 |
+
[sde] generated 52/128
|
| 352 |
+
[sde] generated 54/128
|
| 353 |
+
[sde] generated 56/128
|
| 354 |
+
[sde] generated 58/128
|
| 355 |
+
[sde] generated 60/128
|
| 356 |
+
[sde] generated 62/128
|
| 357 |
+
[sde] generated 64/128
|
| 358 |
+
[sde] generated 66/128
|
| 359 |
+
[sde] generated 68/128
|
| 360 |
+
[sde] generated 70/128
|
| 361 |
+
[sde] generated 72/128
|
| 362 |
+
[sde] generated 74/128
|
| 363 |
+
[sde] generated 76/128
|
| 364 |
+
[sde] generated 78/128
|
| 365 |
+
[sde] generated 80/128
|
| 366 |
+
[sde] generated 82/128
|
| 367 |
+
[sde] generated 84/128
|
| 368 |
+
[sde] generated 86/128
|
| 369 |
+
[sde] generated 88/128
|
| 370 |
+
[sde] generated 90/128
|
| 371 |
+
[sde] generated 92/128
|
| 372 |
+
[sde] generated 94/128
|
| 373 |
+
[sde] generated 96/128
|
| 374 |
+
[sde] generated 98/128
|
| 375 |
+
[sde] generated 100/128
|
| 376 |
+
[sde] generated 102/128
|
| 377 |
+
[sde] generated 104/128
|
| 378 |
+
[sde] generated 106/128
|
| 379 |
+
[sde] generated 108/128
|
| 380 |
+
[sde] generated 110/128
|
| 381 |
+
[sde] generated 112/128
|
| 382 |
+
[sde] generated 114/128
|
| 383 |
+
[sde] generated 116/128
|
| 384 |
+
[sde] generated 118/128
|
| 385 |
+
[sde] generated 120/128
|
| 386 |
+
[sde] generated 122/128
|
| 387 |
+
[sde] generated 124/128
|
| 388 |
+
[sde] generated 126/128
|
| 389 |
+
[sde] generated 128/128
|
| 390 |
+
[score] loading scorer: /e2e-data/evad-tech-vla/wanghan58/models/flowtext_scorers/gpt2-large-standard
|
| 391 |
+
[summary] {
|
| 392 |
+
"type": "summary",
|
| 393 |
+
"checkpoint": "runs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0020000.pt",
|
| 394 |
+
"step": 20000,
|
| 395 |
+
"decode": {
|
| 396 |
+
"decode_rule": "dirichlet_resample_sde",
|
| 397 |
+
"steps": 128,
|
| 398 |
+
"model_t_mode": "support_t",
|
| 399 |
+
"mean_mode": "endpoint_only",
|
| 400 |
+
"anchor_gamma": 1.0,
|
| 401 |
+
"endpoint_floor": 0.0,
|
| 402 |
+
"concentration_min": 32100.0,
|
| 403 |
+
"concentration_max": 64200.0,
|
| 404 |
+
"endpoint_temp": 1.45,
|
| 405 |
+
"endpoint_temp_start": null,
|
| 406 |
+
"endpoint_temp_end": null,
|
| 407 |
+
"endpoint_projection": "gumbel_softmax",
|
| 408 |
+
"endpoint_top_k": 0,
|
| 409 |
+
"endpoint_top_p": 0.95,
|
| 410 |
+
"gumbel_tau_start": 1.0,
|
| 411 |
+
"gumbel_tau_end": 0.2,
|
| 412 |
+
"gumbel_noise_scale_start": 1.0,
|
| 413 |
+
"gumbel_noise_scale_end": 1.0,
|
| 414 |
+
"ban_special_tokens": false,
|
| 415 |
+
"banned_endpoint_ids": [],
|
| 416 |
+
"support_power": 1.0,
|
| 417 |
+
"semantic_power": 1.0,
|
| 418 |
+
"noise_init": "dirichlet",
|
| 419 |
+
"noise_sigma": -1.0,
|
| 420 |
+
"noise_dirichlet_concentration": 32100.0,
|
| 421 |
+
"sde_resample": "dirichlet",
|
| 422 |
+
"logistic_normal_sigma_min": 0.18,
|
| 423 |
+
"logistic_normal_sigma_max": 3.0,
|
| 424 |
+
"logistic_normal_tau_min": 0.65,
|
| 425 |
+
"logistic_normal_tau_max": 1.0,
|
| 426 |
+
"final_from": "blend_0.5",
|
| 427 |
+
"n_samples": 128,
|
| 428 |
+
"seed": 20260524
|
| 429 |
+
},
|
| 430 |
+
"raw_genppl": {
|
| 431 |
+
"ppl": 3.4360435319768396,
|
| 432 |
+
"nll_per_token": 1.234320673418016,
|
| 433 |
+
"tokens": 60733,
|
| 434 |
+
"kept_samples": 128,
|
| 435 |
+
"total_samples": 128,
|
| 436 |
+
"empty_rate": 0.0,
|
| 437 |
+
"skipped_samples": 0
|
| 438 |
+
},
|
| 439 |
+
"stripped_genppl": {
|
| 440 |
+
"ppl": 3.399305871374786,
|
| 441 |
+
"nll_per_token": 1.2235712553015452,
|
| 442 |
+
"tokens": 60637,
|
| 443 |
+
"kept_samples": 128,
|
| 444 |
+
"total_samples": 128,
|
| 445 |
+
"empty_rate": 0.0,
|
| 446 |
+
"skipped_samples": 0
|
| 447 |
+
},
|
| 448 |
+
"diversity": {
|
| 449 |
+
"sample_entropy": 0.8467485464533029,
|
| 450 |
+
"unique_tokens": 298,
|
| 451 |
+
"token_count": 131072,
|
| 452 |
+
"distinct_1": 0.0022735595703125,
|
| 453 |
+
"distinct_2": 0.017037817693059627,
|
| 454 |
+
"top_token_mass": 0.5230484008789062
|
| 455 |
+
}
|
| 456 |
+
}
|
| 457 |
+
[done] docs/lta_samples/metrics_20260525/owt_t5_absrope_adaln_Cv_to_2v_mask0p1_1p0_sameT_sde_gumbel_topp0.95_tau1.0_to_0.2_blend_c32100_64200_n128/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0020000/sde_steps128_samples128_scored.jsonl
|
| 458 |
+
[watch-gumbel] 2026-05-25_21:08:53 done step_0020000
|
LTA_openwebtext_dualt/logs/owt_candidate_catdualt_step246k_64_c1024_t1p2_blend_n64.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[forbid_endpoint_ids] n=352 first=[94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125]
|
| 2 |
+
[decode] steps64_c1024_mtpost_t1p2_tpow1p0_noise0_blend_anchored
|
| 3 |
+
[summary] {"name": "steps64_c1024_mtpost_t1p2_tpow1p0_noise0_blend_anchored", "step": 246000, "n_samples": 64, "steps": 64, "concentration_max": 1024.0, "temp_start": 1.2, "temp_end": 1.2, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "blend", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 0, "update_rule": "anchored", "model_t_mode": "post", "lock_bos": true, "lock_final_eos": false, "detok_genppl": 153.70967053904752, "sample_entropy": 4.642808948434547, "distinct_1": 0.1154937744140625, "distinct_2": 0.5061858504398827, "top_token_mass": 0.1103973388671875, "tokens_scored": 59240, "readability_score": 4.950578398852453, "mean_chars": 3752.34375, "replacement_chars": 0.0}
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/smoke_gpt2_softendpoint_mn_n128_onehot.log
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[rank0]: Traceback (most recent call last):
|
| 2 |
+
[rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2600, in <module>
|
| 3 |
+
[rank0]: main()
|
| 4 |
+
[rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1866, in main
|
| 5 |
+
[rank0]: dataset = CachedWrappedTextSequenceDataset(
|
| 6 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 7 |
+
[rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 704, in __init__
|
| 8 |
+
[rank0]: raise ValueError(f"cache max_len={cache_max_len} does not match requested max_len={self.max_len}")
|
| 9 |
+
[rank0]: ValueError: cache max_len=1024 does not match requested max_len=128
|
| 10 |
+
[rank0]:[W516 22:06:53.301173242 ProcessGroupNCCL.cpp:1487] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
|
| 11 |
+
W0516 22:06:53.771000 470531 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 470536 closing signal SIGTERM
|
| 12 |
+
W0516 22:06:53.772000 470531 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 470537 closing signal SIGTERM
|
| 13 |
+
W0516 22:06:53.773000 470531 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 470538 closing signal SIGTERM
|
| 14 |
+
E0516 22:06:53.950000 470531 torch/distributed/elastic/multiprocessing/api.py:870] failed (exitcode: 1) local_rank: 0 (pid: 470535) of binary: /usr/bin/python
|
| 15 |
+
Traceback (most recent call last):
|
| 16 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 17 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 18 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
|
| 19 |
+
main()
|
| 20 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 21 |
+
return f(*args, **kwargs)
|
| 22 |
+
^^^^^^^^^^^^^^^^^^
|
| 23 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
|
| 24 |
+
run(args)
|
| 25 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
|
| 26 |
+
elastic_launch(
|
| 27 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
|
| 28 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 29 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 30 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
|
| 31 |
+
raise ChildFailedError(
|
| 32 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 33 |
+
============================================================
|
| 34 |
+
train.py FAILED
|
| 35 |
+
------------------------------------------------------------
|
| 36 |
+
Failures:
|
| 37 |
+
<NO_OTHER_FAILURES>
|
| 38 |
+
------------------------------------------------------------
|
| 39 |
+
Root Cause (first observed failure):
|
| 40 |
+
[0]:
|
| 41 |
+
time : 2026-05-16_22:06:53
|
| 42 |
+
host : localhost
|
| 43 |
+
rank : 0 (local_rank: 0)
|
| 44 |
+
exitcode : 1 (pid: 470535)
|
| 45 |
+
error_file: <N/A>
|
| 46 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 47 |
+
============================================================
|
| 48 |
+
[rank0]: Traceback (most recent call last):
|
| 49 |
+
[rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2600, in <module>
|
| 50 |
+
[rank0]: main()
|
| 51 |
+
[rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1866, in main
|
| 52 |
+
[rank0]: dataset = CachedWrappedTextSequenceDataset(
|
| 53 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 54 |
+
[rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 695, in __init__
|
| 55 |
+
[rank0]: raise RuntimeError(
|
| 56 |
+
[rank0]: RuntimeError: cached OWT chunks not found under /e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len128_train_minus_100k; build them first or set --owt_chunk_cache_rebuild on rank 0
|
| 57 |
+
[rank0]:[W516 22:07:24.000487031 ProcessGroupNCCL.cpp:1487] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
|
| 58 |
+
W0516 22:07:24.408000 470659 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 470664 closing signal SIGTERM
|
| 59 |
+
W0516 22:07:24.409000 470659 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 470665 closing signal SIGTERM
|
| 60 |
+
W0516 22:07:24.409000 470659 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 470666 closing signal SIGTERM
|
| 61 |
+
E0516 22:07:24.587000 470659 torch/distributed/elastic/multiprocessing/api.py:870] failed (exitcode: 1) local_rank: 0 (pid: 470663) of binary: /usr/bin/python
|
| 62 |
+
Traceback (most recent call last):
|
| 63 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 64 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 65 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
|
| 66 |
+
main()
|
| 67 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 68 |
+
return f(*args, **kwargs)
|
| 69 |
+
^^^^^^^^^^^^^^^^^^
|
| 70 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
|
| 71 |
+
run(args)
|
| 72 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
|
| 73 |
+
elastic_launch(
|
| 74 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
|
| 75 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 76 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 77 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
|
| 78 |
+
raise ChildFailedError(
|
| 79 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 80 |
+
============================================================
|
| 81 |
+
train.py FAILED
|
| 82 |
+
------------------------------------------------------------
|
| 83 |
+
Failures:
|
| 84 |
+
<NO_OTHER_FAILURES>
|
| 85 |
+
------------------------------------------------------------
|
| 86 |
+
Root Cause (first observed failure):
|
| 87 |
+
[0]:
|
| 88 |
+
time : 2026-05-16_22:07:24
|
| 89 |
+
host : localhost
|
| 90 |
+
rank : 0 (local_rank: 0)
|
| 91 |
+
exitcode : 1 (pid: 470663)
|
| 92 |
+
error_file: <N/A>
|
| 93 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 94 |
+
============================================================
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_combo_len256_dirichlet_unigram_shared_highC_20260517_170456.log
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NCCL version 2.25.1+cuda12.8
|
| 2 |
+
{
|
| 3 |
+
"device": "cuda:0",
|
| 4 |
+
"rank": 0,
|
| 5 |
+
"world_size": 4,
|
| 6 |
+
"samples": "owt_cached_chunks:8",
|
| 7 |
+
"vocab_size": 969,
|
| 8 |
+
"tokenizer_vocab_size": 50257,
|
| 9 |
+
"save_dir": "runs/train8_combo_len256_dirichlet_unigram_shared_highC_20260517_170456",
|
| 10 |
+
"batch_size": 128,
|
| 11 |
+
"grad_accum": 1,
|
| 12 |
+
"effective_batch_size": 512,
|
| 13 |
+
"global_batch_size": 512,
|
| 14 |
+
"lr_schedule": "constant_warmup",
|
| 15 |
+
"optimizer": "muon",
|
| 16 |
+
"epochs": 0.0,
|
| 17 |
+
"steps_per_epoch": 1,
|
| 18 |
+
"total_steps": 1000,
|
| 19 |
+
"warmup_steps": 10,
|
| 20 |
+
"warmup_epochs": -1.0,
|
| 21 |
+
"min_lr": 0.0,
|
| 22 |
+
"weight_decay": 0.1,
|
| 23 |
+
"output_weight_decay": -1.0,
|
| 24 |
+
"adamw_param_groups": "nanogpt",
|
| 25 |
+
"adam_beta1": 0.9,
|
| 26 |
+
"adam_beta2": 0.95,
|
| 27 |
+
"adam_eps": 1e-08,
|
| 28 |
+
"muon_impl": "legacy",
|
| 29 |
+
"muon_momentum": 0.95,
|
| 30 |
+
"muon_ns_steps": 5,
|
| 31 |
+
"muon_update_scale": 1.0,
|
| 32 |
+
"muon_nesterov": false,
|
| 33 |
+
"muon_width_scale": false,
|
| 34 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 35 |
+
"muon_param_count": 1965440,
|
| 36 |
+
"muon_adam_param_count": 8192,
|
| 37 |
+
"muon_param_names": [
|
| 38 |
+
"vocab_embed.embedding",
|
| 39 |
+
"sigma_map.net.0.weight",
|
| 40 |
+
"sigma_map.net.2.weight",
|
| 41 |
+
"blocks.0.attn_qkv.weight",
|
| 42 |
+
"blocks.0.attn_out.weight",
|
| 43 |
+
"blocks.0.mlp.0.weight",
|
| 44 |
+
"blocks.0.mlp.2.weight",
|
| 45 |
+
"blocks.0.adaLN_modulation.weight",
|
| 46 |
+
"blocks.1.attn_qkv.weight",
|
| 47 |
+
"blocks.1.attn_out.weight",
|
| 48 |
+
"blocks.1.mlp.0.weight",
|
| 49 |
+
"blocks.1.mlp.2.weight",
|
| 50 |
+
"blocks.1.adaLN_modulation.weight",
|
| 51 |
+
"blocks.2.attn_qkv.weight",
|
| 52 |
+
"blocks.2.attn_out.weight",
|
| 53 |
+
"blocks.2.mlp.0.weight",
|
| 54 |
+
"blocks.2.mlp.2.weight",
|
| 55 |
+
"blocks.2.adaLN_modulation.weight",
|
| 56 |
+
"output_layer.linear.weight",
|
| 57 |
+
"output_layer.adaLN_modulation.weight"
|
| 58 |
+
],
|
| 59 |
+
"muon_adam_param_names": [
|
| 60 |
+
"sigma_map.net.0.bias",
|
| 61 |
+
"sigma_map.net.2.bias",
|
| 62 |
+
"blocks.0.norm1.weight",
|
| 63 |
+
"blocks.0.norm2.weight",
|
| 64 |
+
"blocks.0.mlp.0.bias",
|
| 65 |
+
"blocks.0.mlp.2.bias",
|
| 66 |
+
"blocks.0.adaLN_modulation.bias",
|
| 67 |
+
"blocks.1.norm1.weight",
|
| 68 |
+
"blocks.1.norm2.weight",
|
| 69 |
+
"blocks.1.mlp.0.bias",
|
| 70 |
+
"blocks.1.mlp.2.bias",
|
| 71 |
+
"blocks.1.adaLN_modulation.bias",
|
| 72 |
+
"blocks.2.norm1.weight",
|
| 73 |
+
"blocks.2.norm2.weight",
|
| 74 |
+
"blocks.2.mlp.0.bias",
|
| 75 |
+
"blocks.2.mlp.2.bias",
|
| 76 |
+
"blocks.2.adaLN_modulation.bias",
|
| 77 |
+
"output_layer.norm_final.weight",
|
| 78 |
+
"output_layer.adaLN_modulation.bias"
|
| 79 |
+
],
|
| 80 |
+
"muon_effective_nesterov": false,
|
| 81 |
+
"muon_effective_width_scale": false,
|
| 82 |
+
"muon_effective_weight_decay": 0.1,
|
| 83 |
+
"muon_adam_fallback_nesterov": false,
|
| 84 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 85 |
+
"ema_decay": 0.9999,
|
| 86 |
+
"ema_start_step": 0,
|
| 87 |
+
"model_type": "ddit",
|
| 88 |
+
"ddit_mlp_type": "gelu",
|
| 89 |
+
"elf_num_time_tokens": 4,
|
| 90 |
+
"elf_num_model_mode_tokens": 0,
|
| 91 |
+
"qk_norm": true,
|
| 92 |
+
"output_bias": false,
|
| 93 |
+
"output_init_std": -1.0,
|
| 94 |
+
"norm_type": "rmsnorm",
|
| 95 |
+
"target_loss": "hard_ce",
|
| 96 |
+
"linear_soft_target_power": 1.0,
|
| 97 |
+
"linear_soft_target_min_conf": 0.0,
|
| 98 |
+
"linear_soft_target_max_conf": 1.0,
|
| 99 |
+
"t_sampling_mode": "logit_normal",
|
| 100 |
+
"t_sampling_power": 1.0,
|
| 101 |
+
"t_sampling_eps": 0.0001,
|
| 102 |
+
"t_sampling_logit_mean": -1.5,
|
| 103 |
+
"t_sampling_logit_std": 0.8,
|
| 104 |
+
"dual_t": true,
|
| 105 |
+
"corrupt_t_mode": "same",
|
| 106 |
+
"corrupt_min_t": 0.0,
|
| 107 |
+
"corrupt_max_t": 1.0,
|
| 108 |
+
"prefix_block_prob": 0.0,
|
| 109 |
+
"prefix_block_len": 128,
|
| 110 |
+
"mask_ratio_floor_schedule": "none",
|
| 111 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 112 |
+
"dirichlet_semantic_t_mode": "same",
|
| 113 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 114 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 115 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 116 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 117 |
+
"categorical_wrong_from_full_vocab": true,
|
| 118 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 119 |
+
"categorical_wrong_basin_token_ids": "",
|
| 120 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 121 |
+
"categorical_wrong_unigram_prob": 1.0,
|
| 122 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 123 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 124 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 125 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 126 |
+
"categorical_wrong_unigram_shared_prob": 0.5,
|
| 127 |
+
"mask_mixture_original_prob": 0.0,
|
| 128 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 129 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 130 |
+
"mask_mixture_block_prob": 0.0,
|
| 131 |
+
"mask_mixture_all_prob": 1.0,
|
| 132 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 133 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 134 |
+
"mask_mixture_block_tokens": "64,128",
|
| 135 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 136 |
+
"logistic_normal_sigma_min": 0.1,
|
| 137 |
+
"logistic_normal_sigma_max": 1.0,
|
| 138 |
+
"logistic_normal_tau_min": 1.0,
|
| 139 |
+
"logistic_normal_tau_max": 1.0,
|
| 140 |
+
"torch_compile": false,
|
| 141 |
+
"compile_mode": "max-autotune",
|
| 142 |
+
"state_format": "prob",
|
| 143 |
+
"meanflow_weight": 0.0,
|
| 144 |
+
"rollout_train_prob": 0.0,
|
| 145 |
+
"rollout_train_steps": 1,
|
| 146 |
+
"rollout_train_infer_steps": 64,
|
| 147 |
+
"rollout_train_temp": 1.45,
|
| 148 |
+
"rollout_train_max_gamma": 1.0,
|
| 149 |
+
"rollout_train_corrupt_only": true,
|
| 150 |
+
"rollout_train_samplewise": false,
|
| 151 |
+
"rollout_train_compute_always": false,
|
| 152 |
+
"bridge_noise_init": "logistic_normal",
|
| 153 |
+
"noise_sigma": -1.0,
|
| 154 |
+
"allow_tf32": true,
|
| 155 |
+
"activation_checkpointing": false,
|
| 156 |
+
"activation_checkpoint_interval": 1,
|
| 157 |
+
"activation_checkpoint_scope": "block",
|
| 158 |
+
"ddp_static_graph": false,
|
| 159 |
+
"ddp_gradient_as_bucket_view": true,
|
| 160 |
+
"blocking_data_transfer": false,
|
| 161 |
+
"dataloader_prefetch_factor": 4,
|
| 162 |
+
"full_train_stats": false,
|
| 163 |
+
"tokenized_hf": false,
|
| 164 |
+
"tokenized_pad_token": "pad",
|
| 165 |
+
"elf_conditional_hf": false,
|
| 166 |
+
"record_pad_truncate": false,
|
| 167 |
+
"record_add_eos": false,
|
| 168 |
+
"record_add_special_tokens": false,
|
| 169 |
+
"record_pad_token": "pad",
|
| 170 |
+
"record_shuffle_buffer": 10000,
|
| 171 |
+
"wrap": true,
|
| 172 |
+
"wrap_mode": "stream",
|
| 173 |
+
"wrap_record_buffer_size": 200,
|
| 174 |
+
"owt_cached_chunks": true,
|
| 175 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
|
| 176 |
+
"owt_chunk_cache_rebuild": false,
|
| 177 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 178 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 179 |
+
"online_chunk_shuffle": false,
|
| 180 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 181 |
+
"openwebtext_split": "train_minus_100k",
|
| 182 |
+
"detokenizer": "auto",
|
| 183 |
+
"resolved_detokenizer": null,
|
| 184 |
+
"num_workers": 0,
|
| 185 |
+
"latest_every": 1000,
|
| 186 |
+
"resume_path": ""
|
| 187 |
+
}
|
| 188 |
+
step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=4.7s lr=2.000000e-03 loss=6.6629 loss_recon=6.6629 loss_meanflow=0.0000 mean_model_t=0.2078 mean_corrupt_t=0.2078 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1701 corrupt_frac=1.0000 acc_corrupt=0.1701 loss_corrupt=6.6629 wrong_frac=0.7922 init_acc_corrupt=0.2054 acc_corrupt_t_0p0_0p2=0.0802 corrupt_frac_t_0p0_0p2=0.5620 acc_corrupt_t_0p2_0p4=0.2399 corrupt_frac_t_0p2_0p4=0.3533 acc_corrupt_t_0p4_0p6=0.4568 corrupt_frac_t_0p4_0p6=0.0755 acc_corrupt_t_0p6_0p8=0.6274 corrupt_frac_t_0p6_0p8=0.0117 out_w_norm=1.2395 out_g_norm=0.9648 acc_corrupt_t_0p8_1p0=0.9141 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.3374 init_gold_top10=0.2275 init_gold_top100=0.2997
|
| 189 |
+
step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=4.0s lr=2.000000e-03 loss=5.9494 loss_recon=5.9494 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1795 corrupt_frac=1.0000 acc_corrupt=0.1795 loss_corrupt=5.9494 wrong_frac=0.7916 init_acc_corrupt=0.2060 acc_corrupt_t_0p0_0p2=0.0927 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=0.2447 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.4631 corrupt_frac_t_0p4_0p6=0.0744 acc_corrupt_t_0p6_0p8=0.6568 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=0.8105 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=4.1195 out_g_norm=1.3169 loss_all=5.5816 init_gold_top10=0.2191 init_gold_top100=0.2940
|
| 190 |
+
step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=4.0s lr=2.000000e-03 loss=5.3088 loss_recon=5.3088 loss_meanflow=0.0000 mean_model_t=0.2105 mean_corrupt_t=0.2105 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2092 corrupt_frac=1.0000 acc_corrupt=0.2092 loss_corrupt=5.3088 wrong_frac=0.7894 init_acc_corrupt=0.2082 acc_corrupt_t_0p0_0p2=0.1143 corrupt_frac_t_0p0_0p2=0.5542 acc_corrupt_t_0p2_0p4=0.2854 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.4737 corrupt_frac_t_0p4_0p6=0.0813 acc_corrupt_t_0p6_0p8=0.6627 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=6.7925 out_g_norm=0.5991 acc_corrupt_t_0p8_1p0=0.8750 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.0754 init_gold_top10=0.2283 init_gold_top100=0.3011
|
| 191 |
+
step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=4.0s lr=2.000000e-03 loss=5.0142 loss_recon=5.0142 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2180 corrupt_frac=1.0000 acc_corrupt=0.2180 loss_corrupt=5.0142 wrong_frac=0.7915 init_acc_corrupt=0.2061 acc_corrupt_t_0p0_0p2=0.1264 corrupt_frac_t_0p0_0p2=0.5597 acc_corrupt_t_0p2_0p4=0.2952 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.4799 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.6613 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=8.4766 out_g_norm=0.3326 acc_corrupt_t_0p8_1p0=0.7930 corrupt_frac_t_0p8_1p0=0.0078 loss_all=4.7706 init_gold_top10=0.2298 init_gold_top100=0.3036
|
| 192 |
+
step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=4.0s lr=2.000000e-03 loss=4.5487 loss_recon=4.5487 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2257 corrupt_frac=1.0000 acc_corrupt=0.2257 loss_corrupt=4.5487 wrong_frac=0.7914 init_acc_corrupt=0.2062 acc_corrupt_t_0p0_0p2=0.1340 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.3049 corrupt_frac_t_0p2_0p4=0.3550 acc_corrupt_t_0p4_0p6=0.4866 corrupt_frac_t_0p4_0p6=0.0741 acc_corrupt_t_0p6_0p8=0.6614 corrupt_frac_t_0p6_0p8=0.0143 out_w_norm=9.7624 out_g_norm=0.4423 loss_all=4.2801 init_gold_top10=0.2068 init_gold_top100=0.2830
|
| 193 |
+
step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=4.0s lr=2.000000e-03 loss=3.8447 loss_recon=3.8447 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2370 corrupt_frac=1.0000 acc_corrupt=0.2370 loss_corrupt=3.8447 wrong_frac=0.7917 init_acc_corrupt=0.2059 acc_corrupt_t_0p0_0p2=0.1444 corrupt_frac_t_0p0_0p2=0.5568 acc_corrupt_t_0p2_0p4=0.3165 corrupt_frac_t_0p2_0p4=0.3606 acc_corrupt_t_0p4_0p6=0.4950 corrupt_frac_t_0p4_0p6=0.0742 acc_corrupt_t_0p6_0p8=0.6750 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=10.1909 out_g_norm=0.4588 loss_all=3.5868 init_gold_top10=0.1960 init_gold_top100=0.2722
|
| 194 |
+
step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=4.0s lr=2.000000e-03 loss=3.1110 loss_recon=3.1110 loss_meanflow=0.0000 mean_model_t=0.2079 mean_corrupt_t=0.2079 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2652 corrupt_frac=1.0000 acc_corrupt=0.2652 loss_corrupt=3.1110 wrong_frac=0.7925 init_acc_corrupt=0.2051 acc_corrupt_t_0p0_0p2=0.1631 corrupt_frac_t_0p0_0p2=0.5607 acc_corrupt_t_0p2_0p4=0.3575 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.5431 corrupt_frac_t_0p4_0p6=0.0739 acc_corrupt_t_0p6_0p8=0.7116 corrupt_frac_t_0p6_0p8=0.0132 out_w_norm=10.4454 out_g_norm=0.5395 loss_all=2.6425 init_gold_top10=0.2289 init_gold_top100=0.3008
|
| 195 |
+
step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=4.0s lr=2.000000e-03 loss=2.2335 loss_recon=2.2335 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3921 corrupt_frac=1.0000 acc_corrupt=0.3921 loss_corrupt=2.2335 wrong_frac=0.7904 init_acc_corrupt=0.2072 acc_corrupt_t_0p0_0p2=0.2394 corrupt_frac_t_0p0_0p2=0.5505 acc_corrupt_t_0p2_0p4=0.5444 corrupt_frac_t_0p2_0p4=0.3649 acc_corrupt_t_0p4_0p6=0.7185 corrupt_frac_t_0p4_0p6=0.0765 out_w_norm=10.7231 out_g_norm=0.6774 acc_corrupt_t_0p6_0p8=0.8264 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=0.9297 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.8154 init_gold_top10=0.2095 init_gold_top100=0.2845
|
| 196 |
+
step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=4.0s lr=2.000000e-03 loss=1.4273 loss_recon=1.4273 loss_meanflow=0.0000 mean_model_t=0.2102 mean_corrupt_t=0.2102 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6031 corrupt_frac=1.0000 acc_corrupt=0.6031 loss_corrupt=1.4273 wrong_frac=0.7901 init_acc_corrupt=0.2076 acc_corrupt_t_0p0_0p2=0.4064 corrupt_frac_t_0p0_0p2=0.5513 acc_corrupt_t_0p2_0p4=0.8248 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.9279 corrupt_frac_t_0p4_0p6=0.0755 out_w_norm=10.9722 out_g_norm=0.6506 acc_corrupt_t_0p6_0p8=0.9541 corrupt_frac_t_0p6_0p8=0.0127 acc_corrupt_t_0p8_1p0=0.9629 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.1917 init_gold_top10=0.2077 init_gold_top100=0.2817
|
| 197 |
+
step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=4.0s lr=2.000000e-03 loss=0.9388 loss_recon=0.9388 loss_meanflow=0.0000 mean_model_t=0.2101 mean_corrupt_t=0.2101 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7497 corrupt_frac=1.0000 acc_corrupt=0.7497 loss_corrupt=0.9388 wrong_frac=0.7897 init_acc_corrupt=0.2080 acc_corrupt_t_0p0_0p2=0.5762 corrupt_frac_t_0p0_0p2=0.5591 acc_corrupt_t_0p2_0p4=0.9639 corrupt_frac_t_0p2_0p4=0.3526 acc_corrupt_t_0p4_0p6=0.9922 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p8_1p0=0.9941 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=11.2383 out_g_norm=0.6494 acc_corrupt_t_0p6_0p8=0.9947 corrupt_frac_t_0p6_0p8=0.0133 loss_all=0.7538 init_gold_top10=0.2115 init_gold_top100=0.2852
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NCCL version 2.25.1+cuda12.8
|
| 2 |
+
{
|
| 3 |
+
"device": "cuda:0",
|
| 4 |
+
"rank": 0,
|
| 5 |
+
"world_size": 4,
|
| 6 |
+
"samples": "owt_cached_chunks:8",
|
| 7 |
+
"vocab_size": 969,
|
| 8 |
+
"tokenizer_vocab_size": 50257,
|
| 9 |
+
"save_dir": "runs/train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456",
|
| 10 |
+
"batch_size": 128,
|
| 11 |
+
"grad_accum": 1,
|
| 12 |
+
"effective_batch_size": 512,
|
| 13 |
+
"global_batch_size": 512,
|
| 14 |
+
"lr_schedule": "constant_warmup",
|
| 15 |
+
"optimizer": "muon",
|
| 16 |
+
"epochs": 0.0,
|
| 17 |
+
"steps_per_epoch": 1,
|
| 18 |
+
"total_steps": 1000,
|
| 19 |
+
"warmup_steps": 10,
|
| 20 |
+
"warmup_epochs": -1.0,
|
| 21 |
+
"min_lr": 0.0,
|
| 22 |
+
"weight_decay": 0.1,
|
| 23 |
+
"output_weight_decay": -1.0,
|
| 24 |
+
"adamw_param_groups": "nanogpt",
|
| 25 |
+
"adam_beta1": 0.9,
|
| 26 |
+
"adam_beta2": 0.95,
|
| 27 |
+
"adam_eps": 1e-08,
|
| 28 |
+
"muon_impl": "legacy",
|
| 29 |
+
"muon_momentum": 0.95,
|
| 30 |
+
"muon_ns_steps": 5,
|
| 31 |
+
"muon_update_scale": 1.0,
|
| 32 |
+
"muon_nesterov": false,
|
| 33 |
+
"muon_width_scale": false,
|
| 34 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 35 |
+
"muon_param_count": 1965440,
|
| 36 |
+
"muon_adam_param_count": 8192,
|
| 37 |
+
"muon_param_names": [
|
| 38 |
+
"vocab_embed.embedding",
|
| 39 |
+
"sigma_map.net.0.weight",
|
| 40 |
+
"sigma_map.net.2.weight",
|
| 41 |
+
"blocks.0.attn_qkv.weight",
|
| 42 |
+
"blocks.0.attn_out.weight",
|
| 43 |
+
"blocks.0.mlp.0.weight",
|
| 44 |
+
"blocks.0.mlp.2.weight",
|
| 45 |
+
"blocks.0.adaLN_modulation.weight",
|
| 46 |
+
"blocks.1.attn_qkv.weight",
|
| 47 |
+
"blocks.1.attn_out.weight",
|
| 48 |
+
"blocks.1.mlp.0.weight",
|
| 49 |
+
"blocks.1.mlp.2.weight",
|
| 50 |
+
"blocks.1.adaLN_modulation.weight",
|
| 51 |
+
"blocks.2.attn_qkv.weight",
|
| 52 |
+
"blocks.2.attn_out.weight",
|
| 53 |
+
"blocks.2.mlp.0.weight",
|
| 54 |
+
"blocks.2.mlp.2.weight",
|
| 55 |
+
"blocks.2.adaLN_modulation.weight",
|
| 56 |
+
"output_layer.linear.weight",
|
| 57 |
+
"output_layer.adaLN_modulation.weight"
|
| 58 |
+
],
|
| 59 |
+
"muon_adam_param_names": [
|
| 60 |
+
"sigma_map.net.0.bias",
|
| 61 |
+
"sigma_map.net.2.bias",
|
| 62 |
+
"blocks.0.norm1.weight",
|
| 63 |
+
"blocks.0.norm2.weight",
|
| 64 |
+
"blocks.0.mlp.0.bias",
|
| 65 |
+
"blocks.0.mlp.2.bias",
|
| 66 |
+
"blocks.0.adaLN_modulation.bias",
|
| 67 |
+
"blocks.1.norm1.weight",
|
| 68 |
+
"blocks.1.norm2.weight",
|
| 69 |
+
"blocks.1.mlp.0.bias",
|
| 70 |
+
"blocks.1.mlp.2.bias",
|
| 71 |
+
"blocks.1.adaLN_modulation.bias",
|
| 72 |
+
"blocks.2.norm1.weight",
|
| 73 |
+
"blocks.2.norm2.weight",
|
| 74 |
+
"blocks.2.mlp.0.bias",
|
| 75 |
+
"blocks.2.mlp.2.bias",
|
| 76 |
+
"blocks.2.adaLN_modulation.bias",
|
| 77 |
+
"output_layer.norm_final.weight",
|
| 78 |
+
"output_layer.adaLN_modulation.bias"
|
| 79 |
+
],
|
| 80 |
+
"muon_effective_nesterov": false,
|
| 81 |
+
"muon_effective_width_scale": false,
|
| 82 |
+
"muon_effective_weight_decay": 0.1,
|
| 83 |
+
"muon_adam_fallback_nesterov": false,
|
| 84 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 85 |
+
"ema_decay": 0.9999,
|
| 86 |
+
"ema_start_step": 0,
|
| 87 |
+
"model_type": "ddit",
|
| 88 |
+
"ddit_mlp_type": "gelu",
|
| 89 |
+
"elf_num_time_tokens": 4,
|
| 90 |
+
"elf_num_model_mode_tokens": 0,
|
| 91 |
+
"qk_norm": true,
|
| 92 |
+
"output_bias": false,
|
| 93 |
+
"output_init_std": -1.0,
|
| 94 |
+
"norm_type": "rmsnorm",
|
| 95 |
+
"target_loss": "hard_ce",
|
| 96 |
+
"linear_soft_target_power": 1.0,
|
| 97 |
+
"linear_soft_target_min_conf": 0.0,
|
| 98 |
+
"linear_soft_target_max_conf": 1.0,
|
| 99 |
+
"t_sampling_mode": "logit_normal",
|
| 100 |
+
"t_sampling_power": 1.0,
|
| 101 |
+
"t_sampling_eps": 0.0001,
|
| 102 |
+
"t_sampling_logit_mean": -1.5,
|
| 103 |
+
"t_sampling_logit_std": 0.8,
|
| 104 |
+
"dual_t": true,
|
| 105 |
+
"corrupt_t_mode": "same",
|
| 106 |
+
"corrupt_min_t": 0.0,
|
| 107 |
+
"corrupt_max_t": 1.0,
|
| 108 |
+
"prefix_block_prob": 0.0,
|
| 109 |
+
"prefix_block_len": 128,
|
| 110 |
+
"mask_ratio_floor_schedule": "none",
|
| 111 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 112 |
+
"dirichlet_semantic_t_mode": "same",
|
| 113 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 114 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 115 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 116 |
+
"endpoint_sequence_random_prob_alpha": 0.5,
|
| 117 |
+
"categorical_wrong_from_full_vocab": true,
|
| 118 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 119 |
+
"categorical_wrong_basin_token_ids": "",
|
| 120 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 121 |
+
"categorical_wrong_unigram_prob": 1.0,
|
| 122 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 123 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 124 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 125 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 126 |
+
"categorical_wrong_unigram_shared_prob": 0.5,
|
| 127 |
+
"mask_mixture_original_prob": 0.0,
|
| 128 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 129 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 130 |
+
"mask_mixture_block_prob": 0.0,
|
| 131 |
+
"mask_mixture_all_prob": 1.0,
|
| 132 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 133 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 134 |
+
"mask_mixture_block_tokens": "64,128",
|
| 135 |
+
"simplex_bridge_sampler": "logistic_normal_linear_mean",
|
| 136 |
+
"logistic_normal_sigma_min": 0.03,
|
| 137 |
+
"logistic_normal_sigma_max": 0.4,
|
| 138 |
+
"logistic_normal_tau_min": 1.0,
|
| 139 |
+
"logistic_normal_tau_max": 1.0,
|
| 140 |
+
"torch_compile": false,
|
| 141 |
+
"compile_mode": "max-autotune",
|
| 142 |
+
"state_format": "prob",
|
| 143 |
+
"meanflow_weight": 0.0,
|
| 144 |
+
"rollout_train_prob": 0.0,
|
| 145 |
+
"rollout_train_steps": 1,
|
| 146 |
+
"rollout_train_infer_steps": 64,
|
| 147 |
+
"rollout_train_temp": 1.45,
|
| 148 |
+
"rollout_train_max_gamma": 1.0,
|
| 149 |
+
"rollout_train_corrupt_only": true,
|
| 150 |
+
"rollout_train_samplewise": false,
|
| 151 |
+
"rollout_train_compute_always": false,
|
| 152 |
+
"bridge_noise_init": "logistic_normal",
|
| 153 |
+
"noise_sigma": -1.0,
|
| 154 |
+
"allow_tf32": true,
|
| 155 |
+
"activation_checkpointing": false,
|
| 156 |
+
"activation_checkpoint_interval": 1,
|
| 157 |
+
"activation_checkpoint_scope": "block",
|
| 158 |
+
"ddp_static_graph": false,
|
| 159 |
+
"ddp_gradient_as_bucket_view": true,
|
| 160 |
+
"blocking_data_transfer": false,
|
| 161 |
+
"dataloader_prefetch_factor": 4,
|
| 162 |
+
"full_train_stats": false,
|
| 163 |
+
"tokenized_hf": false,
|
| 164 |
+
"tokenized_pad_token": "pad",
|
| 165 |
+
"elf_conditional_hf": false,
|
| 166 |
+
"record_pad_truncate": false,
|
| 167 |
+
"record_add_eos": false,
|
| 168 |
+
"record_add_special_tokens": false,
|
| 169 |
+
"record_pad_token": "pad",
|
| 170 |
+
"record_shuffle_buffer": 10000,
|
| 171 |
+
"wrap": true,
|
| 172 |
+
"wrap_mode": "stream",
|
| 173 |
+
"wrap_record_buffer_size": 200,
|
| 174 |
+
"owt_cached_chunks": true,
|
| 175 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
|
| 176 |
+
"owt_chunk_cache_rebuild": false,
|
| 177 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 178 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 179 |
+
"online_chunk_shuffle": false,
|
| 180 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 181 |
+
"openwebtext_split": "train_minus_100k",
|
| 182 |
+
"detokenizer": "auto",
|
| 183 |
+
"resolved_detokenizer": null,
|
| 184 |
+
"num_workers": 0,
|
| 185 |
+
"latest_every": 1000,
|
| 186 |
+
"resume_path": ""
|
| 187 |
+
}
|
| 188 |
+
step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=4.5s lr=2.000000e-03 loss=6.7097 loss_recon=6.7097 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1179 corrupt_frac=1.0000 acc_corrupt=0.1179 loss_corrupt=6.7097 wrong_frac=0.8656 init_acc_corrupt=0.1344 acc_corrupt_t_0p0_0p2=0.0569 corrupt_frac_t_0p0_0p2=0.5599 acc_corrupt_t_0p2_0p4=0.1555 corrupt_frac_t_0p2_0p4=0.3564 acc_corrupt_t_0p4_0p6=0.3428 corrupt_frac_t_0p4_0p6=0.0738 acc_corrupt_t_0p6_0p8=0.5361 corrupt_frac_t_0p6_0p8=0.0138 out_w_norm=1.2263 out_g_norm=1.0547 acc_corrupt_t_0p8_1p0=0.8340 corrupt_frac_t_0p8_1p0=0.0104 loss_all=6.4790 init_gold_top10=0.1401 init_gold_top100=0.2195
|
| 189 |
+
step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=3.8s lr=2.000000e-03 loss=6.1757 loss_recon=6.1757 loss_meanflow=0.0000 mean_model_t=0.2069 mean_corrupt_t=0.2069 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1118 corrupt_frac=1.0000 acc_corrupt=0.1118 loss_corrupt=6.1757 wrong_frac=0.8695 init_acc_corrupt=0.1305 acc_corrupt_t_0p0_0p2=0.0654 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.1396 corrupt_frac_t_0p2_0p4=0.3552 acc_corrupt_t_0p4_0p6=0.2837 corrupt_frac_t_0p4_0p6=0.0749 out_w_norm=3.8017 out_g_norm=1.5027 acc_corrupt_t_0p6_0p8=0.5003 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=0.7578 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.9357 init_gold_top10=0.1209 init_gold_top100=0.2034
|
| 190 |
+
step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=3.8s lr=2.000000e-03 loss=5.6465 loss_recon=5.6465 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1395 corrupt_frac=1.0000 acc_corrupt=0.1395 loss_corrupt=5.6465 wrong_frac=0.8662 init_acc_corrupt=0.1338 acc_corrupt_t_0p0_0p2=0.0788 corrupt_frac_t_0p0_0p2=0.5585 acc_corrupt_t_0p2_0p4=0.1795 corrupt_frac_t_0p2_0p4=0.3541 acc_corrupt_t_0p4_0p6=0.3465 corrupt_frac_t_0p4_0p6=0.0783 acc_corrupt_t_0p6_0p8=0.5308 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=6.0405 out_g_norm=0.6592 acc_corrupt_t_0p8_1p0=0.4082 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.4633 init_gold_top10=0.1470 init_gold_top100=0.2274
|
| 191 |
+
step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=3.9s lr=2.000000e-03 loss=5.4440 loss_recon=5.4440 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1499 corrupt_frac=1.0000 acc_corrupt=0.1499 loss_corrupt=5.4440 wrong_frac=0.8670 init_acc_corrupt=0.1330 acc_corrupt_t_0p0_0p2=0.0880 corrupt_frac_t_0p0_0p2=0.5540 acc_corrupt_t_0p2_0p4=0.1937 corrupt_frac_t_0p2_0p4=0.3638 acc_corrupt_t_0p4_0p6=0.3606 corrupt_frac_t_0p4_0p6=0.0743 acc_corrupt_t_0p6_0p8=0.4963 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=7.7267 out_g_norm=0.3063 acc_corrupt_t_0p8_1p0=0.7578 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.2757 init_gold_top10=0.1559 init_gold_top100=0.2377
|
| 192 |
+
step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=3.8s lr=2.000000e-03 loss=5.1483 loss_recon=5.1483 loss_meanflow=0.0000 mean_model_t=0.2103 mean_corrupt_t=0.2103 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1601 corrupt_frac=1.0000 acc_corrupt=0.1601 loss_corrupt=5.1483 wrong_frac=0.8648 init_acc_corrupt=0.1352 acc_corrupt_t_0p0_0p2=0.0953 corrupt_frac_t_0p0_0p2=0.5508 acc_corrupt_t_0p2_0p4=0.2020 corrupt_frac_t_0p2_0p4=0.3598 acc_corrupt_t_0p4_0p6=0.3747 corrupt_frac_t_0p4_0p6=0.0818 out_w_norm=8.9678 out_g_norm=0.4350 acc_corrupt_t_0p6_0p8=0.5439 corrupt_frac_t_0p6_0p8=0.0136 acc_corrupt_t_0p8_1p0=0.8184 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.0288 init_gold_top10=0.1219 init_gold_top100=0.2024
|
| 193 |
+
step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=3.8s lr=2.000000e-03 loss=4.6281 loss_recon=4.6281 loss_meanflow=0.0000 mean_model_t=0.2080 mean_corrupt_t=0.2080 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1673 corrupt_frac=1.0000 acc_corrupt=0.1673 loss_corrupt=4.6281 wrong_frac=0.8662 init_acc_corrupt=0.1338 acc_corrupt_t_0p0_0p2=0.1028 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.2130 corrupt_frac_t_0p2_0p4=0.3509 acc_corrupt_t_0p4_0p6=0.3788 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p6_0p8=0.5652 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=9.3953 out_g_norm=0.4892 acc_corrupt_t_0p8_1p0=0.8623 corrupt_frac_t_0p8_1p0=0.0078 loss_all=4.4779 init_gold_top10=0.1371 init_gold_top100=0.2186
|
| 194 |
+
step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=3.8s lr=2.000000e-03 loss=4.0407 loss_recon=4.0407 loss_meanflow=0.0000 mean_model_t=0.2106 mean_corrupt_t=0.2106 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1789 corrupt_frac=1.0000 acc_corrupt=0.1789 loss_corrupt=4.0407 wrong_frac=0.8641 init_acc_corrupt=0.1359 acc_corrupt_t_0p0_0p2=0.1138 corrupt_frac_t_0p0_0p2=0.5534 acc_corrupt_t_0p2_0p4=0.2271 corrupt_frac_t_0p2_0p4=0.3573 acc_corrupt_t_0p4_0p6=0.3743 corrupt_frac_t_0p4_0p6=0.0812 out_w_norm=9.6182 out_g_norm=0.5365 acc_corrupt_t_0p6_0p8=0.5301 corrupt_frac_t_0p6_0p8=0.0129 acc_corrupt_t_0p8_1p0=0.8301 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.7657 init_gold_top10=0.1297 init_gold_top100=0.2126
|
| 195 |
+
step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=3.8s lr=2.000000e-03 loss=3.5950 loss_recon=3.5950 loss_meanflow=0.0000 mean_model_t=0.2079 mean_corrupt_t=0.2079 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1916 corrupt_frac=1.0000 acc_corrupt=0.1916 loss_corrupt=3.5950 wrong_frac=0.8684 init_acc_corrupt=0.1316 acc_corrupt_t_0p0_0p2=0.1264 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.2418 corrupt_frac_t_0p2_0p4=0.3614 acc_corrupt_t_0p4_0p6=0.3942 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.5762 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=9.8701 out_g_norm=0.7488 acc_corrupt_t_0p8_1p0=0.8203 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.4492 init_gold_top10=0.1366 init_gold_top100=0.2184
|
| 196 |
+
step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=3.8s lr=2.000000e-03 loss=3.0683 loss_recon=3.0683 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2626 corrupt_frac=1.0000 acc_corrupt=0.2626 loss_corrupt=3.0683 wrong_frac=0.8669 init_acc_corrupt=0.1331 acc_corrupt_t_0p0_0p2=0.1752 corrupt_frac_t_0p0_0p2=0.5614 acc_corrupt_t_0p2_0p4=0.3404 corrupt_frac_t_0p2_0p4=0.3548 acc_corrupt_t_0p4_0p6=0.5038 corrupt_frac_t_0p4_0p6=0.0757 out_w_norm=10.0897 out_g_norm=1.0269 acc_corrupt_t_0p6_0p8=0.6538 corrupt_frac_t_0p6_0p8=0.0121 acc_corrupt_t_0p8_1p0=0.9082 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.4332 init_gold_top10=0.1616 init_gold_top100=0.2392
|
| 197 |
+
step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=3.8s lr=2.000000e-03 loss=2.4691 loss_recon=2.4691 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4023 corrupt_frac=1.0000 acc_corrupt=0.4023 loss_corrupt=2.4691 wrong_frac=0.8652 init_acc_corrupt=0.1348 acc_corrupt_t_0p0_0p2=0.2939 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=0.5031 corrupt_frac_t_0p2_0p4=0.3610 acc_corrupt_t_0p4_0p6=0.6783 corrupt_frac_t_0p4_0p6=0.0730 out_w_norm=10.3143 out_g_norm=1.0213 acc_corrupt_t_0p6_0p8=0.7848 corrupt_frac_t_0p6_0p8=0.0140 acc_corrupt_t_0p8_1p0=0.4844 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.1232 init_gold_top10=0.1450 init_gold_top100=0.2272
|
| 198 |
+
NCCL version 2.25.1+cuda12.8
|
| 199 |
+
resumed_from=runs/train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456/latest.pt start_step=1001
|
| 200 |
+
{
|
| 201 |
+
"device": "cuda:0",
|
| 202 |
+
"rank": 0,
|
| 203 |
+
"world_size": 4,
|
| 204 |
+
"samples": "owt_cached_chunks:8",
|
| 205 |
+
"vocab_size": 969,
|
| 206 |
+
"tokenizer_vocab_size": 50257,
|
| 207 |
+
"save_dir": "runs/train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456",
|
| 208 |
+
"batch_size": 128,
|
| 209 |
+
"grad_accum": 1,
|
| 210 |
+
"effective_batch_size": 512,
|
| 211 |
+
"global_batch_size": 512,
|
| 212 |
+
"lr_schedule": "constant_warmup",
|
| 213 |
+
"optimizer": "muon",
|
| 214 |
+
"epochs": 0.0,
|
| 215 |
+
"steps_per_epoch": 1,
|
| 216 |
+
"total_steps": 2000,
|
| 217 |
+
"warmup_steps": 10,
|
| 218 |
+
"warmup_epochs": -1.0,
|
| 219 |
+
"min_lr": 0.0,
|
| 220 |
+
"weight_decay": 0.1,
|
| 221 |
+
"output_weight_decay": -1.0,
|
| 222 |
+
"adamw_param_groups": "nanogpt",
|
| 223 |
+
"adam_beta1": 0.9,
|
| 224 |
+
"adam_beta2": 0.95,
|
| 225 |
+
"adam_eps": 1e-08,
|
| 226 |
+
"muon_impl": "legacy",
|
| 227 |
+
"muon_momentum": 0.95,
|
| 228 |
+
"muon_ns_steps": 5,
|
| 229 |
+
"muon_update_scale": 1.0,
|
| 230 |
+
"muon_nesterov": false,
|
| 231 |
+
"muon_width_scale": false,
|
| 232 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 233 |
+
"muon_param_count": 1965440,
|
| 234 |
+
"muon_adam_param_count": 8192,
|
| 235 |
+
"muon_param_names": [
|
| 236 |
+
"vocab_embed.embedding",
|
| 237 |
+
"sigma_map.net.0.weight",
|
| 238 |
+
"sigma_map.net.2.weight",
|
| 239 |
+
"blocks.0.attn_qkv.weight",
|
| 240 |
+
"blocks.0.attn_out.weight",
|
| 241 |
+
"blocks.0.mlp.0.weight",
|
| 242 |
+
"blocks.0.mlp.2.weight",
|
| 243 |
+
"blocks.0.adaLN_modulation.weight",
|
| 244 |
+
"blocks.1.attn_qkv.weight",
|
| 245 |
+
"blocks.1.attn_out.weight",
|
| 246 |
+
"blocks.1.mlp.0.weight",
|
| 247 |
+
"blocks.1.mlp.2.weight",
|
| 248 |
+
"blocks.1.adaLN_modulation.weight",
|
| 249 |
+
"blocks.2.attn_qkv.weight",
|
| 250 |
+
"blocks.2.attn_out.weight",
|
| 251 |
+
"blocks.2.mlp.0.weight",
|
| 252 |
+
"blocks.2.mlp.2.weight",
|
| 253 |
+
"blocks.2.adaLN_modulation.weight",
|
| 254 |
+
"output_layer.linear.weight",
|
| 255 |
+
"output_layer.adaLN_modulation.weight"
|
| 256 |
+
],
|
| 257 |
+
"muon_adam_param_names": [
|
| 258 |
+
"sigma_map.net.0.bias",
|
| 259 |
+
"sigma_map.net.2.bias",
|
| 260 |
+
"blocks.0.norm1.weight",
|
| 261 |
+
"blocks.0.norm2.weight",
|
| 262 |
+
"blocks.0.mlp.0.bias",
|
| 263 |
+
"blocks.0.mlp.2.bias",
|
| 264 |
+
"blocks.0.adaLN_modulation.bias",
|
| 265 |
+
"blocks.1.norm1.weight",
|
| 266 |
+
"blocks.1.norm2.weight",
|
| 267 |
+
"blocks.1.mlp.0.bias",
|
| 268 |
+
"blocks.1.mlp.2.bias",
|
| 269 |
+
"blocks.1.adaLN_modulation.bias",
|
| 270 |
+
"blocks.2.norm1.weight",
|
| 271 |
+
"blocks.2.norm2.weight",
|
| 272 |
+
"blocks.2.mlp.0.bias",
|
| 273 |
+
"blocks.2.mlp.2.bias",
|
| 274 |
+
"blocks.2.adaLN_modulation.bias",
|
| 275 |
+
"output_layer.norm_final.weight",
|
| 276 |
+
"output_layer.adaLN_modulation.bias"
|
| 277 |
+
],
|
| 278 |
+
"muon_effective_nesterov": false,
|
| 279 |
+
"muon_effective_width_scale": false,
|
| 280 |
+
"muon_effective_weight_decay": 0.1,
|
| 281 |
+
"muon_adam_fallback_nesterov": false,
|
| 282 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 283 |
+
"ema_decay": 0.9999,
|
| 284 |
+
"ema_start_step": 0,
|
| 285 |
+
"model_type": "ddit",
|
| 286 |
+
"ddit_mlp_type": "gelu",
|
| 287 |
+
"elf_num_time_tokens": 4,
|
| 288 |
+
"elf_num_model_mode_tokens": 0,
|
| 289 |
+
"qk_norm": true,
|
| 290 |
+
"output_bias": false,
|
| 291 |
+
"output_init_std": -1.0,
|
| 292 |
+
"norm_type": "rmsnorm",
|
| 293 |
+
"target_loss": "hard_ce",
|
| 294 |
+
"linear_soft_target_power": 1.0,
|
| 295 |
+
"linear_soft_target_min_conf": 0.0,
|
| 296 |
+
"linear_soft_target_max_conf": 1.0,
|
| 297 |
+
"t_sampling_mode": "logit_normal",
|
| 298 |
+
"t_sampling_power": 1.0,
|
| 299 |
+
"t_sampling_eps": 0.0001,
|
| 300 |
+
"t_sampling_logit_mean": -1.5,
|
| 301 |
+
"t_sampling_logit_std": 0.8,
|
| 302 |
+
"dual_t": true,
|
| 303 |
+
"corrupt_t_mode": "same",
|
| 304 |
+
"corrupt_min_t": 0.0,
|
| 305 |
+
"corrupt_max_t": 1.0,
|
| 306 |
+
"prefix_block_prob": 0.0,
|
| 307 |
+
"prefix_block_len": 128,
|
| 308 |
+
"mask_ratio_floor_schedule": "none",
|
| 309 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 310 |
+
"dirichlet_semantic_t_mode": "same",
|
| 311 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 312 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 313 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 314 |
+
"endpoint_sequence_random_prob_alpha": 0.5,
|
| 315 |
+
"categorical_wrong_from_full_vocab": true,
|
| 316 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 317 |
+
"categorical_wrong_basin_token_ids": "",
|
| 318 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 319 |
+
"categorical_wrong_unigram_prob": 1.0,
|
| 320 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 321 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 322 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 323 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 324 |
+
"categorical_wrong_unigram_shared_prob": 0.5,
|
| 325 |
+
"mask_mixture_original_prob": 0.0,
|
| 326 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 327 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 328 |
+
"mask_mixture_block_prob": 0.0,
|
| 329 |
+
"mask_mixture_all_prob": 1.0,
|
| 330 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 331 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 332 |
+
"mask_mixture_block_tokens": "64,128",
|
| 333 |
+
"simplex_bridge_sampler": "logistic_normal_linear_mean",
|
| 334 |
+
"logistic_normal_sigma_min": 0.03,
|
| 335 |
+
"logistic_normal_sigma_max": 0.4,
|
| 336 |
+
"logistic_normal_tau_min": 1.0,
|
| 337 |
+
"logistic_normal_tau_max": 1.0,
|
| 338 |
+
"torch_compile": false,
|
| 339 |
+
"compile_mode": "max-autotune",
|
| 340 |
+
"state_format": "prob",
|
| 341 |
+
"meanflow_weight": 0.0,
|
| 342 |
+
"rollout_train_prob": 0.0,
|
| 343 |
+
"rollout_train_steps": 1,
|
| 344 |
+
"rollout_train_infer_steps": 64,
|
| 345 |
+
"rollout_train_temp": 1.45,
|
| 346 |
+
"rollout_train_max_gamma": 1.0,
|
| 347 |
+
"rollout_train_corrupt_only": true,
|
| 348 |
+
"rollout_train_samplewise": false,
|
| 349 |
+
"rollout_train_compute_always": false,
|
| 350 |
+
"bridge_noise_init": "logistic_normal",
|
| 351 |
+
"noise_sigma": -1.0,
|
| 352 |
+
"allow_tf32": true,
|
| 353 |
+
"activation_checkpointing": false,
|
| 354 |
+
"activation_checkpoint_interval": 1,
|
| 355 |
+
"activation_checkpoint_scope": "block",
|
| 356 |
+
"ddp_static_graph": false,
|
| 357 |
+
"ddp_gradient_as_bucket_view": true,
|
| 358 |
+
"blocking_data_transfer": false,
|
| 359 |
+
"dataloader_prefetch_factor": 4,
|
| 360 |
+
"full_train_stats": false,
|
| 361 |
+
"tokenized_hf": false,
|
| 362 |
+
"tokenized_pad_token": "pad",
|
| 363 |
+
"elf_conditional_hf": false,
|
| 364 |
+
"record_pad_truncate": false,
|
| 365 |
+
"record_add_eos": false,
|
| 366 |
+
"record_add_special_tokens": false,
|
| 367 |
+
"record_pad_token": "pad",
|
| 368 |
+
"record_shuffle_buffer": 10000,
|
| 369 |
+
"wrap": true,
|
| 370 |
+
"wrap_mode": "stream",
|
| 371 |
+
"wrap_record_buffer_size": 200,
|
| 372 |
+
"owt_cached_chunks": true,
|
| 373 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
|
| 374 |
+
"owt_chunk_cache_rebuild": false,
|
| 375 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 376 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 377 |
+
"online_chunk_shuffle": false,
|
| 378 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 379 |
+
"openwebtext_split": "train_minus_100k",
|
| 380 |
+
"detokenizer": "auto",
|
| 381 |
+
"resolved_detokenizer": null,
|
| 382 |
+
"num_workers": 0,
|
| 383 |
+
"latest_every": 1000,
|
| 384 |
+
"resume_path": "runs/train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456/latest.pt"
|
| 385 |
+
}
|
| 386 |
+
step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=4.7s lr=2.000000e-03 loss=2.0475 loss_recon=2.0475 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5133 corrupt_frac=1.0000 acc_corrupt=0.5133 loss_corrupt=2.0475 wrong_frac=0.8656 init_acc_corrupt=0.1344 acc_corrupt_t_0p0_0p2=0.4118 corrupt_frac_t_0p0_0p2=0.5599 acc_corrupt_t_0p2_0p4=0.6173 corrupt_frac_t_0p2_0p4=0.3564 acc_corrupt_t_0p4_0p6=0.7417 corrupt_frac_t_0p4_0p6=0.0738 acc_corrupt_t_0p6_0p8=0.8054 corrupt_frac_t_0p6_0p8=0.0138 out_w_norm=10.5718 out_g_norm=1.1653 acc_corrupt_t_0p8_1p0=0.9883 corrupt_frac_t_0p8_1p0=0.0104 loss_all=1.8965 init_gold_top10=0.1401 init_gold_top100=0.2195
|
| 387 |
+
step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=4.0s lr=2.000000e-03 loss=1.8251 loss_recon=1.8251 loss_meanflow=0.0000 mean_model_t=0.2069 mean_corrupt_t=0.2069 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5569 corrupt_frac=1.0000 acc_corrupt=0.5569 loss_corrupt=1.8251 wrong_frac=0.8695 init_acc_corrupt=0.1305 acc_corrupt_t_0p0_0p2=0.4760 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.6412 corrupt_frac_t_0p2_0p4=0.3552 acc_corrupt_t_0p4_0p6=0.7336 corrupt_frac_t_0p4_0p6=0.0749 out_w_norm=10.7986 out_g_norm=1.1972 acc_corrupt_t_0p6_0p8=0.8191 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.8619 init_gold_top10=0.1209 init_gold_top100=0.2034
|
| 388 |
+
step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=4.0s lr=2.000000e-03 loss=1.6495 loss_recon=1.6495 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5851 corrupt_frac=1.0000 acc_corrupt=0.5851 loss_corrupt=1.6495 wrong_frac=0.8662 init_acc_corrupt=0.1338 acc_corrupt_t_0p0_0p2=0.5137 corrupt_frac_t_0p0_0p2=0.5585 acc_corrupt_t_0p2_0p4=0.6553 corrupt_frac_t_0p2_0p4=0.3541 acc_corrupt_t_0p4_0p6=0.7488 corrupt_frac_t_0p4_0p6=0.0783 acc_corrupt_t_0p6_0p8=0.8325 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=10.9370 out_g_norm=1.2443 acc_corrupt_t_0p8_1p0=0.5098 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.5011 init_gold_top10=0.1470 init_gold_top100=0.2274
|
| 389 |
+
step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=4.0s lr=2.000000e-03 loss=1.4723 loss_recon=1.4723 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6098 corrupt_frac=1.0000 acc_corrupt=0.6098 loss_corrupt=1.4723 wrong_frac=0.8670 init_acc_corrupt=0.1330 acc_corrupt_t_0p0_0p2=0.5484 corrupt_frac_t_0p0_0p2=0.5540 acc_corrupt_t_0p2_0p4=0.6697 corrupt_frac_t_0p2_0p4=0.3638 acc_corrupt_t_0p4_0p6=0.7569 corrupt_frac_t_0p4_0p6=0.0743 acc_corrupt_t_0p6_0p8=0.7629 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=11.0332 out_g_norm=1.2546 acc_corrupt_t_0p8_1p0=0.9980 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0527 init_gold_top10=0.1559 init_gold_top100=0.2377
|
| 390 |
+
step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=4.0s lr=2.000000e-03 loss=1.3097 loss_recon=1.3097 loss_meanflow=0.0000 mean_model_t=0.2103 mean_corrupt_t=0.2103 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6239 corrupt_frac=1.0000 acc_corrupt=0.6239 loss_corrupt=1.3097 wrong_frac=0.8648 init_acc_corrupt=0.1352 acc_corrupt_t_0p0_0p2=0.5615 corrupt_frac_t_0p0_0p2=0.5508 acc_corrupt_t_0p2_0p4=0.6788 corrupt_frac_t_0p2_0p4=0.3598 acc_corrupt_t_0p4_0p6=0.7833 corrupt_frac_t_0p4_0p6=0.0818 out_w_norm=11.1027 out_g_norm=1.2651 acc_corrupt_t_0p6_0p8=0.8208 corrupt_frac_t_0p6_0p8=0.0136 acc_corrupt_t_0p8_1p0=0.9941 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.4609 init_gold_top10=0.1219 init_gold_top100=0.2024
|
| 391 |
+
step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=4.0s lr=2.000000e-03 loss=1.1768 loss_recon=1.1768 loss_meanflow=0.0000 mean_model_t=0.2080 mean_corrupt_t=0.2080 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6383 corrupt_frac=1.0000 acc_corrupt=0.6383 loss_corrupt=1.1768 wrong_frac=0.8662 init_acc_corrupt=0.1338 acc_corrupt_t_0p0_0p2=0.5824 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.6902 corrupt_frac_t_0p2_0p4=0.3509 acc_corrupt_t_0p4_0p6=0.7812 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p6_0p8=0.8484 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=11.1630 out_g_norm=1.2914 acc_corrupt_t_0p8_1p0=0.9971 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.3553 init_gold_top10=0.1371 init_gold_top100=0.2186
|
| 392 |
+
step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=4.0s lr=2.000000e-03 loss=1.0551 loss_recon=1.0551 loss_meanflow=0.0000 mean_model_t=0.2106 mean_corrupt_t=0.2106 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6547 corrupt_frac=1.0000 acc_corrupt=0.6547 loss_corrupt=1.0551 wrong_frac=0.8641 init_acc_corrupt=0.1359 acc_corrupt_t_0p0_0p2=0.6084 corrupt_frac_t_0p0_0p2=0.5534 acc_corrupt_t_0p2_0p4=0.6984 corrupt_frac_t_0p2_0p4=0.3573 acc_corrupt_t_0p4_0p6=0.7629 corrupt_frac_t_0p4_0p6=0.0812 out_w_norm=11.2082 out_g_norm=1.2603 acc_corrupt_t_0p6_0p8=0.7996 corrupt_frac_t_0p6_0p8=0.0129 acc_corrupt_t_0p8_1p0=0.9980 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0073 init_gold_top10=0.1297 init_gold_top100=0.2126
|
| 393 |
+
step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=4.0s lr=2.000000e-03 loss=0.9843 loss_recon=0.9843 loss_meanflow=0.0000 mean_model_t=0.2079 mean_corrupt_t=0.2079 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6577 corrupt_frac=1.0000 acc_corrupt=0.6577 loss_corrupt=0.9843 wrong_frac=0.8684 init_acc_corrupt=0.1316 acc_corrupt_t_0p0_0p2=0.6154 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.6959 corrupt_frac_t_0p2_0p4=0.3614 acc_corrupt_t_0p4_0p6=0.7686 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.8488 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=11.2253 out_g_norm=1.1781 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0908 init_gold_top10=0.1366 init_gold_top100=0.2184
|
| 394 |
+
step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=4.0s lr=2.000000e-03 loss=0.8950 loss_recon=0.8950 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6766 corrupt_frac=1.0000 acc_corrupt=0.6766 loss_corrupt=0.8950 wrong_frac=0.8669 init_acc_corrupt=0.1331 acc_corrupt_t_0p0_0p2=0.6318 corrupt_frac_t_0p0_0p2=0.5614 acc_corrupt_t_0p2_0p4=0.7194 corrupt_frac_t_0p2_0p4=0.3548 acc_corrupt_t_0p4_0p6=0.7877 corrupt_frac_t_0p4_0p6=0.0757 out_w_norm=11.2363 out_g_norm=1.1321 acc_corrupt_t_0p6_0p8=0.8710 corrupt_frac_t_0p6_0p8=0.0121 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.6929 init_gold_top10=0.1616 init_gold_top100=0.2392
|
| 395 |
+
step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=4.0s lr=2.000000e-03 loss=0.8544 loss_recon=0.8544 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6830 corrupt_frac=1.0000 acc_corrupt=0.6830 loss_corrupt=0.8544 wrong_frac=0.8652 init_acc_corrupt=0.1348 acc_corrupt_t_0p0_0p2=0.6393 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=0.7180 corrupt_frac_t_0p2_0p4=0.3610 acc_corrupt_t_0p4_0p6=0.8173 corrupt_frac_t_0p4_0p6=0.0730 out_w_norm=11.2470 out_g_norm=1.0160 acc_corrupt_t_0p6_0p8=0.8743 corrupt_frac_t_0p6_0p8=0.0140 acc_corrupt_t_0p8_1p0=0.5039 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.7242 init_gold_top10=0.1450 init_gold_top100=0.2272
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_20260517_223933.log
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NCCL version 2.25.1+cuda12.8
|
| 2 |
+
{
|
| 3 |
+
"device": "cuda:0",
|
| 4 |
+
"rank": 0,
|
| 5 |
+
"world_size": 4,
|
| 6 |
+
"samples": "owt_cached_chunks:8",
|
| 7 |
+
"vocab_size": 2664,
|
| 8 |
+
"tokenizer_vocab_size": 50257,
|
| 9 |
+
"save_dir": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_20260517_223933",
|
| 10 |
+
"batch_size": 128,
|
| 11 |
+
"grad_accum": 1,
|
| 12 |
+
"effective_batch_size": 512,
|
| 13 |
+
"global_batch_size": 512,
|
| 14 |
+
"lr_schedule": "constant_warmup",
|
| 15 |
+
"optimizer": "muon",
|
| 16 |
+
"epochs": 0.0,
|
| 17 |
+
"steps_per_epoch": 1,
|
| 18 |
+
"total_steps": 1000,
|
| 19 |
+
"warmup_steps": 10,
|
| 20 |
+
"warmup_epochs": -1.0,
|
| 21 |
+
"min_lr": 0.0,
|
| 22 |
+
"weight_decay": 0.1,
|
| 23 |
+
"output_weight_decay": -1.0,
|
| 24 |
+
"adamw_param_groups": "nanogpt",
|
| 25 |
+
"adam_beta1": 0.9,
|
| 26 |
+
"adam_beta2": 0.95,
|
| 27 |
+
"adam_eps": 1e-08,
|
| 28 |
+
"muon_impl": "legacy",
|
| 29 |
+
"muon_momentum": 0.95,
|
| 30 |
+
"muon_ns_steps": 5,
|
| 31 |
+
"muon_update_scale": 1.0,
|
| 32 |
+
"muon_nesterov": false,
|
| 33 |
+
"muon_width_scale": false,
|
| 34 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 35 |
+
"muon_param_count": 2616320,
|
| 36 |
+
"muon_adam_param_count": 8192,
|
| 37 |
+
"muon_param_names": [
|
| 38 |
+
"vocab_embed.embedding",
|
| 39 |
+
"sigma_map.net.0.weight",
|
| 40 |
+
"sigma_map.net.2.weight",
|
| 41 |
+
"blocks.0.attn_qkv.weight",
|
| 42 |
+
"blocks.0.attn_out.weight",
|
| 43 |
+
"blocks.0.mlp.0.weight",
|
| 44 |
+
"blocks.0.mlp.2.weight",
|
| 45 |
+
"blocks.0.adaLN_modulation.weight",
|
| 46 |
+
"blocks.1.attn_qkv.weight",
|
| 47 |
+
"blocks.1.attn_out.weight",
|
| 48 |
+
"blocks.1.mlp.0.weight",
|
| 49 |
+
"blocks.1.mlp.2.weight",
|
| 50 |
+
"blocks.1.adaLN_modulation.weight",
|
| 51 |
+
"blocks.2.attn_qkv.weight",
|
| 52 |
+
"blocks.2.attn_out.weight",
|
| 53 |
+
"blocks.2.mlp.0.weight",
|
| 54 |
+
"blocks.2.mlp.2.weight",
|
| 55 |
+
"blocks.2.adaLN_modulation.weight",
|
| 56 |
+
"output_layer.linear.weight",
|
| 57 |
+
"output_layer.adaLN_modulation.weight"
|
| 58 |
+
],
|
| 59 |
+
"muon_adam_param_names": [
|
| 60 |
+
"sigma_map.net.0.bias",
|
| 61 |
+
"sigma_map.net.2.bias",
|
| 62 |
+
"blocks.0.norm1.weight",
|
| 63 |
+
"blocks.0.norm2.weight",
|
| 64 |
+
"blocks.0.mlp.0.bias",
|
| 65 |
+
"blocks.0.mlp.2.bias",
|
| 66 |
+
"blocks.0.adaLN_modulation.bias",
|
| 67 |
+
"blocks.1.norm1.weight",
|
| 68 |
+
"blocks.1.norm2.weight",
|
| 69 |
+
"blocks.1.mlp.0.bias",
|
| 70 |
+
"blocks.1.mlp.2.bias",
|
| 71 |
+
"blocks.1.adaLN_modulation.bias",
|
| 72 |
+
"blocks.2.norm1.weight",
|
| 73 |
+
"blocks.2.norm2.weight",
|
| 74 |
+
"blocks.2.mlp.0.bias",
|
| 75 |
+
"blocks.2.mlp.2.bias",
|
| 76 |
+
"blocks.2.adaLN_modulation.bias",
|
| 77 |
+
"output_layer.norm_final.weight",
|
| 78 |
+
"output_layer.adaLN_modulation.bias"
|
| 79 |
+
],
|
| 80 |
+
"muon_effective_nesterov": false,
|
| 81 |
+
"muon_effective_width_scale": false,
|
| 82 |
+
"muon_effective_weight_decay": 0.1,
|
| 83 |
+
"muon_adam_fallback_nesterov": false,
|
| 84 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 85 |
+
"ema_decay": 0.9999,
|
| 86 |
+
"ema_start_step": 0,
|
| 87 |
+
"model_type": "ddit",
|
| 88 |
+
"ddit_mlp_type": "gelu",
|
| 89 |
+
"elf_num_time_tokens": 4,
|
| 90 |
+
"elf_num_model_mode_tokens": 0,
|
| 91 |
+
"qk_norm": true,
|
| 92 |
+
"output_bias": false,
|
| 93 |
+
"output_init_std": -1.0,
|
| 94 |
+
"norm_type": "rmsnorm",
|
| 95 |
+
"target_loss": "hard_ce",
|
| 96 |
+
"linear_soft_target_power": 1.0,
|
| 97 |
+
"linear_soft_target_min_conf": 0.0,
|
| 98 |
+
"linear_soft_target_max_conf": 1.0,
|
| 99 |
+
"t_sampling_mode": "logit_normal",
|
| 100 |
+
"t_sampling_power": 1.0,
|
| 101 |
+
"t_sampling_eps": 0.0001,
|
| 102 |
+
"t_sampling_logit_mean": -1.5,
|
| 103 |
+
"t_sampling_logit_std": 0.8,
|
| 104 |
+
"dual_t": true,
|
| 105 |
+
"corrupt_t_mode": "same",
|
| 106 |
+
"corrupt_min_t": 0.0,
|
| 107 |
+
"corrupt_max_t": 1.0,
|
| 108 |
+
"prefix_block_prob": 0.0,
|
| 109 |
+
"prefix_block_len": 128,
|
| 110 |
+
"mask_ratio_floor_schedule": "none",
|
| 111 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 112 |
+
"dirichlet_semantic_t_mode": "same",
|
| 113 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 114 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 115 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 116 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 117 |
+
"categorical_wrong_from_full_vocab": true,
|
| 118 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 119 |
+
"categorical_wrong_basin_token_ids": "",
|
| 120 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 121 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 122 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 123 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 124 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 125 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 126 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 127 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 128 |
+
"mask_mixture_original_prob": 0.0,
|
| 129 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 130 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 131 |
+
"mask_mixture_block_prob": 0.0,
|
| 132 |
+
"mask_mixture_all_prob": 1.0,
|
| 133 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 134 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 135 |
+
"mask_mixture_block_tokens": "64,128",
|
| 136 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 137 |
+
"logistic_normal_sigma_min": 0.1,
|
| 138 |
+
"logistic_normal_sigma_max": 1.0,
|
| 139 |
+
"logistic_normal_tau_min": 1.0,
|
| 140 |
+
"logistic_normal_tau_max": 1.0,
|
| 141 |
+
"torch_compile": false,
|
| 142 |
+
"compile_mode": "max-autotune",
|
| 143 |
+
"state_format": "prob",
|
| 144 |
+
"meanflow_weight": 0.0,
|
| 145 |
+
"rollout_train_prob": 0.5,
|
| 146 |
+
"rollout_train_steps": 1,
|
| 147 |
+
"rollout_train_infer_steps": 1,
|
| 148 |
+
"rollout_train_time_mode": "sampled_s",
|
| 149 |
+
"rollout_train_s_dist": "uniform",
|
| 150 |
+
"rollout_train_s_min_frac": 0.0,
|
| 151 |
+
"rollout_train_s_max_frac": 0.125,
|
| 152 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 153 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 154 |
+
"rollout_train_temp": 1.45,
|
| 155 |
+
"rollout_train_max_gamma": 1.0,
|
| 156 |
+
"rollout_train_corrupt_only": true,
|
| 157 |
+
"rollout_train_samplewise": true,
|
| 158 |
+
"rollout_train_compute_always": false,
|
| 159 |
+
"rollout_train_sync_t": true,
|
| 160 |
+
"bridge_noise_init": "logistic_normal",
|
| 161 |
+
"noise_sigma": -1.0,
|
| 162 |
+
"allow_tf32": true,
|
| 163 |
+
"activation_checkpointing": false,
|
| 164 |
+
"activation_checkpoint_interval": 1,
|
| 165 |
+
"activation_checkpoint_scope": "block",
|
| 166 |
+
"ddp_static_graph": false,
|
| 167 |
+
"ddp_gradient_as_bucket_view": true,
|
| 168 |
+
"blocking_data_transfer": false,
|
| 169 |
+
"dataloader_prefetch_factor": 4,
|
| 170 |
+
"full_train_stats": false,
|
| 171 |
+
"tokenized_hf": false,
|
| 172 |
+
"tokenized_pad_token": "pad",
|
| 173 |
+
"elf_conditional_hf": false,
|
| 174 |
+
"record_pad_truncate": false,
|
| 175 |
+
"record_add_eos": false,
|
| 176 |
+
"record_add_special_tokens": false,
|
| 177 |
+
"record_pad_token": "pad",
|
| 178 |
+
"record_shuffle_buffer": 10000,
|
| 179 |
+
"wrap": true,
|
| 180 |
+
"wrap_mode": "stream",
|
| 181 |
+
"wrap_record_buffer_size": 200,
|
| 182 |
+
"owt_cached_chunks": true,
|
| 183 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
|
| 184 |
+
"owt_chunk_cache_rebuild": false,
|
| 185 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 186 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 187 |
+
"online_chunk_shuffle": false,
|
| 188 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 189 |
+
"openwebtext_split": "train_minus_100k",
|
| 190 |
+
"detokenizer": "auto",
|
| 191 |
+
"resolved_detokenizer": null,
|
| 192 |
+
"num_workers": 0,
|
| 193 |
+
"latest_every": 1000,
|
| 194 |
+
"resume_path": ""
|
| 195 |
+
}
|
| 196 |
+
W0517 22:40:01.897000 386925 torch/distributed/elastic/agent/server/api.py:719] Received 15 death signal, shutting down workers
|
| 197 |
+
W0517 22:40:01.899000 386925 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 386929 closing signal SIGTERM
|
| 198 |
+
W0517 22:40:01.900000 386925 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 386930 closing signal SIGTERM
|
| 199 |
+
W0517 22:40:01.900000 386925 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 386931 closing signal SIGTERM
|
| 200 |
+
W0517 22:40:01.901000 386925 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 386932 closing signal SIGTERM
|
| 201 |
+
Traceback (most recent call last):
|
| 202 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 203 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 204 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
|
| 205 |
+
main()
|
| 206 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 207 |
+
return f(*args, **kwargs)
|
| 208 |
+
^^^^^^^^^^^^^^^^^^
|
| 209 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
|
| 210 |
+
run(args)
|
| 211 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
|
| 212 |
+
elastic_launch(
|
| 213 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
|
| 214 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 215 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 216 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
|
| 217 |
+
result = agent.run()
|
| 218 |
+
^^^^^^^^^^^
|
| 219 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
|
| 220 |
+
result = f(*args, **kwargs)
|
| 221 |
+
^^^^^^^^^^^^^^^^^^
|
| 222 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run
|
| 223 |
+
result = self._invoke_run(role)
|
| 224 |
+
^^^^^^^^^^^^^^^^^^^^^^
|
| 225 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 870, in _invoke_run
|
| 226 |
+
time.sleep(monitor_interval)
|
| 227 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler
|
| 228 |
+
raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
|
| 229 |
+
torch.distributed.elastic.multiprocessing.api.SignalException: Process 386925 got signal: 15
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139.log
ADDED
|
@@ -0,0 +1,609 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NCCL version 2.25.1+cuda12.8
|
| 2 |
+
{
|
| 3 |
+
"device": "cuda:0",
|
| 4 |
+
"rank": 0,
|
| 5 |
+
"world_size": 4,
|
| 6 |
+
"samples": "owt_cached_chunks:8",
|
| 7 |
+
"vocab_size": 2664,
|
| 8 |
+
"tokenizer_vocab_size": 50257,
|
| 9 |
+
"save_dir": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139",
|
| 10 |
+
"batch_size": 128,
|
| 11 |
+
"grad_accum": 1,
|
| 12 |
+
"effective_batch_size": 512,
|
| 13 |
+
"global_batch_size": 512,
|
| 14 |
+
"lr_schedule": "constant_warmup",
|
| 15 |
+
"optimizer": "muon",
|
| 16 |
+
"epochs": 0.0,
|
| 17 |
+
"steps_per_epoch": 1,
|
| 18 |
+
"total_steps": 1000,
|
| 19 |
+
"warmup_steps": 10,
|
| 20 |
+
"warmup_epochs": -1.0,
|
| 21 |
+
"min_lr": 0.0,
|
| 22 |
+
"weight_decay": 0.1,
|
| 23 |
+
"output_weight_decay": -1.0,
|
| 24 |
+
"adamw_param_groups": "nanogpt",
|
| 25 |
+
"adam_beta1": 0.9,
|
| 26 |
+
"adam_beta2": 0.95,
|
| 27 |
+
"adam_eps": 1e-08,
|
| 28 |
+
"muon_impl": "legacy",
|
| 29 |
+
"muon_momentum": 0.95,
|
| 30 |
+
"muon_ns_steps": 5,
|
| 31 |
+
"muon_update_scale": 1.0,
|
| 32 |
+
"muon_nesterov": false,
|
| 33 |
+
"muon_width_scale": false,
|
| 34 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 35 |
+
"muon_param_count": 2616320,
|
| 36 |
+
"muon_adam_param_count": 8192,
|
| 37 |
+
"muon_param_names": [
|
| 38 |
+
"vocab_embed.embedding",
|
| 39 |
+
"sigma_map.net.0.weight",
|
| 40 |
+
"sigma_map.net.2.weight",
|
| 41 |
+
"blocks.0.attn_qkv.weight",
|
| 42 |
+
"blocks.0.attn_out.weight",
|
| 43 |
+
"blocks.0.mlp.0.weight",
|
| 44 |
+
"blocks.0.mlp.2.weight",
|
| 45 |
+
"blocks.0.adaLN_modulation.weight",
|
| 46 |
+
"blocks.1.attn_qkv.weight",
|
| 47 |
+
"blocks.1.attn_out.weight",
|
| 48 |
+
"blocks.1.mlp.0.weight",
|
| 49 |
+
"blocks.1.mlp.2.weight",
|
| 50 |
+
"blocks.1.adaLN_modulation.weight",
|
| 51 |
+
"blocks.2.attn_qkv.weight",
|
| 52 |
+
"blocks.2.attn_out.weight",
|
| 53 |
+
"blocks.2.mlp.0.weight",
|
| 54 |
+
"blocks.2.mlp.2.weight",
|
| 55 |
+
"blocks.2.adaLN_modulation.weight",
|
| 56 |
+
"output_layer.linear.weight",
|
| 57 |
+
"output_layer.adaLN_modulation.weight"
|
| 58 |
+
],
|
| 59 |
+
"muon_adam_param_names": [
|
| 60 |
+
"sigma_map.net.0.bias",
|
| 61 |
+
"sigma_map.net.2.bias",
|
| 62 |
+
"blocks.0.norm1.weight",
|
| 63 |
+
"blocks.0.norm2.weight",
|
| 64 |
+
"blocks.0.mlp.0.bias",
|
| 65 |
+
"blocks.0.mlp.2.bias",
|
| 66 |
+
"blocks.0.adaLN_modulation.bias",
|
| 67 |
+
"blocks.1.norm1.weight",
|
| 68 |
+
"blocks.1.norm2.weight",
|
| 69 |
+
"blocks.1.mlp.0.bias",
|
| 70 |
+
"blocks.1.mlp.2.bias",
|
| 71 |
+
"blocks.1.adaLN_modulation.bias",
|
| 72 |
+
"blocks.2.norm1.weight",
|
| 73 |
+
"blocks.2.norm2.weight",
|
| 74 |
+
"blocks.2.mlp.0.bias",
|
| 75 |
+
"blocks.2.mlp.2.bias",
|
| 76 |
+
"blocks.2.adaLN_modulation.bias",
|
| 77 |
+
"output_layer.norm_final.weight",
|
| 78 |
+
"output_layer.adaLN_modulation.bias"
|
| 79 |
+
],
|
| 80 |
+
"muon_effective_nesterov": false,
|
| 81 |
+
"muon_effective_width_scale": false,
|
| 82 |
+
"muon_effective_weight_decay": 0.1,
|
| 83 |
+
"muon_adam_fallback_nesterov": false,
|
| 84 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 85 |
+
"ema_decay": 0.9999,
|
| 86 |
+
"ema_start_step": 0,
|
| 87 |
+
"model_type": "ddit",
|
| 88 |
+
"ddit_mlp_type": "gelu",
|
| 89 |
+
"elf_num_time_tokens": 4,
|
| 90 |
+
"elf_num_model_mode_tokens": 0,
|
| 91 |
+
"qk_norm": true,
|
| 92 |
+
"output_bias": false,
|
| 93 |
+
"output_init_std": -1.0,
|
| 94 |
+
"norm_type": "rmsnorm",
|
| 95 |
+
"target_loss": "hard_ce",
|
| 96 |
+
"linear_soft_target_power": 1.0,
|
| 97 |
+
"linear_soft_target_min_conf": 0.0,
|
| 98 |
+
"linear_soft_target_max_conf": 1.0,
|
| 99 |
+
"t_sampling_mode": "logit_normal",
|
| 100 |
+
"t_sampling_power": 1.0,
|
| 101 |
+
"t_sampling_eps": 0.0001,
|
| 102 |
+
"t_sampling_logit_mean": -1.5,
|
| 103 |
+
"t_sampling_logit_std": 0.8,
|
| 104 |
+
"dual_t": true,
|
| 105 |
+
"corrupt_t_mode": "same",
|
| 106 |
+
"corrupt_min_t": 0.0,
|
| 107 |
+
"corrupt_max_t": 1.0,
|
| 108 |
+
"prefix_block_prob": 0.0,
|
| 109 |
+
"prefix_block_len": 128,
|
| 110 |
+
"mask_ratio_floor_schedule": "none",
|
| 111 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 112 |
+
"dirichlet_semantic_t_mode": "same",
|
| 113 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 114 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 115 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 116 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 117 |
+
"categorical_wrong_from_full_vocab": true,
|
| 118 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 119 |
+
"categorical_wrong_basin_token_ids": "",
|
| 120 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 121 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 122 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 123 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 124 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 125 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 126 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 127 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 128 |
+
"mask_mixture_original_prob": 0.0,
|
| 129 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 130 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 131 |
+
"mask_mixture_block_prob": 0.0,
|
| 132 |
+
"mask_mixture_all_prob": 1.0,
|
| 133 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 134 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 135 |
+
"mask_mixture_block_tokens": "64,128",
|
| 136 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 137 |
+
"logistic_normal_sigma_min": 0.1,
|
| 138 |
+
"logistic_normal_sigma_max": 1.0,
|
| 139 |
+
"logistic_normal_tau_min": 1.0,
|
| 140 |
+
"logistic_normal_tau_max": 1.0,
|
| 141 |
+
"torch_compile": false,
|
| 142 |
+
"compile_mode": "max-autotune",
|
| 143 |
+
"state_format": "prob",
|
| 144 |
+
"meanflow_weight": 0.0,
|
| 145 |
+
"rollout_train_prob": 0.5,
|
| 146 |
+
"rollout_train_steps": 4,
|
| 147 |
+
"rollout_train_infer_steps": 1,
|
| 148 |
+
"rollout_train_time_mode": "sampled_path",
|
| 149 |
+
"rollout_train_s_dist": "uniform",
|
| 150 |
+
"rollout_train_s_min_frac": 0.0,
|
| 151 |
+
"rollout_train_s_max_frac": 0.125,
|
| 152 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 153 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 154 |
+
"rollout_train_temp": 1.45,
|
| 155 |
+
"rollout_train_max_gamma": 1.0,
|
| 156 |
+
"rollout_train_corrupt_only": true,
|
| 157 |
+
"rollout_train_samplewise": true,
|
| 158 |
+
"rollout_train_compute_always": false,
|
| 159 |
+
"rollout_train_sync_t": true,
|
| 160 |
+
"bridge_noise_init": "logistic_normal",
|
| 161 |
+
"noise_sigma": -1.0,
|
| 162 |
+
"allow_tf32": true,
|
| 163 |
+
"activation_checkpointing": false,
|
| 164 |
+
"activation_checkpoint_interval": 1,
|
| 165 |
+
"activation_checkpoint_scope": "block",
|
| 166 |
+
"ddp_static_graph": false,
|
| 167 |
+
"ddp_gradient_as_bucket_view": true,
|
| 168 |
+
"blocking_data_transfer": false,
|
| 169 |
+
"dataloader_prefetch_factor": 4,
|
| 170 |
+
"full_train_stats": false,
|
| 171 |
+
"tokenized_hf": false,
|
| 172 |
+
"tokenized_pad_token": "pad",
|
| 173 |
+
"elf_conditional_hf": false,
|
| 174 |
+
"record_pad_truncate": false,
|
| 175 |
+
"record_add_eos": false,
|
| 176 |
+
"record_add_special_tokens": false,
|
| 177 |
+
"record_pad_token": "pad",
|
| 178 |
+
"record_shuffle_buffer": 10000,
|
| 179 |
+
"wrap": true,
|
| 180 |
+
"wrap_mode": "stream",
|
| 181 |
+
"wrap_record_buffer_size": 200,
|
| 182 |
+
"owt_cached_chunks": true,
|
| 183 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
|
| 184 |
+
"owt_chunk_cache_rebuild": false,
|
| 185 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 186 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 187 |
+
"online_chunk_shuffle": false,
|
| 188 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 189 |
+
"openwebtext_split": "train_minus_100k",
|
| 190 |
+
"detokenizer": "auto",
|
| 191 |
+
"resolved_detokenizer": null,
|
| 192 |
+
"num_workers": 0,
|
| 193 |
+
"latest_every": 1000,
|
| 194 |
+
"resume_path": ""
|
| 195 |
+
}
|
| 196 |
+
step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=24.8s lr=2.000000e-03 loss=7.7206 loss_recon=7.7206 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5077 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0958 corrupt_frac=1.0000 acc_corrupt=0.0958 loss_corrupt=7.7206 wrong_frac=0.7915 init_acc_corrupt=0.1164 acc_corrupt_t_0p0_0p2=0.0500 corrupt_frac_t_0p0_0p2=0.5640 acc_corrupt_t_0p2_0p4=0.1270 corrupt_frac_t_0p2_0p4=0.3466 acc_corrupt_t_0p4_0p6=0.2493 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p6_0p8=0.3719 corrupt_frac_t_0p6_0p8=0.0136 out_w_norm=1.0047 out_g_norm=1.0928 acc_corrupt_t_0p8_1p0=0.4936 corrupt_frac_t_0p8_1p0=0.0078 loss_all=7.4724 init_gold_top10=0.2003 init_gold_top100=0.4085 rollout_applied_pos_frac=0.4453 init_acc_rollout_applied=0.1056 init_acc_rollout_kept=0.1192 logit_acc_rollout_applied=0.0969 logit_acc_rollout_kept=0.0996
|
| 197 |
+
step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=23.9s lr=2.000000e-03 loss=7.0874 loss_recon=7.0874 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4995 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1036 corrupt_frac=1.0000 acc_corrupt=0.1036 loss_corrupt=7.0874 wrong_frac=0.7905 init_acc_corrupt=0.1172 acc_corrupt_t_0p0_0p2=0.0560 corrupt_frac_t_0p0_0p2=0.5557 acc_corrupt_t_0p2_0p4=0.1392 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.2552 corrupt_frac_t_0p4_0p6=0.0762 acc_corrupt_t_0p6_0p8=0.3485 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=2.8612 out_g_norm=1.7761 acc_corrupt_t_0p8_1p0=0.4243 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.6891 init_gold_top10=0.2090 init_gold_top100=0.4276 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.1378 init_acc_rollout_kept=0.1215 logit_acc_rollout_applied=0.1143 logit_acc_rollout_kept=0.1146
|
| 198 |
+
step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=24.0s lr=2.000000e-03 loss=6.4546 loss_recon=6.4546 loss_meanflow=0.0000 mean_model_t=0.2098 mean_corrupt_t=0.2098 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5023 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1137 corrupt_frac=1.0000 acc_corrupt=0.1137 loss_corrupt=6.4546 wrong_frac=0.7902 init_acc_corrupt=0.1187 acc_corrupt_t_0p0_0p2=0.0592 corrupt_frac_t_0p0_0p2=0.5544 acc_corrupt_t_0p2_0p4=0.1548 corrupt_frac_t_0p2_0p4=0.3617 acc_corrupt_t_0p4_0p6=0.2839 corrupt_frac_t_0p4_0p6=0.0743 out_w_norm=4.3408 out_g_norm=1.3199 acc_corrupt_t_0p6_0p8=0.3901 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=0.5415 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.2279 init_gold_top10=0.2020 init_gold_top100=0.4348 rollout_applied_pos_frac=0.4375 init_acc_rollout_applied=0.0878 init_acc_rollout_kept=0.1243 logit_acc_rollout_applied=0.1032 logit_acc_rollout_kept=0.1241
|
| 199 |
+
step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=23.9s lr=2.000000e-03 loss=5.9837 loss_recon=5.9837 loss_meanflow=0.0000 mean_model_t=0.2072 mean_corrupt_t=0.2072 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5030 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1233 corrupt_frac=1.0000 acc_corrupt=0.1233 loss_corrupt=5.9837 wrong_frac=0.7929 init_acc_corrupt=0.1160 acc_corrupt_t_0p0_0p2=0.0639 corrupt_frac_t_0p0_0p2=0.5638 acc_corrupt_t_0p2_0p4=0.1708 corrupt_frac_t_0p2_0p4=0.3526 acc_corrupt_t_0p4_0p6=0.3106 corrupt_frac_t_0p4_0p6=0.0753 out_w_norm=5.4789 out_g_norm=0.5031 acc_corrupt_t_0p6_0p8=0.4367 corrupt_frac_t_0p6_0p8=0.0128 acc_corrupt_t_0p8_1p0=0.5306 corrupt_frac_t_0p8_1p0=0.0117 loss_all=5.7599 init_gold_top10=0.2017 init_gold_top100=0.4699 rollout_applied_pos_frac=0.4844 init_acc_rollout_applied=0.1182 init_acc_rollout_kept=0.1042 logit_acc_rollout_applied=0.1310 logit_acc_rollout_kept=0.1227
|
| 200 |
+
step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=23.9s lr=2.000000e-03 loss=5.4774 loss_recon=5.4774 loss_meanflow=0.0000 mean_model_t=0.2101 mean_corrupt_t=0.2101 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4994 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1378 corrupt_frac=1.0000 acc_corrupt=0.1378 loss_corrupt=5.4774 wrong_frac=0.7898 init_acc_corrupt=0.1197 acc_corrupt_t_0p0_0p2=0.0677 corrupt_frac_t_0p0_0p2=0.5506 acc_corrupt_t_0p2_0p4=0.1918 corrupt_frac_t_0p2_0p4=0.3660 acc_corrupt_t_0p4_0p6=0.3501 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.4913 corrupt_frac_t_0p6_0p8=0.0128 out_w_norm=6.7180 out_g_norm=0.4070 acc_corrupt_t_0p8_1p0=0.6104 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.1003 init_gold_top10=0.2273 init_gold_top100=0.5161 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.1306 init_acc_rollout_kept=0.1486 logit_acc_rollout_applied=0.1487 logit_acc_rollout_kept=0.1648
|
| 201 |
+
step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=23.9s lr=2.000000e-03 loss=4.8712 loss_recon=4.8712 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5037 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1521 corrupt_frac=1.0000 acc_corrupt=0.1521 loss_corrupt=4.8712 wrong_frac=0.7918 init_acc_corrupt=0.1187 acc_corrupt_t_0p0_0p2=0.0719 corrupt_frac_t_0p0_0p2=0.5629 acc_corrupt_t_0p2_0p4=0.2147 corrupt_frac_t_0p2_0p4=0.3502 acc_corrupt_t_0p4_0p6=0.4001 corrupt_frac_t_0p4_0p6=0.0768 out_w_norm=7.9804 out_g_norm=0.4280 acc_corrupt_t_0p6_0p8=0.5625 corrupt_frac_t_0p6_0p8=0.0133 loss_all=4.5656 init_gold_top10=0.2029 init_gold_top100=0.5925 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.1124 init_acc_rollout_kept=0.1078 logit_acc_rollout_applied=0.1600 logit_acc_rollout_kept=0.1533
|
| 202 |
+
step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=24.1s lr=2.000000e-03 loss=4.2343 loss_recon=4.2343 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5123 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1770 corrupt_frac=1.0000 acc_corrupt=0.1770 loss_corrupt=4.2343 wrong_frac=0.7915 init_acc_corrupt=0.1192 acc_corrupt_t_0p0_0p2=0.0787 corrupt_frac_t_0p0_0p2=0.5591 acc_corrupt_t_0p2_0p4=0.2531 corrupt_frac_t_0p2_0p4=0.3563 acc_corrupt_t_0p4_0p6=0.4880 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=9.1564 out_g_norm=0.4726 acc_corrupt_t_0p6_0p8=0.6622 corrupt_frac_t_0p6_0p8=0.0125 acc_corrupt_t_0p8_1p0=0.8376 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.9427 init_gold_top10=0.2206 init_gold_top100=0.6230 rollout_applied_pos_frac=0.5156 init_acc_rollout_applied=0.1283 init_acc_rollout_kept=0.1214 logit_acc_rollout_applied=0.1920 logit_acc_rollout_kept=0.1924
|
| 203 |
+
step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=23.9s lr=2.000000e-03 loss=3.7296 loss_recon=3.7296 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5020 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2008 corrupt_frac=1.0000 acc_corrupt=0.2008 loss_corrupt=3.7296 wrong_frac=0.7911 init_acc_corrupt=0.1208 acc_corrupt_t_0p0_0p2=0.0884 corrupt_frac_t_0p0_0p2=0.5509 acc_corrupt_t_0p2_0p4=0.2944 corrupt_frac_t_0p2_0p4=0.3674 acc_corrupt_t_0p4_0p6=0.5237 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.6942 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=9.9997 out_g_norm=0.5989 acc_corrupt_t_0p8_1p0=0.8555 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.4499 init_gold_top10=0.2440 init_gold_top100=0.5899 rollout_applied_pos_frac=0.4531 init_acc_rollout_applied=0.1209 init_acc_rollout_kept=0.1210 logit_acc_rollout_applied=0.2120 logit_acc_rollout_kept=0.2282
|
| 204 |
+
step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=24.0s lr=2.000000e-03 loss=3.2983 loss_recon=3.2983 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5052 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2379 corrupt_frac=1.0000 acc_corrupt=0.2379 loss_corrupt=3.2983 wrong_frac=0.7905 init_acc_corrupt=0.1230 acc_corrupt_t_0p0_0p2=0.1028 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3571 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.5909 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.7411 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=10.4993 out_g_norm=0.8729 acc_corrupt_t_0p8_1p0=0.8477 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.0155 init_gold_top10=0.2915 init_gold_top100=0.6315 rollout_applied_pos_frac=0.4766 init_acc_rollout_applied=0.1272 init_acc_rollout_kept=0.1170 logit_acc_rollout_applied=0.2742 logit_acc_rollout_kept=0.2656
|
| 205 |
+
step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=23.8s lr=2.000000e-03 loss=2.8778 loss_recon=2.8778 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4952 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2973 corrupt_frac=1.0000 acc_corrupt=0.2973 loss_corrupt=2.8778 wrong_frac=0.7924 init_acc_corrupt=0.1228 acc_corrupt_t_0p0_0p2=0.1297 corrupt_frac_t_0p0_0p2=0.5567 acc_corrupt_t_0p2_0p4=0.4607 corrupt_frac_t_0p2_0p4=0.3598 acc_corrupt_t_0p4_0p6=0.6989 corrupt_frac_t_0p4_0p6=0.0751 acc_corrupt_t_0p6_0p8=0.8142 corrupt_frac_t_0p6_0p8=0.0128 out_w_norm=10.8390 out_g_norm=1.0522 loss_all=2.7461 init_gold_top10=0.3628 init_gold_top100=0.6665 rollout_applied_pos_frac=0.5234 init_acc_rollout_applied=0.1368 init_acc_rollout_kept=0.0965 logit_acc_rollout_applied=0.3638 logit_acc_rollout_kept=0.2733
|
| 206 |
+
NCCL version 2.25.1+cuda12.8
|
| 207 |
+
resumed_from=runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139/latest.pt start_step=1001
|
| 208 |
+
{
|
| 209 |
+
"device": "cuda:0",
|
| 210 |
+
"rank": 0,
|
| 211 |
+
"world_size": 4,
|
| 212 |
+
"samples": "owt_cached_chunks:8",
|
| 213 |
+
"vocab_size": 2664,
|
| 214 |
+
"tokenizer_vocab_size": 50257,
|
| 215 |
+
"save_dir": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139",
|
| 216 |
+
"batch_size": 128,
|
| 217 |
+
"grad_accum": 1,
|
| 218 |
+
"effective_batch_size": 512,
|
| 219 |
+
"global_batch_size": 512,
|
| 220 |
+
"lr_schedule": "constant_warmup",
|
| 221 |
+
"optimizer": "muon",
|
| 222 |
+
"epochs": 0.0,
|
| 223 |
+
"steps_per_epoch": 1,
|
| 224 |
+
"total_steps": 2000,
|
| 225 |
+
"warmup_steps": 10,
|
| 226 |
+
"warmup_epochs": -1.0,
|
| 227 |
+
"min_lr": 0.0,
|
| 228 |
+
"weight_decay": 0.1,
|
| 229 |
+
"output_weight_decay": -1.0,
|
| 230 |
+
"adamw_param_groups": "nanogpt",
|
| 231 |
+
"adam_beta1": 0.9,
|
| 232 |
+
"adam_beta2": 0.95,
|
| 233 |
+
"adam_eps": 1e-08,
|
| 234 |
+
"muon_impl": "legacy",
|
| 235 |
+
"muon_momentum": 0.95,
|
| 236 |
+
"muon_ns_steps": 5,
|
| 237 |
+
"muon_update_scale": 1.0,
|
| 238 |
+
"muon_nesterov": false,
|
| 239 |
+
"muon_width_scale": false,
|
| 240 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 241 |
+
"muon_param_count": 2616320,
|
| 242 |
+
"muon_adam_param_count": 8192,
|
| 243 |
+
"muon_param_names": [
|
| 244 |
+
"vocab_embed.embedding",
|
| 245 |
+
"sigma_map.net.0.weight",
|
| 246 |
+
"sigma_map.net.2.weight",
|
| 247 |
+
"blocks.0.attn_qkv.weight",
|
| 248 |
+
"blocks.0.attn_out.weight",
|
| 249 |
+
"blocks.0.mlp.0.weight",
|
| 250 |
+
"blocks.0.mlp.2.weight",
|
| 251 |
+
"blocks.0.adaLN_modulation.weight",
|
| 252 |
+
"blocks.1.attn_qkv.weight",
|
| 253 |
+
"blocks.1.attn_out.weight",
|
| 254 |
+
"blocks.1.mlp.0.weight",
|
| 255 |
+
"blocks.1.mlp.2.weight",
|
| 256 |
+
"blocks.1.adaLN_modulation.weight",
|
| 257 |
+
"blocks.2.attn_qkv.weight",
|
| 258 |
+
"blocks.2.attn_out.weight",
|
| 259 |
+
"blocks.2.mlp.0.weight",
|
| 260 |
+
"blocks.2.mlp.2.weight",
|
| 261 |
+
"blocks.2.adaLN_modulation.weight",
|
| 262 |
+
"output_layer.linear.weight",
|
| 263 |
+
"output_layer.adaLN_modulation.weight"
|
| 264 |
+
],
|
| 265 |
+
"muon_adam_param_names": [
|
| 266 |
+
"sigma_map.net.0.bias",
|
| 267 |
+
"sigma_map.net.2.bias",
|
| 268 |
+
"blocks.0.norm1.weight",
|
| 269 |
+
"blocks.0.norm2.weight",
|
| 270 |
+
"blocks.0.mlp.0.bias",
|
| 271 |
+
"blocks.0.mlp.2.bias",
|
| 272 |
+
"blocks.0.adaLN_modulation.bias",
|
| 273 |
+
"blocks.1.norm1.weight",
|
| 274 |
+
"blocks.1.norm2.weight",
|
| 275 |
+
"blocks.1.mlp.0.bias",
|
| 276 |
+
"blocks.1.mlp.2.bias",
|
| 277 |
+
"blocks.1.adaLN_modulation.bias",
|
| 278 |
+
"blocks.2.norm1.weight",
|
| 279 |
+
"blocks.2.norm2.weight",
|
| 280 |
+
"blocks.2.mlp.0.bias",
|
| 281 |
+
"blocks.2.mlp.2.bias",
|
| 282 |
+
"blocks.2.adaLN_modulation.bias",
|
| 283 |
+
"output_layer.norm_final.weight",
|
| 284 |
+
"output_layer.adaLN_modulation.bias"
|
| 285 |
+
],
|
| 286 |
+
"muon_effective_nesterov": false,
|
| 287 |
+
"muon_effective_width_scale": false,
|
| 288 |
+
"muon_effective_weight_decay": 0.1,
|
| 289 |
+
"muon_adam_fallback_nesterov": false,
|
| 290 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 291 |
+
"ema_decay": 0.9999,
|
| 292 |
+
"ema_start_step": 0,
|
| 293 |
+
"model_type": "ddit",
|
| 294 |
+
"ddit_mlp_type": "gelu",
|
| 295 |
+
"elf_num_time_tokens": 4,
|
| 296 |
+
"elf_num_model_mode_tokens": 0,
|
| 297 |
+
"qk_norm": true,
|
| 298 |
+
"output_bias": false,
|
| 299 |
+
"output_init_std": -1.0,
|
| 300 |
+
"norm_type": "rmsnorm",
|
| 301 |
+
"target_loss": "hard_ce",
|
| 302 |
+
"linear_soft_target_power": 1.0,
|
| 303 |
+
"linear_soft_target_min_conf": 0.0,
|
| 304 |
+
"linear_soft_target_max_conf": 1.0,
|
| 305 |
+
"t_sampling_mode": "logit_normal",
|
| 306 |
+
"t_sampling_power": 1.0,
|
| 307 |
+
"t_sampling_eps": 0.0001,
|
| 308 |
+
"t_sampling_logit_mean": -1.5,
|
| 309 |
+
"t_sampling_logit_std": 0.8,
|
| 310 |
+
"dual_t": true,
|
| 311 |
+
"corrupt_t_mode": "same",
|
| 312 |
+
"corrupt_min_t": 0.0,
|
| 313 |
+
"corrupt_max_t": 1.0,
|
| 314 |
+
"prefix_block_prob": 0.0,
|
| 315 |
+
"prefix_block_len": 128,
|
| 316 |
+
"mask_ratio_floor_schedule": "none",
|
| 317 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 318 |
+
"dirichlet_semantic_t_mode": "same",
|
| 319 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 320 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 321 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 322 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 323 |
+
"categorical_wrong_from_full_vocab": true,
|
| 324 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 325 |
+
"categorical_wrong_basin_token_ids": "",
|
| 326 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 327 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 328 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 329 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 330 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 331 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 332 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 333 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 334 |
+
"mask_mixture_original_prob": 0.0,
|
| 335 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 336 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 337 |
+
"mask_mixture_block_prob": 0.0,
|
| 338 |
+
"mask_mixture_all_prob": 1.0,
|
| 339 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 340 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 341 |
+
"mask_mixture_block_tokens": "64,128",
|
| 342 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 343 |
+
"logistic_normal_sigma_min": 0.1,
|
| 344 |
+
"logistic_normal_sigma_max": 1.0,
|
| 345 |
+
"logistic_normal_tau_min": 1.0,
|
| 346 |
+
"logistic_normal_tau_max": 1.0,
|
| 347 |
+
"torch_compile": false,
|
| 348 |
+
"compile_mode": "max-autotune",
|
| 349 |
+
"state_format": "prob",
|
| 350 |
+
"meanflow_weight": 0.0,
|
| 351 |
+
"rollout_train_prob": 0.5,
|
| 352 |
+
"rollout_train_steps": 4,
|
| 353 |
+
"rollout_train_infer_steps": 1,
|
| 354 |
+
"rollout_train_time_mode": "sampled_path",
|
| 355 |
+
"rollout_train_s_dist": "uniform",
|
| 356 |
+
"rollout_train_s_min_frac": 0.0,
|
| 357 |
+
"rollout_train_s_max_frac": 0.125,
|
| 358 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 359 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 360 |
+
"rollout_train_temp": 1.45,
|
| 361 |
+
"rollout_train_max_gamma": 1.0,
|
| 362 |
+
"rollout_train_corrupt_only": true,
|
| 363 |
+
"rollout_train_samplewise": true,
|
| 364 |
+
"rollout_train_compute_always": false,
|
| 365 |
+
"rollout_train_sync_t": true,
|
| 366 |
+
"bridge_noise_init": "logistic_normal",
|
| 367 |
+
"noise_sigma": -1.0,
|
| 368 |
+
"allow_tf32": true,
|
| 369 |
+
"activation_checkpointing": false,
|
| 370 |
+
"activation_checkpoint_interval": 1,
|
| 371 |
+
"activation_checkpoint_scope": "block",
|
| 372 |
+
"ddp_static_graph": false,
|
| 373 |
+
"ddp_gradient_as_bucket_view": true,
|
| 374 |
+
"blocking_data_transfer": false,
|
| 375 |
+
"dataloader_prefetch_factor": 4,
|
| 376 |
+
"full_train_stats": false,
|
| 377 |
+
"tokenized_hf": false,
|
| 378 |
+
"tokenized_pad_token": "pad",
|
| 379 |
+
"elf_conditional_hf": false,
|
| 380 |
+
"record_pad_truncate": false,
|
| 381 |
+
"record_add_eos": false,
|
| 382 |
+
"record_add_special_tokens": false,
|
| 383 |
+
"record_pad_token": "pad",
|
| 384 |
+
"record_shuffle_buffer": 10000,
|
| 385 |
+
"wrap": true,
|
| 386 |
+
"wrap_mode": "stream",
|
| 387 |
+
"wrap_record_buffer_size": 200,
|
| 388 |
+
"owt_cached_chunks": true,
|
| 389 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
|
| 390 |
+
"owt_chunk_cache_rebuild": false,
|
| 391 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 392 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 393 |
+
"online_chunk_shuffle": false,
|
| 394 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 395 |
+
"openwebtext_split": "train_minus_100k",
|
| 396 |
+
"detokenizer": "auto",
|
| 397 |
+
"resolved_detokenizer": null,
|
| 398 |
+
"num_workers": 0,
|
| 399 |
+
"latest_every": 1000,
|
| 400 |
+
"resume_path": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139/latest.pt"
|
| 401 |
+
}
|
| 402 |
+
step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=24.6s lr=2.000000e-03 loss=2.4561 loss_recon=2.4561 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5077 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3788 corrupt_frac=1.0000 acc_corrupt=0.3788 loss_corrupt=2.4561 wrong_frac=0.7915 init_acc_corrupt=0.1281 acc_corrupt_t_0p0_0p2=0.1728 corrupt_frac_t_0p0_0p2=0.5640 acc_corrupt_t_0p2_0p4=0.6004 corrupt_frac_t_0p2_0p4=0.3466 acc_corrupt_t_0p4_0p6=0.8111 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p6_0p8=0.8828 corrupt_frac_t_0p6_0p8=0.0136 out_w_norm=11.1283 out_g_norm=1.2622 acc_corrupt_t_0p8_1p0=0.9307 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.1052 init_gold_top10=0.4033 init_gold_top100=0.6222 rollout_applied_pos_frac=0.4453 init_acc_rollout_applied=0.1418 init_acc_rollout_kept=0.1192 logit_acc_rollout_applied=0.4907 logit_acc_rollout_kept=0.4183
|
| 403 |
+
step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=23.7s lr=2.000000e-03 loss=2.0694 loss_recon=2.0694 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4995 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4665 corrupt_frac=1.0000 acc_corrupt=0.4665 loss_corrupt=2.0694 wrong_frac=0.7905 init_acc_corrupt=0.1362 acc_corrupt_t_0p0_0p2=0.2250 corrupt_frac_t_0p0_0p2=0.5557 acc_corrupt_t_0p2_0p4=0.7365 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.9014 corrupt_frac_t_0p4_0p6=0.0762 acc_corrupt_t_0p6_0p8=0.9335 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=11.3914 out_g_norm=1.4567 acc_corrupt_t_0p8_1p0=0.9663 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.9323 init_gold_top10=0.4233 init_gold_top100=0.6392 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.1810 init_acc_rollout_kept=0.1215 logit_acc_rollout_applied=0.5332 logit_acc_rollout_kept=0.4872
|
| 404 |
+
step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=23.7s lr=2.000000e-03 loss=1.7590 loss_recon=1.7590 loss_meanflow=0.0000 mean_model_t=0.2098 mean_corrupt_t=0.2098 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5023 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5421 corrupt_frac=1.0000 acc_corrupt=0.5421 loss_corrupt=1.7590 wrong_frac=0.7902 init_acc_corrupt=0.1490 acc_corrupt_t_0p0_0p2=0.2864 corrupt_frac_t_0p0_0p2=0.5544 acc_corrupt_t_0p2_0p4=0.8375 corrupt_frac_t_0p2_0p4=0.3617 acc_corrupt_t_0p4_0p6=0.9568 corrupt_frac_t_0p4_0p6=0.0743 out_w_norm=11.5756 out_g_norm=1.5131 acc_corrupt_t_0p6_0p8=0.9688 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=0.9624 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.4657 init_gold_top10=0.4631 init_gold_top100=0.6198 rollout_applied_pos_frac=0.4375 init_acc_rollout_applied=0.1669 init_acc_rollout_kept=0.1243 logit_acc_rollout_applied=0.6838 logit_acc_rollout_kept=0.5458
|
| 405 |
+
step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=23.7s lr=2.000000e-03 loss=1.5170 loss_recon=1.5170 loss_meanflow=0.0000 mean_model_t=0.2072 mean_corrupt_t=0.2072 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5030 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6018 corrupt_frac=1.0000 acc_corrupt=0.6018 loss_corrupt=1.5170 wrong_frac=0.7929 init_acc_corrupt=0.1570 acc_corrupt_t_0p0_0p2=0.3584 corrupt_frac_t_0p0_0p2=0.5638 acc_corrupt_t_0p2_0p4=0.9009 corrupt_frac_t_0p2_0p4=0.3526 acc_corrupt_t_0p4_0p6=0.9811 corrupt_frac_t_0p4_0p6=0.0753 out_w_norm=11.6982 out_g_norm=1.5191 acc_corrupt_t_0p6_0p8=0.9840 corrupt_frac_t_0p6_0p8=0.0128 acc_corrupt_t_0p8_1p0=0.9805 corrupt_frac_t_0p8_1p0=0.0117 loss_all=1.3672 init_gold_top10=0.4951 init_gold_top100=0.6454 rollout_applied_pos_frac=0.4844 init_acc_rollout_applied=0.2290 init_acc_rollout_kept=0.1042 logit_acc_rollout_applied=0.7189 logit_acc_rollout_kept=0.5401
|
| 406 |
+
step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=23.7s lr=2.000000e-03 loss=1.2915 loss_recon=1.2915 loss_meanflow=0.0000 mean_model_t=0.2101 mean_corrupt_t=0.2101 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4994 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6540 corrupt_frac=1.0000 acc_corrupt=0.6540 loss_corrupt=1.2915 wrong_frac=0.7898 init_acc_corrupt=0.1734 acc_corrupt_t_0p0_0p2=0.4134 corrupt_frac_t_0p0_0p2=0.5506 acc_corrupt_t_0p2_0p4=0.9392 corrupt_frac_t_0p2_0p4=0.3660 acc_corrupt_t_0p4_0p6=0.9911 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9905 corrupt_frac_t_0p6_0p8=0.0128 out_w_norm=11.7810 out_g_norm=1.4652 acc_corrupt_t_0p8_1p0=0.9917 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0025 init_gold_top10=0.5435 init_gold_top100=0.6617 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.2334 init_acc_rollout_kept=0.1486 logit_acc_rollout_applied=0.7939 logit_acc_rollout_kept=0.6810
|
| 407 |
+
step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=23.7s lr=2.000000e-03 loss=1.1774 loss_recon=1.1774 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5037 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6826 corrupt_frac=1.0000 acc_corrupt=0.6826 loss_corrupt=1.1774 wrong_frac=0.7918 init_acc_corrupt=0.1812 acc_corrupt_t_0p0_0p2=0.4607 corrupt_frac_t_0p0_0p2=0.5629 acc_corrupt_t_0p2_0p4=0.9616 corrupt_frac_t_0p2_0p4=0.3502 acc_corrupt_t_0p4_0p6=0.9956 corrupt_frac_t_0p4_0p6=0.0768 out_w_norm=11.8241 out_g_norm=1.3619 acc_corrupt_t_0p6_0p8=0.9946 corrupt_frac_t_0p6_0p8=0.0133 loss_all=1.1961 init_gold_top10=0.5235 init_gold_top100=0.6535 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.2526 init_acc_rollout_kept=0.1078 logit_acc_rollout_applied=0.8093 logit_acc_rollout_kept=0.5727
|
| 408 |
+
step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=23.8s lr=2.000000e-03 loss=1.0426 loss_recon=1.0426 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5123 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7140 corrupt_frac=1.0000 acc_corrupt=0.7140 loss_corrupt=1.0426 wrong_frac=0.7915 init_acc_corrupt=0.1897 acc_corrupt_t_0p0_0p2=0.5042 corrupt_frac_t_0p0_0p2=0.5591 acc_corrupt_t_0p2_0p4=0.9759 corrupt_frac_t_0p2_0p4=0.3563 acc_corrupt_t_0p4_0p6=0.9978 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=11.8438 out_g_norm=1.2704 acc_corrupt_t_0p6_0p8=0.9969 corrupt_frac_t_0p6_0p8=0.0125 acc_corrupt_t_0p8_1p0=0.9912 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0589 init_gold_top10=0.5409 init_gold_top100=0.6740 rollout_applied_pos_frac=0.5156 init_acc_rollout_applied=0.2758 init_acc_rollout_kept=0.1214 logit_acc_rollout_applied=0.7361 logit_acc_rollout_kept=0.6505
|
| 409 |
+
step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=23.7s lr=2.000000e-03 loss=0.9326 loss_recon=0.9326 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5020 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7394 corrupt_frac=1.0000 acc_corrupt=0.7394 loss_corrupt=0.9326 wrong_frac=0.7911 init_acc_corrupt=0.1967 acc_corrupt_t_0p0_0p2=0.5387 corrupt_frac_t_0p0_0p2=0.5509 acc_corrupt_t_0p2_0p4=0.9829 corrupt_frac_t_0p2_0p4=0.3674 acc_corrupt_t_0p4_0p6=0.9988 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.9967 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=11.8643 out_g_norm=1.1264 acc_corrupt_t_0p8_1p0=0.9866 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.7271 init_gold_top10=0.5306 init_gold_top100=0.6286 rollout_applied_pos_frac=0.4531 init_acc_rollout_applied=0.2711 init_acc_rollout_kept=0.1210 logit_acc_rollout_applied=0.8536 logit_acc_rollout_kept=0.7328
|
| 410 |
+
step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=23.8s lr=2.000000e-03 loss=0.8706 loss_recon=0.8706 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5052 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7491 corrupt_frac=1.0000 acc_corrupt=0.7491 loss_corrupt=0.8706 wrong_frac=0.7905 init_acc_corrupt=0.2009 acc_corrupt_t_0p0_0p2=0.5529 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.9885 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.9988 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.9971 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=11.8558 out_g_norm=1.0997 acc_corrupt_t_0p8_1p0=0.9964 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.7654 init_gold_top10=0.5507 init_gold_top100=0.6459 rollout_applied_pos_frac=0.4766 init_acc_rollout_applied=0.2787 init_acc_rollout_kept=0.1170 logit_acc_rollout_applied=0.8564 logit_acc_rollout_kept=0.7075
|
| 411 |
+
step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=23.7s lr=2.000000e-03 loss=0.7684 loss_recon=0.7684 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4952 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7720 corrupt_frac=1.0000 acc_corrupt=0.7720 loss_corrupt=0.7684 wrong_frac=0.7924 init_acc_corrupt=0.1993 acc_corrupt_t_0p0_0p2=0.5971 corrupt_frac_t_0p0_0p2=0.5567 acc_corrupt_t_0p2_0p4=0.9900 corrupt_frac_t_0p2_0p4=0.3598 acc_corrupt_t_0p4_0p6=0.9990 corrupt_frac_t_0p4_0p6=0.0751 acc_corrupt_t_0p6_0p8=0.9969 corrupt_frac_t_0p6_0p8=0.0128 out_w_norm=11.8523 out_g_norm=1.1152 loss_all=0.7562 init_gold_top10=0.5871 init_gold_top100=0.6723 rollout_applied_pos_frac=0.5234 init_acc_rollout_applied=0.3057 init_acc_rollout_kept=0.0965 logit_acc_rollout_applied=0.8930 logit_acc_rollout_kept=0.6352
|
| 412 |
+
NCCL version 2.25.1+cuda12.8
|
| 413 |
+
resumed_from=runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139/latest.pt start_step=2001
|
| 414 |
+
{
|
| 415 |
+
"device": "cuda:0",
|
| 416 |
+
"rank": 0,
|
| 417 |
+
"world_size": 4,
|
| 418 |
+
"samples": "owt_cached_chunks:8",
|
| 419 |
+
"vocab_size": 2664,
|
| 420 |
+
"tokenizer_vocab_size": 50257,
|
| 421 |
+
"save_dir": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139",
|
| 422 |
+
"batch_size": 128,
|
| 423 |
+
"grad_accum": 1,
|
| 424 |
+
"effective_batch_size": 512,
|
| 425 |
+
"global_batch_size": 512,
|
| 426 |
+
"lr_schedule": "constant_warmup",
|
| 427 |
+
"optimizer": "muon",
|
| 428 |
+
"epochs": 0.0,
|
| 429 |
+
"steps_per_epoch": 1,
|
| 430 |
+
"total_steps": 3000,
|
| 431 |
+
"warmup_steps": 10,
|
| 432 |
+
"warmup_epochs": -1.0,
|
| 433 |
+
"min_lr": 0.0,
|
| 434 |
+
"weight_decay": 0.1,
|
| 435 |
+
"output_weight_decay": -1.0,
|
| 436 |
+
"adamw_param_groups": "nanogpt",
|
| 437 |
+
"adam_beta1": 0.9,
|
| 438 |
+
"adam_beta2": 0.95,
|
| 439 |
+
"adam_eps": 1e-08,
|
| 440 |
+
"muon_impl": "legacy",
|
| 441 |
+
"muon_momentum": 0.95,
|
| 442 |
+
"muon_ns_steps": 5,
|
| 443 |
+
"muon_update_scale": 1.0,
|
| 444 |
+
"muon_nesterov": false,
|
| 445 |
+
"muon_width_scale": false,
|
| 446 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 447 |
+
"muon_param_count": 2616320,
|
| 448 |
+
"muon_adam_param_count": 8192,
|
| 449 |
+
"muon_param_names": [
|
| 450 |
+
"vocab_embed.embedding",
|
| 451 |
+
"sigma_map.net.0.weight",
|
| 452 |
+
"sigma_map.net.2.weight",
|
| 453 |
+
"blocks.0.attn_qkv.weight",
|
| 454 |
+
"blocks.0.attn_out.weight",
|
| 455 |
+
"blocks.0.mlp.0.weight",
|
| 456 |
+
"blocks.0.mlp.2.weight",
|
| 457 |
+
"blocks.0.adaLN_modulation.weight",
|
| 458 |
+
"blocks.1.attn_qkv.weight",
|
| 459 |
+
"blocks.1.attn_out.weight",
|
| 460 |
+
"blocks.1.mlp.0.weight",
|
| 461 |
+
"blocks.1.mlp.2.weight",
|
| 462 |
+
"blocks.1.adaLN_modulation.weight",
|
| 463 |
+
"blocks.2.attn_qkv.weight",
|
| 464 |
+
"blocks.2.attn_out.weight",
|
| 465 |
+
"blocks.2.mlp.0.weight",
|
| 466 |
+
"blocks.2.mlp.2.weight",
|
| 467 |
+
"blocks.2.adaLN_modulation.weight",
|
| 468 |
+
"output_layer.linear.weight",
|
| 469 |
+
"output_layer.adaLN_modulation.weight"
|
| 470 |
+
],
|
| 471 |
+
"muon_adam_param_names": [
|
| 472 |
+
"sigma_map.net.0.bias",
|
| 473 |
+
"sigma_map.net.2.bias",
|
| 474 |
+
"blocks.0.norm1.weight",
|
| 475 |
+
"blocks.0.norm2.weight",
|
| 476 |
+
"blocks.0.mlp.0.bias",
|
| 477 |
+
"blocks.0.mlp.2.bias",
|
| 478 |
+
"blocks.0.adaLN_modulation.bias",
|
| 479 |
+
"blocks.1.norm1.weight",
|
| 480 |
+
"blocks.1.norm2.weight",
|
| 481 |
+
"blocks.1.mlp.0.bias",
|
| 482 |
+
"blocks.1.mlp.2.bias",
|
| 483 |
+
"blocks.1.adaLN_modulation.bias",
|
| 484 |
+
"blocks.2.norm1.weight",
|
| 485 |
+
"blocks.2.norm2.weight",
|
| 486 |
+
"blocks.2.mlp.0.bias",
|
| 487 |
+
"blocks.2.mlp.2.bias",
|
| 488 |
+
"blocks.2.adaLN_modulation.bias",
|
| 489 |
+
"output_layer.norm_final.weight",
|
| 490 |
+
"output_layer.adaLN_modulation.bias"
|
| 491 |
+
],
|
| 492 |
+
"muon_effective_nesterov": false,
|
| 493 |
+
"muon_effective_width_scale": false,
|
| 494 |
+
"muon_effective_weight_decay": 0.1,
|
| 495 |
+
"muon_adam_fallback_nesterov": false,
|
| 496 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 497 |
+
"ema_decay": 0.9999,
|
| 498 |
+
"ema_start_step": 0,
|
| 499 |
+
"model_type": "ddit",
|
| 500 |
+
"ddit_mlp_type": "gelu",
|
| 501 |
+
"elf_num_time_tokens": 4,
|
| 502 |
+
"elf_num_model_mode_tokens": 0,
|
| 503 |
+
"qk_norm": true,
|
| 504 |
+
"output_bias": false,
|
| 505 |
+
"output_init_std": -1.0,
|
| 506 |
+
"norm_type": "rmsnorm",
|
| 507 |
+
"target_loss": "hard_ce",
|
| 508 |
+
"linear_soft_target_power": 1.0,
|
| 509 |
+
"linear_soft_target_min_conf": 0.0,
|
| 510 |
+
"linear_soft_target_max_conf": 1.0,
|
| 511 |
+
"t_sampling_mode": "logit_normal",
|
| 512 |
+
"t_sampling_power": 1.0,
|
| 513 |
+
"t_sampling_eps": 0.0001,
|
| 514 |
+
"t_sampling_logit_mean": -1.5,
|
| 515 |
+
"t_sampling_logit_std": 0.8,
|
| 516 |
+
"dual_t": true,
|
| 517 |
+
"corrupt_t_mode": "same",
|
| 518 |
+
"corrupt_min_t": 0.0,
|
| 519 |
+
"corrupt_max_t": 1.0,
|
| 520 |
+
"prefix_block_prob": 0.0,
|
| 521 |
+
"prefix_block_len": 128,
|
| 522 |
+
"mask_ratio_floor_schedule": "none",
|
| 523 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 524 |
+
"dirichlet_semantic_t_mode": "same",
|
| 525 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 526 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 527 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 528 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 529 |
+
"categorical_wrong_from_full_vocab": true,
|
| 530 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 531 |
+
"categorical_wrong_basin_token_ids": "",
|
| 532 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 533 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 534 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 535 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 536 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 537 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 538 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 539 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 540 |
+
"mask_mixture_original_prob": 0.0,
|
| 541 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 542 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 543 |
+
"mask_mixture_block_prob": 0.0,
|
| 544 |
+
"mask_mixture_all_prob": 1.0,
|
| 545 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 546 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 547 |
+
"mask_mixture_block_tokens": "64,128",
|
| 548 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 549 |
+
"logistic_normal_sigma_min": 0.1,
|
| 550 |
+
"logistic_normal_sigma_max": 1.0,
|
| 551 |
+
"logistic_normal_tau_min": 1.0,
|
| 552 |
+
"logistic_normal_tau_max": 1.0,
|
| 553 |
+
"torch_compile": false,
|
| 554 |
+
"compile_mode": "max-autotune",
|
| 555 |
+
"state_format": "prob",
|
| 556 |
+
"meanflow_weight": 0.0,
|
| 557 |
+
"rollout_train_prob": 0.5,
|
| 558 |
+
"rollout_train_steps": 4,
|
| 559 |
+
"rollout_train_infer_steps": 1,
|
| 560 |
+
"rollout_train_time_mode": "sampled_path",
|
| 561 |
+
"rollout_train_s_dist": "uniform",
|
| 562 |
+
"rollout_train_s_min_frac": 0.0,
|
| 563 |
+
"rollout_train_s_max_frac": 0.125,
|
| 564 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 565 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 566 |
+
"rollout_train_temp": 1.45,
|
| 567 |
+
"rollout_train_max_gamma": 1.0,
|
| 568 |
+
"rollout_train_corrupt_only": true,
|
| 569 |
+
"rollout_train_samplewise": true,
|
| 570 |
+
"rollout_train_compute_always": false,
|
| 571 |
+
"rollout_train_sync_t": true,
|
| 572 |
+
"bridge_noise_init": "logistic_normal",
|
| 573 |
+
"noise_sigma": -1.0,
|
| 574 |
+
"allow_tf32": true,
|
| 575 |
+
"activation_checkpointing": false,
|
| 576 |
+
"activation_checkpoint_interval": 1,
|
| 577 |
+
"activation_checkpoint_scope": "block",
|
| 578 |
+
"ddp_static_graph": false,
|
| 579 |
+
"ddp_gradient_as_bucket_view": true,
|
| 580 |
+
"blocking_data_transfer": false,
|
| 581 |
+
"dataloader_prefetch_factor": 4,
|
| 582 |
+
"full_train_stats": false,
|
| 583 |
+
"tokenized_hf": false,
|
| 584 |
+
"tokenized_pad_token": "pad",
|
| 585 |
+
"elf_conditional_hf": false,
|
| 586 |
+
"record_pad_truncate": false,
|
| 587 |
+
"record_add_eos": false,
|
| 588 |
+
"record_add_special_tokens": false,
|
| 589 |
+
"record_pad_token": "pad",
|
| 590 |
+
"record_shuffle_buffer": 10000,
|
| 591 |
+
"wrap": true,
|
| 592 |
+
"wrap_mode": "stream",
|
| 593 |
+
"wrap_record_buffer_size": 200,
|
| 594 |
+
"owt_cached_chunks": true,
|
| 595 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
|
| 596 |
+
"owt_chunk_cache_rebuild": false,
|
| 597 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 598 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 599 |
+
"online_chunk_shuffle": false,
|
| 600 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 601 |
+
"openwebtext_split": "train_minus_100k",
|
| 602 |
+
"detokenizer": "auto",
|
| 603 |
+
"resolved_detokenizer": null,
|
| 604 |
+
"num_workers": 0,
|
| 605 |
+
"latest_every": 1000,
|
| 606 |
+
"resume_path": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139/latest.pt"
|
| 607 |
+
}
|
| 608 |
+
step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=24.6s lr=2.000000e-03 loss=0.6727 loss_recon=0.6727 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5077 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7942 corrupt_frac=1.0000 acc_corrupt=0.7942 loss_corrupt=0.6727 wrong_frac=0.7915 init_acc_corrupt=0.2065 acc_corrupt_t_0p0_0p2=0.6391 corrupt_frac_t_0p0_0p2=0.5640 acc_corrupt_t_0p2_0p4=0.9938 corrupt_frac_t_0p2_0p4=0.3466 acc_corrupt_t_0p4_0p6=0.9993 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p6_0p8=0.9977 corrupt_frac_t_0p6_0p8=0.0136 out_w_norm=11.8640 out_g_norm=1.0068 acc_corrupt_t_0p8_1p0=0.9895 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5473 init_gold_top10=0.5368 init_gold_top100=0.6265 rollout_applied_pos_frac=0.4453 init_acc_rollout_applied=0.3061 init_acc_rollout_kept=0.1192 logit_acc_rollout_applied=0.8278 logit_acc_rollout_kept=0.8275
|
| 609 |
+
step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=23.6s lr=2.000000e-03 loss=0.5787 loss_recon=0.5787 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4995 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8214 corrupt_frac=1.0000 acc_corrupt=0.8214 loss_corrupt=0.5787 wrong_frac=0.7905 init_acc_corrupt=0.2095 acc_corrupt_t_0p0_0p2=0.6810 corrupt_frac_t_0p0_0p2=0.5557 acc_corrupt_t_0p2_0p4=0.9966 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.9994 corrupt_frac_t_0p4_0p6=0.0762 acc_corrupt_t_0p6_0p8=0.9984 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=11.8582 out_g_norm=0.9122 acc_corrupt_t_0p8_1p0=0.9941 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5341 init_gold_top10=0.5514 init_gold_top100=0.6421 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.2945 init_acc_rollout_kept=0.1215 logit_acc_rollout_applied=0.8562 logit_acc_rollout_kept=0.8293
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728.log
ADDED
|
@@ -0,0 +1,1034 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NCCL version 2.25.1+cuda12.8
|
| 2 |
+
{
|
| 3 |
+
"device": "cuda:0",
|
| 4 |
+
"rank": 0,
|
| 5 |
+
"world_size": 4,
|
| 6 |
+
"samples": "owt_cached_chunks:8",
|
| 7 |
+
"vocab_size": 2423,
|
| 8 |
+
"tokenizer_vocab_size": 32100,
|
| 9 |
+
"save_dir": "runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728",
|
| 10 |
+
"batch_size": 128,
|
| 11 |
+
"grad_accum": 1,
|
| 12 |
+
"effective_batch_size": 512,
|
| 13 |
+
"global_batch_size": 512,
|
| 14 |
+
"lr_schedule": "constant_warmup",
|
| 15 |
+
"optimizer": "muon",
|
| 16 |
+
"epochs": 0.0,
|
| 17 |
+
"steps_per_epoch": 1,
|
| 18 |
+
"total_steps": 1000,
|
| 19 |
+
"warmup_steps": 10,
|
| 20 |
+
"warmup_epochs": -1.0,
|
| 21 |
+
"min_lr": 0.0,
|
| 22 |
+
"weight_decay": 0.1,
|
| 23 |
+
"output_weight_decay": -1.0,
|
| 24 |
+
"adamw_param_groups": "nanogpt",
|
| 25 |
+
"adam_beta1": 0.9,
|
| 26 |
+
"adam_beta2": 0.95,
|
| 27 |
+
"adam_eps": 1e-08,
|
| 28 |
+
"muon_impl": "legacy",
|
| 29 |
+
"muon_momentum": 0.95,
|
| 30 |
+
"muon_ns_steps": 5,
|
| 31 |
+
"muon_update_scale": 1.0,
|
| 32 |
+
"muon_nesterov": false,
|
| 33 |
+
"muon_width_scale": false,
|
| 34 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 35 |
+
"muon_param_count": 2523776,
|
| 36 |
+
"muon_adam_param_count": 8192,
|
| 37 |
+
"muon_param_names": [
|
| 38 |
+
"vocab_embed.embedding",
|
| 39 |
+
"sigma_map.net.0.weight",
|
| 40 |
+
"sigma_map.net.2.weight",
|
| 41 |
+
"blocks.0.attn_qkv.weight",
|
| 42 |
+
"blocks.0.attn_out.weight",
|
| 43 |
+
"blocks.0.mlp.0.weight",
|
| 44 |
+
"blocks.0.mlp.2.weight",
|
| 45 |
+
"blocks.0.adaLN_modulation.weight",
|
| 46 |
+
"blocks.1.attn_qkv.weight",
|
| 47 |
+
"blocks.1.attn_out.weight",
|
| 48 |
+
"blocks.1.mlp.0.weight",
|
| 49 |
+
"blocks.1.mlp.2.weight",
|
| 50 |
+
"blocks.1.adaLN_modulation.weight",
|
| 51 |
+
"blocks.2.attn_qkv.weight",
|
| 52 |
+
"blocks.2.attn_out.weight",
|
| 53 |
+
"blocks.2.mlp.0.weight",
|
| 54 |
+
"blocks.2.mlp.2.weight",
|
| 55 |
+
"blocks.2.adaLN_modulation.weight",
|
| 56 |
+
"output_layer.linear.weight",
|
| 57 |
+
"output_layer.adaLN_modulation.weight"
|
| 58 |
+
],
|
| 59 |
+
"muon_adam_param_names": [
|
| 60 |
+
"sigma_map.net.0.bias",
|
| 61 |
+
"sigma_map.net.2.bias",
|
| 62 |
+
"blocks.0.norm1.weight",
|
| 63 |
+
"blocks.0.norm2.weight",
|
| 64 |
+
"blocks.0.mlp.0.bias",
|
| 65 |
+
"blocks.0.mlp.2.bias",
|
| 66 |
+
"blocks.0.adaLN_modulation.bias",
|
| 67 |
+
"blocks.1.norm1.weight",
|
| 68 |
+
"blocks.1.norm2.weight",
|
| 69 |
+
"blocks.1.mlp.0.bias",
|
| 70 |
+
"blocks.1.mlp.2.bias",
|
| 71 |
+
"blocks.1.adaLN_modulation.bias",
|
| 72 |
+
"blocks.2.norm1.weight",
|
| 73 |
+
"blocks.2.norm2.weight",
|
| 74 |
+
"blocks.2.mlp.0.bias",
|
| 75 |
+
"blocks.2.mlp.2.bias",
|
| 76 |
+
"blocks.2.adaLN_modulation.bias",
|
| 77 |
+
"output_layer.norm_final.weight",
|
| 78 |
+
"output_layer.adaLN_modulation.bias"
|
| 79 |
+
],
|
| 80 |
+
"muon_effective_nesterov": false,
|
| 81 |
+
"muon_effective_width_scale": false,
|
| 82 |
+
"muon_effective_weight_decay": 0.1,
|
| 83 |
+
"muon_adam_fallback_nesterov": false,
|
| 84 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 85 |
+
"ema_decay": 0.9999,
|
| 86 |
+
"ema_start_step": 0,
|
| 87 |
+
"model_type": "ddit",
|
| 88 |
+
"ddit_mlp_type": "gelu",
|
| 89 |
+
"elf_num_time_tokens": 4,
|
| 90 |
+
"elf_num_model_mode_tokens": 0,
|
| 91 |
+
"qk_norm": true,
|
| 92 |
+
"output_bias": false,
|
| 93 |
+
"output_init_std": -1.0,
|
| 94 |
+
"norm_type": "rmsnorm",
|
| 95 |
+
"target_loss": "hard_ce",
|
| 96 |
+
"linear_soft_target_power": 1.0,
|
| 97 |
+
"linear_soft_target_min_conf": 0.0,
|
| 98 |
+
"linear_soft_target_max_conf": 1.0,
|
| 99 |
+
"t_sampling_mode": "uniform",
|
| 100 |
+
"t_sampling_power": 1.0,
|
| 101 |
+
"t_sampling_eps": 0.0001,
|
| 102 |
+
"t_sampling_logit_mean": -1.5,
|
| 103 |
+
"t_sampling_logit_std": 0.8,
|
| 104 |
+
"dual_t": true,
|
| 105 |
+
"corrupt_t_mode": "same",
|
| 106 |
+
"corrupt_min_t": 0.0,
|
| 107 |
+
"corrupt_max_t": 1.0,
|
| 108 |
+
"prefix_block_prob": 0.0,
|
| 109 |
+
"prefix_block_len": 128,
|
| 110 |
+
"mask_ratio_floor_schedule": "none",
|
| 111 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 112 |
+
"dirichlet_semantic_t_mode": "same",
|
| 113 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 114 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 115 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 116 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 117 |
+
"categorical_wrong_from_full_vocab": true,
|
| 118 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 119 |
+
"categorical_wrong_basin_token_ids": "",
|
| 120 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 121 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 122 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 123 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 124 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 125 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 126 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 127 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 128 |
+
"mask_mixture_original_prob": 0.0,
|
| 129 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 130 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 131 |
+
"mask_mixture_block_prob": 0.0,
|
| 132 |
+
"mask_mixture_all_prob": 1.0,
|
| 133 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 134 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 135 |
+
"mask_mixture_block_tokens": "64,128",
|
| 136 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 137 |
+
"logistic_normal_sigma_min": 0.1,
|
| 138 |
+
"logistic_normal_sigma_max": 1.0,
|
| 139 |
+
"logistic_normal_tau_min": 1.0,
|
| 140 |
+
"logistic_normal_tau_max": 1.0,
|
| 141 |
+
"torch_compile": false,
|
| 142 |
+
"compile_mode": "max-autotune",
|
| 143 |
+
"state_format": "prob",
|
| 144 |
+
"meanflow_weight": 0.0,
|
| 145 |
+
"rollout_train_prob": 0.35,
|
| 146 |
+
"rollout_train_steps": 3,
|
| 147 |
+
"rollout_train_steps_min": 0,
|
| 148 |
+
"rollout_train_infer_steps": 1,
|
| 149 |
+
"rollout_train_time_mode": "sampled_path",
|
| 150 |
+
"rollout_train_s_dist": "uniform",
|
| 151 |
+
"rollout_train_s_min_frac": 0.0,
|
| 152 |
+
"rollout_train_s_max_frac": 0.25,
|
| 153 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 154 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 155 |
+
"rollout_train_temp": 1.0,
|
| 156 |
+
"rollout_train_max_gamma": 1.0,
|
| 157 |
+
"rollout_train_corrupt_only": true,
|
| 158 |
+
"rollout_train_samplewise": true,
|
| 159 |
+
"rollout_train_compute_always": false,
|
| 160 |
+
"rollout_train_sync_t": true,
|
| 161 |
+
"bridge_noise_init": "logistic_normal",
|
| 162 |
+
"noise_sigma": -1.0,
|
| 163 |
+
"allow_tf32": true,
|
| 164 |
+
"activation_checkpointing": false,
|
| 165 |
+
"activation_checkpoint_interval": 1,
|
| 166 |
+
"activation_checkpoint_scope": "block",
|
| 167 |
+
"ddp_static_graph": false,
|
| 168 |
+
"ddp_gradient_as_bucket_view": true,
|
| 169 |
+
"blocking_data_transfer": false,
|
| 170 |
+
"dataloader_prefetch_factor": 4,
|
| 171 |
+
"full_train_stats": false,
|
| 172 |
+
"tokenized_hf": false,
|
| 173 |
+
"tokenized_pad_token": "pad",
|
| 174 |
+
"elf_conditional_hf": false,
|
| 175 |
+
"record_pad_truncate": false,
|
| 176 |
+
"record_add_eos": false,
|
| 177 |
+
"record_add_special_tokens": false,
|
| 178 |
+
"record_pad_token": "pad",
|
| 179 |
+
"record_shuffle_buffer": 10000,
|
| 180 |
+
"wrap": true,
|
| 181 |
+
"wrap_mode": "stream",
|
| 182 |
+
"wrap_record_buffer_size": 200,
|
| 183 |
+
"owt_cached_chunks": true,
|
| 184 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
|
| 185 |
+
"owt_chunk_cache_rebuild": false,
|
| 186 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 187 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 188 |
+
"online_chunk_shuffle": false,
|
| 189 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 190 |
+
"openwebtext_split": "train_minus_100k",
|
| 191 |
+
"detokenizer": "auto",
|
| 192 |
+
"resolved_detokenizer": null,
|
| 193 |
+
"num_workers": 0,
|
| 194 |
+
"latest_every": 1000,
|
| 195 |
+
"resume_path": ""
|
| 196 |
+
}
|
| 197 |
+
step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=19.8s lr=2.000000e-03 loss=7.3417 loss_recon=7.3417 loss_meanflow=0.0000 mean_model_t=0.4971 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3513 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3284 corrupt_frac=1.0000 acc_corrupt=0.3284 loss_corrupt=7.3417 wrong_frac=0.5028 init_acc_corrupt=0.4627 acc_corrupt_t_0p0_0p2=0.0466 corrupt_frac_t_0p0_0p2=0.2071 acc_corrupt_t_0p2_0p4=0.1614 corrupt_frac_t_0p2_0p4=0.1988 acc_corrupt_t_0p4_0p6=0.3279 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=0.4813 corrupt_frac_t_0p6_0p8=0.1951 acc_corrupt_t_0p8_1p0=0.6352 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=1.0922 out_g_norm=1.0102 loss_all=6.7794 init_gold_top10=0.4656 init_gold_top100=0.6040 rollout_applied_pos_frac=0.3516 init_acc_rollout_applied=0.3928 init_acc_rollout_kept=0.4393 logit_acc_rollout_applied=0.2685 logit_acc_rollout_kept=0.2938
|
| 198 |
+
step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=19.0s lr=2.000000e-03 loss=5.8034 loss_recon=5.8034 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3505 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3299 corrupt_frac=1.0000 acc_corrupt=0.3299 loss_corrupt=5.8034 wrong_frac=0.4984 init_acc_corrupt=0.4676 acc_corrupt_t_0p0_0p2=0.0524 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.1630 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.3295 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=0.4754 corrupt_frac_t_0p6_0p8=0.2007 acc_corrupt_t_0p8_1p0=0.6253 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=3.4952 out_g_norm=1.3311 loss_all=5.0183 init_gold_top10=0.5079 init_gold_top100=0.6301 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.4755 init_acc_rollout_kept=0.4824 logit_acc_rollout_applied=0.3469 logit_acc_rollout_kept=0.3570
|
| 199 |
+
step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=19.0s lr=2.000000e-03 loss=4.7182 loss_recon=4.7182 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3535 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3676 corrupt_frac=1.0000 acc_corrupt=0.3676 loss_corrupt=4.7182 wrong_frac=0.4985 init_acc_corrupt=0.4685 acc_corrupt_t_0p0_0p2=0.0557 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.1898 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.3639 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=0.5242 corrupt_frac_t_0p6_0p8=0.2045 acc_corrupt_t_0p8_1p0=0.6974 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=5.5797 out_g_norm=0.5500 loss_all=4.3883 init_gold_top10=0.5102 init_gold_top100=0.6393 rollout_applied_pos_frac=0.4062 init_acc_rollout_applied=0.5089 init_acc_rollout_kept=0.4471 logit_acc_rollout_applied=0.4133 logit_acc_rollout_kept=0.3762
|
| 200 |
+
step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=19.0s lr=2.000000e-03 loss=4.1225 loss_recon=4.1225 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3515 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4225 corrupt_frac=1.0000 acc_corrupt=0.4225 loss_corrupt=4.1225 wrong_frac=0.5016 init_acc_corrupt=0.4649 acc_corrupt_t_0p0_0p2=0.0583 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.2095 corrupt_frac_t_0p2_0p4=0.2001 acc_corrupt_t_0p4_0p6=0.4200 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.6151 corrupt_frac_t_0p6_0p8=0.1992 acc_corrupt_t_0p8_1p0=0.8157 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=7.1167 out_g_norm=0.2749 loss_all=3.9532 init_gold_top10=0.4850 init_gold_top100=0.6328 rollout_applied_pos_frac=0.3281 init_acc_rollout_applied=0.4657 init_acc_rollout_kept=0.4320 logit_acc_rollout_applied=0.4583 logit_acc_rollout_kept=0.4408
|
| 201 |
+
step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=19.0s lr=2.000000e-03 loss=3.5466 loss_recon=3.5466 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3470 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4823 corrupt_frac=1.0000 acc_corrupt=0.4823 loss_corrupt=3.5466 wrong_frac=0.5023 init_acc_corrupt=0.4642 acc_corrupt_t_0p0_0p2=0.0594 corrupt_frac_t_0p0_0p2=0.2059 acc_corrupt_t_0p2_0p4=0.2412 corrupt_frac_t_0p2_0p4=0.1985 acc_corrupt_t_0p4_0p6=0.5048 corrupt_frac_t_0p4_0p6=0.1969 acc_corrupt_t_0p6_0p8=0.7094 corrupt_frac_t_0p6_0p8=0.1927 acc_corrupt_t_0p8_1p0=0.9033 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=8.4596 out_g_norm=0.2351 loss_all=3.3523 init_gold_top10=0.4858 init_gold_top100=0.6220 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.4617 init_acc_rollout_kept=0.4433 logit_acc_rollout_applied=0.4844 logit_acc_rollout_kept=0.4697
|
| 202 |
+
step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=19.1s lr=2.000000e-03 loss=3.0821 loss_recon=3.0821 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3562 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4963 corrupt_frac=1.0000 acc_corrupt=0.4963 loss_corrupt=3.0821 wrong_frac=0.4987 init_acc_corrupt=0.4682 acc_corrupt_t_0p0_0p2=0.0622 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.2700 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=0.5248 corrupt_frac_t_0p4_0p6=0.1990 acc_corrupt_t_0p6_0p8=0.7152 corrupt_frac_t_0p6_0p8=0.2017 acc_corrupt_t_0p8_1p0=0.9049 corrupt_frac_t_0p8_1p0=0.1997 out_w_norm=9.7303 out_g_norm=0.2534 loss_all=2.4394 init_gold_top10=0.5858 init_gold_top100=0.7010 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5050 init_acc_rollout_kept=0.5727 logit_acc_rollout_applied=0.5326 logit_acc_rollout_kept=0.6010
|
| 203 |
+
step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=18.9s lr=2.000000e-03 loss=2.7681 loss_recon=2.7681 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3412 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5034 corrupt_frac=1.0000 acc_corrupt=0.5034 loss_corrupt=2.7681 wrong_frac=0.4994 init_acc_corrupt=0.4679 acc_corrupt_t_0p0_0p2=0.0635 corrupt_frac_t_0p0_0p2=0.2030 acc_corrupt_t_0p2_0p4=0.2807 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.5331 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.7234 corrupt_frac_t_0p6_0p8=0.2001 acc_corrupt_t_0p8_1p0=0.9066 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=10.7428 out_g_norm=0.2922 loss_all=2.5966 init_gold_top10=0.5189 init_gold_top100=0.6915 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.4203 init_acc_rollout_kept=0.5121 logit_acc_rollout_applied=0.4614 logit_acc_rollout_kept=0.5496
|
| 204 |
+
step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=19.0s lr=2.000000e-03 loss=2.3453 loss_recon=2.3453 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3430 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5308 corrupt_frac=1.0000 acc_corrupt=0.5308 loss_corrupt=2.3453 wrong_frac=0.5037 init_acc_corrupt=0.4633 acc_corrupt_t_0p0_0p2=0.0631 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.3065 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=0.5891 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=0.7800 corrupt_frac_t_0p6_0p8=0.1965 acc_corrupt_t_0p8_1p0=0.9311 corrupt_frac_t_0p8_1p0=0.1984 out_w_norm=11.2908 out_g_norm=0.3703 loss_all=2.2007 init_gold_top10=0.5154 init_gold_top100=0.6697 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.4141 init_acc_rollout_kept=0.4622 logit_acc_rollout_applied=0.5173 logit_acc_rollout_kept=0.5575
|
| 205 |
+
step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=19.1s lr=2.000000e-03 loss=1.8132 loss_recon=1.8132 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.5040 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3546 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6146 corrupt_frac=1.0000 acc_corrupt=0.6146 loss_corrupt=1.8132 wrong_frac=0.4960 init_acc_corrupt=0.4740 acc_corrupt_t_0p0_0p2=0.0648 corrupt_frac_t_0p0_0p2=0.1931 acc_corrupt_t_0p2_0p4=0.3780 corrupt_frac_t_0p2_0p4=0.2013 acc_corrupt_t_0p4_0p6=0.7364 corrupt_frac_t_0p4_0p6=0.2019 acc_corrupt_t_0p6_0p8=0.8940 corrupt_frac_t_0p6_0p8=0.1981 acc_corrupt_t_0p8_1p0=0.9738 corrupt_frac_t_0p8_1p0=0.2056 out_w_norm=11.7769 out_g_norm=0.4581 loss_all=1.5519 init_gold_top10=0.5837 init_gold_top100=0.7187 rollout_applied_pos_frac=0.3125 init_acc_rollout_applied=0.4619 init_acc_rollout_kept=0.5155 logit_acc_rollout_applied=0.6272 logit_acc_rollout_kept=0.6871
|
| 206 |
+
step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=19.0s lr=2.000000e-03 loss=1.5254 loss_recon=1.5254 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3534 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6756 corrupt_frac=1.0000 acc_corrupt=0.6756 loss_corrupt=1.5254 wrong_frac=0.4993 init_acc_corrupt=0.4741 acc_corrupt_t_0p0_0p2=0.0694 corrupt_frac_t_0p0_0p2=0.2034 acc_corrupt_t_0p2_0p4=0.4889 corrupt_frac_t_0p2_0p4=0.1967 acc_corrupt_t_0p4_0p6=0.8656 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=0.9668 corrupt_frac_t_0p6_0p8=0.1933 acc_corrupt_t_0p8_1p0=0.9947 corrupt_frac_t_0p8_1p0=0.2055 out_w_norm=12.1609 out_g_norm=0.5459 loss_all=1.5778 init_gold_top10=0.5685 init_gold_top100=0.7045 rollout_applied_pos_frac=0.4141 init_acc_rollout_applied=0.5418 init_acc_rollout_kept=0.4381 logit_acc_rollout_applied=0.7332 logit_acc_rollout_kept=0.6367
|
| 207 |
+
NCCL version 2.25.1+cuda12.8
|
| 208 |
+
resumed_from=runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt start_step=1001
|
| 209 |
+
{
|
| 210 |
+
"device": "cuda:0",
|
| 211 |
+
"rank": 0,
|
| 212 |
+
"world_size": 4,
|
| 213 |
+
"samples": "owt_cached_chunks:8",
|
| 214 |
+
"vocab_size": 2423,
|
| 215 |
+
"tokenizer_vocab_size": 32100,
|
| 216 |
+
"save_dir": "runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728",
|
| 217 |
+
"batch_size": 128,
|
| 218 |
+
"grad_accum": 1,
|
| 219 |
+
"effective_batch_size": 512,
|
| 220 |
+
"global_batch_size": 512,
|
| 221 |
+
"lr_schedule": "constant_warmup",
|
| 222 |
+
"optimizer": "muon",
|
| 223 |
+
"epochs": 0.0,
|
| 224 |
+
"steps_per_epoch": 1,
|
| 225 |
+
"total_steps": 2000,
|
| 226 |
+
"warmup_steps": 10,
|
| 227 |
+
"warmup_epochs": -1.0,
|
| 228 |
+
"min_lr": 0.0,
|
| 229 |
+
"weight_decay": 0.1,
|
| 230 |
+
"output_weight_decay": -1.0,
|
| 231 |
+
"adamw_param_groups": "nanogpt",
|
| 232 |
+
"adam_beta1": 0.9,
|
| 233 |
+
"adam_beta2": 0.95,
|
| 234 |
+
"adam_eps": 1e-08,
|
| 235 |
+
"muon_impl": "legacy",
|
| 236 |
+
"muon_momentum": 0.95,
|
| 237 |
+
"muon_ns_steps": 5,
|
| 238 |
+
"muon_update_scale": 1.0,
|
| 239 |
+
"muon_nesterov": false,
|
| 240 |
+
"muon_width_scale": false,
|
| 241 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 242 |
+
"muon_param_count": 2523776,
|
| 243 |
+
"muon_adam_param_count": 8192,
|
| 244 |
+
"muon_param_names": [
|
| 245 |
+
"vocab_embed.embedding",
|
| 246 |
+
"sigma_map.net.0.weight",
|
| 247 |
+
"sigma_map.net.2.weight",
|
| 248 |
+
"blocks.0.attn_qkv.weight",
|
| 249 |
+
"blocks.0.attn_out.weight",
|
| 250 |
+
"blocks.0.mlp.0.weight",
|
| 251 |
+
"blocks.0.mlp.2.weight",
|
| 252 |
+
"blocks.0.adaLN_modulation.weight",
|
| 253 |
+
"blocks.1.attn_qkv.weight",
|
| 254 |
+
"blocks.1.attn_out.weight",
|
| 255 |
+
"blocks.1.mlp.0.weight",
|
| 256 |
+
"blocks.1.mlp.2.weight",
|
| 257 |
+
"blocks.1.adaLN_modulation.weight",
|
| 258 |
+
"blocks.2.attn_qkv.weight",
|
| 259 |
+
"blocks.2.attn_out.weight",
|
| 260 |
+
"blocks.2.mlp.0.weight",
|
| 261 |
+
"blocks.2.mlp.2.weight",
|
| 262 |
+
"blocks.2.adaLN_modulation.weight",
|
| 263 |
+
"output_layer.linear.weight",
|
| 264 |
+
"output_layer.adaLN_modulation.weight"
|
| 265 |
+
],
|
| 266 |
+
"muon_adam_param_names": [
|
| 267 |
+
"sigma_map.net.0.bias",
|
| 268 |
+
"sigma_map.net.2.bias",
|
| 269 |
+
"blocks.0.norm1.weight",
|
| 270 |
+
"blocks.0.norm2.weight",
|
| 271 |
+
"blocks.0.mlp.0.bias",
|
| 272 |
+
"blocks.0.mlp.2.bias",
|
| 273 |
+
"blocks.0.adaLN_modulation.bias",
|
| 274 |
+
"blocks.1.norm1.weight",
|
| 275 |
+
"blocks.1.norm2.weight",
|
| 276 |
+
"blocks.1.mlp.0.bias",
|
| 277 |
+
"blocks.1.mlp.2.bias",
|
| 278 |
+
"blocks.1.adaLN_modulation.bias",
|
| 279 |
+
"blocks.2.norm1.weight",
|
| 280 |
+
"blocks.2.norm2.weight",
|
| 281 |
+
"blocks.2.mlp.0.bias",
|
| 282 |
+
"blocks.2.mlp.2.bias",
|
| 283 |
+
"blocks.2.adaLN_modulation.bias",
|
| 284 |
+
"output_layer.norm_final.weight",
|
| 285 |
+
"output_layer.adaLN_modulation.bias"
|
| 286 |
+
],
|
| 287 |
+
"muon_effective_nesterov": false,
|
| 288 |
+
"muon_effective_width_scale": false,
|
| 289 |
+
"muon_effective_weight_decay": 0.1,
|
| 290 |
+
"muon_adam_fallback_nesterov": false,
|
| 291 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 292 |
+
"ema_decay": 0.9999,
|
| 293 |
+
"ema_start_step": 0,
|
| 294 |
+
"model_type": "ddit",
|
| 295 |
+
"ddit_mlp_type": "gelu",
|
| 296 |
+
"elf_num_time_tokens": 4,
|
| 297 |
+
"elf_num_model_mode_tokens": 0,
|
| 298 |
+
"qk_norm": true,
|
| 299 |
+
"output_bias": false,
|
| 300 |
+
"output_init_std": -1.0,
|
| 301 |
+
"norm_type": "rmsnorm",
|
| 302 |
+
"target_loss": "hard_ce",
|
| 303 |
+
"linear_soft_target_power": 1.0,
|
| 304 |
+
"linear_soft_target_min_conf": 0.0,
|
| 305 |
+
"linear_soft_target_max_conf": 1.0,
|
| 306 |
+
"t_sampling_mode": "uniform",
|
| 307 |
+
"t_sampling_power": 1.0,
|
| 308 |
+
"t_sampling_eps": 0.0001,
|
| 309 |
+
"t_sampling_logit_mean": -1.5,
|
| 310 |
+
"t_sampling_logit_std": 0.8,
|
| 311 |
+
"dual_t": true,
|
| 312 |
+
"corrupt_t_mode": "same",
|
| 313 |
+
"corrupt_min_t": 0.0,
|
| 314 |
+
"corrupt_max_t": 1.0,
|
| 315 |
+
"prefix_block_prob": 0.0,
|
| 316 |
+
"prefix_block_len": 128,
|
| 317 |
+
"mask_ratio_floor_schedule": "none",
|
| 318 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 319 |
+
"dirichlet_semantic_t_mode": "same",
|
| 320 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 321 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 322 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 323 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 324 |
+
"categorical_wrong_from_full_vocab": true,
|
| 325 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 326 |
+
"categorical_wrong_basin_token_ids": "",
|
| 327 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 328 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 329 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 330 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 331 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 332 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 333 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 334 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 335 |
+
"mask_mixture_original_prob": 0.0,
|
| 336 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 337 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 338 |
+
"mask_mixture_block_prob": 0.0,
|
| 339 |
+
"mask_mixture_all_prob": 1.0,
|
| 340 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 341 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 342 |
+
"mask_mixture_block_tokens": "64,128",
|
| 343 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 344 |
+
"logistic_normal_sigma_min": 0.1,
|
| 345 |
+
"logistic_normal_sigma_max": 1.0,
|
| 346 |
+
"logistic_normal_tau_min": 1.0,
|
| 347 |
+
"logistic_normal_tau_max": 1.0,
|
| 348 |
+
"torch_compile": false,
|
| 349 |
+
"compile_mode": "max-autotune",
|
| 350 |
+
"state_format": "prob",
|
| 351 |
+
"meanflow_weight": 0.0,
|
| 352 |
+
"rollout_train_prob": 0.35,
|
| 353 |
+
"rollout_train_steps": 3,
|
| 354 |
+
"rollout_train_steps_min": 0,
|
| 355 |
+
"rollout_train_infer_steps": 1,
|
| 356 |
+
"rollout_train_time_mode": "sampled_path",
|
| 357 |
+
"rollout_train_s_dist": "uniform",
|
| 358 |
+
"rollout_train_s_min_frac": 0.0,
|
| 359 |
+
"rollout_train_s_max_frac": 0.25,
|
| 360 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 361 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 362 |
+
"rollout_train_temp": 1.0,
|
| 363 |
+
"rollout_train_max_gamma": 1.0,
|
| 364 |
+
"rollout_train_corrupt_only": true,
|
| 365 |
+
"rollout_train_samplewise": true,
|
| 366 |
+
"rollout_train_compute_always": false,
|
| 367 |
+
"rollout_train_sync_t": true,
|
| 368 |
+
"bridge_noise_init": "logistic_normal",
|
| 369 |
+
"noise_sigma": -1.0,
|
| 370 |
+
"allow_tf32": true,
|
| 371 |
+
"activation_checkpointing": false,
|
| 372 |
+
"activation_checkpoint_interval": 1,
|
| 373 |
+
"activation_checkpoint_scope": "block",
|
| 374 |
+
"ddp_static_graph": false,
|
| 375 |
+
"ddp_gradient_as_bucket_view": true,
|
| 376 |
+
"blocking_data_transfer": false,
|
| 377 |
+
"dataloader_prefetch_factor": 4,
|
| 378 |
+
"full_train_stats": false,
|
| 379 |
+
"tokenized_hf": false,
|
| 380 |
+
"tokenized_pad_token": "pad",
|
| 381 |
+
"elf_conditional_hf": false,
|
| 382 |
+
"record_pad_truncate": false,
|
| 383 |
+
"record_add_eos": false,
|
| 384 |
+
"record_add_special_tokens": false,
|
| 385 |
+
"record_pad_token": "pad",
|
| 386 |
+
"record_shuffle_buffer": 10000,
|
| 387 |
+
"wrap": true,
|
| 388 |
+
"wrap_mode": "stream",
|
| 389 |
+
"wrap_record_buffer_size": 200,
|
| 390 |
+
"owt_cached_chunks": true,
|
| 391 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
|
| 392 |
+
"owt_chunk_cache_rebuild": false,
|
| 393 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 394 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 395 |
+
"online_chunk_shuffle": false,
|
| 396 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 397 |
+
"openwebtext_split": "train_minus_100k",
|
| 398 |
+
"detokenizer": "auto",
|
| 399 |
+
"resolved_detokenizer": null,
|
| 400 |
+
"num_workers": 0,
|
| 401 |
+
"latest_every": 1000,
|
| 402 |
+
"resume_path": "runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt"
|
| 403 |
+
}
|
| 404 |
+
step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=20.2s lr=2.000000e-03 loss=1.3296 loss_recon=1.3296 loss_meanflow=0.0000 mean_model_t=0.4971 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3513 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7155 corrupt_frac=1.0000 acc_corrupt=0.7155 loss_corrupt=1.3296 wrong_frac=0.5028 init_acc_corrupt=0.4747 acc_corrupt_t_0p0_0p2=0.0794 corrupt_frac_t_0p0_0p2=0.2071 acc_corrupt_t_0p2_0p4=0.6046 corrupt_frac_t_0p2_0p4=0.1988 acc_corrupt_t_0p4_0p6=0.9346 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=0.9896 corrupt_frac_t_0p6_0p8=0.1951 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.4307 out_g_norm=0.5976 loss_all=1.1705 init_gold_top10=0.5527 init_gold_top100=0.6892 rollout_applied_pos_frac=0.3516 init_acc_rollout_applied=0.4722 init_acc_rollout_kept=0.4393 logit_acc_rollout_applied=0.7318 logit_acc_rollout_kept=0.7314
|
| 405 |
+
step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=19.0s lr=2.000000e-03 loss=1.1354 loss_recon=1.1354 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3505 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7552 corrupt_frac=1.0000 acc_corrupt=0.7552 loss_corrupt=1.1354 wrong_frac=0.4984 init_acc_corrupt=0.4838 acc_corrupt_t_0p0_0p2=0.0990 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.7128 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.9679 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=0.9960 corrupt_frac_t_0p6_0p8=0.2007 acc_corrupt_t_0p8_1p0=0.9995 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.6239 out_g_norm=0.6265 loss_all=1.0650 init_gold_top10=0.5828 init_gold_top100=0.7012 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.5340 init_acc_rollout_kept=0.4824 logit_acc_rollout_applied=0.7880 logit_acc_rollout_kept=0.7683
|
| 406 |
+
step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=19.0s lr=2.000000e-03 loss=0.9741 loss_recon=0.9741 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3535 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7882 corrupt_frac=1.0000 acc_corrupt=0.7882 loss_corrupt=0.9741 wrong_frac=0.4985 init_acc_corrupt=0.4873 acc_corrupt_t_0p0_0p2=0.1305 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.8078 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.9844 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=0.9982 corrupt_frac_t_0p6_0p8=0.2045 acc_corrupt_t_0p8_1p0=0.9998 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=12.7840 out_g_norm=0.6228 loss_all=0.8968 init_gold_top10=0.5980 init_gold_top100=0.7052 rollout_applied_pos_frac=0.4062 init_acc_rollout_applied=0.5557 init_acc_rollout_kept=0.4471 logit_acc_rollout_applied=0.8387 logit_acc_rollout_kept=0.7854
|
| 407 |
+
step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=19.0s lr=2.000000e-03 loss=0.8769 loss_recon=0.8769 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3515 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8068 corrupt_frac=1.0000 acc_corrupt=0.8068 loss_corrupt=0.8769 wrong_frac=0.5016 init_acc_corrupt=0.4879 acc_corrupt_t_0p0_0p2=0.1718 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.8725 corrupt_frac_t_0p2_0p4=0.2001 acc_corrupt_t_0p4_0p6=0.9927 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.9991 corrupt_frac_t_0p6_0p8=0.1992 acc_corrupt_t_0p8_1p0=0.9998 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=12.9245 out_g_norm=0.6431 loss_all=0.7940 init_gold_top10=0.5796 init_gold_top100=0.7022 rollout_applied_pos_frac=0.3281 init_acc_rollout_applied=0.5728 init_acc_rollout_kept=0.4320 logit_acc_rollout_applied=0.8450 logit_acc_rollout_kept=0.8035
|
| 408 |
+
step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=19.0s lr=2.000000e-03 loss=0.8124 loss_recon=0.8124 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3470 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8199 corrupt_frac=1.0000 acc_corrupt=0.8199 loss_corrupt=0.8124 wrong_frac=0.5023 init_acc_corrupt=0.4897 acc_corrupt_t_0p0_0p2=0.2024 corrupt_frac_t_0p0_0p2=0.2059 acc_corrupt_t_0p2_0p4=0.9242 corrupt_frac_t_0p2_0p4=0.1985 acc_corrupt_t_0p4_0p6=0.9964 corrupt_frac_t_0p4_0p6=0.1969 acc_corrupt_t_0p6_0p8=0.9994 corrupt_frac_t_0p6_0p8=0.1927 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=13.0274 out_g_norm=0.6095 loss_all=0.7331 init_gold_top10=0.5547 init_gold_top100=0.6604 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5269 init_acc_rollout_kept=0.4433 logit_acc_rollout_applied=0.8746 logit_acc_rollout_kept=0.8114
|
| 409 |
+
step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=19.1s lr=2.000000e-03 loss=0.6964 loss_recon=0.6964 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3562 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8418 corrupt_frac=1.0000 acc_corrupt=0.8418 loss_corrupt=0.6964 wrong_frac=0.4987 init_acc_corrupt=0.4972 acc_corrupt_t_0p0_0p2=0.2464 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.9545 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=0.9982 corrupt_frac_t_0p4_0p6=0.1990 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.2017 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.1997 out_w_norm=13.1072 out_g_norm=0.5888 loss_all=0.5103 init_gold_top10=0.6593 init_gold_top100=0.7448 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5605 init_acc_rollout_kept=0.5727 logit_acc_rollout_applied=0.8679 logit_acc_rollout_kept=0.8962
|
| 410 |
+
step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=18.9s lr=2.000000e-03 loss=0.6638 loss_recon=0.6638 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3412 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8463 corrupt_frac=1.0000 acc_corrupt=0.8463 loss_corrupt=0.6638 wrong_frac=0.4994 init_acc_corrupt=0.4951 acc_corrupt_t_0p0_0p2=0.2739 corrupt_frac_t_0p0_0p2=0.2030 acc_corrupt_t_0p2_0p4=0.9684 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.9989 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.2001 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=13.1660 out_g_norm=0.6050 loss_all=0.8292 init_gold_top10=0.5939 init_gold_top100=0.7235 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.4727 init_acc_rollout_kept=0.5121 logit_acc_rollout_applied=0.7681 logit_acc_rollout_kept=0.8222
|
| 411 |
+
step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=19.0s lr=2.000000e-03 loss=0.6150 loss_recon=0.6150 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3430 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8536 corrupt_frac=1.0000 acc_corrupt=0.8536 loss_corrupt=0.6150 wrong_frac=0.5037 init_acc_corrupt=0.4900 acc_corrupt_t_0p0_0p2=0.2985 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.9798 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=0.9994 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.1965 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.1984 out_w_norm=13.2069 out_g_norm=0.5504 loss_all=0.6526 init_gold_top10=0.5530 init_gold_top100=0.6933 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.4390 init_acc_rollout_kept=0.4622 logit_acc_rollout_applied=0.8218 logit_acc_rollout_kept=0.8440
|
| 412 |
+
step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=19.1s lr=2.000000e-03 loss=0.5333 loss_recon=0.5333 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.5040 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3546 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8671 corrupt_frac=1.0000 acc_corrupt=0.8671 loss_corrupt=0.5333 wrong_frac=0.4960 init_acc_corrupt=0.5035 acc_corrupt_t_0p0_0p2=0.3275 corrupt_frac_t_0p0_0p2=0.1931 acc_corrupt_t_0p2_0p4=0.9857 corrupt_frac_t_0p2_0p4=0.2013 acc_corrupt_t_0p4_0p6=0.9995 corrupt_frac_t_0p4_0p6=0.2019 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1981 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.2056 out_w_norm=13.2327 out_g_norm=0.5815 loss_all=0.3862 init_gold_top10=0.6463 init_gold_top100=0.7374 rollout_applied_pos_frac=0.3125 init_acc_rollout_applied=0.5274 init_acc_rollout_kept=0.5155 logit_acc_rollout_applied=0.8912 logit_acc_rollout_kept=0.8996
|
| 413 |
+
step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=19.0s lr=2.000000e-03 loss=0.4955 loss_recon=0.4955 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3534 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8700 corrupt_frac=1.0000 acc_corrupt=0.8700 loss_corrupt=0.4955 wrong_frac=0.4993 init_acc_corrupt=0.4977 acc_corrupt_t_0p0_0p2=0.3724 corrupt_frac_t_0p0_0p2=0.2034 acc_corrupt_t_0p2_0p4=0.9888 corrupt_frac_t_0p2_0p4=0.1967 acc_corrupt_t_0p4_0p6=0.9996 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1933 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2055 out_w_norm=13.2466 out_g_norm=0.5578 loss_all=0.6899 init_gold_top10=0.6007 init_gold_top100=0.7234 rollout_applied_pos_frac=0.4141 init_acc_rollout_applied=0.6060 init_acc_rollout_kept=0.4381 logit_acc_rollout_applied=0.8636 logit_acc_rollout_kept=0.7849
|
| 414 |
+
NCCL version 2.25.1+cuda12.8
|
| 415 |
+
resumed_from=runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt start_step=2001
|
| 416 |
+
{
|
| 417 |
+
"device": "cuda:0",
|
| 418 |
+
"rank": 0,
|
| 419 |
+
"world_size": 4,
|
| 420 |
+
"samples": "owt_cached_chunks:8",
|
| 421 |
+
"vocab_size": 2423,
|
| 422 |
+
"tokenizer_vocab_size": 32100,
|
| 423 |
+
"save_dir": "runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728",
|
| 424 |
+
"batch_size": 128,
|
| 425 |
+
"grad_accum": 1,
|
| 426 |
+
"effective_batch_size": 512,
|
| 427 |
+
"global_batch_size": 512,
|
| 428 |
+
"lr_schedule": "constant_warmup",
|
| 429 |
+
"optimizer": "muon",
|
| 430 |
+
"epochs": 0.0,
|
| 431 |
+
"steps_per_epoch": 1,
|
| 432 |
+
"total_steps": 3000,
|
| 433 |
+
"warmup_steps": 10,
|
| 434 |
+
"warmup_epochs": -1.0,
|
| 435 |
+
"min_lr": 0.0,
|
| 436 |
+
"weight_decay": 0.1,
|
| 437 |
+
"output_weight_decay": -1.0,
|
| 438 |
+
"adamw_param_groups": "nanogpt",
|
| 439 |
+
"adam_beta1": 0.9,
|
| 440 |
+
"adam_beta2": 0.95,
|
| 441 |
+
"adam_eps": 1e-08,
|
| 442 |
+
"muon_impl": "legacy",
|
| 443 |
+
"muon_momentum": 0.95,
|
| 444 |
+
"muon_ns_steps": 5,
|
| 445 |
+
"muon_update_scale": 1.0,
|
| 446 |
+
"muon_nesterov": false,
|
| 447 |
+
"muon_width_scale": false,
|
| 448 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 449 |
+
"muon_param_count": 2523776,
|
| 450 |
+
"muon_adam_param_count": 8192,
|
| 451 |
+
"muon_param_names": [
|
| 452 |
+
"vocab_embed.embedding",
|
| 453 |
+
"sigma_map.net.0.weight",
|
| 454 |
+
"sigma_map.net.2.weight",
|
| 455 |
+
"blocks.0.attn_qkv.weight",
|
| 456 |
+
"blocks.0.attn_out.weight",
|
| 457 |
+
"blocks.0.mlp.0.weight",
|
| 458 |
+
"blocks.0.mlp.2.weight",
|
| 459 |
+
"blocks.0.adaLN_modulation.weight",
|
| 460 |
+
"blocks.1.attn_qkv.weight",
|
| 461 |
+
"blocks.1.attn_out.weight",
|
| 462 |
+
"blocks.1.mlp.0.weight",
|
| 463 |
+
"blocks.1.mlp.2.weight",
|
| 464 |
+
"blocks.1.adaLN_modulation.weight",
|
| 465 |
+
"blocks.2.attn_qkv.weight",
|
| 466 |
+
"blocks.2.attn_out.weight",
|
| 467 |
+
"blocks.2.mlp.0.weight",
|
| 468 |
+
"blocks.2.mlp.2.weight",
|
| 469 |
+
"blocks.2.adaLN_modulation.weight",
|
| 470 |
+
"output_layer.linear.weight",
|
| 471 |
+
"output_layer.adaLN_modulation.weight"
|
| 472 |
+
],
|
| 473 |
+
"muon_adam_param_names": [
|
| 474 |
+
"sigma_map.net.0.bias",
|
| 475 |
+
"sigma_map.net.2.bias",
|
| 476 |
+
"blocks.0.norm1.weight",
|
| 477 |
+
"blocks.0.norm2.weight",
|
| 478 |
+
"blocks.0.mlp.0.bias",
|
| 479 |
+
"blocks.0.mlp.2.bias",
|
| 480 |
+
"blocks.0.adaLN_modulation.bias",
|
| 481 |
+
"blocks.1.norm1.weight",
|
| 482 |
+
"blocks.1.norm2.weight",
|
| 483 |
+
"blocks.1.mlp.0.bias",
|
| 484 |
+
"blocks.1.mlp.2.bias",
|
| 485 |
+
"blocks.1.adaLN_modulation.bias",
|
| 486 |
+
"blocks.2.norm1.weight",
|
| 487 |
+
"blocks.2.norm2.weight",
|
| 488 |
+
"blocks.2.mlp.0.bias",
|
| 489 |
+
"blocks.2.mlp.2.bias",
|
| 490 |
+
"blocks.2.adaLN_modulation.bias",
|
| 491 |
+
"output_layer.norm_final.weight",
|
| 492 |
+
"output_layer.adaLN_modulation.bias"
|
| 493 |
+
],
|
| 494 |
+
"muon_effective_nesterov": false,
|
| 495 |
+
"muon_effective_width_scale": false,
|
| 496 |
+
"muon_effective_weight_decay": 0.1,
|
| 497 |
+
"muon_adam_fallback_nesterov": false,
|
| 498 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 499 |
+
"ema_decay": 0.9999,
|
| 500 |
+
"ema_start_step": 0,
|
| 501 |
+
"model_type": "ddit",
|
| 502 |
+
"ddit_mlp_type": "gelu",
|
| 503 |
+
"elf_num_time_tokens": 4,
|
| 504 |
+
"elf_num_model_mode_tokens": 0,
|
| 505 |
+
"qk_norm": true,
|
| 506 |
+
"output_bias": false,
|
| 507 |
+
"output_init_std": -1.0,
|
| 508 |
+
"norm_type": "rmsnorm",
|
| 509 |
+
"target_loss": "hard_ce",
|
| 510 |
+
"linear_soft_target_power": 1.0,
|
| 511 |
+
"linear_soft_target_min_conf": 0.0,
|
| 512 |
+
"linear_soft_target_max_conf": 1.0,
|
| 513 |
+
"t_sampling_mode": "uniform",
|
| 514 |
+
"t_sampling_power": 1.0,
|
| 515 |
+
"t_sampling_eps": 0.0001,
|
| 516 |
+
"t_sampling_logit_mean": -1.5,
|
| 517 |
+
"t_sampling_logit_std": 0.8,
|
| 518 |
+
"dual_t": true,
|
| 519 |
+
"corrupt_t_mode": "same",
|
| 520 |
+
"corrupt_min_t": 0.0,
|
| 521 |
+
"corrupt_max_t": 1.0,
|
| 522 |
+
"prefix_block_prob": 0.0,
|
| 523 |
+
"prefix_block_len": 128,
|
| 524 |
+
"mask_ratio_floor_schedule": "none",
|
| 525 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 526 |
+
"dirichlet_semantic_t_mode": "same",
|
| 527 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 528 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 529 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 530 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 531 |
+
"categorical_wrong_from_full_vocab": true,
|
| 532 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 533 |
+
"categorical_wrong_basin_token_ids": "",
|
| 534 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 535 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 536 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 537 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 538 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 539 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 540 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 541 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 542 |
+
"mask_mixture_original_prob": 0.0,
|
| 543 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 544 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 545 |
+
"mask_mixture_block_prob": 0.0,
|
| 546 |
+
"mask_mixture_all_prob": 1.0,
|
| 547 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 548 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 549 |
+
"mask_mixture_block_tokens": "64,128",
|
| 550 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 551 |
+
"logistic_normal_sigma_min": 0.1,
|
| 552 |
+
"logistic_normal_sigma_max": 1.0,
|
| 553 |
+
"logistic_normal_tau_min": 1.0,
|
| 554 |
+
"logistic_normal_tau_max": 1.0,
|
| 555 |
+
"torch_compile": false,
|
| 556 |
+
"compile_mode": "max-autotune",
|
| 557 |
+
"state_format": "prob",
|
| 558 |
+
"meanflow_weight": 0.0,
|
| 559 |
+
"rollout_train_prob": 0.35,
|
| 560 |
+
"rollout_train_steps": 3,
|
| 561 |
+
"rollout_train_steps_min": 0,
|
| 562 |
+
"rollout_train_infer_steps": 1,
|
| 563 |
+
"rollout_train_time_mode": "sampled_path",
|
| 564 |
+
"rollout_train_s_dist": "uniform",
|
| 565 |
+
"rollout_train_s_min_frac": 0.0,
|
| 566 |
+
"rollout_train_s_max_frac": 0.25,
|
| 567 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 568 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 569 |
+
"rollout_train_temp": 1.0,
|
| 570 |
+
"rollout_train_max_gamma": 1.0,
|
| 571 |
+
"rollout_train_corrupt_only": true,
|
| 572 |
+
"rollout_train_samplewise": true,
|
| 573 |
+
"rollout_train_compute_always": false,
|
| 574 |
+
"rollout_train_sync_t": true,
|
| 575 |
+
"bridge_noise_init": "logistic_normal",
|
| 576 |
+
"noise_sigma": -1.0,
|
| 577 |
+
"allow_tf32": true,
|
| 578 |
+
"activation_checkpointing": false,
|
| 579 |
+
"activation_checkpoint_interval": 1,
|
| 580 |
+
"activation_checkpoint_scope": "block",
|
| 581 |
+
"ddp_static_graph": false,
|
| 582 |
+
"ddp_gradient_as_bucket_view": true,
|
| 583 |
+
"blocking_data_transfer": false,
|
| 584 |
+
"dataloader_prefetch_factor": 4,
|
| 585 |
+
"full_train_stats": false,
|
| 586 |
+
"tokenized_hf": false,
|
| 587 |
+
"tokenized_pad_token": "pad",
|
| 588 |
+
"elf_conditional_hf": false,
|
| 589 |
+
"record_pad_truncate": false,
|
| 590 |
+
"record_add_eos": false,
|
| 591 |
+
"record_add_special_tokens": false,
|
| 592 |
+
"record_pad_token": "pad",
|
| 593 |
+
"record_shuffle_buffer": 10000,
|
| 594 |
+
"wrap": true,
|
| 595 |
+
"wrap_mode": "stream",
|
| 596 |
+
"wrap_record_buffer_size": 200,
|
| 597 |
+
"owt_cached_chunks": true,
|
| 598 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
|
| 599 |
+
"owt_chunk_cache_rebuild": false,
|
| 600 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 601 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 602 |
+
"online_chunk_shuffle": false,
|
| 603 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 604 |
+
"openwebtext_split": "train_minus_100k",
|
| 605 |
+
"detokenizer": "auto",
|
| 606 |
+
"resolved_detokenizer": null,
|
| 607 |
+
"num_workers": 0,
|
| 608 |
+
"latest_every": 1000,
|
| 609 |
+
"resume_path": "runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt"
|
| 610 |
+
}
|
| 611 |
+
step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=19.8s lr=2.000000e-03 loss=0.4583 loss_recon=0.4583 loss_meanflow=0.0000 mean_model_t=0.4971 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3513 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8735 corrupt_frac=1.0000 acc_corrupt=0.8735 loss_corrupt=0.4583 wrong_frac=0.5028 init_acc_corrupt=0.4935 acc_corrupt_t_0p0_0p2=0.3975 corrupt_frac_t_0p0_0p2=0.2071 acc_corrupt_t_0p2_0p4=0.9918 corrupt_frac_t_0p2_0p4=0.1988 acc_corrupt_t_0p4_0p6=0.9997 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1951 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=13.2591 out_g_norm=0.5308 loss_all=0.2927 init_gold_top10=0.5962 init_gold_top100=0.6969 rollout_applied_pos_frac=0.3516 init_acc_rollout_applied=0.5366 init_acc_rollout_kept=0.4393 logit_acc_rollout_applied=0.9168 logit_acc_rollout_kept=0.9161
|
| 612 |
+
step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=19.0s lr=2.000000e-03 loss=0.4055 loss_recon=0.4055 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3505 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8848 corrupt_frac=1.0000 acc_corrupt=0.8848 loss_corrupt=0.4055 wrong_frac=0.4984 init_acc_corrupt=0.5000 acc_corrupt_t_0p0_0p2=0.4320 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9939 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.9997 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.2007 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=13.2548 out_g_norm=0.5596 loss_all=0.4308 init_gold_top10=0.6072 init_gold_top100=0.7113 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.5487 init_acc_rollout_kept=0.4824 logit_acc_rollout_applied=0.8832 logit_acc_rollout_kept=0.8671
|
| 613 |
+
step=2300 epoch=2300/3000 epoch_step=1/1 micro_steps=2300 elapsed=19.1s lr=2.000000e-03 loss=0.3642 loss_recon=0.3642 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3535 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8920 corrupt_frac=1.0000 acc_corrupt=0.8920 loss_corrupt=0.3642 wrong_frac=0.4985 init_acc_corrupt=0.4995 acc_corrupt_t_0p0_0p2=0.4506 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.9954 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.2045 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=13.2547 out_g_norm=0.5233 loss_all=0.3508 init_gold_top10=0.6184 init_gold_top100=0.7122 rollout_applied_pos_frac=0.4062 init_acc_rollout_applied=0.5912 init_acc_rollout_kept=0.4471 logit_acc_rollout_applied=0.9022 logit_acc_rollout_kept=0.8909
|
| 614 |
+
step=2400 epoch=2400/3000 epoch_step=1/1 micro_steps=2400 elapsed=19.0s lr=2.000000e-03 loss=0.3392 loss_recon=0.3392 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3515 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8964 corrupt_frac=1.0000 acc_corrupt=0.8964 loss_corrupt=0.3392 wrong_frac=0.5016 init_acc_corrupt=0.4981 acc_corrupt_t_0p0_0p2=0.4873 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9962 corrupt_frac_t_0p2_0p4=0.2001 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1992 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=13.2416 out_g_norm=0.5315 loss_all=0.3230 init_gold_top10=0.6008 init_gold_top100=0.7086 rollout_applied_pos_frac=0.3281 init_acc_rollout_applied=0.6073 init_acc_rollout_kept=0.4320 logit_acc_rollout_applied=0.8929 logit_acc_rollout_kept=0.9091
|
| 615 |
+
step=2500 epoch=2500/3000 epoch_step=1/1 micro_steps=2500 elapsed=19.0s lr=2.000000e-03 loss=0.3342 loss_recon=0.3342 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3470 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8955 corrupt_frac=1.0000 acc_corrupt=0.8955 loss_corrupt=0.3342 wrong_frac=0.5023 init_acc_corrupt=0.4972 acc_corrupt_t_0p0_0p2=0.4951 corrupt_frac_t_0p0_0p2=0.2059 acc_corrupt_t_0p2_0p4=0.9976 corrupt_frac_t_0p2_0p4=0.1985 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.1969 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1927 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=13.2247 out_g_norm=0.4966 loss_all=0.2946 init_gold_top10=0.5647 init_gold_top100=0.6631 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5336 init_acc_rollout_kept=0.4433 logit_acc_rollout_applied=0.9112 logit_acc_rollout_kept=0.9019
|
| 616 |
+
step=2600 epoch=2600/3000 epoch_step=1/1 micro_steps=2600 elapsed=19.1s lr=2.000000e-03 loss=0.2927 loss_recon=0.2927 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3562 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9071 corrupt_frac=1.0000 acc_corrupt=0.9071 loss_corrupt=0.2927 wrong_frac=0.4987 init_acc_corrupt=0.5030 acc_corrupt_t_0p0_0p2=0.5311 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.9981 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1990 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2017 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1997 out_w_norm=13.1977 out_g_norm=0.4920 loss_all=0.2278 init_gold_top10=0.6797 init_gold_top100=0.7468 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5799 init_acc_rollout_kept=0.5727 logit_acc_rollout_applied=0.8991 logit_acc_rollout_kept=0.9380
|
| 617 |
+
step=2700 epoch=2700/3000 epoch_step=1/1 micro_steps=2700 elapsed=18.9s lr=2.000000e-03 loss=0.2925 loss_recon=0.2925 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3412 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9053 corrupt_frac=1.0000 acc_corrupt=0.9053 loss_corrupt=0.2925 wrong_frac=0.4994 init_acc_corrupt=0.5003 acc_corrupt_t_0p0_0p2=0.5350 corrupt_frac_t_0p0_0p2=0.2030 acc_corrupt_t_0p2_0p4=0.9984 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2001 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=13.1540 out_g_norm=0.4638 loss_all=0.4191 init_gold_top10=0.6214 init_gold_top100=0.7245 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.5020 init_acc_rollout_kept=0.5121 logit_acc_rollout_applied=0.8600 logit_acc_rollout_kept=0.8647
|
| 618 |
+
step=2800 epoch=2800/3000 epoch_step=1/1 micro_steps=2800 elapsed=19.0s lr=2.000000e-03 loss=0.2855 loss_recon=0.2855 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3430 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9054 corrupt_frac=1.0000 acc_corrupt=0.9054 loss_corrupt=0.2855 wrong_frac=0.5037 init_acc_corrupt=0.4940 acc_corrupt_t_0p0_0p2=0.5346 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.9986 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1965 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1984 out_w_norm=13.1064 out_g_norm=0.4428 loss_all=0.3016 init_gold_top10=0.5783 init_gold_top100=0.6934 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.4441 init_acc_rollout_kept=0.4622 logit_acc_rollout_applied=0.8869 logit_acc_rollout_kept=0.9013
|
| 619 |
+
step=2900 epoch=2900/3000 epoch_step=1/1 micro_steps=2900 elapsed=19.1s lr=2.000000e-03 loss=0.2654 loss_recon=0.2654 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.5040 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3546 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9131 corrupt_frac=1.0000 acc_corrupt=0.9131 loss_corrupt=0.2654 wrong_frac=0.4960 init_acc_corrupt=0.5068 acc_corrupt_t_0p0_0p2=0.5513 corrupt_frac_t_0p0_0p2=0.1931 acc_corrupt_t_0p2_0p4=0.9990 corrupt_frac_t_0p2_0p4=0.2013 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2019 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1981 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2056 out_w_norm=13.0543 out_g_norm=0.4271 loss_all=0.1481 init_gold_top10=0.6606 init_gold_top100=0.7375 rollout_applied_pos_frac=0.3125 init_acc_rollout_applied=0.5364 init_acc_rollout_kept=0.5155 logit_acc_rollout_applied=0.9541 logit_acc_rollout_kept=0.9501
|
| 620 |
+
step=3000 epoch=3000/3000 epoch_step=1/1 micro_steps=3000 elapsed=19.0s lr=2.000000e-03 loss=0.2693 loss_recon=0.2693 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3534 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9110 corrupt_frac=1.0000 acc_corrupt=0.9110 loss_corrupt=0.2693 wrong_frac=0.4993 init_acc_corrupt=0.5007 acc_corrupt_t_0p0_0p2=0.5637 corrupt_frac_t_0p0_0p2=0.2034 acc_corrupt_t_0p2_0p4=0.9990 corrupt_frac_t_0p2_0p4=0.1967 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1933 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2055 out_w_norm=12.9986 out_g_norm=0.4138 loss_all=0.4644 init_gold_top10=0.6166 init_gold_top100=0.7234 rollout_applied_pos_frac=0.4141 init_acc_rollout_applied=0.6088 init_acc_rollout_kept=0.4381 logit_acc_rollout_applied=0.8838 logit_acc_rollout_kept=0.8375
|
| 621 |
+
NCCL version 2.25.1+cuda12.8
|
| 622 |
+
resumed_from=runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt start_step=3001
|
| 623 |
+
{
|
| 624 |
+
"device": "cuda:0",
|
| 625 |
+
"rank": 0,
|
| 626 |
+
"world_size": 4,
|
| 627 |
+
"samples": "owt_cached_chunks:8",
|
| 628 |
+
"vocab_size": 2423,
|
| 629 |
+
"tokenizer_vocab_size": 32100,
|
| 630 |
+
"save_dir": "runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728",
|
| 631 |
+
"batch_size": 128,
|
| 632 |
+
"grad_accum": 1,
|
| 633 |
+
"effective_batch_size": 512,
|
| 634 |
+
"global_batch_size": 512,
|
| 635 |
+
"lr_schedule": "constant_warmup",
|
| 636 |
+
"optimizer": "muon",
|
| 637 |
+
"epochs": 0.0,
|
| 638 |
+
"steps_per_epoch": 1,
|
| 639 |
+
"total_steps": 4000,
|
| 640 |
+
"warmup_steps": 10,
|
| 641 |
+
"warmup_epochs": -1.0,
|
| 642 |
+
"min_lr": 0.0,
|
| 643 |
+
"weight_decay": 0.1,
|
| 644 |
+
"output_weight_decay": -1.0,
|
| 645 |
+
"adamw_param_groups": "nanogpt",
|
| 646 |
+
"adam_beta1": 0.9,
|
| 647 |
+
"adam_beta2": 0.95,
|
| 648 |
+
"adam_eps": 1e-08,
|
| 649 |
+
"muon_impl": "legacy",
|
| 650 |
+
"muon_momentum": 0.95,
|
| 651 |
+
"muon_ns_steps": 5,
|
| 652 |
+
"muon_update_scale": 1.0,
|
| 653 |
+
"muon_nesterov": false,
|
| 654 |
+
"muon_width_scale": false,
|
| 655 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 656 |
+
"muon_param_count": 2523776,
|
| 657 |
+
"muon_adam_param_count": 8192,
|
| 658 |
+
"muon_param_names": [
|
| 659 |
+
"vocab_embed.embedding",
|
| 660 |
+
"sigma_map.net.0.weight",
|
| 661 |
+
"sigma_map.net.2.weight",
|
| 662 |
+
"blocks.0.attn_qkv.weight",
|
| 663 |
+
"blocks.0.attn_out.weight",
|
| 664 |
+
"blocks.0.mlp.0.weight",
|
| 665 |
+
"blocks.0.mlp.2.weight",
|
| 666 |
+
"blocks.0.adaLN_modulation.weight",
|
| 667 |
+
"blocks.1.attn_qkv.weight",
|
| 668 |
+
"blocks.1.attn_out.weight",
|
| 669 |
+
"blocks.1.mlp.0.weight",
|
| 670 |
+
"blocks.1.mlp.2.weight",
|
| 671 |
+
"blocks.1.adaLN_modulation.weight",
|
| 672 |
+
"blocks.2.attn_qkv.weight",
|
| 673 |
+
"blocks.2.attn_out.weight",
|
| 674 |
+
"blocks.2.mlp.0.weight",
|
| 675 |
+
"blocks.2.mlp.2.weight",
|
| 676 |
+
"blocks.2.adaLN_modulation.weight",
|
| 677 |
+
"output_layer.linear.weight",
|
| 678 |
+
"output_layer.adaLN_modulation.weight"
|
| 679 |
+
],
|
| 680 |
+
"muon_adam_param_names": [
|
| 681 |
+
"sigma_map.net.0.bias",
|
| 682 |
+
"sigma_map.net.2.bias",
|
| 683 |
+
"blocks.0.norm1.weight",
|
| 684 |
+
"blocks.0.norm2.weight",
|
| 685 |
+
"blocks.0.mlp.0.bias",
|
| 686 |
+
"blocks.0.mlp.2.bias",
|
| 687 |
+
"blocks.0.adaLN_modulation.bias",
|
| 688 |
+
"blocks.1.norm1.weight",
|
| 689 |
+
"blocks.1.norm2.weight",
|
| 690 |
+
"blocks.1.mlp.0.bias",
|
| 691 |
+
"blocks.1.mlp.2.bias",
|
| 692 |
+
"blocks.1.adaLN_modulation.bias",
|
| 693 |
+
"blocks.2.norm1.weight",
|
| 694 |
+
"blocks.2.norm2.weight",
|
| 695 |
+
"blocks.2.mlp.0.bias",
|
| 696 |
+
"blocks.2.mlp.2.bias",
|
| 697 |
+
"blocks.2.adaLN_modulation.bias",
|
| 698 |
+
"output_layer.norm_final.weight",
|
| 699 |
+
"output_layer.adaLN_modulation.bias"
|
| 700 |
+
],
|
| 701 |
+
"muon_effective_nesterov": false,
|
| 702 |
+
"muon_effective_width_scale": false,
|
| 703 |
+
"muon_effective_weight_decay": 0.1,
|
| 704 |
+
"muon_adam_fallback_nesterov": false,
|
| 705 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 706 |
+
"ema_decay": 0.9999,
|
| 707 |
+
"ema_start_step": 0,
|
| 708 |
+
"model_type": "ddit",
|
| 709 |
+
"ddit_mlp_type": "gelu",
|
| 710 |
+
"elf_num_time_tokens": 4,
|
| 711 |
+
"elf_num_model_mode_tokens": 0,
|
| 712 |
+
"qk_norm": true,
|
| 713 |
+
"output_bias": false,
|
| 714 |
+
"output_init_std": -1.0,
|
| 715 |
+
"norm_type": "rmsnorm",
|
| 716 |
+
"target_loss": "hard_ce",
|
| 717 |
+
"linear_soft_target_power": 1.0,
|
| 718 |
+
"linear_soft_target_min_conf": 0.0,
|
| 719 |
+
"linear_soft_target_max_conf": 1.0,
|
| 720 |
+
"t_sampling_mode": "uniform",
|
| 721 |
+
"t_sampling_power": 1.0,
|
| 722 |
+
"t_sampling_eps": 0.0001,
|
| 723 |
+
"t_sampling_logit_mean": -1.5,
|
| 724 |
+
"t_sampling_logit_std": 0.8,
|
| 725 |
+
"dual_t": true,
|
| 726 |
+
"corrupt_t_mode": "same",
|
| 727 |
+
"corrupt_min_t": 0.0,
|
| 728 |
+
"corrupt_max_t": 1.0,
|
| 729 |
+
"prefix_block_prob": 0.0,
|
| 730 |
+
"prefix_block_len": 128,
|
| 731 |
+
"mask_ratio_floor_schedule": "none",
|
| 732 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 733 |
+
"dirichlet_semantic_t_mode": "same",
|
| 734 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 735 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 736 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 737 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 738 |
+
"categorical_wrong_from_full_vocab": true,
|
| 739 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 740 |
+
"categorical_wrong_basin_token_ids": "",
|
| 741 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 742 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 743 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 744 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 745 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 746 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 747 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 748 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 749 |
+
"mask_mixture_original_prob": 0.0,
|
| 750 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 751 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 752 |
+
"mask_mixture_block_prob": 0.0,
|
| 753 |
+
"mask_mixture_all_prob": 1.0,
|
| 754 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 755 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 756 |
+
"mask_mixture_block_tokens": "64,128",
|
| 757 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 758 |
+
"logistic_normal_sigma_min": 0.1,
|
| 759 |
+
"logistic_normal_sigma_max": 1.0,
|
| 760 |
+
"logistic_normal_tau_min": 1.0,
|
| 761 |
+
"logistic_normal_tau_max": 1.0,
|
| 762 |
+
"torch_compile": false,
|
| 763 |
+
"compile_mode": "max-autotune",
|
| 764 |
+
"state_format": "prob",
|
| 765 |
+
"meanflow_weight": 0.0,
|
| 766 |
+
"rollout_train_prob": 0.35,
|
| 767 |
+
"rollout_train_steps": 3,
|
| 768 |
+
"rollout_train_steps_min": 0,
|
| 769 |
+
"rollout_train_infer_steps": 1,
|
| 770 |
+
"rollout_train_time_mode": "sampled_path",
|
| 771 |
+
"rollout_train_s_dist": "uniform",
|
| 772 |
+
"rollout_train_s_min_frac": 0.0,
|
| 773 |
+
"rollout_train_s_max_frac": 0.25,
|
| 774 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 775 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 776 |
+
"rollout_train_temp": 1.0,
|
| 777 |
+
"rollout_train_max_gamma": 1.0,
|
| 778 |
+
"rollout_train_corrupt_only": true,
|
| 779 |
+
"rollout_train_samplewise": true,
|
| 780 |
+
"rollout_train_compute_always": false,
|
| 781 |
+
"rollout_train_sync_t": true,
|
| 782 |
+
"bridge_noise_init": "logistic_normal",
|
| 783 |
+
"noise_sigma": -1.0,
|
| 784 |
+
"allow_tf32": true,
|
| 785 |
+
"activation_checkpointing": false,
|
| 786 |
+
"activation_checkpoint_interval": 1,
|
| 787 |
+
"activation_checkpoint_scope": "block",
|
| 788 |
+
"ddp_static_graph": false,
|
| 789 |
+
"ddp_gradient_as_bucket_view": true,
|
| 790 |
+
"blocking_data_transfer": false,
|
| 791 |
+
"dataloader_prefetch_factor": 4,
|
| 792 |
+
"full_train_stats": false,
|
| 793 |
+
"tokenized_hf": false,
|
| 794 |
+
"tokenized_pad_token": "pad",
|
| 795 |
+
"elf_conditional_hf": false,
|
| 796 |
+
"record_pad_truncate": false,
|
| 797 |
+
"record_add_eos": false,
|
| 798 |
+
"record_add_special_tokens": false,
|
| 799 |
+
"record_pad_token": "pad",
|
| 800 |
+
"record_shuffle_buffer": 10000,
|
| 801 |
+
"wrap": true,
|
| 802 |
+
"wrap_mode": "stream",
|
| 803 |
+
"wrap_record_buffer_size": 200,
|
| 804 |
+
"owt_cached_chunks": true,
|
| 805 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
|
| 806 |
+
"owt_chunk_cache_rebuild": false,
|
| 807 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 808 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 809 |
+
"online_chunk_shuffle": false,
|
| 810 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 811 |
+
"openwebtext_split": "train_minus_100k",
|
| 812 |
+
"detokenizer": "auto",
|
| 813 |
+
"resolved_detokenizer": null,
|
| 814 |
+
"num_workers": 0,
|
| 815 |
+
"latest_every": 1000,
|
| 816 |
+
"resume_path": "runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt"
|
| 817 |
+
}
|
| 818 |
+
step=3100 epoch=3100/4000 epoch_step=1/1 micro_steps=3100 elapsed=19.8s lr=2.000000e-03 loss=0.2771 loss_recon=0.2771 loss_meanflow=0.0000 mean_model_t=0.4971 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3513 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9082 corrupt_frac=1.0000 acc_corrupt=0.9082 loss_corrupt=0.2771 wrong_frac=0.5028 init_acc_corrupt=0.4959 acc_corrupt_t_0p0_0p2=0.5578 corrupt_frac_t_0p0_0p2=0.2071 acc_corrupt_t_0p2_0p4=0.9991 corrupt_frac_t_0p2_0p4=0.1988 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1951 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.9438 out_g_norm=0.4133 loss_all=0.1770 init_gold_top10=0.6054 init_gold_top100=0.6969 rollout_applied_pos_frac=0.3516 init_acc_rollout_applied=0.5426 init_acc_rollout_kept=0.4393 logit_acc_rollout_applied=0.9321 logit_acc_rollout_kept=0.9395
|
| 819 |
+
step=3200 epoch=3200/4000 epoch_step=1/1 micro_steps=3200 elapsed=19.0s lr=2.000000e-03 loss=0.2579 loss_recon=0.2579 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3505 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9130 corrupt_frac=1.0000 acc_corrupt=0.9130 loss_corrupt=0.2579 wrong_frac=0.4984 init_acc_corrupt=0.5023 acc_corrupt_t_0p0_0p2=0.5668 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9993 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2007 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.8972 out_g_norm=0.4063 loss_all=0.3037 init_gold_top10=0.6150 init_gold_top100=0.7113 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.5494 init_acc_rollout_kept=0.4824 logit_acc_rollout_applied=0.8931 logit_acc_rollout_kept=0.8928
|
| 820 |
+
step=3300 epoch=3300/4000 epoch_step=1/1 micro_steps=3300 elapsed=19.0s lr=2.000000e-03 loss=0.2556 loss_recon=0.2556 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3535 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9136 corrupt_frac=1.0000 acc_corrupt=0.9136 loss_corrupt=0.2556 wrong_frac=0.4985 init_acc_corrupt=0.5011 acc_corrupt_t_0p0_0p2=0.5569 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.9994 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2045 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=12.8626 out_g_norm=0.3628 loss_all=0.2428 init_gold_top10=0.6262 init_gold_top100=0.7122 rollout_applied_pos_frac=0.4062 init_acc_rollout_applied=0.5953 init_acc_rollout_kept=0.4471 logit_acc_rollout_applied=0.8962 logit_acc_rollout_kept=0.9286
|
| 821 |
+
step=3400 epoch=3400/4000 epoch_step=1/1 micro_steps=3400 elapsed=19.0s lr=2.000000e-03 loss=0.2476 loss_recon=0.2476 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3515 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9170 corrupt_frac=1.0000 acc_corrupt=0.9170 loss_corrupt=0.2476 wrong_frac=0.5016 init_acc_corrupt=0.4994 acc_corrupt_t_0p0_0p2=0.5868 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9994 corrupt_frac_t_0p2_0p4=0.2001 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1992 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=12.8240 out_g_norm=0.3630 loss_all=0.2311 init_gold_top10=0.6073 init_gold_top100=0.7086 rollout_applied_pos_frac=0.3281 init_acc_rollout_applied=0.6076 init_acc_rollout_kept=0.4320 logit_acc_rollout_applied=0.8999 logit_acc_rollout_kept=0.9337
|
| 822 |
+
step=3500 epoch=3500/4000 epoch_step=1/1 micro_steps=3500 elapsed=18.9s lr=2.000000e-03 loss=0.2571 loss_recon=0.2571 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3470 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9129 corrupt_frac=1.0000 acc_corrupt=0.9129 loss_corrupt=0.2571 wrong_frac=0.5023 init_acc_corrupt=0.4981 acc_corrupt_t_0p0_0p2=0.5775 corrupt_frac_t_0p0_0p2=0.2059 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.1985 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1969 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1927 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=12.7945 out_g_norm=0.3526 loss_all=0.2127 init_gold_top10=0.5645 init_gold_top100=0.6631 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5340 init_acc_rollout_kept=0.4433 logit_acc_rollout_applied=0.9209 logit_acc_rollout_kept=0.9339
|
| 823 |
+
step=3600 epoch=3600/4000 epoch_step=1/1 micro_steps=3600 elapsed=19.1s lr=2.000000e-03 loss=0.2311 loss_recon=0.2311 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3562 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9220 corrupt_frac=1.0000 acc_corrupt=0.9220 loss_corrupt=0.2311 wrong_frac=0.4987 init_acc_corrupt=0.5040 acc_corrupt_t_0p0_0p2=0.6049 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1990 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2017 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1997 out_w_norm=12.7752 out_g_norm=0.3081 loss_all=0.1592 init_gold_top10=0.6825 init_gold_top100=0.7468 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5806 init_acc_rollout_kept=0.5727 logit_acc_rollout_applied=0.9003 logit_acc_rollout_kept=0.9709
|
| 824 |
+
step=3700 epoch=3700/4000 epoch_step=1/1 micro_steps=3700 elapsed=18.9s lr=2.000000e-03 loss=0.2345 loss_recon=0.2345 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3412 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9207 corrupt_frac=1.0000 acc_corrupt=0.9207 loss_corrupt=0.2345 wrong_frac=0.4994 init_acc_corrupt=0.5010 acc_corrupt_t_0p0_0p2=0.6098 corrupt_frac_t_0p0_0p2=0.2030 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2001 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=12.7645 out_g_norm=0.2989 loss_all=0.3539 init_gold_top10=0.6224 init_gold_top100=0.7245 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.5012 init_acc_rollout_kept=0.5121 logit_acc_rollout_applied=0.8579 logit_acc_rollout_kept=0.8977
|
| 825 |
+
step=3800 epoch=3800/4000 epoch_step=1/1 micro_steps=3800 elapsed=19.0s lr=2.000000e-03 loss=0.2347 loss_recon=0.2347 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3430 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9189 corrupt_frac=1.0000 acc_corrupt=0.9189 loss_corrupt=0.2347 wrong_frac=0.5037 init_acc_corrupt=0.4948 acc_corrupt_t_0p0_0p2=0.6001 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1965 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1984 out_w_norm=12.7526 out_g_norm=0.3031 loss_all=0.2269 init_gold_top10=0.5784 init_gold_top100=0.6934 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.4589 init_acc_rollout_kept=0.4622 logit_acc_rollout_applied=0.9234 logit_acc_rollout_kept=0.9226
|
| 826 |
+
step=3900 epoch=3900/4000 epoch_step=1/1 micro_steps=3900 elapsed=19.1s lr=2.000000e-03 loss=0.2202 loss_recon=0.2202 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.5040 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3546 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9242 corrupt_frac=1.0000 acc_corrupt=0.9242 loss_corrupt=0.2202 wrong_frac=0.4960 init_acc_corrupt=0.5074 acc_corrupt_t_0p0_0p2=0.6082 corrupt_frac_t_0p0_0p2=0.1931 acc_corrupt_t_0p2_0p4=0.9993 corrupt_frac_t_0p2_0p4=0.2013 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2019 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1981 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2056 out_w_norm=12.7499 out_g_norm=0.3059 loss_all=0.1200 init_gold_top10=0.6612 init_gold_top100=0.7375 rollout_applied_pos_frac=0.3125 init_acc_rollout_applied=0.5376 init_acc_rollout_kept=0.5155 logit_acc_rollout_applied=0.9567 logit_acc_rollout_kept=0.9620
|
| 827 |
+
step=4000 epoch=4000/4000 epoch_step=1/1 micro_steps=4000 elapsed=19.0s lr=2.000000e-03 loss=0.2281 loss_recon=0.2281 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3534 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9216 corrupt_frac=1.0000 acc_corrupt=0.9216 loss_corrupt=0.2281 wrong_frac=0.4993 init_acc_corrupt=0.5014 acc_corrupt_t_0p0_0p2=0.6154 corrupt_frac_t_0p0_0p2=0.2034 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.1967 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1933 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2055 out_w_norm=12.7635 out_g_norm=0.2545 loss_all=0.3671 init_gold_top10=0.6215 init_gold_top100=0.7234 rollout_applied_pos_frac=0.4141 init_acc_rollout_applied=0.6094 init_acc_rollout_kept=0.4381 logit_acc_rollout_applied=0.8936 logit_acc_rollout_kept=0.8557
|
| 828 |
+
NCCL version 2.25.1+cuda12.8
|
| 829 |
+
resumed_from=runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt start_step=4001
|
| 830 |
+
{
|
| 831 |
+
"device": "cuda:0",
|
| 832 |
+
"rank": 0,
|
| 833 |
+
"world_size": 4,
|
| 834 |
+
"samples": "owt_cached_chunks:8",
|
| 835 |
+
"vocab_size": 2423,
|
| 836 |
+
"tokenizer_vocab_size": 32100,
|
| 837 |
+
"save_dir": "runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728",
|
| 838 |
+
"batch_size": 128,
|
| 839 |
+
"grad_accum": 1,
|
| 840 |
+
"effective_batch_size": 512,
|
| 841 |
+
"global_batch_size": 512,
|
| 842 |
+
"lr_schedule": "constant_warmup",
|
| 843 |
+
"optimizer": "muon",
|
| 844 |
+
"epochs": 0.0,
|
| 845 |
+
"steps_per_epoch": 1,
|
| 846 |
+
"total_steps": 5000,
|
| 847 |
+
"warmup_steps": 10,
|
| 848 |
+
"warmup_epochs": -1.0,
|
| 849 |
+
"min_lr": 0.0,
|
| 850 |
+
"weight_decay": 0.1,
|
| 851 |
+
"output_weight_decay": -1.0,
|
| 852 |
+
"adamw_param_groups": "nanogpt",
|
| 853 |
+
"adam_beta1": 0.9,
|
| 854 |
+
"adam_beta2": 0.95,
|
| 855 |
+
"adam_eps": 1e-08,
|
| 856 |
+
"muon_impl": "legacy",
|
| 857 |
+
"muon_momentum": 0.95,
|
| 858 |
+
"muon_ns_steps": 5,
|
| 859 |
+
"muon_update_scale": 1.0,
|
| 860 |
+
"muon_nesterov": false,
|
| 861 |
+
"muon_width_scale": false,
|
| 862 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 863 |
+
"muon_param_count": 2523776,
|
| 864 |
+
"muon_adam_param_count": 8192,
|
| 865 |
+
"muon_param_names": [
|
| 866 |
+
"vocab_embed.embedding",
|
| 867 |
+
"sigma_map.net.0.weight",
|
| 868 |
+
"sigma_map.net.2.weight",
|
| 869 |
+
"blocks.0.attn_qkv.weight",
|
| 870 |
+
"blocks.0.attn_out.weight",
|
| 871 |
+
"blocks.0.mlp.0.weight",
|
| 872 |
+
"blocks.0.mlp.2.weight",
|
| 873 |
+
"blocks.0.adaLN_modulation.weight",
|
| 874 |
+
"blocks.1.attn_qkv.weight",
|
| 875 |
+
"blocks.1.attn_out.weight",
|
| 876 |
+
"blocks.1.mlp.0.weight",
|
| 877 |
+
"blocks.1.mlp.2.weight",
|
| 878 |
+
"blocks.1.adaLN_modulation.weight",
|
| 879 |
+
"blocks.2.attn_qkv.weight",
|
| 880 |
+
"blocks.2.attn_out.weight",
|
| 881 |
+
"blocks.2.mlp.0.weight",
|
| 882 |
+
"blocks.2.mlp.2.weight",
|
| 883 |
+
"blocks.2.adaLN_modulation.weight",
|
| 884 |
+
"output_layer.linear.weight",
|
| 885 |
+
"output_layer.adaLN_modulation.weight"
|
| 886 |
+
],
|
| 887 |
+
"muon_adam_param_names": [
|
| 888 |
+
"sigma_map.net.0.bias",
|
| 889 |
+
"sigma_map.net.2.bias",
|
| 890 |
+
"blocks.0.norm1.weight",
|
| 891 |
+
"blocks.0.norm2.weight",
|
| 892 |
+
"blocks.0.mlp.0.bias",
|
| 893 |
+
"blocks.0.mlp.2.bias",
|
| 894 |
+
"blocks.0.adaLN_modulation.bias",
|
| 895 |
+
"blocks.1.norm1.weight",
|
| 896 |
+
"blocks.1.norm2.weight",
|
| 897 |
+
"blocks.1.mlp.0.bias",
|
| 898 |
+
"blocks.1.mlp.2.bias",
|
| 899 |
+
"blocks.1.adaLN_modulation.bias",
|
| 900 |
+
"blocks.2.norm1.weight",
|
| 901 |
+
"blocks.2.norm2.weight",
|
| 902 |
+
"blocks.2.mlp.0.bias",
|
| 903 |
+
"blocks.2.mlp.2.bias",
|
| 904 |
+
"blocks.2.adaLN_modulation.bias",
|
| 905 |
+
"output_layer.norm_final.weight",
|
| 906 |
+
"output_layer.adaLN_modulation.bias"
|
| 907 |
+
],
|
| 908 |
+
"muon_effective_nesterov": false,
|
| 909 |
+
"muon_effective_width_scale": false,
|
| 910 |
+
"muon_effective_weight_decay": 0.1,
|
| 911 |
+
"muon_adam_fallback_nesterov": false,
|
| 912 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 913 |
+
"ema_decay": 0.9999,
|
| 914 |
+
"ema_start_step": 0,
|
| 915 |
+
"model_type": "ddit",
|
| 916 |
+
"ddit_mlp_type": "gelu",
|
| 917 |
+
"elf_num_time_tokens": 4,
|
| 918 |
+
"elf_num_model_mode_tokens": 0,
|
| 919 |
+
"qk_norm": true,
|
| 920 |
+
"output_bias": false,
|
| 921 |
+
"output_init_std": -1.0,
|
| 922 |
+
"norm_type": "rmsnorm",
|
| 923 |
+
"target_loss": "hard_ce",
|
| 924 |
+
"linear_soft_target_power": 1.0,
|
| 925 |
+
"linear_soft_target_min_conf": 0.0,
|
| 926 |
+
"linear_soft_target_max_conf": 1.0,
|
| 927 |
+
"t_sampling_mode": "uniform",
|
| 928 |
+
"t_sampling_power": 1.0,
|
| 929 |
+
"t_sampling_eps": 0.0001,
|
| 930 |
+
"t_sampling_logit_mean": -1.5,
|
| 931 |
+
"t_sampling_logit_std": 0.8,
|
| 932 |
+
"dual_t": true,
|
| 933 |
+
"corrupt_t_mode": "same",
|
| 934 |
+
"corrupt_min_t": 0.0,
|
| 935 |
+
"corrupt_max_t": 1.0,
|
| 936 |
+
"prefix_block_prob": 0.0,
|
| 937 |
+
"prefix_block_len": 128,
|
| 938 |
+
"mask_ratio_floor_schedule": "none",
|
| 939 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 940 |
+
"dirichlet_semantic_t_mode": "same",
|
| 941 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 942 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 943 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 944 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 945 |
+
"categorical_wrong_from_full_vocab": true,
|
| 946 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 947 |
+
"categorical_wrong_basin_token_ids": "",
|
| 948 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 949 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 950 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 951 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 952 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 953 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 954 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 955 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 956 |
+
"mask_mixture_original_prob": 0.0,
|
| 957 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 958 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 959 |
+
"mask_mixture_block_prob": 0.0,
|
| 960 |
+
"mask_mixture_all_prob": 1.0,
|
| 961 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 962 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 963 |
+
"mask_mixture_block_tokens": "64,128",
|
| 964 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 965 |
+
"logistic_normal_sigma_min": 0.1,
|
| 966 |
+
"logistic_normal_sigma_max": 1.0,
|
| 967 |
+
"logistic_normal_tau_min": 1.0,
|
| 968 |
+
"logistic_normal_tau_max": 1.0,
|
| 969 |
+
"torch_compile": false,
|
| 970 |
+
"compile_mode": "max-autotune",
|
| 971 |
+
"state_format": "prob",
|
| 972 |
+
"meanflow_weight": 0.0,
|
| 973 |
+
"rollout_train_prob": 0.35,
|
| 974 |
+
"rollout_train_steps": 3,
|
| 975 |
+
"rollout_train_steps_min": 0,
|
| 976 |
+
"rollout_train_infer_steps": 1,
|
| 977 |
+
"rollout_train_time_mode": "sampled_path",
|
| 978 |
+
"rollout_train_s_dist": "uniform",
|
| 979 |
+
"rollout_train_s_min_frac": 0.0,
|
| 980 |
+
"rollout_train_s_max_frac": 0.25,
|
| 981 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 982 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 983 |
+
"rollout_train_temp": 1.0,
|
| 984 |
+
"rollout_train_max_gamma": 1.0,
|
| 985 |
+
"rollout_train_corrupt_only": true,
|
| 986 |
+
"rollout_train_samplewise": true,
|
| 987 |
+
"rollout_train_compute_always": false,
|
| 988 |
+
"rollout_train_sync_t": true,
|
| 989 |
+
"bridge_noise_init": "logistic_normal",
|
| 990 |
+
"noise_sigma": -1.0,
|
| 991 |
+
"allow_tf32": true,
|
| 992 |
+
"activation_checkpointing": false,
|
| 993 |
+
"activation_checkpoint_interval": 1,
|
| 994 |
+
"activation_checkpoint_scope": "block",
|
| 995 |
+
"ddp_static_graph": false,
|
| 996 |
+
"ddp_gradient_as_bucket_view": true,
|
| 997 |
+
"blocking_data_transfer": false,
|
| 998 |
+
"dataloader_prefetch_factor": 4,
|
| 999 |
+
"full_train_stats": false,
|
| 1000 |
+
"tokenized_hf": false,
|
| 1001 |
+
"tokenized_pad_token": "pad",
|
| 1002 |
+
"elf_conditional_hf": false,
|
| 1003 |
+
"record_pad_truncate": false,
|
| 1004 |
+
"record_add_eos": false,
|
| 1005 |
+
"record_add_special_tokens": false,
|
| 1006 |
+
"record_pad_token": "pad",
|
| 1007 |
+
"record_shuffle_buffer": 10000,
|
| 1008 |
+
"wrap": true,
|
| 1009 |
+
"wrap_mode": "stream",
|
| 1010 |
+
"wrap_record_buffer_size": 200,
|
| 1011 |
+
"owt_cached_chunks": true,
|
| 1012 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
|
| 1013 |
+
"owt_chunk_cache_rebuild": false,
|
| 1014 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 1015 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 1016 |
+
"online_chunk_shuffle": false,
|
| 1017 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 1018 |
+
"openwebtext_split": "train_minus_100k",
|
| 1019 |
+
"detokenizer": "auto",
|
| 1020 |
+
"resolved_detokenizer": null,
|
| 1021 |
+
"num_workers": 0,
|
| 1022 |
+
"latest_every": 1000,
|
| 1023 |
+
"resume_path": "runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt"
|
| 1024 |
+
}
|
| 1025 |
+
step=4100 epoch=4100/5000 epoch_step=1/1 micro_steps=4100 elapsed=19.8s lr=2.000000e-03 loss=0.2388 loss_recon=0.2388 loss_meanflow=0.0000 mean_model_t=0.4971 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3513 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9192 corrupt_frac=1.0000 acc_corrupt=0.9192 loss_corrupt=0.2388 wrong_frac=0.5028 init_acc_corrupt=0.4964 acc_corrupt_t_0p0_0p2=0.6102 corrupt_frac_t_0p0_0p2=0.2071 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.1988 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1951 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.7665 out_g_norm=0.2375 loss_all=0.1860 init_gold_top10=0.6049 init_gold_top100=0.6969 rollout_applied_pos_frac=0.3516 init_acc_rollout_applied=0.5464 init_acc_rollout_kept=0.4393 logit_acc_rollout_applied=0.9101 logit_acc_rollout_kept=0.9438
|
| 1026 |
+
step=4200 epoch=4200/5000 epoch_step=1/1 micro_steps=4200 elapsed=19.0s lr=2.000000e-03 loss=0.2291 loss_recon=0.2291 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3505 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9221 corrupt_frac=1.0000 acc_corrupt=0.9221 loss_corrupt=0.2291 wrong_frac=0.4984 init_acc_corrupt=0.5025 acc_corrupt_t_0p0_0p2=0.6120 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2007 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.7798 out_g_norm=0.2175 loss_all=0.2915 init_gold_top10=0.6130 init_gold_top100=0.7113 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.5497 init_acc_rollout_kept=0.4824 logit_acc_rollout_applied=0.8851 logit_acc_rollout_kept=0.8986
|
| 1027 |
+
step=4300 epoch=4300/5000 epoch_step=1/1 micro_steps=4300 elapsed=19.0s lr=2.000000e-03 loss=0.2208 loss_recon=0.2208 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3535 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9235 corrupt_frac=1.0000 acc_corrupt=0.9235 loss_corrupt=0.2208 wrong_frac=0.4985 init_acc_corrupt=0.5016 acc_corrupt_t_0p0_0p2=0.6076 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.9997 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2045 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=12.7930 out_g_norm=0.2144 loss_all=0.2136 init_gold_top10=0.6264 init_gold_top100=0.7122 rollout_applied_pos_frac=0.4062 init_acc_rollout_applied=0.5958 init_acc_rollout_kept=0.4471 logit_acc_rollout_applied=0.9151 logit_acc_rollout_kept=0.9313
|
| 1028 |
+
step=4400 epoch=4400/5000 epoch_step=1/1 micro_steps=4400 elapsed=19.0s lr=2.000000e-03 loss=0.2217 loss_recon=0.2217 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3515 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9240 corrupt_frac=1.0000 acc_corrupt=0.9240 loss_corrupt=0.2217 wrong_frac=0.5016 init_acc_corrupt=0.4998 acc_corrupt_t_0p0_0p2=0.6214 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.2001 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1992 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=12.8076 out_g_norm=0.2126 loss_all=0.1889 init_gold_top10=0.6095 init_gold_top100=0.7086 rollout_applied_pos_frac=0.3281 init_acc_rollout_applied=0.6095 init_acc_rollout_kept=0.4320 logit_acc_rollout_applied=0.9396 logit_acc_rollout_kept=0.9343
|
| 1029 |
+
step=4500 epoch=4500/5000 epoch_step=1/1 micro_steps=4500 elapsed=18.9s lr=2.000000e-03 loss=0.2272 loss_recon=0.2272 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3470 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9222 corrupt_frac=1.0000 acc_corrupt=0.9222 loss_corrupt=0.2272 wrong_frac=0.5023 init_acc_corrupt=0.4989 acc_corrupt_t_0p0_0p2=0.6227 corrupt_frac_t_0p0_0p2=0.2059 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.1985 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1969 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1927 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=12.8130 out_g_norm=0.1987 loss_all=0.1815 init_gold_top10=0.5693 init_gold_top100=0.6631 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5343 init_acc_rollout_kept=0.4433 logit_acc_rollout_applied=0.9338 logit_acc_rollout_kept=0.9350
|
| 1030 |
+
step=4600 epoch=4600/5000 epoch_step=1/1 micro_steps=4600 elapsed=19.1s lr=2.000000e-03 loss=0.2009 loss_recon=0.2009 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3562 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9311 corrupt_frac=1.0000 acc_corrupt=0.9311 loss_corrupt=0.2009 wrong_frac=0.4987 init_acc_corrupt=0.5041 acc_corrupt_t_0p0_0p2=0.6509 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.9998 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1990 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2017 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1997 out_w_norm=12.8104 out_g_norm=0.1711 loss_all=0.1198 init_gold_top10=0.6808 init_gold_top100=0.7468 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5819 init_acc_rollout_kept=0.5727 logit_acc_rollout_applied=0.9010 logit_acc_rollout_kept=0.9871
|
| 1031 |
+
step=4700 epoch=4700/5000 epoch_step=1/1 micro_steps=4700 elapsed=18.8s lr=2.000000e-03 loss=0.2080 loss_recon=0.2080 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3412 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9287 corrupt_frac=1.0000 acc_corrupt=0.9287 loss_corrupt=0.2080 wrong_frac=0.4994 init_acc_corrupt=0.5013 acc_corrupt_t_0p0_0p2=0.6493 corrupt_frac_t_0p0_0p2=0.2030 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2001 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=12.8228 out_g_norm=0.1683 loss_all=0.2800 init_gold_top10=0.6261 init_gold_top100=0.7245 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.5064 init_acc_rollout_kept=0.5121 logit_acc_rollout_applied=0.8821 logit_acc_rollout_kept=0.9122
|
| 1032 |
+
step=4800 epoch=4800/5000 epoch_step=1/1 micro_steps=4800 elapsed=19.0s lr=2.000000e-03 loss=0.2172 loss_recon=0.2172 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3430 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9244 corrupt_frac=1.0000 acc_corrupt=0.9244 loss_corrupt=0.2172 wrong_frac=0.5037 init_acc_corrupt=0.4952 acc_corrupt_t_0p0_0p2=0.6273 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1965 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1984 out_w_norm=12.8338 out_g_norm=0.1678 loss_all=0.2473 init_gold_top10=0.5767 init_gold_top100=0.6934 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.4607 init_acc_rollout_kept=0.4622 logit_acc_rollout_applied=0.9255 logit_acc_rollout_kept=0.9273
|
| 1033 |
+
step=4900 epoch=4900/5000 epoch_step=1/1 micro_steps=4900 elapsed=19.1s lr=2.000000e-03 loss=0.2005 loss_recon=0.2005 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.5040 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3546 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9313 corrupt_frac=1.0000 acc_corrupt=0.9313 loss_corrupt=0.2005 wrong_frac=0.4960 init_acc_corrupt=0.5079 acc_corrupt_t_0p0_0p2=0.6448 corrupt_frac_t_0p0_0p2=0.1931 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.2013 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2019 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1981 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2056 out_w_norm=12.8373 out_g_norm=0.1631 loss_all=0.0820 init_gold_top10=0.6659 init_gold_top100=0.7375 rollout_applied_pos_frac=0.3125 init_acc_rollout_applied=0.5385 init_acc_rollout_kept=0.5155 logit_acc_rollout_applied=0.9767 logit_acc_rollout_kept=0.9712
|
| 1034 |
+
step=5000 epoch=5000/5000 epoch_step=1/1 micro_steps=5000 elapsed=19.0s lr=2.000000e-03 loss=0.2097 loss_recon=0.2097 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3534 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9280 corrupt_frac=1.0000 acc_corrupt=0.9280 loss_corrupt=0.2097 wrong_frac=0.4993 init_acc_corrupt=0.5014 acc_corrupt_t_0p0_0p2=0.6470 corrupt_frac_t_0p0_0p2=0.2034 acc_corrupt_t_0p2_0p4=0.9993 corrupt_frac_t_0p2_0p4=0.1967 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1933 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2055 out_w_norm=12.8513 out_g_norm=0.1607 loss_all=0.3370 init_gold_top10=0.6228 init_gold_top100=0.7234 rollout_applied_pos_frac=0.4141 init_acc_rollout_applied=0.6155 init_acc_rollout_kept=0.4381 logit_acc_rollout_applied=0.9016 logit_acc_rollout_kept=0.8619
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_t5tok_p50_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014620.log
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NCCL version 2.25.1+cuda12.8
|
| 2 |
+
{
|
| 3 |
+
"device": "cuda:0",
|
| 4 |
+
"rank": 0,
|
| 5 |
+
"world_size": 4,
|
| 6 |
+
"samples": "owt_cached_chunks:8",
|
| 7 |
+
"vocab_size": 2423,
|
| 8 |
+
"tokenizer_vocab_size": 32100,
|
| 9 |
+
"save_dir": "runs/train8_ctx1024_t5tok_p50_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014620",
|
| 10 |
+
"batch_size": 128,
|
| 11 |
+
"grad_accum": 1,
|
| 12 |
+
"effective_batch_size": 512,
|
| 13 |
+
"global_batch_size": 512,
|
| 14 |
+
"lr_schedule": "constant_warmup",
|
| 15 |
+
"optimizer": "muon",
|
| 16 |
+
"epochs": 0.0,
|
| 17 |
+
"steps_per_epoch": 1,
|
| 18 |
+
"total_steps": 1000,
|
| 19 |
+
"warmup_steps": 10,
|
| 20 |
+
"warmup_epochs": -1.0,
|
| 21 |
+
"min_lr": 0.0,
|
| 22 |
+
"weight_decay": 0.1,
|
| 23 |
+
"output_weight_decay": -1.0,
|
| 24 |
+
"adamw_param_groups": "nanogpt",
|
| 25 |
+
"adam_beta1": 0.9,
|
| 26 |
+
"adam_beta2": 0.95,
|
| 27 |
+
"adam_eps": 1e-08,
|
| 28 |
+
"muon_impl": "legacy",
|
| 29 |
+
"muon_momentum": 0.95,
|
| 30 |
+
"muon_ns_steps": 5,
|
| 31 |
+
"muon_update_scale": 1.0,
|
| 32 |
+
"muon_nesterov": false,
|
| 33 |
+
"muon_width_scale": false,
|
| 34 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 35 |
+
"muon_param_count": 2523776,
|
| 36 |
+
"muon_adam_param_count": 8192,
|
| 37 |
+
"muon_param_names": [
|
| 38 |
+
"vocab_embed.embedding",
|
| 39 |
+
"sigma_map.net.0.weight",
|
| 40 |
+
"sigma_map.net.2.weight",
|
| 41 |
+
"blocks.0.attn_qkv.weight",
|
| 42 |
+
"blocks.0.attn_out.weight",
|
| 43 |
+
"blocks.0.mlp.0.weight",
|
| 44 |
+
"blocks.0.mlp.2.weight",
|
| 45 |
+
"blocks.0.adaLN_modulation.weight",
|
| 46 |
+
"blocks.1.attn_qkv.weight",
|
| 47 |
+
"blocks.1.attn_out.weight",
|
| 48 |
+
"blocks.1.mlp.0.weight",
|
| 49 |
+
"blocks.1.mlp.2.weight",
|
| 50 |
+
"blocks.1.adaLN_modulation.weight",
|
| 51 |
+
"blocks.2.attn_qkv.weight",
|
| 52 |
+
"blocks.2.attn_out.weight",
|
| 53 |
+
"blocks.2.mlp.0.weight",
|
| 54 |
+
"blocks.2.mlp.2.weight",
|
| 55 |
+
"blocks.2.adaLN_modulation.weight",
|
| 56 |
+
"output_layer.linear.weight",
|
| 57 |
+
"output_layer.adaLN_modulation.weight"
|
| 58 |
+
],
|
| 59 |
+
"muon_adam_param_names": [
|
| 60 |
+
"sigma_map.net.0.bias",
|
| 61 |
+
"sigma_map.net.2.bias",
|
| 62 |
+
"blocks.0.norm1.weight",
|
| 63 |
+
"blocks.0.norm2.weight",
|
| 64 |
+
"blocks.0.mlp.0.bias",
|
| 65 |
+
"blocks.0.mlp.2.bias",
|
| 66 |
+
"blocks.0.adaLN_modulation.bias",
|
| 67 |
+
"blocks.1.norm1.weight",
|
| 68 |
+
"blocks.1.norm2.weight",
|
| 69 |
+
"blocks.1.mlp.0.bias",
|
| 70 |
+
"blocks.1.mlp.2.bias",
|
| 71 |
+
"blocks.1.adaLN_modulation.bias",
|
| 72 |
+
"blocks.2.norm1.weight",
|
| 73 |
+
"blocks.2.norm2.weight",
|
| 74 |
+
"blocks.2.mlp.0.bias",
|
| 75 |
+
"blocks.2.mlp.2.bias",
|
| 76 |
+
"blocks.2.adaLN_modulation.bias",
|
| 77 |
+
"output_layer.norm_final.weight",
|
| 78 |
+
"output_layer.adaLN_modulation.bias"
|
| 79 |
+
],
|
| 80 |
+
"muon_effective_nesterov": false,
|
| 81 |
+
"muon_effective_width_scale": false,
|
| 82 |
+
"muon_effective_weight_decay": 0.1,
|
| 83 |
+
"muon_adam_fallback_nesterov": false,
|
| 84 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 85 |
+
"ema_decay": 0.9999,
|
| 86 |
+
"ema_start_step": 0,
|
| 87 |
+
"model_type": "ddit",
|
| 88 |
+
"ddit_mlp_type": "gelu",
|
| 89 |
+
"elf_num_time_tokens": 4,
|
| 90 |
+
"elf_num_model_mode_tokens": 0,
|
| 91 |
+
"qk_norm": true,
|
| 92 |
+
"output_bias": false,
|
| 93 |
+
"output_init_std": -1.0,
|
| 94 |
+
"norm_type": "rmsnorm",
|
| 95 |
+
"target_loss": "hard_ce",
|
| 96 |
+
"linear_soft_target_power": 1.0,
|
| 97 |
+
"linear_soft_target_min_conf": 0.0,
|
| 98 |
+
"linear_soft_target_max_conf": 1.0,
|
| 99 |
+
"t_sampling_mode": "uniform",
|
| 100 |
+
"t_sampling_power": 1.0,
|
| 101 |
+
"t_sampling_eps": 0.0001,
|
| 102 |
+
"t_sampling_logit_mean": -1.5,
|
| 103 |
+
"t_sampling_logit_std": 0.8,
|
| 104 |
+
"dual_t": true,
|
| 105 |
+
"corrupt_t_mode": "same",
|
| 106 |
+
"corrupt_min_t": 0.0,
|
| 107 |
+
"corrupt_max_t": 1.0,
|
| 108 |
+
"prefix_block_prob": 0.0,
|
| 109 |
+
"prefix_block_len": 128,
|
| 110 |
+
"mask_ratio_floor_schedule": "none",
|
| 111 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 112 |
+
"dirichlet_semantic_t_mode": "same",
|
| 113 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 114 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 115 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 116 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 117 |
+
"categorical_wrong_from_full_vocab": true,
|
| 118 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 119 |
+
"categorical_wrong_basin_token_ids": "",
|
| 120 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 121 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 122 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 123 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 124 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 125 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 126 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 127 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 128 |
+
"mask_mixture_original_prob": 0.0,
|
| 129 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 130 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 131 |
+
"mask_mixture_block_prob": 0.0,
|
| 132 |
+
"mask_mixture_all_prob": 1.0,
|
| 133 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 134 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 135 |
+
"mask_mixture_block_tokens": "64,128",
|
| 136 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 137 |
+
"logistic_normal_sigma_min": 0.1,
|
| 138 |
+
"logistic_normal_sigma_max": 1.0,
|
| 139 |
+
"logistic_normal_tau_min": 1.0,
|
| 140 |
+
"logistic_normal_tau_max": 1.0,
|
| 141 |
+
"torch_compile": false,
|
| 142 |
+
"compile_mode": "max-autotune",
|
| 143 |
+
"state_format": "prob",
|
| 144 |
+
"meanflow_weight": 0.0,
|
| 145 |
+
"rollout_train_prob": 0.5,
|
| 146 |
+
"rollout_train_steps": 4,
|
| 147 |
+
"rollout_train_steps_min": 0,
|
| 148 |
+
"rollout_train_infer_steps": 1,
|
| 149 |
+
"rollout_train_time_mode": "sampled_path",
|
| 150 |
+
"rollout_train_s_dist": "uniform",
|
| 151 |
+
"rollout_train_s_min_frac": 0.0,
|
| 152 |
+
"rollout_train_s_max_frac": 0.25,
|
| 153 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 154 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 155 |
+
"rollout_train_temp": 1.0,
|
| 156 |
+
"rollout_train_max_gamma": 1.0,
|
| 157 |
+
"rollout_train_corrupt_only": true,
|
| 158 |
+
"rollout_train_samplewise": true,
|
| 159 |
+
"rollout_train_compute_always": false,
|
| 160 |
+
"rollout_train_sync_t": true,
|
| 161 |
+
"bridge_noise_init": "logistic_normal",
|
| 162 |
+
"noise_sigma": -1.0,
|
| 163 |
+
"allow_tf32": true,
|
| 164 |
+
"activation_checkpointing": false,
|
| 165 |
+
"activation_checkpoint_interval": 1,
|
| 166 |
+
"activation_checkpoint_scope": "block",
|
| 167 |
+
"ddp_static_graph": false,
|
| 168 |
+
"ddp_gradient_as_bucket_view": true,
|
| 169 |
+
"blocking_data_transfer": false,
|
| 170 |
+
"dataloader_prefetch_factor": 4,
|
| 171 |
+
"full_train_stats": false,
|
| 172 |
+
"tokenized_hf": false,
|
| 173 |
+
"tokenized_pad_token": "pad",
|
| 174 |
+
"elf_conditional_hf": false,
|
| 175 |
+
"record_pad_truncate": false,
|
| 176 |
+
"record_add_eos": false,
|
| 177 |
+
"record_add_special_tokens": false,
|
| 178 |
+
"record_pad_token": "pad",
|
| 179 |
+
"record_shuffle_buffer": 10000,
|
| 180 |
+
"wrap": true,
|
| 181 |
+
"wrap_mode": "stream",
|
| 182 |
+
"wrap_record_buffer_size": 200,
|
| 183 |
+
"owt_cached_chunks": true,
|
| 184 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
|
| 185 |
+
"owt_chunk_cache_rebuild": false,
|
| 186 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 187 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 188 |
+
"online_chunk_shuffle": false,
|
| 189 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 190 |
+
"openwebtext_split": "train_minus_100k",
|
| 191 |
+
"detokenizer": "auto",
|
| 192 |
+
"resolved_detokenizer": null,
|
| 193 |
+
"num_workers": 0,
|
| 194 |
+
"latest_every": 1000,
|
| 195 |
+
"resume_path": ""
|
| 196 |
+
}
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705.log
ADDED
|
@@ -0,0 +1,1024 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NCCL version 2.25.1+cuda12.8
|
| 2 |
+
{
|
| 3 |
+
"device": "cuda:0",
|
| 4 |
+
"rank": 0,
|
| 5 |
+
"world_size": 4,
|
| 6 |
+
"samples": "owt_cached_chunks:8",
|
| 7 |
+
"vocab_size": 2664,
|
| 8 |
+
"tokenizer_vocab_size": 50257,
|
| 9 |
+
"save_dir": "runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705",
|
| 10 |
+
"batch_size": 128,
|
| 11 |
+
"grad_accum": 1,
|
| 12 |
+
"effective_batch_size": 512,
|
| 13 |
+
"global_batch_size": 512,
|
| 14 |
+
"lr_schedule": "constant_warmup",
|
| 15 |
+
"optimizer": "muon",
|
| 16 |
+
"epochs": 0.0,
|
| 17 |
+
"steps_per_epoch": 1,
|
| 18 |
+
"total_steps": 1000,
|
| 19 |
+
"warmup_steps": 10,
|
| 20 |
+
"warmup_epochs": -1.0,
|
| 21 |
+
"min_lr": 0.0,
|
| 22 |
+
"weight_decay": 0.1,
|
| 23 |
+
"output_weight_decay": -1.0,
|
| 24 |
+
"adamw_param_groups": "nanogpt",
|
| 25 |
+
"adam_beta1": 0.9,
|
| 26 |
+
"adam_beta2": 0.95,
|
| 27 |
+
"adam_eps": 1e-08,
|
| 28 |
+
"muon_impl": "legacy",
|
| 29 |
+
"muon_momentum": 0.95,
|
| 30 |
+
"muon_ns_steps": 5,
|
| 31 |
+
"muon_update_scale": 1.0,
|
| 32 |
+
"muon_nesterov": false,
|
| 33 |
+
"muon_width_scale": false,
|
| 34 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 35 |
+
"muon_param_count": 2616320,
|
| 36 |
+
"muon_adam_param_count": 8192,
|
| 37 |
+
"muon_param_names": [
|
| 38 |
+
"vocab_embed.embedding",
|
| 39 |
+
"sigma_map.net.0.weight",
|
| 40 |
+
"sigma_map.net.2.weight",
|
| 41 |
+
"blocks.0.attn_qkv.weight",
|
| 42 |
+
"blocks.0.attn_out.weight",
|
| 43 |
+
"blocks.0.mlp.0.weight",
|
| 44 |
+
"blocks.0.mlp.2.weight",
|
| 45 |
+
"blocks.0.adaLN_modulation.weight",
|
| 46 |
+
"blocks.1.attn_qkv.weight",
|
| 47 |
+
"blocks.1.attn_out.weight",
|
| 48 |
+
"blocks.1.mlp.0.weight",
|
| 49 |
+
"blocks.1.mlp.2.weight",
|
| 50 |
+
"blocks.1.adaLN_modulation.weight",
|
| 51 |
+
"blocks.2.attn_qkv.weight",
|
| 52 |
+
"blocks.2.attn_out.weight",
|
| 53 |
+
"blocks.2.mlp.0.weight",
|
| 54 |
+
"blocks.2.mlp.2.weight",
|
| 55 |
+
"blocks.2.adaLN_modulation.weight",
|
| 56 |
+
"output_layer.linear.weight",
|
| 57 |
+
"output_layer.adaLN_modulation.weight"
|
| 58 |
+
],
|
| 59 |
+
"muon_adam_param_names": [
|
| 60 |
+
"sigma_map.net.0.bias",
|
| 61 |
+
"sigma_map.net.2.bias",
|
| 62 |
+
"blocks.0.norm1.weight",
|
| 63 |
+
"blocks.0.norm2.weight",
|
| 64 |
+
"blocks.0.mlp.0.bias",
|
| 65 |
+
"blocks.0.mlp.2.bias",
|
| 66 |
+
"blocks.0.adaLN_modulation.bias",
|
| 67 |
+
"blocks.1.norm1.weight",
|
| 68 |
+
"blocks.1.norm2.weight",
|
| 69 |
+
"blocks.1.mlp.0.bias",
|
| 70 |
+
"blocks.1.mlp.2.bias",
|
| 71 |
+
"blocks.1.adaLN_modulation.bias",
|
| 72 |
+
"blocks.2.norm1.weight",
|
| 73 |
+
"blocks.2.norm2.weight",
|
| 74 |
+
"blocks.2.mlp.0.bias",
|
| 75 |
+
"blocks.2.mlp.2.bias",
|
| 76 |
+
"blocks.2.adaLN_modulation.bias",
|
| 77 |
+
"output_layer.norm_final.weight",
|
| 78 |
+
"output_layer.adaLN_modulation.bias"
|
| 79 |
+
],
|
| 80 |
+
"muon_effective_nesterov": false,
|
| 81 |
+
"muon_effective_width_scale": false,
|
| 82 |
+
"muon_effective_weight_decay": 0.1,
|
| 83 |
+
"muon_adam_fallback_nesterov": false,
|
| 84 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 85 |
+
"ema_decay": 0.9999,
|
| 86 |
+
"ema_start_step": 0,
|
| 87 |
+
"model_type": "ddit",
|
| 88 |
+
"ddit_mlp_type": "gelu",
|
| 89 |
+
"elf_num_time_tokens": 4,
|
| 90 |
+
"elf_num_model_mode_tokens": 0,
|
| 91 |
+
"qk_norm": true,
|
| 92 |
+
"output_bias": false,
|
| 93 |
+
"output_init_std": -1.0,
|
| 94 |
+
"norm_type": "rmsnorm",
|
| 95 |
+
"target_loss": "hard_ce",
|
| 96 |
+
"linear_soft_target_power": 1.0,
|
| 97 |
+
"linear_soft_target_min_conf": 0.0,
|
| 98 |
+
"linear_soft_target_max_conf": 1.0,
|
| 99 |
+
"t_sampling_mode": "logit_normal",
|
| 100 |
+
"t_sampling_power": 1.0,
|
| 101 |
+
"t_sampling_eps": 0.0001,
|
| 102 |
+
"t_sampling_logit_mean": -1.5,
|
| 103 |
+
"t_sampling_logit_std": 0.8,
|
| 104 |
+
"dual_t": true,
|
| 105 |
+
"corrupt_t_mode": "same",
|
| 106 |
+
"corrupt_min_t": 0.0,
|
| 107 |
+
"corrupt_max_t": 1.0,
|
| 108 |
+
"prefix_block_prob": 0.0,
|
| 109 |
+
"prefix_block_len": 128,
|
| 110 |
+
"mask_ratio_floor_schedule": "none",
|
| 111 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 112 |
+
"dirichlet_semantic_t_mode": "same",
|
| 113 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 114 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 115 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 116 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 117 |
+
"categorical_wrong_from_full_vocab": true,
|
| 118 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 119 |
+
"categorical_wrong_basin_token_ids": "",
|
| 120 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 121 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 122 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 123 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 124 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 125 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 126 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 127 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 128 |
+
"mask_mixture_original_prob": 0.0,
|
| 129 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 130 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 131 |
+
"mask_mixture_block_prob": 0.0,
|
| 132 |
+
"mask_mixture_all_prob": 1.0,
|
| 133 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 134 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 135 |
+
"mask_mixture_block_tokens": "64,128",
|
| 136 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 137 |
+
"logistic_normal_sigma_min": 0.1,
|
| 138 |
+
"logistic_normal_sigma_max": 1.0,
|
| 139 |
+
"logistic_normal_tau_min": 1.0,
|
| 140 |
+
"logistic_normal_tau_max": 1.0,
|
| 141 |
+
"torch_compile": false,
|
| 142 |
+
"compile_mode": "max-autotune",
|
| 143 |
+
"state_format": "prob",
|
| 144 |
+
"meanflow_weight": 0.0,
|
| 145 |
+
"rollout_train_prob": 0.35,
|
| 146 |
+
"rollout_train_steps": 1,
|
| 147 |
+
"rollout_train_infer_steps": 1,
|
| 148 |
+
"rollout_train_time_mode": "sampled_s",
|
| 149 |
+
"rollout_train_s_dist": "uniform",
|
| 150 |
+
"rollout_train_s_min_frac": 0.0,
|
| 151 |
+
"rollout_train_s_max_frac": 0.25,
|
| 152 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 153 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 154 |
+
"rollout_train_temp": 1.45,
|
| 155 |
+
"rollout_train_max_gamma": 1.0,
|
| 156 |
+
"rollout_train_corrupt_only": true,
|
| 157 |
+
"rollout_train_samplewise": true,
|
| 158 |
+
"rollout_train_compute_always": false,
|
| 159 |
+
"rollout_train_sync_t": true,
|
| 160 |
+
"bridge_noise_init": "logistic_normal",
|
| 161 |
+
"noise_sigma": -1.0,
|
| 162 |
+
"allow_tf32": true,
|
| 163 |
+
"activation_checkpointing": false,
|
| 164 |
+
"activation_checkpoint_interval": 1,
|
| 165 |
+
"activation_checkpoint_scope": "block",
|
| 166 |
+
"ddp_static_graph": false,
|
| 167 |
+
"ddp_gradient_as_bucket_view": true,
|
| 168 |
+
"blocking_data_transfer": false,
|
| 169 |
+
"dataloader_prefetch_factor": 4,
|
| 170 |
+
"full_train_stats": false,
|
| 171 |
+
"tokenized_hf": false,
|
| 172 |
+
"tokenized_pad_token": "pad",
|
| 173 |
+
"elf_conditional_hf": false,
|
| 174 |
+
"record_pad_truncate": false,
|
| 175 |
+
"record_add_eos": false,
|
| 176 |
+
"record_add_special_tokens": false,
|
| 177 |
+
"record_pad_token": "pad",
|
| 178 |
+
"record_shuffle_buffer": 10000,
|
| 179 |
+
"wrap": true,
|
| 180 |
+
"wrap_mode": "stream",
|
| 181 |
+
"wrap_record_buffer_size": 200,
|
| 182 |
+
"owt_cached_chunks": true,
|
| 183 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
|
| 184 |
+
"owt_chunk_cache_rebuild": false,
|
| 185 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 186 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 187 |
+
"online_chunk_shuffle": false,
|
| 188 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 189 |
+
"openwebtext_split": "train_minus_100k",
|
| 190 |
+
"detokenizer": "auto",
|
| 191 |
+
"resolved_detokenizer": null,
|
| 192 |
+
"num_workers": 0,
|
| 193 |
+
"latest_every": 1000,
|
| 194 |
+
"resume_path": ""
|
| 195 |
+
}
|
| 196 |
+
step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=14.5s lr=2.000000e-03 loss=7.7225 loss_recon=7.7225 loss_meanflow=0.0000 mean_model_t=0.2070 mean_corrupt_t=0.2070 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3566 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0944 corrupt_frac=1.0000 acc_corrupt=0.0944 loss_corrupt=7.7225 wrong_frac=0.7930 init_acc_corrupt=0.1137 acc_corrupt_t_0p0_0p2=0.0502 corrupt_frac_t_0p0_0p2=0.5646 acc_corrupt_t_0p2_0p4=0.1261 corrupt_frac_t_0p2_0p4=0.3537 acc_corrupt_t_0p4_0p6=0.2510 corrupt_frac_t_0p4_0p6=0.0731 acc_corrupt_t_0p6_0p8=0.3593 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=1.0070 out_g_norm=1.0880 acc_corrupt_t_0p8_1p0=0.4619 corrupt_frac_t_0p8_1p0=0.0078 loss_all=7.4683 init_gold_top10=0.1940 init_gold_top100=0.3863 rollout_applied_pos_frac=0.3750 init_acc_rollout_applied=0.1046 init_acc_rollout_kept=0.1284 logit_acc_rollout_applied=0.0924 logit_acc_rollout_kept=0.1034
|
| 197 |
+
step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=13.5s lr=2.000000e-03 loss=7.0875 loss_recon=7.0875 loss_meanflow=0.0000 mean_model_t=0.2093 mean_corrupt_t=0.2093 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3467 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1033 corrupt_frac=1.0000 acc_corrupt=0.1033 loss_corrupt=7.0875 wrong_frac=0.7905 init_acc_corrupt=0.1168 acc_corrupt_t_0p0_0p2=0.0558 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.1391 corrupt_frac_t_0p2_0p4=0.3600 acc_corrupt_t_0p4_0p6=0.2535 corrupt_frac_t_0p4_0p6=0.0760 acc_corrupt_t_0p6_0p8=0.3377 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=2.8606 out_g_norm=1.7741 acc_corrupt_t_0p8_1p0=0.2988 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.6883 init_gold_top10=0.2127 init_gold_top100=0.4090 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.1394 init_acc_rollout_kept=0.1165 logit_acc_rollout_applied=0.1224 logit_acc_rollout_kept=0.1105
|
| 198 |
+
step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=13.5s lr=2.000000e-03 loss=6.4572 loss_recon=6.4572 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3452 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1128 corrupt_frac=1.0000 acc_corrupt=0.1128 loss_corrupt=6.4572 wrong_frac=0.7923 init_acc_corrupt=0.1157 acc_corrupt_t_0p0_0p2=0.0592 corrupt_frac_t_0p0_0p2=0.5677 acc_corrupt_t_0p2_0p4=0.1556 corrupt_frac_t_0p2_0p4=0.3483 acc_corrupt_t_0p4_0p6=0.2841 corrupt_frac_t_0p4_0p6=0.0743 acc_corrupt_t_0p6_0p8=0.3918 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=4.3503 out_g_norm=1.3203 acc_corrupt_t_0p8_1p0=0.5454 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.2518 init_gold_top10=0.1851 init_gold_top100=0.3919 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.0706 init_acc_rollout_kept=0.1125 logit_acc_rollout_applied=0.0955 logit_acc_rollout_kept=0.1151
|
| 199 |
+
step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=13.6s lr=2.000000e-03 loss=5.9689 loss_recon=5.9689 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3509 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1251 corrupt_frac=1.0000 acc_corrupt=0.1251 loss_corrupt=5.9689 wrong_frac=0.7904 init_acc_corrupt=0.1177 acc_corrupt_t_0p0_0p2=0.0635 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=0.1711 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.3157 corrupt_frac_t_0p4_0p6=0.0770 acc_corrupt_t_0p6_0p8=0.4369 corrupt_frac_t_0p6_0p8=0.0138 out_w_norm=5.5056 out_g_norm=0.5242 acc_corrupt_t_0p8_1p0=0.5684 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.7489 init_gold_top10=0.1987 init_gold_top100=0.4110 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.1260 init_acc_rollout_kept=0.1193 logit_acc_rollout_applied=0.1379 logit_acc_rollout_kept=0.1298
|
| 200 |
+
step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=13.6s lr=2.000000e-03 loss=5.4717 loss_recon=5.4717 loss_meanflow=0.0000 mean_model_t=0.2075 mean_corrupt_t=0.2075 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3543 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1360 corrupt_frac=1.0000 acc_corrupt=0.1360 loss_corrupt=5.4717 wrong_frac=0.7924 init_acc_corrupt=0.1157 acc_corrupt_t_0p0_0p2=0.0675 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=0.1901 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.3503 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.4852 corrupt_frac_t_0p6_0p8=0.0118 out_w_norm=6.7726 out_g_norm=0.4126 acc_corrupt_t_0p8_1p0=0.6328 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.1699 init_gold_top10=0.2023 init_gold_top100=0.4501 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1088 init_acc_rollout_kept=0.1267 logit_acc_rollout_applied=0.1348 logit_acc_rollout_kept=0.1538
|
| 201 |
+
step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=13.5s lr=2.000000e-03 loss=4.8275 loss_recon=4.8275 loss_meanflow=0.0000 mean_model_t=0.2080 mean_corrupt_t=0.2080 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3501 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1514 corrupt_frac=1.0000 acc_corrupt=0.1514 loss_corrupt=4.8275 wrong_frac=0.7919 init_acc_corrupt=0.1154 acc_corrupt_t_0p0_0p2=0.0729 corrupt_frac_t_0p0_0p2=0.5587 acc_corrupt_t_0p2_0p4=0.2141 corrupt_frac_t_0p2_0p4=0.3601 acc_corrupt_t_0p4_0p6=0.3970 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.5683 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=8.0690 out_g_norm=0.4267 acc_corrupt_t_0p8_1p0=0.6172 corrupt_frac_t_0p8_1p0=0.0078 loss_all=4.3866 init_gold_top10=0.2056 init_gold_top100=0.5031 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.1209 init_acc_rollout_kept=0.1135 logit_acc_rollout_applied=0.1647 logit_acc_rollout_kept=0.1675
|
| 202 |
+
step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=13.5s lr=2.000000e-03 loss=4.2103 loss_recon=4.2103 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3470 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1766 corrupt_frac=1.0000 acc_corrupt=0.1766 loss_corrupt=4.2103 wrong_frac=0.7918 init_acc_corrupt=0.1171 acc_corrupt_t_0p0_0p2=0.0787 corrupt_frac_t_0p0_0p2=0.5605 acc_corrupt_t_0p2_0p4=0.2543 corrupt_frac_t_0p2_0p4=0.3566 acc_corrupt_t_0p4_0p6=0.4862 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.6680 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=9.2577 out_g_norm=0.4974 acc_corrupt_t_0p8_1p0=0.8032 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.9837 init_gold_top10=0.2096 init_gold_top100=0.5203 rollout_applied_pos_frac=0.3516 init_acc_rollout_applied=0.1317 init_acc_rollout_kept=0.1155 logit_acc_rollout_applied=0.2035 logit_acc_rollout_kept=0.1879
|
| 203 |
+
step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=13.5s lr=2.000000e-03 loss=3.7190 loss_recon=3.7190 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3430 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1987 corrupt_frac=1.0000 acc_corrupt=0.1987 loss_corrupt=3.7190 wrong_frac=0.7914 init_acc_corrupt=0.1172 acc_corrupt_t_0p0_0p2=0.0885 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.2933 corrupt_frac_t_0p2_0p4=0.3564 acc_corrupt_t_0p4_0p6=0.5263 corrupt_frac_t_0p4_0p6=0.0741 acc_corrupt_t_0p6_0p8=0.6865 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=10.1103 out_g_norm=0.6562 loss_all=3.4861 init_gold_top10=0.2151 init_gold_top100=0.5456 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1216 init_acc_rollout_kept=0.1331 logit_acc_rollout_applied=0.2087 logit_acc_rollout_kept=0.2242
|
| 204 |
+
step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=13.6s lr=2.000000e-03 loss=3.3010 loss_recon=3.3010 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3524 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2325 corrupt_frac=1.0000 acc_corrupt=0.2325 loss_corrupt=3.3010 wrong_frac=0.7903 init_acc_corrupt=0.1189 acc_corrupt_t_0p0_0p2=0.1019 corrupt_frac_t_0p0_0p2=0.5486 acc_corrupt_t_0p2_0p4=0.3438 corrupt_frac_t_0p2_0p4=0.3675 acc_corrupt_t_0p4_0p6=0.5826 corrupt_frac_t_0p4_0p6=0.0754 out_w_norm=10.6246 out_g_norm=0.8460 acc_corrupt_t_0p6_0p8=0.7352 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.8434 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.1073 init_gold_top10=0.2286 init_gold_top100=0.5535 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1193 init_acc_rollout_kept=0.1251 logit_acc_rollout_applied=0.2425 logit_acc_rollout_kept=0.2695
|
| 205 |
+
step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=13.5s lr=2.000000e-03 loss=2.9096 loss_recon=2.9096 loss_meanflow=0.0000 mean_model_t=0.2090 mean_corrupt_t=0.2090 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3507 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2852 corrupt_frac=1.0000 acc_corrupt=0.2852 loss_corrupt=2.9096 wrong_frac=0.7909 init_acc_corrupt=0.1190 acc_corrupt_t_0p0_0p2=0.1272 corrupt_frac_t_0p0_0p2=0.5551 acc_corrupt_t_0p2_0p4=0.4349 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.6706 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.8008 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=10.9832 out_g_norm=1.0880 acc_corrupt_t_0p8_1p0=0.8379 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.7501 init_gold_top10=0.2276 init_gold_top100=0.5129 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.0947 init_acc_rollout_kept=0.1007 logit_acc_rollout_applied=0.2757 logit_acc_rollout_kept=0.3128
|
| 206 |
+
NCCL version 2.25.1+cuda12.8
|
| 207 |
+
resumed_from=runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705/latest.pt start_step=1001
|
| 208 |
+
{
|
| 209 |
+
"device": "cuda:0",
|
| 210 |
+
"rank": 0,
|
| 211 |
+
"world_size": 4,
|
| 212 |
+
"samples": "owt_cached_chunks:8",
|
| 213 |
+
"vocab_size": 2664,
|
| 214 |
+
"tokenizer_vocab_size": 50257,
|
| 215 |
+
"save_dir": "runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705",
|
| 216 |
+
"batch_size": 128,
|
| 217 |
+
"grad_accum": 1,
|
| 218 |
+
"effective_batch_size": 512,
|
| 219 |
+
"global_batch_size": 512,
|
| 220 |
+
"lr_schedule": "constant_warmup",
|
| 221 |
+
"optimizer": "muon",
|
| 222 |
+
"epochs": 0.0,
|
| 223 |
+
"steps_per_epoch": 1,
|
| 224 |
+
"total_steps": 2000,
|
| 225 |
+
"warmup_steps": 10,
|
| 226 |
+
"warmup_epochs": -1.0,
|
| 227 |
+
"min_lr": 0.0,
|
| 228 |
+
"weight_decay": 0.1,
|
| 229 |
+
"output_weight_decay": -1.0,
|
| 230 |
+
"adamw_param_groups": "nanogpt",
|
| 231 |
+
"adam_beta1": 0.9,
|
| 232 |
+
"adam_beta2": 0.95,
|
| 233 |
+
"adam_eps": 1e-08,
|
| 234 |
+
"muon_impl": "legacy",
|
| 235 |
+
"muon_momentum": 0.95,
|
| 236 |
+
"muon_ns_steps": 5,
|
| 237 |
+
"muon_update_scale": 1.0,
|
| 238 |
+
"muon_nesterov": false,
|
| 239 |
+
"muon_width_scale": false,
|
| 240 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 241 |
+
"muon_param_count": 2616320,
|
| 242 |
+
"muon_adam_param_count": 8192,
|
| 243 |
+
"muon_param_names": [
|
| 244 |
+
"vocab_embed.embedding",
|
| 245 |
+
"sigma_map.net.0.weight",
|
| 246 |
+
"sigma_map.net.2.weight",
|
| 247 |
+
"blocks.0.attn_qkv.weight",
|
| 248 |
+
"blocks.0.attn_out.weight",
|
| 249 |
+
"blocks.0.mlp.0.weight",
|
| 250 |
+
"blocks.0.mlp.2.weight",
|
| 251 |
+
"blocks.0.adaLN_modulation.weight",
|
| 252 |
+
"blocks.1.attn_qkv.weight",
|
| 253 |
+
"blocks.1.attn_out.weight",
|
| 254 |
+
"blocks.1.mlp.0.weight",
|
| 255 |
+
"blocks.1.mlp.2.weight",
|
| 256 |
+
"blocks.1.adaLN_modulation.weight",
|
| 257 |
+
"blocks.2.attn_qkv.weight",
|
| 258 |
+
"blocks.2.attn_out.weight",
|
| 259 |
+
"blocks.2.mlp.0.weight",
|
| 260 |
+
"blocks.2.mlp.2.weight",
|
| 261 |
+
"blocks.2.adaLN_modulation.weight",
|
| 262 |
+
"output_layer.linear.weight",
|
| 263 |
+
"output_layer.adaLN_modulation.weight"
|
| 264 |
+
],
|
| 265 |
+
"muon_adam_param_names": [
|
| 266 |
+
"sigma_map.net.0.bias",
|
| 267 |
+
"sigma_map.net.2.bias",
|
| 268 |
+
"blocks.0.norm1.weight",
|
| 269 |
+
"blocks.0.norm2.weight",
|
| 270 |
+
"blocks.0.mlp.0.bias",
|
| 271 |
+
"blocks.0.mlp.2.bias",
|
| 272 |
+
"blocks.0.adaLN_modulation.bias",
|
| 273 |
+
"blocks.1.norm1.weight",
|
| 274 |
+
"blocks.1.norm2.weight",
|
| 275 |
+
"blocks.1.mlp.0.bias",
|
| 276 |
+
"blocks.1.mlp.2.bias",
|
| 277 |
+
"blocks.1.adaLN_modulation.bias",
|
| 278 |
+
"blocks.2.norm1.weight",
|
| 279 |
+
"blocks.2.norm2.weight",
|
| 280 |
+
"blocks.2.mlp.0.bias",
|
| 281 |
+
"blocks.2.mlp.2.bias",
|
| 282 |
+
"blocks.2.adaLN_modulation.bias",
|
| 283 |
+
"output_layer.norm_final.weight",
|
| 284 |
+
"output_layer.adaLN_modulation.bias"
|
| 285 |
+
],
|
| 286 |
+
"muon_effective_nesterov": false,
|
| 287 |
+
"muon_effective_width_scale": false,
|
| 288 |
+
"muon_effective_weight_decay": 0.1,
|
| 289 |
+
"muon_adam_fallback_nesterov": false,
|
| 290 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 291 |
+
"ema_decay": 0.9999,
|
| 292 |
+
"ema_start_step": 0,
|
| 293 |
+
"model_type": "ddit",
|
| 294 |
+
"ddit_mlp_type": "gelu",
|
| 295 |
+
"elf_num_time_tokens": 4,
|
| 296 |
+
"elf_num_model_mode_tokens": 0,
|
| 297 |
+
"qk_norm": true,
|
| 298 |
+
"output_bias": false,
|
| 299 |
+
"output_init_std": -1.0,
|
| 300 |
+
"norm_type": "rmsnorm",
|
| 301 |
+
"target_loss": "hard_ce",
|
| 302 |
+
"linear_soft_target_power": 1.0,
|
| 303 |
+
"linear_soft_target_min_conf": 0.0,
|
| 304 |
+
"linear_soft_target_max_conf": 1.0,
|
| 305 |
+
"t_sampling_mode": "logit_normal",
|
| 306 |
+
"t_sampling_power": 1.0,
|
| 307 |
+
"t_sampling_eps": 0.0001,
|
| 308 |
+
"t_sampling_logit_mean": -1.5,
|
| 309 |
+
"t_sampling_logit_std": 0.8,
|
| 310 |
+
"dual_t": true,
|
| 311 |
+
"corrupt_t_mode": "same",
|
| 312 |
+
"corrupt_min_t": 0.0,
|
| 313 |
+
"corrupt_max_t": 1.0,
|
| 314 |
+
"prefix_block_prob": 0.0,
|
| 315 |
+
"prefix_block_len": 128,
|
| 316 |
+
"mask_ratio_floor_schedule": "none",
|
| 317 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 318 |
+
"dirichlet_semantic_t_mode": "same",
|
| 319 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 320 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 321 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 322 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 323 |
+
"categorical_wrong_from_full_vocab": true,
|
| 324 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 325 |
+
"categorical_wrong_basin_token_ids": "",
|
| 326 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 327 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 328 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 329 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 330 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 331 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 332 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 333 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 334 |
+
"mask_mixture_original_prob": 0.0,
|
| 335 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 336 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 337 |
+
"mask_mixture_block_prob": 0.0,
|
| 338 |
+
"mask_mixture_all_prob": 1.0,
|
| 339 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 340 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 341 |
+
"mask_mixture_block_tokens": "64,128",
|
| 342 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 343 |
+
"logistic_normal_sigma_min": 0.1,
|
| 344 |
+
"logistic_normal_sigma_max": 1.0,
|
| 345 |
+
"logistic_normal_tau_min": 1.0,
|
| 346 |
+
"logistic_normal_tau_max": 1.0,
|
| 347 |
+
"torch_compile": false,
|
| 348 |
+
"compile_mode": "max-autotune",
|
| 349 |
+
"state_format": "prob",
|
| 350 |
+
"meanflow_weight": 0.0,
|
| 351 |
+
"rollout_train_prob": 0.35,
|
| 352 |
+
"rollout_train_steps": 1,
|
| 353 |
+
"rollout_train_infer_steps": 1,
|
| 354 |
+
"rollout_train_time_mode": "sampled_s",
|
| 355 |
+
"rollout_train_s_dist": "uniform",
|
| 356 |
+
"rollout_train_s_min_frac": 0.0,
|
| 357 |
+
"rollout_train_s_max_frac": 0.25,
|
| 358 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 359 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 360 |
+
"rollout_train_temp": 1.45,
|
| 361 |
+
"rollout_train_max_gamma": 1.0,
|
| 362 |
+
"rollout_train_corrupt_only": true,
|
| 363 |
+
"rollout_train_samplewise": true,
|
| 364 |
+
"rollout_train_compute_always": false,
|
| 365 |
+
"rollout_train_sync_t": true,
|
| 366 |
+
"bridge_noise_init": "logistic_normal",
|
| 367 |
+
"noise_sigma": -1.0,
|
| 368 |
+
"allow_tf32": true,
|
| 369 |
+
"activation_checkpointing": false,
|
| 370 |
+
"activation_checkpoint_interval": 1,
|
| 371 |
+
"activation_checkpoint_scope": "block",
|
| 372 |
+
"ddp_static_graph": false,
|
| 373 |
+
"ddp_gradient_as_bucket_view": true,
|
| 374 |
+
"blocking_data_transfer": false,
|
| 375 |
+
"dataloader_prefetch_factor": 4,
|
| 376 |
+
"full_train_stats": false,
|
| 377 |
+
"tokenized_hf": false,
|
| 378 |
+
"tokenized_pad_token": "pad",
|
| 379 |
+
"elf_conditional_hf": false,
|
| 380 |
+
"record_pad_truncate": false,
|
| 381 |
+
"record_add_eos": false,
|
| 382 |
+
"record_add_special_tokens": false,
|
| 383 |
+
"record_pad_token": "pad",
|
| 384 |
+
"record_shuffle_buffer": 10000,
|
| 385 |
+
"wrap": true,
|
| 386 |
+
"wrap_mode": "stream",
|
| 387 |
+
"wrap_record_buffer_size": 200,
|
| 388 |
+
"owt_cached_chunks": true,
|
| 389 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
|
| 390 |
+
"owt_chunk_cache_rebuild": false,
|
| 391 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 392 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 393 |
+
"online_chunk_shuffle": false,
|
| 394 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 395 |
+
"openwebtext_split": "train_minus_100k",
|
| 396 |
+
"detokenizer": "auto",
|
| 397 |
+
"resolved_detokenizer": null,
|
| 398 |
+
"num_workers": 0,
|
| 399 |
+
"latest_every": 1000,
|
| 400 |
+
"resume_path": "runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705/latest.pt"
|
| 401 |
+
}
|
| 402 |
+
step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=14.5s lr=2.000000e-03 loss=2.5661 loss_recon=2.5661 loss_meanflow=0.0000 mean_model_t=0.2070 mean_corrupt_t=0.2070 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3566 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3462 corrupt_frac=1.0000 acc_corrupt=0.3462 loss_corrupt=2.5661 wrong_frac=0.7930 init_acc_corrupt=0.1172 acc_corrupt_t_0p0_0p2=0.1600 corrupt_frac_t_0p0_0p2=0.5646 acc_corrupt_t_0p2_0p4=0.5433 corrupt_frac_t_0p2_0p4=0.3537 acc_corrupt_t_0p4_0p6=0.7710 corrupt_frac_t_0p4_0p6=0.0731 acc_corrupt_t_0p6_0p8=0.8578 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=11.2564 out_g_norm=1.2812 acc_corrupt_t_0p8_1p0=0.9385 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.4453 init_gold_top10=0.2908 init_gold_top100=0.5703 rollout_applied_pos_frac=0.3750 init_acc_rollout_applied=0.1161 init_acc_rollout_kept=0.1284 logit_acc_rollout_applied=0.3787 logit_acc_rollout_kept=0.3859
|
| 403 |
+
step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=13.5s lr=2.000000e-03 loss=2.2222 loss_recon=2.2222 loss_meanflow=0.0000 mean_model_t=0.2093 mean_corrupt_t=0.2093 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3467 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4209 corrupt_frac=1.0000 acc_corrupt=0.4209 loss_corrupt=2.2222 wrong_frac=0.7905 init_acc_corrupt=0.1209 acc_corrupt_t_0p0_0p2=0.1983 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.6590 corrupt_frac_t_0p2_0p4=0.3600 acc_corrupt_t_0p4_0p6=0.8595 corrupt_frac_t_0p4_0p6=0.0760 acc_corrupt_t_0p6_0p8=0.9181 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=11.5179 out_g_norm=1.4931 acc_corrupt_t_0p8_1p0=0.9355 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7875 init_gold_top10=0.3507 init_gold_top100=0.5781 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.1554 init_acc_rollout_kept=0.1165 logit_acc_rollout_applied=0.5391 logit_acc_rollout_kept=0.4889
|
| 404 |
+
step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=13.5s lr=2.000000e-03 loss=1.9908 loss_recon=1.9908 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3452 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4756 corrupt_frac=1.0000 acc_corrupt=0.4756 loss_corrupt=1.9908 wrong_frac=0.7923 init_acc_corrupt=0.1205 acc_corrupt_t_0p0_0p2=0.2363 corrupt_frac_t_0p0_0p2=0.5677 acc_corrupt_t_0p2_0p4=0.7568 corrupt_frac_t_0p2_0p4=0.3483 acc_corrupt_t_0p4_0p6=0.9233 corrupt_frac_t_0p4_0p6=0.0743 acc_corrupt_t_0p6_0p8=0.9486 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=11.7426 out_g_norm=1.6005 acc_corrupt_t_0p8_1p0=0.9565 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.8900 init_gold_top10=0.3031 init_gold_top100=0.5160 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.0895 init_acc_rollout_kept=0.1125 logit_acc_rollout_applied=0.5288 logit_acc_rollout_kept=0.4775
|
| 405 |
+
step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=13.6s lr=2.000000e-03 loss=1.7732 loss_recon=1.7732 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3509 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5297 corrupt_frac=1.0000 acc_corrupt=0.5297 loss_corrupt=1.7732 wrong_frac=0.7904 init_acc_corrupt=0.1238 acc_corrupt_t_0p0_0p2=0.2730 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=0.8254 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.9553 corrupt_frac_t_0p4_0p6=0.0770 acc_corrupt_t_0p6_0p8=0.9705 corrupt_frac_t_0p6_0p8=0.0138 out_w_norm=11.8904 out_g_norm=1.7736 acc_corrupt_t_0p8_1p0=0.9485 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6616 init_gold_top10=0.3314 init_gold_top100=0.5167 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.1414 init_acc_rollout_kept=0.1193 logit_acc_rollout_applied=0.6786 logit_acc_rollout_kept=0.5277
|
| 406 |
+
step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=13.6s lr=2.000000e-03 loss=1.6078 loss_recon=1.6078 loss_meanflow=0.0000 mean_model_t=0.2075 mean_corrupt_t=0.2075 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3543 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5728 corrupt_frac=1.0000 acc_corrupt=0.5728 loss_corrupt=1.6078 wrong_frac=0.7924 init_acc_corrupt=0.1231 acc_corrupt_t_0p0_0p2=0.3100 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=0.8863 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.9791 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9870 corrupt_frac_t_0p6_0p8=0.0118 out_w_norm=12.0140 out_g_norm=1.7297 acc_corrupt_t_0p8_1p0=0.9795 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6049 init_gold_top10=0.3586 init_gold_top100=0.5673 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1354 init_acc_rollout_kept=0.1267 logit_acc_rollout_applied=0.5946 logit_acc_rollout_kept=0.5775
|
| 407 |
+
step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=13.6s lr=2.000000e-03 loss=1.4398 loss_recon=1.4398 loss_meanflow=0.0000 mean_model_t=0.2080 mean_corrupt_t=0.2080 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3501 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6148 corrupt_frac=1.0000 acc_corrupt=0.6148 loss_corrupt=1.4398 wrong_frac=0.7919 init_acc_corrupt=0.1239 acc_corrupt_t_0p0_0p2=0.3616 corrupt_frac_t_0p0_0p2=0.5587 acc_corrupt_t_0p2_0p4=0.9233 corrupt_frac_t_0p2_0p4=0.3601 acc_corrupt_t_0p4_0p6=0.9885 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.9892 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.1155 out_g_norm=1.7083 acc_corrupt_t_0p8_1p0=0.9883 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.2419 init_gold_top10=0.3671 init_gold_top100=0.5536 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.1364 init_acc_rollout_kept=0.1135 logit_acc_rollout_applied=0.6601 logit_acc_rollout_kept=0.6458
|
| 408 |
+
step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=13.5s lr=2.000000e-03 loss=1.3160 loss_recon=1.3160 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3470 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6473 corrupt_frac=1.0000 acc_corrupt=0.6473 loss_corrupt=1.3160 wrong_frac=0.7918 init_acc_corrupt=0.1264 acc_corrupt_t_0p0_0p2=0.4046 corrupt_frac_t_0p0_0p2=0.5605 acc_corrupt_t_0p2_0p4=0.9485 corrupt_frac_t_0p2_0p4=0.3566 acc_corrupt_t_0p4_0p6=0.9930 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.9935 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.1736 out_g_norm=1.7850 acc_corrupt_t_0p8_1p0=0.9946 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.3934 init_gold_top10=0.3819 init_gold_top100=0.5598 rollout_applied_pos_frac=0.3516 init_acc_rollout_applied=0.1794 init_acc_rollout_kept=0.1155 logit_acc_rollout_applied=0.7141 logit_acc_rollout_kept=0.6128
|
| 409 |
+
step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=13.5s lr=2.000000e-03 loss=1.1661 loss_recon=1.1661 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3430 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6826 corrupt_frac=1.0000 acc_corrupt=0.6826 loss_corrupt=1.1661 wrong_frac=0.7914 init_acc_corrupt=0.1274 acc_corrupt_t_0p0_0p2=0.4589 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.9632 corrupt_frac_t_0p2_0p4=0.3564 acc_corrupt_t_0p4_0p6=0.9957 corrupt_frac_t_0p4_0p6=0.0741 acc_corrupt_t_0p6_0p8=0.9944 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.2279 out_g_norm=1.7180 loss_all=1.1713 init_gold_top10=0.3758 init_gold_top100=0.5692 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1465 init_acc_rollout_kept=0.1331 logit_acc_rollout_applied=0.7110 logit_acc_rollout_kept=0.6753
|
| 410 |
+
step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=13.6s lr=2.000000e-03 loss=1.0032 loss_recon=1.0032 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3524 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7219 corrupt_frac=1.0000 acc_corrupt=0.7219 loss_corrupt=1.0032 wrong_frac=0.7903 init_acc_corrupt=0.1292 acc_corrupt_t_0p0_0p2=0.5132 corrupt_frac_t_0p0_0p2=0.5486 acc_corrupt_t_0p2_0p4=0.9708 corrupt_frac_t_0p2_0p4=0.3675 acc_corrupt_t_0p4_0p6=0.9962 corrupt_frac_t_0p4_0p6=0.0754 out_w_norm=12.2700 out_g_norm=1.8058 acc_corrupt_t_0p6_0p8=0.9950 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.9883 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.8831 init_gold_top10=0.3894 init_gold_top100=0.5688 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1433 init_acc_rollout_kept=0.1251 logit_acc_rollout_applied=0.7317 logit_acc_rollout_kept=0.7577
|
| 411 |
+
step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=13.6s lr=2.000000e-03 loss=0.7741 loss_recon=0.7741 loss_meanflow=0.0000 mean_model_t=0.2090 mean_corrupt_t=0.2090 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3507 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7780 corrupt_frac=1.0000 acc_corrupt=0.7780 loss_corrupt=0.7741 wrong_frac=0.7909 init_acc_corrupt=0.1300 acc_corrupt_t_0p0_0p2=0.6133 corrupt_frac_t_0p0_0p2=0.5551 acc_corrupt_t_0p2_0p4=0.9805 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.9966 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.9938 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.3161 out_g_norm=1.6323 acc_corrupt_t_0p8_1p0=0.9678 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.6070 init_gold_top10=0.3857 init_gold_top100=0.5204 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.1180 init_acc_rollout_kept=0.1007 logit_acc_rollout_applied=0.8249 logit_acc_rollout_kept=0.8256
|
| 412 |
+
NCCL version 2.25.1+cuda12.8
|
| 413 |
+
resumed_from=runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705/latest.pt start_step=2001
|
| 414 |
+
{
|
| 415 |
+
"device": "cuda:0",
|
| 416 |
+
"rank": 0,
|
| 417 |
+
"world_size": 4,
|
| 418 |
+
"samples": "owt_cached_chunks:8",
|
| 419 |
+
"vocab_size": 2664,
|
| 420 |
+
"tokenizer_vocab_size": 50257,
|
| 421 |
+
"save_dir": "runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705",
|
| 422 |
+
"batch_size": 128,
|
| 423 |
+
"grad_accum": 1,
|
| 424 |
+
"effective_batch_size": 512,
|
| 425 |
+
"global_batch_size": 512,
|
| 426 |
+
"lr_schedule": "constant_warmup",
|
| 427 |
+
"optimizer": "muon",
|
| 428 |
+
"epochs": 0.0,
|
| 429 |
+
"steps_per_epoch": 1,
|
| 430 |
+
"total_steps": 3000,
|
| 431 |
+
"warmup_steps": 10,
|
| 432 |
+
"warmup_epochs": -1.0,
|
| 433 |
+
"min_lr": 0.0,
|
| 434 |
+
"weight_decay": 0.1,
|
| 435 |
+
"output_weight_decay": -1.0,
|
| 436 |
+
"adamw_param_groups": "nanogpt",
|
| 437 |
+
"adam_beta1": 0.9,
|
| 438 |
+
"adam_beta2": 0.95,
|
| 439 |
+
"adam_eps": 1e-08,
|
| 440 |
+
"muon_impl": "legacy",
|
| 441 |
+
"muon_momentum": 0.95,
|
| 442 |
+
"muon_ns_steps": 5,
|
| 443 |
+
"muon_update_scale": 1.0,
|
| 444 |
+
"muon_nesterov": false,
|
| 445 |
+
"muon_width_scale": false,
|
| 446 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 447 |
+
"muon_param_count": 2616320,
|
| 448 |
+
"muon_adam_param_count": 8192,
|
| 449 |
+
"muon_param_names": [
|
| 450 |
+
"vocab_embed.embedding",
|
| 451 |
+
"sigma_map.net.0.weight",
|
| 452 |
+
"sigma_map.net.2.weight",
|
| 453 |
+
"blocks.0.attn_qkv.weight",
|
| 454 |
+
"blocks.0.attn_out.weight",
|
| 455 |
+
"blocks.0.mlp.0.weight",
|
| 456 |
+
"blocks.0.mlp.2.weight",
|
| 457 |
+
"blocks.0.adaLN_modulation.weight",
|
| 458 |
+
"blocks.1.attn_qkv.weight",
|
| 459 |
+
"blocks.1.attn_out.weight",
|
| 460 |
+
"blocks.1.mlp.0.weight",
|
| 461 |
+
"blocks.1.mlp.2.weight",
|
| 462 |
+
"blocks.1.adaLN_modulation.weight",
|
| 463 |
+
"blocks.2.attn_qkv.weight",
|
| 464 |
+
"blocks.2.attn_out.weight",
|
| 465 |
+
"blocks.2.mlp.0.weight",
|
| 466 |
+
"blocks.2.mlp.2.weight",
|
| 467 |
+
"blocks.2.adaLN_modulation.weight",
|
| 468 |
+
"output_layer.linear.weight",
|
| 469 |
+
"output_layer.adaLN_modulation.weight"
|
| 470 |
+
],
|
| 471 |
+
"muon_adam_param_names": [
|
| 472 |
+
"sigma_map.net.0.bias",
|
| 473 |
+
"sigma_map.net.2.bias",
|
| 474 |
+
"blocks.0.norm1.weight",
|
| 475 |
+
"blocks.0.norm2.weight",
|
| 476 |
+
"blocks.0.mlp.0.bias",
|
| 477 |
+
"blocks.0.mlp.2.bias",
|
| 478 |
+
"blocks.0.adaLN_modulation.bias",
|
| 479 |
+
"blocks.1.norm1.weight",
|
| 480 |
+
"blocks.1.norm2.weight",
|
| 481 |
+
"blocks.1.mlp.0.bias",
|
| 482 |
+
"blocks.1.mlp.2.bias",
|
| 483 |
+
"blocks.1.adaLN_modulation.bias",
|
| 484 |
+
"blocks.2.norm1.weight",
|
| 485 |
+
"blocks.2.norm2.weight",
|
| 486 |
+
"blocks.2.mlp.0.bias",
|
| 487 |
+
"blocks.2.mlp.2.bias",
|
| 488 |
+
"blocks.2.adaLN_modulation.bias",
|
| 489 |
+
"output_layer.norm_final.weight",
|
| 490 |
+
"output_layer.adaLN_modulation.bias"
|
| 491 |
+
],
|
| 492 |
+
"muon_effective_nesterov": false,
|
| 493 |
+
"muon_effective_width_scale": false,
|
| 494 |
+
"muon_effective_weight_decay": 0.1,
|
| 495 |
+
"muon_adam_fallback_nesterov": false,
|
| 496 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 497 |
+
"ema_decay": 0.9999,
|
| 498 |
+
"ema_start_step": 0,
|
| 499 |
+
"model_type": "ddit",
|
| 500 |
+
"ddit_mlp_type": "gelu",
|
| 501 |
+
"elf_num_time_tokens": 4,
|
| 502 |
+
"elf_num_model_mode_tokens": 0,
|
| 503 |
+
"qk_norm": true,
|
| 504 |
+
"output_bias": false,
|
| 505 |
+
"output_init_std": -1.0,
|
| 506 |
+
"norm_type": "rmsnorm",
|
| 507 |
+
"target_loss": "hard_ce",
|
| 508 |
+
"linear_soft_target_power": 1.0,
|
| 509 |
+
"linear_soft_target_min_conf": 0.0,
|
| 510 |
+
"linear_soft_target_max_conf": 1.0,
|
| 511 |
+
"t_sampling_mode": "logit_normal",
|
| 512 |
+
"t_sampling_power": 1.0,
|
| 513 |
+
"t_sampling_eps": 0.0001,
|
| 514 |
+
"t_sampling_logit_mean": -1.5,
|
| 515 |
+
"t_sampling_logit_std": 0.8,
|
| 516 |
+
"dual_t": true,
|
| 517 |
+
"corrupt_t_mode": "same",
|
| 518 |
+
"corrupt_min_t": 0.0,
|
| 519 |
+
"corrupt_max_t": 1.0,
|
| 520 |
+
"prefix_block_prob": 0.0,
|
| 521 |
+
"prefix_block_len": 128,
|
| 522 |
+
"mask_ratio_floor_schedule": "none",
|
| 523 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 524 |
+
"dirichlet_semantic_t_mode": "same",
|
| 525 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 526 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 527 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 528 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 529 |
+
"categorical_wrong_from_full_vocab": true,
|
| 530 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 531 |
+
"categorical_wrong_basin_token_ids": "",
|
| 532 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 533 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 534 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 535 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 536 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 537 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 538 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 539 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 540 |
+
"mask_mixture_original_prob": 0.0,
|
| 541 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 542 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 543 |
+
"mask_mixture_block_prob": 0.0,
|
| 544 |
+
"mask_mixture_all_prob": 1.0,
|
| 545 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 546 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 547 |
+
"mask_mixture_block_tokens": "64,128",
|
| 548 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 549 |
+
"logistic_normal_sigma_min": 0.1,
|
| 550 |
+
"logistic_normal_sigma_max": 1.0,
|
| 551 |
+
"logistic_normal_tau_min": 1.0,
|
| 552 |
+
"logistic_normal_tau_max": 1.0,
|
| 553 |
+
"torch_compile": false,
|
| 554 |
+
"compile_mode": "max-autotune",
|
| 555 |
+
"state_format": "prob",
|
| 556 |
+
"meanflow_weight": 0.0,
|
| 557 |
+
"rollout_train_prob": 0.35,
|
| 558 |
+
"rollout_train_steps": 1,
|
| 559 |
+
"rollout_train_infer_steps": 1,
|
| 560 |
+
"rollout_train_time_mode": "sampled_s",
|
| 561 |
+
"rollout_train_s_dist": "uniform",
|
| 562 |
+
"rollout_train_s_min_frac": 0.0,
|
| 563 |
+
"rollout_train_s_max_frac": 0.25,
|
| 564 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 565 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 566 |
+
"rollout_train_temp": 1.45,
|
| 567 |
+
"rollout_train_max_gamma": 1.0,
|
| 568 |
+
"rollout_train_corrupt_only": true,
|
| 569 |
+
"rollout_train_samplewise": true,
|
| 570 |
+
"rollout_train_compute_always": false,
|
| 571 |
+
"rollout_train_sync_t": true,
|
| 572 |
+
"bridge_noise_init": "logistic_normal",
|
| 573 |
+
"noise_sigma": -1.0,
|
| 574 |
+
"allow_tf32": true,
|
| 575 |
+
"activation_checkpointing": false,
|
| 576 |
+
"activation_checkpoint_interval": 1,
|
| 577 |
+
"activation_checkpoint_scope": "block",
|
| 578 |
+
"ddp_static_graph": false,
|
| 579 |
+
"ddp_gradient_as_bucket_view": true,
|
| 580 |
+
"blocking_data_transfer": false,
|
| 581 |
+
"dataloader_prefetch_factor": 4,
|
| 582 |
+
"full_train_stats": false,
|
| 583 |
+
"tokenized_hf": false,
|
| 584 |
+
"tokenized_pad_token": "pad",
|
| 585 |
+
"elf_conditional_hf": false,
|
| 586 |
+
"record_pad_truncate": false,
|
| 587 |
+
"record_add_eos": false,
|
| 588 |
+
"record_add_special_tokens": false,
|
| 589 |
+
"record_pad_token": "pad",
|
| 590 |
+
"record_shuffle_buffer": 10000,
|
| 591 |
+
"wrap": true,
|
| 592 |
+
"wrap_mode": "stream",
|
| 593 |
+
"wrap_record_buffer_size": 200,
|
| 594 |
+
"owt_cached_chunks": true,
|
| 595 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
|
| 596 |
+
"owt_chunk_cache_rebuild": false,
|
| 597 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 598 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 599 |
+
"online_chunk_shuffle": false,
|
| 600 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 601 |
+
"openwebtext_split": "train_minus_100k",
|
| 602 |
+
"detokenizer": "auto",
|
| 603 |
+
"resolved_detokenizer": null,
|
| 604 |
+
"num_workers": 0,
|
| 605 |
+
"latest_every": 1000,
|
| 606 |
+
"resume_path": "runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705/latest.pt"
|
| 607 |
+
}
|
| 608 |
+
step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=14.7s lr=2.000000e-03 loss=0.6071 loss_recon=0.6071 loss_meanflow=0.0000 mean_model_t=0.2070 mean_corrupt_t=0.2070 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3566 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8207 corrupt_frac=1.0000 acc_corrupt=0.8207 loss_corrupt=0.6071 wrong_frac=0.7930 init_acc_corrupt=0.1295 acc_corrupt_t_0p0_0p2=0.6896 corrupt_frac_t_0p0_0p2=0.5646 acc_corrupt_t_0p2_0p4=0.9893 corrupt_frac_t_0p2_0p4=0.3537 acc_corrupt_t_0p4_0p6=0.9978 corrupt_frac_t_0p4_0p6=0.0731 acc_corrupt_t_0p6_0p8=0.9961 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.3759 out_g_norm=1.3563 acc_corrupt_t_0p8_1p0=0.9873 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5944 init_gold_top10=0.4541 init_gold_top100=0.5788 rollout_applied_pos_frac=0.3750 init_acc_rollout_applied=0.1540 init_acc_rollout_kept=0.1284 logit_acc_rollout_applied=0.8264 logit_acc_rollout_kept=0.8117
|
| 609 |
+
step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=13.8s lr=2.000000e-03 loss=0.5238 loss_recon=0.5238 loss_meanflow=0.0000 mean_model_t=0.2093 mean_corrupt_t=0.2093 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3467 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8450 corrupt_frac=1.0000 acc_corrupt=0.8450 loss_corrupt=0.5238 wrong_frac=0.7905 init_acc_corrupt=0.1334 acc_corrupt_t_0p0_0p2=0.7252 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.9936 corrupt_frac_t_0p2_0p4=0.3600 acc_corrupt_t_0p4_0p6=0.9980 corrupt_frac_t_0p4_0p6=0.0760 acc_corrupt_t_0p6_0p8=0.9972 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.3868 out_g_norm=1.3063 acc_corrupt_t_0p8_1p0=0.9883 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3427 init_gold_top10=0.4658 init_gold_top100=0.5809 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.2087 init_acc_rollout_kept=0.1165 logit_acc_rollout_applied=0.8794 logit_acc_rollout_kept=0.9044
|
| 610 |
+
step=2300 epoch=2300/3000 epoch_step=1/1 micro_steps=2300 elapsed=13.8s lr=2.000000e-03 loss=0.4988 loss_recon=0.4988 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3452 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8515 corrupt_frac=1.0000 acc_corrupt=0.8515 loss_corrupt=0.4988 wrong_frac=0.7923 init_acc_corrupt=0.1331 acc_corrupt_t_0p0_0p2=0.7407 corrupt_frac_t_0p0_0p2=0.5677 acc_corrupt_t_0p2_0p4=0.9965 corrupt_frac_t_0p2_0p4=0.3483 acc_corrupt_t_0p4_0p6=0.9987 corrupt_frac_t_0p4_0p6=0.0743 acc_corrupt_t_0p6_0p8=0.9984 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=12.3557 out_g_norm=1.2158 acc_corrupt_t_0p8_1p0=0.9985 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.4199 init_gold_top10=0.3863 init_gold_top100=0.5180 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.1357 init_acc_rollout_kept=0.1125 logit_acc_rollout_applied=0.8559 logit_acc_rollout_kept=0.8958
|
| 611 |
+
step=2400 epoch=2400/3000 epoch_step=1/1 micro_steps=2400 elapsed=13.9s lr=2.000000e-03 loss=0.4452 loss_recon=0.4452 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3509 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8646 corrupt_frac=1.0000 acc_corrupt=0.8646 loss_corrupt=0.4452 wrong_frac=0.7904 init_acc_corrupt=0.1363 acc_corrupt_t_0p0_0p2=0.7581 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=0.9976 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.9989 corrupt_frac_t_0p4_0p6=0.0770 acc_corrupt_t_0p6_0p8=0.9987 corrupt_frac_t_0p6_0p8=0.0138 out_w_norm=12.3157 out_g_norm=1.0273 acc_corrupt_t_0p8_1p0=0.9968 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5021 init_gold_top10=0.3939 init_gold_top100=0.5171 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.1624 init_acc_rollout_kept=0.1193 logit_acc_rollout_applied=0.9255 logit_acc_rollout_kept=0.8112
|
| 612 |
+
step=2500 epoch=2500/3000 epoch_step=1/1 micro_steps=2500 elapsed=13.9s lr=2.000000e-03 loss=0.4050 loss_recon=0.4050 loss_meanflow=0.0000 mean_model_t=0.2075 mean_corrupt_t=0.2075 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3543 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8737 corrupt_frac=1.0000 acc_corrupt=0.8737 loss_corrupt=0.4050 wrong_frac=0.7924 init_acc_corrupt=0.1349 acc_corrupt_t_0p0_0p2=0.7741 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=0.9991 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.9996 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9992 corrupt_frac_t_0p6_0p8=0.0118 out_w_norm=12.2846 out_g_norm=0.8842 acc_corrupt_t_0p8_1p0=0.9990 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5535 init_gold_top10=0.4440 init_gold_top100=0.5682 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1681 init_acc_rollout_kept=0.1267 logit_acc_rollout_applied=0.8005 logit_acc_rollout_kept=0.8542
|
| 613 |
+
step=2600 epoch=2600/3000 epoch_step=1/1 micro_steps=2600 elapsed=13.8s lr=2.000000e-03 loss=0.3667 loss_recon=0.3667 loss_meanflow=0.0000 mean_model_t=0.2080 mean_corrupt_t=0.2080 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3501 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8866 corrupt_frac=1.0000 acc_corrupt=0.8866 loss_corrupt=0.3667 wrong_frac=0.7919 init_acc_corrupt=0.1346 acc_corrupt_t_0p0_0p2=0.7975 corrupt_frac_t_0p0_0p2=0.5587 acc_corrupt_t_0p2_0p4=0.9993 corrupt_frac_t_0p2_0p4=0.3601 acc_corrupt_t_0p4_0p6=0.9996 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.9992 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.2575 out_g_norm=0.8410 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2945 init_gold_top10=0.4462 init_gold_top100=0.5540 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.1552 init_acc_rollout_kept=0.1135 logit_acc_rollout_applied=0.8925 logit_acc_rollout_kept=0.9056
|
| 614 |
+
step=2700 epoch=2700/3000 epoch_step=1/1 micro_steps=2700 elapsed=13.8s lr=2.000000e-03 loss=0.3555 loss_recon=0.3555 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3470 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8894 corrupt_frac=1.0000 acc_corrupt=0.8894 loss_corrupt=0.3555 wrong_frac=0.7918 init_acc_corrupt=0.1361 acc_corrupt_t_0p0_0p2=0.8030 corrupt_frac_t_0p0_0p2=0.5605 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.3566 acc_corrupt_t_0p4_0p6=0.9997 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.9994 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.2373 out_g_norm=0.7415 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.4078 init_gold_top10=0.4547 init_gold_top100=0.5600 rollout_applied_pos_frac=0.3516 init_acc_rollout_applied=0.2303 init_acc_rollout_kept=0.1155 logit_acc_rollout_applied=0.8906 logit_acc_rollout_kept=0.8987
|
| 615 |
+
step=2800 epoch=2800/3000 epoch_step=1/1 micro_steps=2800 elapsed=13.8s lr=2.000000e-03 loss=0.3279 loss_recon=0.3279 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3430 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8969 corrupt_frac=1.0000 acc_corrupt=0.8969 loss_corrupt=0.3279 wrong_frac=0.7914 init_acc_corrupt=0.1362 acc_corrupt_t_0p0_0p2=0.8168 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.9994 corrupt_frac_t_0p2_0p4=0.3564 acc_corrupt_t_0p4_0p6=0.9997 corrupt_frac_t_0p4_0p6=0.0741 acc_corrupt_t_0p6_0p8=0.9994 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.2266 out_g_norm=0.7464 loss_all=0.3466 init_gold_top10=0.4644 init_gold_top100=0.5694 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1603 init_acc_rollout_kept=0.1331 logit_acc_rollout_applied=0.8982 logit_acc_rollout_kept=0.8894
|
| 616 |
+
step=2900 epoch=2900/3000 epoch_step=1/1 micro_steps=2900 elapsed=13.8s lr=2.000000e-03 loss=0.3187 loss_recon=0.3187 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3524 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9008 corrupt_frac=1.0000 acc_corrupt=0.9008 loss_corrupt=0.3187 wrong_frac=0.7903 init_acc_corrupt=0.1369 acc_corrupt_t_0p0_0p2=0.8194 corrupt_frac_t_0p0_0p2=0.5486 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.3675 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0754 out_w_norm=12.2105 out_g_norm=0.6139 acc_corrupt_t_0p6_0p8=0.9996 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.9997 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3607 init_gold_top10=0.4493 init_gold_top100=0.5688 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1612 init_acc_rollout_kept=0.1251 logit_acc_rollout_applied=0.8823 logit_acc_rollout_kept=0.8900
|
| 617 |
+
step=3000 epoch=3000/3000 epoch_step=1/1 micro_steps=3000 elapsed=13.7s lr=2.000000e-03 loss=0.3103 loss_recon=0.3103 loss_meanflow=0.0000 mean_model_t=0.2090 mean_corrupt_t=0.2090 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3507 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9017 corrupt_frac=1.0000 acc_corrupt=0.9017 loss_corrupt=0.3103 wrong_frac=0.7909 init_acc_corrupt=0.1370 acc_corrupt_t_0p0_0p2=0.8231 corrupt_frac_t_0p0_0p2=0.5551 acc_corrupt_t_0p2_0p4=0.9997 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.9993 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.2031 out_g_norm=0.6367 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2591 init_gold_top10=0.4073 init_gold_top100=0.5204 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.1303 init_acc_rollout_kept=0.1007 logit_acc_rollout_applied=0.8910 logit_acc_rollout_kept=0.9292
|
| 618 |
+
NCCL version 2.25.1+cuda12.8
|
| 619 |
+
resumed_from=runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705/latest.pt start_step=3001
|
| 620 |
+
{
|
| 621 |
+
"device": "cuda:0",
|
| 622 |
+
"rank": 0,
|
| 623 |
+
"world_size": 4,
|
| 624 |
+
"samples": "owt_cached_chunks:8",
|
| 625 |
+
"vocab_size": 2664,
|
| 626 |
+
"tokenizer_vocab_size": 50257,
|
| 627 |
+
"save_dir": "runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705",
|
| 628 |
+
"batch_size": 128,
|
| 629 |
+
"grad_accum": 1,
|
| 630 |
+
"effective_batch_size": 512,
|
| 631 |
+
"global_batch_size": 512,
|
| 632 |
+
"lr_schedule": "constant_warmup",
|
| 633 |
+
"optimizer": "muon",
|
| 634 |
+
"epochs": 0.0,
|
| 635 |
+
"steps_per_epoch": 1,
|
| 636 |
+
"total_steps": 4000,
|
| 637 |
+
"warmup_steps": 10,
|
| 638 |
+
"warmup_epochs": -1.0,
|
| 639 |
+
"min_lr": 0.0,
|
| 640 |
+
"weight_decay": 0.1,
|
| 641 |
+
"output_weight_decay": -1.0,
|
| 642 |
+
"adamw_param_groups": "nanogpt",
|
| 643 |
+
"adam_beta1": 0.9,
|
| 644 |
+
"adam_beta2": 0.95,
|
| 645 |
+
"adam_eps": 1e-08,
|
| 646 |
+
"muon_impl": "legacy",
|
| 647 |
+
"muon_momentum": 0.95,
|
| 648 |
+
"muon_ns_steps": 5,
|
| 649 |
+
"muon_update_scale": 1.0,
|
| 650 |
+
"muon_nesterov": false,
|
| 651 |
+
"muon_width_scale": false,
|
| 652 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 653 |
+
"muon_param_count": 2616320,
|
| 654 |
+
"muon_adam_param_count": 8192,
|
| 655 |
+
"muon_param_names": [
|
| 656 |
+
"vocab_embed.embedding",
|
| 657 |
+
"sigma_map.net.0.weight",
|
| 658 |
+
"sigma_map.net.2.weight",
|
| 659 |
+
"blocks.0.attn_qkv.weight",
|
| 660 |
+
"blocks.0.attn_out.weight",
|
| 661 |
+
"blocks.0.mlp.0.weight",
|
| 662 |
+
"blocks.0.mlp.2.weight",
|
| 663 |
+
"blocks.0.adaLN_modulation.weight",
|
| 664 |
+
"blocks.1.attn_qkv.weight",
|
| 665 |
+
"blocks.1.attn_out.weight",
|
| 666 |
+
"blocks.1.mlp.0.weight",
|
| 667 |
+
"blocks.1.mlp.2.weight",
|
| 668 |
+
"blocks.1.adaLN_modulation.weight",
|
| 669 |
+
"blocks.2.attn_qkv.weight",
|
| 670 |
+
"blocks.2.attn_out.weight",
|
| 671 |
+
"blocks.2.mlp.0.weight",
|
| 672 |
+
"blocks.2.mlp.2.weight",
|
| 673 |
+
"blocks.2.adaLN_modulation.weight",
|
| 674 |
+
"output_layer.linear.weight",
|
| 675 |
+
"output_layer.adaLN_modulation.weight"
|
| 676 |
+
],
|
| 677 |
+
"muon_adam_param_names": [
|
| 678 |
+
"sigma_map.net.0.bias",
|
| 679 |
+
"sigma_map.net.2.bias",
|
| 680 |
+
"blocks.0.norm1.weight",
|
| 681 |
+
"blocks.0.norm2.weight",
|
| 682 |
+
"blocks.0.mlp.0.bias",
|
| 683 |
+
"blocks.0.mlp.2.bias",
|
| 684 |
+
"blocks.0.adaLN_modulation.bias",
|
| 685 |
+
"blocks.1.norm1.weight",
|
| 686 |
+
"blocks.1.norm2.weight",
|
| 687 |
+
"blocks.1.mlp.0.bias",
|
| 688 |
+
"blocks.1.mlp.2.bias",
|
| 689 |
+
"blocks.1.adaLN_modulation.bias",
|
| 690 |
+
"blocks.2.norm1.weight",
|
| 691 |
+
"blocks.2.norm2.weight",
|
| 692 |
+
"blocks.2.mlp.0.bias",
|
| 693 |
+
"blocks.2.mlp.2.bias",
|
| 694 |
+
"blocks.2.adaLN_modulation.bias",
|
| 695 |
+
"output_layer.norm_final.weight",
|
| 696 |
+
"output_layer.adaLN_modulation.bias"
|
| 697 |
+
],
|
| 698 |
+
"muon_effective_nesterov": false,
|
| 699 |
+
"muon_effective_width_scale": false,
|
| 700 |
+
"muon_effective_weight_decay": 0.1,
|
| 701 |
+
"muon_adam_fallback_nesterov": false,
|
| 702 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 703 |
+
"ema_decay": 0.9999,
|
| 704 |
+
"ema_start_step": 0,
|
| 705 |
+
"model_type": "ddit",
|
| 706 |
+
"ddit_mlp_type": "gelu",
|
| 707 |
+
"elf_num_time_tokens": 4,
|
| 708 |
+
"elf_num_model_mode_tokens": 0,
|
| 709 |
+
"qk_norm": true,
|
| 710 |
+
"output_bias": false,
|
| 711 |
+
"output_init_std": -1.0,
|
| 712 |
+
"norm_type": "rmsnorm",
|
| 713 |
+
"target_loss": "hard_ce",
|
| 714 |
+
"linear_soft_target_power": 1.0,
|
| 715 |
+
"linear_soft_target_min_conf": 0.0,
|
| 716 |
+
"linear_soft_target_max_conf": 1.0,
|
| 717 |
+
"t_sampling_mode": "logit_normal",
|
| 718 |
+
"t_sampling_power": 1.0,
|
| 719 |
+
"t_sampling_eps": 0.0001,
|
| 720 |
+
"t_sampling_logit_mean": -1.5,
|
| 721 |
+
"t_sampling_logit_std": 0.8,
|
| 722 |
+
"dual_t": true,
|
| 723 |
+
"corrupt_t_mode": "same",
|
| 724 |
+
"corrupt_min_t": 0.0,
|
| 725 |
+
"corrupt_max_t": 1.0,
|
| 726 |
+
"prefix_block_prob": 0.0,
|
| 727 |
+
"prefix_block_len": 128,
|
| 728 |
+
"mask_ratio_floor_schedule": "none",
|
| 729 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 730 |
+
"dirichlet_semantic_t_mode": "same",
|
| 731 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 732 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 733 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 734 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 735 |
+
"categorical_wrong_from_full_vocab": true,
|
| 736 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 737 |
+
"categorical_wrong_basin_token_ids": "",
|
| 738 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 739 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 740 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 741 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 742 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 743 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 744 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 745 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 746 |
+
"mask_mixture_original_prob": 0.0,
|
| 747 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 748 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 749 |
+
"mask_mixture_block_prob": 0.0,
|
| 750 |
+
"mask_mixture_all_prob": 1.0,
|
| 751 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 752 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 753 |
+
"mask_mixture_block_tokens": "64,128",
|
| 754 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 755 |
+
"logistic_normal_sigma_min": 0.1,
|
| 756 |
+
"logistic_normal_sigma_max": 1.0,
|
| 757 |
+
"logistic_normal_tau_min": 1.0,
|
| 758 |
+
"logistic_normal_tau_max": 1.0,
|
| 759 |
+
"torch_compile": false,
|
| 760 |
+
"compile_mode": "max-autotune",
|
| 761 |
+
"state_format": "prob",
|
| 762 |
+
"meanflow_weight": 0.0,
|
| 763 |
+
"rollout_train_prob": 0.35,
|
| 764 |
+
"rollout_train_steps": 1,
|
| 765 |
+
"rollout_train_infer_steps": 1,
|
| 766 |
+
"rollout_train_time_mode": "sampled_s",
|
| 767 |
+
"rollout_train_s_dist": "uniform",
|
| 768 |
+
"rollout_train_s_min_frac": 0.0,
|
| 769 |
+
"rollout_train_s_max_frac": 0.25,
|
| 770 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 771 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 772 |
+
"rollout_train_temp": 1.45,
|
| 773 |
+
"rollout_train_max_gamma": 1.0,
|
| 774 |
+
"rollout_train_corrupt_only": true,
|
| 775 |
+
"rollout_train_samplewise": true,
|
| 776 |
+
"rollout_train_compute_always": false,
|
| 777 |
+
"rollout_train_sync_t": true,
|
| 778 |
+
"bridge_noise_init": "logistic_normal",
|
| 779 |
+
"noise_sigma": -1.0,
|
| 780 |
+
"allow_tf32": true,
|
| 781 |
+
"activation_checkpointing": false,
|
| 782 |
+
"activation_checkpoint_interval": 1,
|
| 783 |
+
"activation_checkpoint_scope": "block",
|
| 784 |
+
"ddp_static_graph": false,
|
| 785 |
+
"ddp_gradient_as_bucket_view": true,
|
| 786 |
+
"blocking_data_transfer": false,
|
| 787 |
+
"dataloader_prefetch_factor": 4,
|
| 788 |
+
"full_train_stats": false,
|
| 789 |
+
"tokenized_hf": false,
|
| 790 |
+
"tokenized_pad_token": "pad",
|
| 791 |
+
"elf_conditional_hf": false,
|
| 792 |
+
"record_pad_truncate": false,
|
| 793 |
+
"record_add_eos": false,
|
| 794 |
+
"record_add_special_tokens": false,
|
| 795 |
+
"record_pad_token": "pad",
|
| 796 |
+
"record_shuffle_buffer": 10000,
|
| 797 |
+
"wrap": true,
|
| 798 |
+
"wrap_mode": "stream",
|
| 799 |
+
"wrap_record_buffer_size": 200,
|
| 800 |
+
"owt_cached_chunks": true,
|
| 801 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
|
| 802 |
+
"owt_chunk_cache_rebuild": false,
|
| 803 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 804 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 805 |
+
"online_chunk_shuffle": false,
|
| 806 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 807 |
+
"openwebtext_split": "train_minus_100k",
|
| 808 |
+
"detokenizer": "auto",
|
| 809 |
+
"resolved_detokenizer": null,
|
| 810 |
+
"num_workers": 0,
|
| 811 |
+
"latest_every": 1000,
|
| 812 |
+
"resume_path": "runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705/latest.pt"
|
| 813 |
+
}
|
| 814 |
+
step=3100 epoch=3100/4000 epoch_step=1/1 micro_steps=3100 elapsed=14.5s lr=2.000000e-03 loss=0.2988 loss_recon=0.2988 loss_meanflow=0.0000 mean_model_t=0.2070 mean_corrupt_t=0.2070 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3566 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9057 corrupt_frac=1.0000 acc_corrupt=0.9057 loss_corrupt=0.2988 wrong_frac=0.7930 init_acc_corrupt=0.1349 acc_corrupt_t_0p0_0p2=0.8331 corrupt_frac_t_0p0_0p2=0.5646 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3537 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0731 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.2055 out_g_norm=0.5154 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.4181 init_gold_top10=0.4698 init_gold_top100=0.5788 rollout_applied_pos_frac=0.3750 init_acc_rollout_applied=0.1684 init_acc_rollout_kept=0.1284 logit_acc_rollout_applied=0.8597 logit_acc_rollout_kept=0.8469
|
| 815 |
+
step=3200 epoch=3200/4000 epoch_step=1/1 micro_steps=3200 elapsed=13.5s lr=2.000000e-03 loss=0.2808 loss_recon=0.2808 loss_meanflow=0.0000 mean_model_t=0.2093 mean_corrupt_t=0.2093 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3467 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9119 corrupt_frac=1.0000 acc_corrupt=0.9119 loss_corrupt=0.2808 wrong_frac=0.7905 init_acc_corrupt=0.1374 acc_corrupt_t_0p0_0p2=0.8414 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.9998 corrupt_frac_t_0p2_0p4=0.3600 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0760 acc_corrupt_t_0p6_0p8=0.9995 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.2055 out_g_norm=0.6080 acc_corrupt_t_0p8_1p0=0.9980 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1874 init_gold_top10=0.4693 init_gold_top100=0.5809 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.2221 init_acc_rollout_kept=0.1165 logit_acc_rollout_applied=0.8865 logit_acc_rollout_kept=0.9701
|
| 816 |
+
step=3300 epoch=3300/4000 epoch_step=1/1 micro_steps=3300 elapsed=13.5s lr=2.000000e-03 loss=0.2906 loss_recon=0.2906 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3452 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9074 corrupt_frac=1.0000 acc_corrupt=0.9074 loss_corrupt=0.2906 wrong_frac=0.7923 init_acc_corrupt=0.1364 acc_corrupt_t_0p0_0p2=0.8369 corrupt_frac_t_0p0_0p2=0.5677 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3483 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0743 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=12.2204 out_g_norm=0.4741 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3067 init_gold_top10=0.4022 init_gold_top100=0.5180 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.1471 init_acc_rollout_kept=0.1125 logit_acc_rollout_applied=0.9373 logit_acc_rollout_kept=0.8873
|
| 817 |
+
step=3400 epoch=3400/4000 epoch_step=1/1 micro_steps=3400 elapsed=13.6s lr=2.000000e-03 loss=0.2800 loss_recon=0.2800 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3509 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9107 corrupt_frac=1.0000 acc_corrupt=0.9107 loss_corrupt=0.2800 wrong_frac=0.7904 init_acc_corrupt=0.1393 acc_corrupt_t_0p0_0p2=0.8396 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0770 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0138 out_w_norm=12.2315 out_g_norm=0.4718 acc_corrupt_t_0p8_1p0=0.9988 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.4144 init_gold_top10=0.3943 init_gold_top100=0.5171 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.1679 init_acc_rollout_kept=0.1193 logit_acc_rollout_applied=0.9457 logit_acc_rollout_kept=0.8529
|
| 818 |
+
step=3500 epoch=3500/4000 epoch_step=1/1 micro_steps=3500 elapsed=13.6s lr=2.000000e-03 loss=0.2717 loss_recon=0.2717 loss_meanflow=0.0000 mean_model_t=0.2075 mean_corrupt_t=0.2075 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3543 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9142 corrupt_frac=1.0000 acc_corrupt=0.9142 loss_corrupt=0.2717 wrong_frac=0.7924 init_acc_corrupt=0.1371 acc_corrupt_t_0p0_0p2=0.8463 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0118 out_w_norm=12.2636 out_g_norm=0.4406 acc_corrupt_t_0p8_1p0=0.9990 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3554 init_gold_top10=0.4580 init_gold_top100=0.5682 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1740 init_acc_rollout_kept=0.1267 logit_acc_rollout_applied=0.8787 logit_acc_rollout_kept=0.8976
|
| 819 |
+
step=3600 epoch=3600/4000 epoch_step=1/1 micro_steps=3600 elapsed=13.5s lr=2.000000e-03 loss=0.2541 loss_recon=0.2541 loss_meanflow=0.0000 mean_model_t=0.2080 mean_corrupt_t=0.2080 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3501 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9199 corrupt_frac=1.0000 acc_corrupt=0.9199 loss_corrupt=0.2541 wrong_frac=0.7919 init_acc_corrupt=0.1365 acc_corrupt_t_0p0_0p2=0.8567 corrupt_frac_t_0p0_0p2=0.5587 acc_corrupt_t_0p2_0p4=0.9998 corrupt_frac_t_0p2_0p4=0.3601 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.2850 out_g_norm=0.3934 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2026 init_gold_top10=0.4512 init_gold_top100=0.5540 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.1588 init_acc_rollout_kept=0.1135 logit_acc_rollout_applied=0.9273 logit_acc_rollout_kept=0.9316
|
| 820 |
+
step=3700 epoch=3700/4000 epoch_step=1/1 micro_steps=3700 elapsed=13.5s lr=2.000000e-03 loss=0.2640 loss_recon=0.2640 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3470 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9161 corrupt_frac=1.0000 acc_corrupt=0.9161 loss_corrupt=0.2640 wrong_frac=0.7918 init_acc_corrupt=0.1378 acc_corrupt_t_0p0_0p2=0.8504 corrupt_frac_t_0p0_0p2=0.5605 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3566 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.3074 out_g_norm=0.3603 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2453 init_gold_top10=0.4635 init_gold_top100=0.5600 rollout_applied_pos_frac=0.3516 init_acc_rollout_applied=0.2373 init_acc_rollout_kept=0.1155 logit_acc_rollout_applied=0.9115 logit_acc_rollout_kept=0.9218
|
| 821 |
+
step=3800 epoch=3800/4000 epoch_step=1/1 micro_steps=3800 elapsed=13.5s lr=2.000000e-03 loss=0.2382 loss_recon=0.2382 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3430 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9229 corrupt_frac=1.0000 acc_corrupt=0.9229 loss_corrupt=0.2382 wrong_frac=0.7914 init_acc_corrupt=0.1377 acc_corrupt_t_0p0_0p2=0.8628 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3564 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0741 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.3346 out_g_norm=0.3186 loss_all=0.2397 init_gold_top10=0.4725 init_gold_top100=0.5694 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1680 init_acc_rollout_kept=0.1331 logit_acc_rollout_applied=0.9598 logit_acc_rollout_kept=0.9054
|
| 822 |
+
step=3900 epoch=3900/4000 epoch_step=1/1 micro_steps=3900 elapsed=13.6s lr=2.000000e-03 loss=0.2536 loss_recon=0.2536 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3524 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9199 corrupt_frac=1.0000 acc_corrupt=0.9199 loss_corrupt=0.2536 wrong_frac=0.7903 init_acc_corrupt=0.1382 acc_corrupt_t_0p0_0p2=0.8541 corrupt_frac_t_0p0_0p2=0.5486 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3675 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0754 out_w_norm=12.3579 out_g_norm=0.3368 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.9993 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2855 init_gold_top10=0.4527 init_gold_top100=0.5688 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1643 init_acc_rollout_kept=0.1251 logit_acc_rollout_applied=0.8880 logit_acc_rollout_kept=0.9133
|
| 823 |
+
step=4000 epoch=4000/4000 epoch_step=1/1 micro_steps=4000 elapsed=13.5s lr=2.000000e-03 loss=0.2502 loss_recon=0.2502 loss_meanflow=0.0000 mean_model_t=0.2090 mean_corrupt_t=0.2090 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3507 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9205 corrupt_frac=1.0000 acc_corrupt=0.9205 loss_corrupt=0.2502 wrong_frac=0.7909 init_acc_corrupt=0.1380 acc_corrupt_t_0p0_0p2=0.8569 corrupt_frac_t_0p0_0p2=0.5551 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.3747 out_g_norm=0.3521 acc_corrupt_t_0p8_1p0=0.9990 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2211 init_gold_top10=0.4130 init_gold_top100=0.5204 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.1318 init_acc_rollout_kept=0.1007 logit_acc_rollout_applied=0.9164 logit_acc_rollout_kept=0.9363
|
| 824 |
+
NCCL version 2.25.1+cuda12.8
|
| 825 |
+
resumed_from=runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705/latest.pt start_step=4001
|
| 826 |
+
{
|
| 827 |
+
"device": "cuda:0",
|
| 828 |
+
"rank": 0,
|
| 829 |
+
"world_size": 4,
|
| 830 |
+
"samples": "owt_cached_chunks:8",
|
| 831 |
+
"vocab_size": 2664,
|
| 832 |
+
"tokenizer_vocab_size": 50257,
|
| 833 |
+
"save_dir": "runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705",
|
| 834 |
+
"batch_size": 128,
|
| 835 |
+
"grad_accum": 1,
|
| 836 |
+
"effective_batch_size": 512,
|
| 837 |
+
"global_batch_size": 512,
|
| 838 |
+
"lr_schedule": "constant_warmup",
|
| 839 |
+
"optimizer": "muon",
|
| 840 |
+
"epochs": 0.0,
|
| 841 |
+
"steps_per_epoch": 1,
|
| 842 |
+
"total_steps": 5000,
|
| 843 |
+
"warmup_steps": 10,
|
| 844 |
+
"warmup_epochs": -1.0,
|
| 845 |
+
"min_lr": 0.0,
|
| 846 |
+
"weight_decay": 0.1,
|
| 847 |
+
"output_weight_decay": -1.0,
|
| 848 |
+
"adamw_param_groups": "nanogpt",
|
| 849 |
+
"adam_beta1": 0.9,
|
| 850 |
+
"adam_beta2": 0.95,
|
| 851 |
+
"adam_eps": 1e-08,
|
| 852 |
+
"muon_impl": "legacy",
|
| 853 |
+
"muon_momentum": 0.95,
|
| 854 |
+
"muon_ns_steps": 5,
|
| 855 |
+
"muon_update_scale": 1.0,
|
| 856 |
+
"muon_nesterov": false,
|
| 857 |
+
"muon_width_scale": false,
|
| 858 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 859 |
+
"muon_param_count": 2616320,
|
| 860 |
+
"muon_adam_param_count": 8192,
|
| 861 |
+
"muon_param_names": [
|
| 862 |
+
"vocab_embed.embedding",
|
| 863 |
+
"sigma_map.net.0.weight",
|
| 864 |
+
"sigma_map.net.2.weight",
|
| 865 |
+
"blocks.0.attn_qkv.weight",
|
| 866 |
+
"blocks.0.attn_out.weight",
|
| 867 |
+
"blocks.0.mlp.0.weight",
|
| 868 |
+
"blocks.0.mlp.2.weight",
|
| 869 |
+
"blocks.0.adaLN_modulation.weight",
|
| 870 |
+
"blocks.1.attn_qkv.weight",
|
| 871 |
+
"blocks.1.attn_out.weight",
|
| 872 |
+
"blocks.1.mlp.0.weight",
|
| 873 |
+
"blocks.1.mlp.2.weight",
|
| 874 |
+
"blocks.1.adaLN_modulation.weight",
|
| 875 |
+
"blocks.2.attn_qkv.weight",
|
| 876 |
+
"blocks.2.attn_out.weight",
|
| 877 |
+
"blocks.2.mlp.0.weight",
|
| 878 |
+
"blocks.2.mlp.2.weight",
|
| 879 |
+
"blocks.2.adaLN_modulation.weight",
|
| 880 |
+
"output_layer.linear.weight",
|
| 881 |
+
"output_layer.adaLN_modulation.weight"
|
| 882 |
+
],
|
| 883 |
+
"muon_adam_param_names": [
|
| 884 |
+
"sigma_map.net.0.bias",
|
| 885 |
+
"sigma_map.net.2.bias",
|
| 886 |
+
"blocks.0.norm1.weight",
|
| 887 |
+
"blocks.0.norm2.weight",
|
| 888 |
+
"blocks.0.mlp.0.bias",
|
| 889 |
+
"blocks.0.mlp.2.bias",
|
| 890 |
+
"blocks.0.adaLN_modulation.bias",
|
| 891 |
+
"blocks.1.norm1.weight",
|
| 892 |
+
"blocks.1.norm2.weight",
|
| 893 |
+
"blocks.1.mlp.0.bias",
|
| 894 |
+
"blocks.1.mlp.2.bias",
|
| 895 |
+
"blocks.1.adaLN_modulation.bias",
|
| 896 |
+
"blocks.2.norm1.weight",
|
| 897 |
+
"blocks.2.norm2.weight",
|
| 898 |
+
"blocks.2.mlp.0.bias",
|
| 899 |
+
"blocks.2.mlp.2.bias",
|
| 900 |
+
"blocks.2.adaLN_modulation.bias",
|
| 901 |
+
"output_layer.norm_final.weight",
|
| 902 |
+
"output_layer.adaLN_modulation.bias"
|
| 903 |
+
],
|
| 904 |
+
"muon_effective_nesterov": false,
|
| 905 |
+
"muon_effective_width_scale": false,
|
| 906 |
+
"muon_effective_weight_decay": 0.1,
|
| 907 |
+
"muon_adam_fallback_nesterov": false,
|
| 908 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 909 |
+
"ema_decay": 0.9999,
|
| 910 |
+
"ema_start_step": 0,
|
| 911 |
+
"model_type": "ddit",
|
| 912 |
+
"ddit_mlp_type": "gelu",
|
| 913 |
+
"elf_num_time_tokens": 4,
|
| 914 |
+
"elf_num_model_mode_tokens": 0,
|
| 915 |
+
"qk_norm": true,
|
| 916 |
+
"output_bias": false,
|
| 917 |
+
"output_init_std": -1.0,
|
| 918 |
+
"norm_type": "rmsnorm",
|
| 919 |
+
"target_loss": "hard_ce",
|
| 920 |
+
"linear_soft_target_power": 1.0,
|
| 921 |
+
"linear_soft_target_min_conf": 0.0,
|
| 922 |
+
"linear_soft_target_max_conf": 1.0,
|
| 923 |
+
"t_sampling_mode": "logit_normal",
|
| 924 |
+
"t_sampling_power": 1.0,
|
| 925 |
+
"t_sampling_eps": 0.0001,
|
| 926 |
+
"t_sampling_logit_mean": -1.5,
|
| 927 |
+
"t_sampling_logit_std": 0.8,
|
| 928 |
+
"dual_t": true,
|
| 929 |
+
"corrupt_t_mode": "same",
|
| 930 |
+
"corrupt_min_t": 0.0,
|
| 931 |
+
"corrupt_max_t": 1.0,
|
| 932 |
+
"prefix_block_prob": 0.0,
|
| 933 |
+
"prefix_block_len": 128,
|
| 934 |
+
"mask_ratio_floor_schedule": "none",
|
| 935 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 936 |
+
"dirichlet_semantic_t_mode": "same",
|
| 937 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 938 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 939 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 940 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 941 |
+
"categorical_wrong_from_full_vocab": true,
|
| 942 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 943 |
+
"categorical_wrong_basin_token_ids": "",
|
| 944 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 945 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 946 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 947 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 948 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 949 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 950 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 951 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 952 |
+
"mask_mixture_original_prob": 0.0,
|
| 953 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 954 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 955 |
+
"mask_mixture_block_prob": 0.0,
|
| 956 |
+
"mask_mixture_all_prob": 1.0,
|
| 957 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 958 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 959 |
+
"mask_mixture_block_tokens": "64,128",
|
| 960 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 961 |
+
"logistic_normal_sigma_min": 0.1,
|
| 962 |
+
"logistic_normal_sigma_max": 1.0,
|
| 963 |
+
"logistic_normal_tau_min": 1.0,
|
| 964 |
+
"logistic_normal_tau_max": 1.0,
|
| 965 |
+
"torch_compile": false,
|
| 966 |
+
"compile_mode": "max-autotune",
|
| 967 |
+
"state_format": "prob",
|
| 968 |
+
"meanflow_weight": 0.0,
|
| 969 |
+
"rollout_train_prob": 0.35,
|
| 970 |
+
"rollout_train_steps": 1,
|
| 971 |
+
"rollout_train_infer_steps": 1,
|
| 972 |
+
"rollout_train_time_mode": "sampled_s",
|
| 973 |
+
"rollout_train_s_dist": "uniform",
|
| 974 |
+
"rollout_train_s_min_frac": 0.0,
|
| 975 |
+
"rollout_train_s_max_frac": 0.25,
|
| 976 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 977 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 978 |
+
"rollout_train_temp": 1.45,
|
| 979 |
+
"rollout_train_max_gamma": 1.0,
|
| 980 |
+
"rollout_train_corrupt_only": true,
|
| 981 |
+
"rollout_train_samplewise": true,
|
| 982 |
+
"rollout_train_compute_always": false,
|
| 983 |
+
"rollout_train_sync_t": true,
|
| 984 |
+
"bridge_noise_init": "logistic_normal",
|
| 985 |
+
"noise_sigma": -1.0,
|
| 986 |
+
"allow_tf32": true,
|
| 987 |
+
"activation_checkpointing": false,
|
| 988 |
+
"activation_checkpoint_interval": 1,
|
| 989 |
+
"activation_checkpoint_scope": "block",
|
| 990 |
+
"ddp_static_graph": false,
|
| 991 |
+
"ddp_gradient_as_bucket_view": true,
|
| 992 |
+
"blocking_data_transfer": false,
|
| 993 |
+
"dataloader_prefetch_factor": 4,
|
| 994 |
+
"full_train_stats": false,
|
| 995 |
+
"tokenized_hf": false,
|
| 996 |
+
"tokenized_pad_token": "pad",
|
| 997 |
+
"elf_conditional_hf": false,
|
| 998 |
+
"record_pad_truncate": false,
|
| 999 |
+
"record_add_eos": false,
|
| 1000 |
+
"record_add_special_tokens": false,
|
| 1001 |
+
"record_pad_token": "pad",
|
| 1002 |
+
"record_shuffle_buffer": 10000,
|
| 1003 |
+
"wrap": true,
|
| 1004 |
+
"wrap_mode": "stream",
|
| 1005 |
+
"wrap_record_buffer_size": 200,
|
| 1006 |
+
"owt_cached_chunks": true,
|
| 1007 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
|
| 1008 |
+
"owt_chunk_cache_rebuild": false,
|
| 1009 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 1010 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 1011 |
+
"online_chunk_shuffle": false,
|
| 1012 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 1013 |
+
"openwebtext_split": "train_minus_100k",
|
| 1014 |
+
"detokenizer": "auto",
|
| 1015 |
+
"resolved_detokenizer": null,
|
| 1016 |
+
"num_workers": 0,
|
| 1017 |
+
"latest_every": 1000,
|
| 1018 |
+
"resume_path": "runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705/latest.pt"
|
| 1019 |
+
}
|
| 1020 |
+
step=4100 epoch=4100/5000 epoch_step=1/1 micro_steps=4100 elapsed=14.7s lr=2.000000e-03 loss=0.2475 loss_recon=0.2475 loss_meanflow=0.0000 mean_model_t=0.2070 mean_corrupt_t=0.2070 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3566 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9226 corrupt_frac=1.0000 acc_corrupt=0.9226 loss_corrupt=0.2475 wrong_frac=0.7930 init_acc_corrupt=0.1356 acc_corrupt_t_0p0_0p2=0.8630 corrupt_frac_t_0p0_0p2=0.5646 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3537 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0731 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.3913 out_g_norm=0.2801 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3210 init_gold_top10=0.4741 init_gold_top100=0.5788 rollout_applied_pos_frac=0.3750 init_acc_rollout_applied=0.1707 init_acc_rollout_kept=0.1284 logit_acc_rollout_applied=0.8816 logit_acc_rollout_kept=0.8989
|
| 1021 |
+
step=4200 epoch=4200/5000 epoch_step=1/1 micro_steps=4200 elapsed=13.8s lr=2.000000e-03 loss=0.2207 loss_recon=0.2207 loss_meanflow=0.0000 mean_model_t=0.2093 mean_corrupt_t=0.2093 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3467 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9305 corrupt_frac=1.0000 acc_corrupt=0.9305 loss_corrupt=0.2207 wrong_frac=0.7905 init_acc_corrupt=0.1383 acc_corrupt_t_0p0_0p2=0.8749 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3600 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0760 acc_corrupt_t_0p6_0p8=0.9996 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.4189 out_g_norm=0.2572 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1595 init_gold_top10=0.4790 init_gold_top100=0.5809 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.2249 init_acc_rollout_kept=0.1165 logit_acc_rollout_applied=0.9164 logit_acc_rollout_kept=0.9773
|
| 1022 |
+
step=4300 epoch=4300/5000 epoch_step=1/1 micro_steps=4300 elapsed=13.8s lr=2.000000e-03 loss=0.2405 loss_recon=0.2405 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3452 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9226 corrupt_frac=1.0000 acc_corrupt=0.9226 loss_corrupt=0.2405 wrong_frac=0.7923 init_acc_corrupt=0.1372 acc_corrupt_t_0p0_0p2=0.8638 corrupt_frac_t_0p0_0p2=0.5677 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3483 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0743 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=12.4505 out_g_norm=0.2480 acc_corrupt_t_0p8_1p0=0.9995 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2279 init_gold_top10=0.3981 init_gold_top100=0.5180 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.1497 init_acc_rollout_kept=0.1125 logit_acc_rollout_applied=0.9164 logit_acc_rollout_kept=0.9304
|
| 1023 |
+
step=4400 epoch=4400/5000 epoch_step=1/1 micro_steps=4400 elapsed=13.8s lr=2.000000e-03 loss=0.2299 loss_recon=0.2299 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3509 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9278 corrupt_frac=1.0000 acc_corrupt=0.9278 loss_corrupt=0.2299 wrong_frac=0.7904 init_acc_corrupt=0.1401 acc_corrupt_t_0p0_0p2=0.8702 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0770 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0138 out_w_norm=12.4714 out_g_norm=0.2465 acc_corrupt_t_0p8_1p0=0.9983 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5788 init_gold_top10=0.3953 init_gold_top100=0.5171 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.1687 init_acc_rollout_kept=0.1193 logit_acc_rollout_applied=0.9465 logit_acc_rollout_kept=0.8181
|
| 1024 |
+
step=4500 epoch=4500/5000 epoch_step=1/1 micro_steps=4500 elapsed=13.9s lr=2.000000e-03 loss=0.2353 loss_recon=0.2353 loss_meanflow=0.0000 mean_model_t=0.2075 mean_corrupt_t=0.2075 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3543 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9258 corrupt_frac=1.0000 acc_corrupt=0.9258 loss_corrupt=0.2353 wrong_frac=0.7924 init_acc_corrupt=0.1377 acc_corrupt_t_0p0_0p2=0.8669 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0118 out_w_norm=12.4896 out_g_norm=0.2453 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3040 init_gold_top10=0.4599 init_gold_top100=0.5682 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1761 init_acc_rollout_kept=0.1267 logit_acc_rollout_applied=0.8716 logit_acc_rollout_kept=0.9091
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n1024_compactv2664_3l_bs512_hard_ce_allcorrupt.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n1024_compactv2664_3l_bs512_hard_ce_onehot.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n512_compactv1635_3l_bs512_hard_ce_allcorrupt.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n64_compactv335_3l_bs512_hard_ce_allcorrupt.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n8_compactv47_3l_hard_ce_onehot.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n8_linear_soft_kl_onehot_20260517_train8ctx8_overfit.log
ADDED
|
@@ -0,0 +1,326 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NCCL version 2.25.1+cuda12.8
|
| 2 |
+
{
|
| 3 |
+
"device": "cuda:0",
|
| 4 |
+
"rank": 0,
|
| 5 |
+
"world_size": 4,
|
| 6 |
+
"samples": "owt_cached_chunks:8",
|
| 7 |
+
"vocab_size": 50257,
|
| 8 |
+
"tokenizer_vocab_size": 50257,
|
| 9 |
+
"save_dir": "runs/train8_n8_linear_soft_kl_onehot_20260517_train8ctx8_overfit",
|
| 10 |
+
"batch_size": 1,
|
| 11 |
+
"grad_accum": 1,
|
| 12 |
+
"effective_batch_size": 4,
|
| 13 |
+
"global_batch_size": 4,
|
| 14 |
+
"lr_schedule": "constant_warmup",
|
| 15 |
+
"optimizer": "muon",
|
| 16 |
+
"epochs": 0.0,
|
| 17 |
+
"steps_per_epoch": 2,
|
| 18 |
+
"total_steps": 500,
|
| 19 |
+
"warmup_steps": 10,
|
| 20 |
+
"warmup_epochs": -1.0,
|
| 21 |
+
"min_lr": 0.0,
|
| 22 |
+
"weight_decay": 0.1,
|
| 23 |
+
"output_weight_decay": -1.0,
|
| 24 |
+
"adamw_param_groups": "nanogpt",
|
| 25 |
+
"adam_beta1": 0.9,
|
| 26 |
+
"adam_beta2": 0.95,
|
| 27 |
+
"adam_eps": 1e-08,
|
| 28 |
+
"muon_impl": "legacy",
|
| 29 |
+
"muon_momentum": 0.95,
|
| 30 |
+
"muon_ns_steps": 5,
|
| 31 |
+
"muon_update_scale": 1.0,
|
| 32 |
+
"muon_nesterov": false,
|
| 33 |
+
"muon_width_scale": false,
|
| 34 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 35 |
+
"muon_param_count": 169453056,
|
| 36 |
+
"muon_adam_param_count": 122368,
|
| 37 |
+
"muon_param_names": [
|
| 38 |
+
"vocab_embed.embedding",
|
| 39 |
+
"sigma_map.net.0.weight",
|
| 40 |
+
"sigma_map.net.2.weight",
|
| 41 |
+
"blocks.0.attn_qkv.weight",
|
| 42 |
+
"blocks.0.attn_out.weight",
|
| 43 |
+
"blocks.0.mlp.0.weight",
|
| 44 |
+
"blocks.0.mlp.2.weight",
|
| 45 |
+
"blocks.0.adaLN_modulation.weight",
|
| 46 |
+
"blocks.1.attn_qkv.weight",
|
| 47 |
+
"blocks.1.attn_out.weight",
|
| 48 |
+
"blocks.1.mlp.0.weight",
|
| 49 |
+
"blocks.1.mlp.2.weight",
|
| 50 |
+
"blocks.1.adaLN_modulation.weight",
|
| 51 |
+
"blocks.2.attn_qkv.weight",
|
| 52 |
+
"blocks.2.attn_out.weight",
|
| 53 |
+
"blocks.2.mlp.0.weight",
|
| 54 |
+
"blocks.2.mlp.2.weight",
|
| 55 |
+
"blocks.2.adaLN_modulation.weight",
|
| 56 |
+
"blocks.3.attn_qkv.weight",
|
| 57 |
+
"blocks.3.attn_out.weight",
|
| 58 |
+
"blocks.3.mlp.0.weight",
|
| 59 |
+
"blocks.3.mlp.2.weight",
|
| 60 |
+
"blocks.3.adaLN_modulation.weight",
|
| 61 |
+
"blocks.4.attn_qkv.weight",
|
| 62 |
+
"blocks.4.attn_out.weight",
|
| 63 |
+
"blocks.4.mlp.0.weight",
|
| 64 |
+
"blocks.4.mlp.2.weight",
|
| 65 |
+
"blocks.4.adaLN_modulation.weight",
|
| 66 |
+
"blocks.5.attn_qkv.weight",
|
| 67 |
+
"blocks.5.attn_out.weight",
|
| 68 |
+
"blocks.5.mlp.0.weight",
|
| 69 |
+
"blocks.5.mlp.2.weight",
|
| 70 |
+
"blocks.5.adaLN_modulation.weight",
|
| 71 |
+
"blocks.6.attn_qkv.weight",
|
| 72 |
+
"blocks.6.attn_out.weight",
|
| 73 |
+
"blocks.6.mlp.0.weight",
|
| 74 |
+
"blocks.6.mlp.2.weight",
|
| 75 |
+
"blocks.6.adaLN_modulation.weight",
|
| 76 |
+
"blocks.7.attn_qkv.weight",
|
| 77 |
+
"blocks.7.attn_out.weight",
|
| 78 |
+
"blocks.7.mlp.0.weight",
|
| 79 |
+
"blocks.7.mlp.2.weight",
|
| 80 |
+
"blocks.7.adaLN_modulation.weight",
|
| 81 |
+
"blocks.8.attn_qkv.weight",
|
| 82 |
+
"blocks.8.attn_out.weight",
|
| 83 |
+
"blocks.8.mlp.0.weight",
|
| 84 |
+
"blocks.8.mlp.2.weight",
|
| 85 |
+
"blocks.8.adaLN_modulation.weight",
|
| 86 |
+
"blocks.9.attn_qkv.weight",
|
| 87 |
+
"blocks.9.attn_out.weight",
|
| 88 |
+
"blocks.9.mlp.0.weight",
|
| 89 |
+
"blocks.9.mlp.2.weight",
|
| 90 |
+
"blocks.9.adaLN_modulation.weight",
|
| 91 |
+
"blocks.10.attn_qkv.weight",
|
| 92 |
+
"blocks.10.attn_out.weight",
|
| 93 |
+
"blocks.10.mlp.0.weight",
|
| 94 |
+
"blocks.10.mlp.2.weight",
|
| 95 |
+
"blocks.10.adaLN_modulation.weight",
|
| 96 |
+
"blocks.11.attn_qkv.weight",
|
| 97 |
+
"blocks.11.attn_out.weight",
|
| 98 |
+
"blocks.11.mlp.0.weight",
|
| 99 |
+
"blocks.11.mlp.2.weight",
|
| 100 |
+
"blocks.11.adaLN_modulation.weight",
|
| 101 |
+
"output_layer.linear.weight",
|
| 102 |
+
"output_layer.adaLN_modulation.weight"
|
| 103 |
+
],
|
| 104 |
+
"muon_adam_param_names": [
|
| 105 |
+
"sigma_map.net.0.bias",
|
| 106 |
+
"sigma_map.net.2.bias",
|
| 107 |
+
"blocks.0.norm1.weight",
|
| 108 |
+
"blocks.0.norm2.weight",
|
| 109 |
+
"blocks.0.mlp.0.bias",
|
| 110 |
+
"blocks.0.mlp.2.bias",
|
| 111 |
+
"blocks.0.adaLN_modulation.bias",
|
| 112 |
+
"blocks.1.norm1.weight",
|
| 113 |
+
"blocks.1.norm2.weight",
|
| 114 |
+
"blocks.1.mlp.0.bias",
|
| 115 |
+
"blocks.1.mlp.2.bias",
|
| 116 |
+
"blocks.1.adaLN_modulation.bias",
|
| 117 |
+
"blocks.2.norm1.weight",
|
| 118 |
+
"blocks.2.norm2.weight",
|
| 119 |
+
"blocks.2.mlp.0.bias",
|
| 120 |
+
"blocks.2.mlp.2.bias",
|
| 121 |
+
"blocks.2.adaLN_modulation.bias",
|
| 122 |
+
"blocks.3.norm1.weight",
|
| 123 |
+
"blocks.3.norm2.weight",
|
| 124 |
+
"blocks.3.mlp.0.bias",
|
| 125 |
+
"blocks.3.mlp.2.bias",
|
| 126 |
+
"blocks.3.adaLN_modulation.bias",
|
| 127 |
+
"blocks.4.norm1.weight",
|
| 128 |
+
"blocks.4.norm2.weight",
|
| 129 |
+
"blocks.4.mlp.0.bias",
|
| 130 |
+
"blocks.4.mlp.2.bias",
|
| 131 |
+
"blocks.4.adaLN_modulation.bias",
|
| 132 |
+
"blocks.5.norm1.weight",
|
| 133 |
+
"blocks.5.norm2.weight",
|
| 134 |
+
"blocks.5.mlp.0.bias",
|
| 135 |
+
"blocks.5.mlp.2.bias",
|
| 136 |
+
"blocks.5.adaLN_modulation.bias",
|
| 137 |
+
"blocks.6.norm1.weight",
|
| 138 |
+
"blocks.6.norm2.weight",
|
| 139 |
+
"blocks.6.mlp.0.bias",
|
| 140 |
+
"blocks.6.mlp.2.bias",
|
| 141 |
+
"blocks.6.adaLN_modulation.bias",
|
| 142 |
+
"blocks.7.norm1.weight",
|
| 143 |
+
"blocks.7.norm2.weight",
|
| 144 |
+
"blocks.7.mlp.0.bias",
|
| 145 |
+
"blocks.7.mlp.2.bias",
|
| 146 |
+
"blocks.7.adaLN_modulation.bias",
|
| 147 |
+
"blocks.8.norm1.weight",
|
| 148 |
+
"blocks.8.norm2.weight",
|
| 149 |
+
"blocks.8.mlp.0.bias",
|
| 150 |
+
"blocks.8.mlp.2.bias",
|
| 151 |
+
"blocks.8.adaLN_modulation.bias",
|
| 152 |
+
"blocks.9.norm1.weight",
|
| 153 |
+
"blocks.9.norm2.weight",
|
| 154 |
+
"blocks.9.mlp.0.bias",
|
| 155 |
+
"blocks.9.mlp.2.bias",
|
| 156 |
+
"blocks.9.adaLN_modulation.bias",
|
| 157 |
+
"blocks.10.norm1.weight",
|
| 158 |
+
"blocks.10.norm2.weight",
|
| 159 |
+
"blocks.10.mlp.0.bias",
|
| 160 |
+
"blocks.10.mlp.2.bias",
|
| 161 |
+
"blocks.10.adaLN_modulation.bias",
|
| 162 |
+
"blocks.11.norm1.weight",
|
| 163 |
+
"blocks.11.norm2.weight",
|
| 164 |
+
"blocks.11.mlp.0.bias",
|
| 165 |
+
"blocks.11.mlp.2.bias",
|
| 166 |
+
"blocks.11.adaLN_modulation.bias",
|
| 167 |
+
"output_layer.norm_final.weight",
|
| 168 |
+
"output_layer.adaLN_modulation.bias"
|
| 169 |
+
],
|
| 170 |
+
"muon_effective_nesterov": false,
|
| 171 |
+
"muon_effective_width_scale": false,
|
| 172 |
+
"muon_effective_weight_decay": 0.1,
|
| 173 |
+
"muon_adam_fallback_nesterov": false,
|
| 174 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 175 |
+
"ema_decay": 0.9999,
|
| 176 |
+
"ema_start_step": 0,
|
| 177 |
+
"model_type": "ddit",
|
| 178 |
+
"elf_num_time_tokens": 4,
|
| 179 |
+
"elf_num_model_mode_tokens": 0,
|
| 180 |
+
"qk_norm": true,
|
| 181 |
+
"output_bias": false,
|
| 182 |
+
"output_init_std": -1.0,
|
| 183 |
+
"norm_type": "rmsnorm",
|
| 184 |
+
"target_loss": "linear_soft_kl",
|
| 185 |
+
"linear_soft_target_power": 1.0,
|
| 186 |
+
"linear_soft_target_min_conf": 0.0,
|
| 187 |
+
"linear_soft_target_max_conf": 1.0,
|
| 188 |
+
"t_sampling_mode": "logit_normal",
|
| 189 |
+
"t_sampling_power": 1.0,
|
| 190 |
+
"t_sampling_eps": 0.0001,
|
| 191 |
+
"t_sampling_logit_mean": -1.5,
|
| 192 |
+
"t_sampling_logit_std": 0.8,
|
| 193 |
+
"dual_t": true,
|
| 194 |
+
"corrupt_t_mode": "same",
|
| 195 |
+
"corrupt_min_t": 0.0,
|
| 196 |
+
"corrupt_max_t": 1.0,
|
| 197 |
+
"prefix_block_prob": 0.0,
|
| 198 |
+
"prefix_block_len": 128,
|
| 199 |
+
"mask_ratio_floor_schedule": "none",
|
| 200 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 201 |
+
"dirichlet_semantic_t_mode": "same",
|
| 202 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 203 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 204 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 205 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 206 |
+
"categorical_wrong_from_full_vocab": true,
|
| 207 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 208 |
+
"categorical_wrong_basin_token_ids": "",
|
| 209 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 210 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 211 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 212 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 213 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 214 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 215 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 216 |
+
"mask_mixture_original_prob": 0.0,
|
| 217 |
+
"mask_mixture_lowk_prob": 1.0,
|
| 218 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 219 |
+
"mask_mixture_block_prob": 0.0,
|
| 220 |
+
"mask_mixture_all_prob": 0.0,
|
| 221 |
+
"mask_mixture_lowk_clean_tokens": "1,2,4",
|
| 222 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 223 |
+
"mask_mixture_block_tokens": "64,128",
|
| 224 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 225 |
+
"logistic_normal_sigma_min": 0.18,
|
| 226 |
+
"logistic_normal_sigma_max": 2.2,
|
| 227 |
+
"logistic_normal_tau_min": 0.65,
|
| 228 |
+
"logistic_normal_tau_max": 1.15,
|
| 229 |
+
"torch_compile": false,
|
| 230 |
+
"compile_mode": "max-autotune",
|
| 231 |
+
"state_format": "prob",
|
| 232 |
+
"meanflow_weight": 0.0,
|
| 233 |
+
"rollout_train_prob": 0.0,
|
| 234 |
+
"rollout_train_steps": 1,
|
| 235 |
+
"rollout_train_infer_steps": 64,
|
| 236 |
+
"rollout_train_temp": 1.45,
|
| 237 |
+
"rollout_train_max_gamma": 1.0,
|
| 238 |
+
"rollout_train_corrupt_only": true,
|
| 239 |
+
"rollout_train_samplewise": false,
|
| 240 |
+
"rollout_train_compute_always": false,
|
| 241 |
+
"bridge_noise_init": "logistic_normal",
|
| 242 |
+
"noise_sigma": -1.0,
|
| 243 |
+
"allow_tf32": true,
|
| 244 |
+
"activation_checkpointing": false,
|
| 245 |
+
"activation_checkpoint_interval": 1,
|
| 246 |
+
"activation_checkpoint_scope": "block",
|
| 247 |
+
"ddp_static_graph": false,
|
| 248 |
+
"ddp_gradient_as_bucket_view": true,
|
| 249 |
+
"blocking_data_transfer": false,
|
| 250 |
+
"dataloader_prefetch_factor": 4,
|
| 251 |
+
"full_train_stats": false,
|
| 252 |
+
"tokenized_hf": false,
|
| 253 |
+
"tokenized_pad_token": "pad",
|
| 254 |
+
"elf_conditional_hf": false,
|
| 255 |
+
"record_pad_truncate": false,
|
| 256 |
+
"record_add_eos": false,
|
| 257 |
+
"record_add_special_tokens": false,
|
| 258 |
+
"record_pad_token": "pad",
|
| 259 |
+
"record_shuffle_buffer": 10000,
|
| 260 |
+
"wrap": true,
|
| 261 |
+
"wrap_mode": "stream",
|
| 262 |
+
"wrap_record_buffer_size": 200,
|
| 263 |
+
"owt_cached_chunks": true,
|
| 264 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len8_train8_overfit",
|
| 265 |
+
"owt_chunk_cache_rebuild": false,
|
| 266 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 267 |
+
"owt_exact_repeat_per_chunk": 0,
|
| 268 |
+
"online_chunk_shuffle": false,
|
| 269 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 270 |
+
"openwebtext_split": "train_minus_100k",
|
| 271 |
+
"detokenizer": "auto",
|
| 272 |
+
"resolved_detokenizer": null,
|
| 273 |
+
"num_workers": 0,
|
| 274 |
+
"latest_every": 10,
|
| 275 |
+
"resume_path": ""
|
| 276 |
+
}
|
| 277 |
+
step=10 epoch=5/250 epoch_step=2/2 micro_steps=10 elapsed=2.1s lr=2.000000e-03 loss=1.4378 loss_recon=1.4378 loss_meanflow=0.0000 mean_model_t=0.1662 mean_corrupt_t=0.1662 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1662 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2500 corrupt_frac=0.7125 acc_corrupt=0.1228 loss_corrupt=2.1079 wrong_frac=0.7895 init_acc_corrupt=0.0877 acc_corrupt_t_0p0_0p2=0.0513 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.0168 out_g_norm=1.4755 acc_corrupt_t_0p2_0p4=0.2778 corrupt_frac_t_0p2_0p4=1.0000 loss_all=10.7422 init_gold_top10=0.5000 init_gold_top100=0.5000
|
| 278 |
+
step=20 epoch=10/250 epoch_step=2/2 micro_steps=20 elapsed=5.4s lr=2.000000e-03 loss=1.6261 loss_recon=1.6261 loss_meanflow=0.0000 mean_model_t=0.1939 mean_corrupt_t=0.1939 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1939 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3000 corrupt_frac=0.6375 acc_corrupt=0.1569 loss_corrupt=2.2662 wrong_frac=0.8824 init_acc_corrupt=0.0588 acc_corrupt_t_0p0_0p2=0.1538 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.1012 out_g_norm=1.7398 acc_corrupt_t_0p2_0p4=0.1579 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.1667 corrupt_frac_t_0p4_0p6=1.0000 loss_all=10.6484 init_gold_top10=0.0000 init_gold_top100=0.0000
|
| 279 |
+
step=30 epoch=15/250 epoch_step=2/2 micro_steps=30 elapsed=5.0s lr=2.000000e-03 loss=1.4998 loss_recon=1.4998 loss_meanflow=0.0000 mean_model_t=0.1869 mean_corrupt_t=0.1869 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1869 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1875 corrupt_frac=0.7125 acc_corrupt=0.1579 loss_corrupt=2.0982 wrong_frac=0.8772 init_acc_corrupt=0.0175 acc_corrupt_t_0p0_0p2=0.1379 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.2113 out_g_norm=1.9543 acc_corrupt_t_0p2_0p4=0.1786 corrupt_frac_t_0p2_0p4=1.0000 loss_all=10.3594 init_gold_top10=0.0000 init_gold_top100=0.0000
|
| 280 |
+
step=40 epoch=20/250 epoch_step=2/2 micro_steps=40 elapsed=4.8s lr=2.000000e-03 loss=2.1429 loss_recon=2.1429 loss_meanflow=0.0000 mean_model_t=0.2433 mean_corrupt_t=0.2433 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2433 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2750 corrupt_frac=0.6625 acc_corrupt=0.1887 loss_corrupt=2.8585 wrong_frac=0.7925 init_acc_corrupt=0.1509 acc_corrupt_t_0p0_0p2=0.2222 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.3226 out_g_norm=1.9545 acc_corrupt_t_0p2_0p4=0.1724 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p6_0p8=0.1667 corrupt_frac_t_0p6_0p8=1.0000 loss_all=10.1250 init_gold_top10=0.0000 init_gold_top100=0.0000
|
| 281 |
+
step=50 epoch=25/250 epoch_step=2/2 micro_steps=50 elapsed=5.3s lr=2.000000e-03 loss=2.0644 loss_recon=2.0644 loss_meanflow=0.0000 mean_model_t=0.2392 mean_corrupt_t=0.2392 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2392 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2500 corrupt_frac=0.7875 acc_corrupt=0.1111 loss_corrupt=3.0162 wrong_frac=0.7778 init_acc_corrupt=0.1111 acc_corrupt_t_0p0_0p2=0.0833 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.4197 out_g_norm=1.8317 acc_corrupt_t_0p4_0p6=0.1429 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.1200 corrupt_frac_t_0p2_0p4=1.0000 loss_all=10.3516 init_gold_top10=0.2857 init_gold_top100=0.4286
|
| 282 |
+
step=60 epoch=30/250 epoch_step=2/2 micro_steps=60 elapsed=3.9s lr=2.000000e-03 loss=0.6981 loss_recon=0.6981 loss_meanflow=0.0000 mean_model_t=0.0956 mean_corrupt_t=0.0956 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0956 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3125 corrupt_frac=0.7125 acc_corrupt=0.1404 loss_corrupt=1.7183 wrong_frac=0.8596 init_acc_corrupt=0.0175 acc_corrupt_t_0p0_0p2=0.1400 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.5080 out_g_norm=1.7802 acc_corrupt_t_0p2_0p4=0.1429 corrupt_frac_t_0p2_0p4=1.0000 loss_all=9.8359 init_gold_top10=0.0000 init_gold_top100=0.1667
|
| 283 |
+
step=70 epoch=35/250 epoch_step=2/2 micro_steps=70 elapsed=5.3s lr=2.000000e-03 loss=1.2525 loss_recon=1.2525 loss_meanflow=0.0000 mean_model_t=0.1739 mean_corrupt_t=0.1739 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1739 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4500 corrupt_frac=0.6875 acc_corrupt=0.2909 loss_corrupt=2.1904 wrong_frac=0.8000 init_acc_corrupt=0.1091 acc_corrupt_t_0p0_0p2=0.2051 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.5936 out_g_norm=2.0018 acc_corrupt_t_0p2_0p4=0.5000 corrupt_frac_t_0p2_0p4=1.0000 loss_all=9.6328 init_gold_top10=0.1667 init_gold_top100=0.1667
|
| 284 |
+
step=80 epoch=40/250 epoch_step=2/2 micro_steps=80 elapsed=4.3s lr=2.000000e-03 loss=1.0250 loss_recon=1.0250 loss_meanflow=0.0000 mean_model_t=0.1471 mean_corrupt_t=0.1471 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1471 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3375 corrupt_frac=0.7250 acc_corrupt=0.2414 loss_corrupt=1.5685 wrong_frac=0.8103 init_acc_corrupt=0.0517 acc_corrupt_t_0p0_0p2=0.2308 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.6759 out_g_norm=1.6317 acc_corrupt_t_0p2_0p4=0.3333 corrupt_frac_t_0p2_0p4=1.0000 loss_all=8.8125 init_gold_top10=0.2500 init_gold_top100=0.2500
|
| 285 |
+
step=90 epoch=45/250 epoch_step=2/2 micro_steps=90 elapsed=4.3s lr=2.000000e-03 loss=2.0293 loss_recon=2.0293 loss_meanflow=0.0000 mean_model_t=0.2610 mean_corrupt_t=0.2610 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2610 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4125 corrupt_frac=0.7125 acc_corrupt=0.2281 loss_corrupt=2.8313 wrong_frac=0.7719 init_acc_corrupt=0.2105 acc_corrupt_t_0p4_0p6=0.3333 corrupt_frac_t_0p4_0p6=1.0000 out_w_norm=0.7576 out_g_norm=2.3043 acc_corrupt_t_0p2_0p4=0.2083 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p0_0p2=0.1905 corrupt_frac_t_0p0_0p2=1.0000 loss_all=9.1992 init_gold_top10=0.1667 init_gold_top100=0.1667
|
| 286 |
+
step=100 epoch=50/250 epoch_step=2/2 micro_steps=100 elapsed=5.3s lr=2.000000e-03 loss=1.3626 loss_recon=1.3626 loss_meanflow=0.0000 mean_model_t=0.2061 mean_corrupt_t=0.2061 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2061 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4375 corrupt_frac=0.7000 acc_corrupt=0.2321 loss_corrupt=1.9125 wrong_frac=0.8393 init_acc_corrupt=0.0893 acc_corrupt_t_0p0_0p2=0.2059 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.8440 out_g_norm=2.1251 acc_corrupt_t_0p2_0p4=0.2727 corrupt_frac_t_0p2_0p4=1.0000 loss_all=8.1016 init_gold_top10=0.2500 init_gold_top100=0.2500
|
| 287 |
+
step=110 epoch=55/250 epoch_step=2/2 micro_steps=110 elapsed=4.3s lr=2.000000e-03 loss=1.2112 loss_recon=1.2112 loss_meanflow=0.0000 mean_model_t=0.1964 mean_corrupt_t=0.1964 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1964 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5375 corrupt_frac=0.6625 acc_corrupt=0.3208 loss_corrupt=2.1580 wrong_frac=0.6981 init_acc_corrupt=0.1509 acc_corrupt_t_0p2_0p4=0.3182 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=0.9333 out_g_norm=2.1929 acc_corrupt_t_0p0_0p2=0.2000 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.8333 corrupt_frac_t_0p4_0p6=1.0000 loss_all=7.6875 init_gold_top10=0.2857 init_gold_top100=0.2857
|
| 288 |
+
step=120 epoch=60/250 epoch_step=2/2 micro_steps=120 elapsed=3.9s lr=2.000000e-03 loss=1.3013 loss_recon=1.3013 loss_meanflow=0.0000 mean_model_t=0.2019 mean_corrupt_t=0.2019 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2019 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4750 corrupt_frac=0.7500 acc_corrupt=0.3000 loss_corrupt=1.7847 wrong_frac=0.7500 init_acc_corrupt=0.2167 acc_corrupt_t_0p0_0p2=0.1786 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.0233 out_g_norm=1.7968 acc_corrupt_t_0p2_0p4=0.3077 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.8333 corrupt_frac_t_0p4_0p6=1.0000 loss_all=6.5352 init_gold_top10=0.6667 init_gold_top100=0.6667
|
| 289 |
+
step=130 epoch=65/250 epoch_step=2/2 micro_steps=130 elapsed=5.0s lr=2.000000e-03 loss=0.9233 loss_recon=0.9233 loss_meanflow=0.0000 mean_model_t=0.1694 mean_corrupt_t=0.1694 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1694 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4125 corrupt_frac=0.7125 acc_corrupt=0.2105 loss_corrupt=1.8001 wrong_frac=0.7895 init_acc_corrupt=0.0175 acc_corrupt_t_0p0_0p2=0.1944 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.1115 out_g_norm=2.2608 acc_corrupt_t_0p2_0p4=0.2381 corrupt_frac_t_0p2_0p4=1.0000 loss_all=7.4980 init_gold_top10=0.3333 init_gold_top100=0.3333
|
| 290 |
+
step=140 epoch=70/250 epoch_step=2/2 micro_steps=140 elapsed=4.3s lr=2.000000e-03 loss=1.2486 loss_recon=1.2486 loss_meanflow=0.0000 mean_model_t=0.2216 mean_corrupt_t=0.2216 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2216 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4000 corrupt_frac=0.8000 acc_corrupt=0.2500 loss_corrupt=1.7026 wrong_frac=0.7969 init_acc_corrupt=0.1094 acc_corrupt_t_0p2_0p4=0.1579 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=1.2048 out_g_norm=2.0386 acc_corrupt_t_0p4_0p6=0.7143 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p0_0p2=0.2105 corrupt_frac_t_0p0_0p2=1.0000 loss_all=5.8379 init_gold_top10=0.3333 init_gold_top100=0.3333
|
| 291 |
+
step=150 epoch=75/250 epoch_step=2/2 micro_steps=150 elapsed=3.9s lr=2.000000e-03 loss=0.9679 loss_recon=0.9679 loss_meanflow=0.0000 mean_model_t=0.2049 mean_corrupt_t=0.2049 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2049 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5250 corrupt_frac=0.6875 acc_corrupt=0.3091 loss_corrupt=1.9291 wrong_frac=0.8727 init_acc_corrupt=0.0727 acc_corrupt_t_0p4_0p6=0.3333 corrupt_frac_t_0p4_0p6=1.0000 out_w_norm=1.3014 out_g_norm=2.1012 acc_corrupt_t_0p2_0p4=0.4286 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p0_0p2=0.2143 corrupt_frac_t_0p0_0p2=1.0000 loss_all=7.2578 init_gold_top10=0.0000 init_gold_top100=0.2857
|
| 292 |
+
step=160 epoch=80/250 epoch_step=2/2 micro_steps=160 elapsed=4.4s lr=2.000000e-03 loss=0.9315 loss_recon=0.9315 loss_meanflow=0.0000 mean_model_t=0.2270 mean_corrupt_t=0.2270 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2270 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5625 corrupt_frac=0.7125 acc_corrupt=0.3860 loss_corrupt=1.5416 wrong_frac=0.7018 init_acc_corrupt=0.1754 acc_corrupt_t_0p0_0p2=0.3750 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.3993 out_g_norm=2.1721 acc_corrupt_t_0p2_0p4=0.2632 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p6_0p8=0.8333 corrupt_frac_t_0p6_0p8=1.0000 loss_all=5.0039 init_gold_top10=0.1667 init_gold_top100=0.1667
|
| 293 |
+
step=170 epoch=85/250 epoch_step=2/2 micro_steps=170 elapsed=4.3s lr=2.000000e-03 loss=0.9990 loss_recon=0.9990 loss_meanflow=0.0000 mean_model_t=0.2447 mean_corrupt_t=0.2447 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2447 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5875 corrupt_frac=0.6500 acc_corrupt=0.3654 loss_corrupt=1.2837 wrong_frac=0.8654 init_acc_corrupt=0.0962 acc_corrupt_t_0p2_0p4=0.3793 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=1.4993 out_g_norm=2.1622 acc_corrupt_t_0p4_0p6=0.5714 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p0_0p2=0.2500 corrupt_frac_t_0p0_0p2=1.0000 loss_all=2.9458 init_gold_top10=0.0000 init_gold_top100=0.2500
|
| 294 |
+
step=180 epoch=90/250 epoch_step=2/2 micro_steps=180 elapsed=3.9s lr=2.000000e-03 loss=0.7330 loss_recon=0.7330 loss_meanflow=0.0000 mean_model_t=0.2271 mean_corrupt_t=0.2271 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2271 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6000 corrupt_frac=0.6375 acc_corrupt=0.3725 loss_corrupt=1.1424 wrong_frac=0.7843 init_acc_corrupt=0.1569 acc_corrupt_t_0p2_0p4=0.4444 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=1.5973 out_g_norm=2.0392 acc_corrupt_t_0p0_0p2=0.2963 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.5000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=4.4272 init_gold_top10=0.2500 init_gold_top100=0.2500
|
| 295 |
+
step=190 epoch=95/250 epoch_step=2/2 micro_steps=190 elapsed=4.4s lr=2.000000e-03 loss=0.6392 loss_recon=0.6392 loss_meanflow=0.0000 mean_model_t=0.1793 mean_corrupt_t=0.1793 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1793 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4500 corrupt_frac=0.7125 acc_corrupt=0.2456 loss_corrupt=1.4030 wrong_frac=0.8070 init_acc_corrupt=0.0702 acc_corrupt_t_0p0_0p2=0.1429 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.6917 out_g_norm=2.2318 acc_corrupt_t_0p2_0p4=0.3636 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=6.1974 init_gold_top10=0.5000 init_gold_top100=0.5000
|
| 296 |
+
step=200 epoch=100/250 epoch_step=2/2 micro_steps=200 elapsed=4.3s lr=2.000000e-03 loss=0.6127 loss_recon=0.6127 loss_meanflow=0.0000 mean_model_t=0.1869 mean_corrupt_t=0.1869 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1869 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5125 corrupt_frac=0.7625 acc_corrupt=0.3607 loss_corrupt=0.9250 wrong_frac=0.8033 init_acc_corrupt=0.1148 acc_corrupt_t_0p2_0p4=0.5000 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=1.7771 out_g_norm=2.0441 acc_corrupt_t_0p0_0p2=0.2069 corrupt_frac_t_0p0_0p2=1.0000 loss_all=2.7065 init_gold_top10=0.0000 init_gold_top100=0.2500
|
| 297 |
+
step=210 epoch=105/250 epoch_step=2/2 micro_steps=210 elapsed=3.9s lr=2.000000e-03 loss=0.3842 loss_recon=0.3842 loss_meanflow=0.0000 mean_model_t=0.1656 mean_corrupt_t=0.1656 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1656 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4875 corrupt_frac=0.7625 acc_corrupt=0.3279 loss_corrupt=0.8346 wrong_frac=0.8689 init_acc_corrupt=0.0656 acc_corrupt_t_0p0_0p2=0.2391 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.8511 out_g_norm=1.9414 acc_corrupt_t_0p2_0p4=0.5455 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.7500 corrupt_frac_t_0p4_0p6=1.0000 loss_all=3.7437 init_gold_top10=0.0000 init_gold_top100=0.0000
|
| 298 |
+
step=220 epoch=110/250 epoch_step=2/2 micro_steps=220 elapsed=4.4s lr=2.000000e-03 loss=0.5714 loss_recon=0.5714 loss_meanflow=0.0000 mean_model_t=0.2338 mean_corrupt_t=0.2338 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2338 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5750 corrupt_frac=0.7125 acc_corrupt=0.4035 loss_corrupt=0.9256 wrong_frac=0.7193 init_acc_corrupt=0.1754 acc_corrupt_t_0p2_0p4=0.3871 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=1.9223 out_g_norm=2.8620 acc_corrupt_t_0p0_0p2=0.3500 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.6667 corrupt_frac_t_0p4_0p6=1.0000 loss_all=2.8745 init_gold_top10=0.4286 init_gold_top100=0.4286
|
| 299 |
+
step=230 epoch=115/250 epoch_step=2/2 micro_steps=230 elapsed=4.3s lr=2.000000e-03 loss=0.3316 loss_recon=0.3316 loss_meanflow=0.0000 mean_model_t=0.1464 mean_corrupt_t=0.1464 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1464 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4875 corrupt_frac=0.6875 acc_corrupt=0.2545 loss_corrupt=0.6993 wrong_frac=0.8545 init_acc_corrupt=0.0545 acc_corrupt_t_0p0_0p2=0.2500 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.9838 out_g_norm=1.9833 acc_corrupt_t_0p2_0p4=0.2632 corrupt_frac_t_0p2_0p4=1.0000 loss_all=3.0620 init_gold_top10=0.3333 init_gold_top100=0.3333
|
| 300 |
+
step=240 epoch=120/250 epoch_step=2/2 micro_steps=240 elapsed=3.9s lr=2.000000e-03 loss=0.5485 loss_recon=0.5485 loss_meanflow=0.0000 mean_model_t=0.1917 mean_corrupt_t=0.1917 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1917 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4250 corrupt_frac=0.8125 acc_corrupt=0.2923 loss_corrupt=1.0804 wrong_frac=0.8769 init_acc_corrupt=0.0462 acc_corrupt_t_0p2_0p4=0.1579 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.0305 out_g_norm=2.2724 acc_corrupt_t_0p4_0p6=0.8333 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p0_0p2=0.2750 corrupt_frac_t_0p0_0p2=1.0000 loss_all=4.9873 init_gold_top10=0.0000 init_gold_top100=0.0000
|
| 301 |
+
step=250 epoch=125/250 epoch_step=2/2 micro_steps=250 elapsed=4.3s lr=2.000000e-03 loss=0.5894 loss_recon=0.5894 loss_meanflow=0.0000 mean_model_t=0.2324 mean_corrupt_t=0.2324 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2324 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5000 corrupt_frac=0.7375 acc_corrupt=0.3220 loss_corrupt=1.1833 wrong_frac=0.7966 init_acc_corrupt=0.1356 acc_corrupt_t_0p2_0p4=0.3571 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.0676 out_g_norm=2.6213 acc_corrupt_t_0p0_0p2=0.2353 corrupt_frac_t_0p0_0p2=1.0000 loss_all=5.5378 init_gold_top10=0.0000 init_gold_top100=0.0000
|
| 302 |
+
step=260 epoch=130/250 epoch_step=2/2 micro_steps=260 elapsed=4.3s lr=2.000000e-03 loss=0.4236 loss_recon=0.4236 loss_meanflow=0.0000 mean_model_t=0.2037 mean_corrupt_t=0.2037 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2037 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6125 corrupt_frac=0.7000 acc_corrupt=0.4464 loss_corrupt=0.6426 wrong_frac=0.7857 init_acc_corrupt=0.0893 acc_corrupt_t_0p0_0p2=0.2500 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0902 out_g_norm=2.7820 acc_corrupt_t_0p2_0p4=0.6429 corrupt_frac_t_0p2_0p4=1.0000 loss_all=1.9424 init_gold_top10=0.0000 init_gold_top100=0.2500
|
| 303 |
+
step=270 epoch=135/250 epoch_step=2/2 micro_steps=270 elapsed=3.9s lr=2.000000e-03 loss=0.4216 loss_recon=0.4216 loss_meanflow=0.0000 mean_model_t=0.2494 mean_corrupt_t=0.2494 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2494 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6375 corrupt_frac=0.6375 acc_corrupt=0.4314 loss_corrupt=0.6980 wrong_frac=0.8235 init_acc_corrupt=0.1373 acc_corrupt_t_0p0_0p2=0.3103 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0985 out_g_norm=2.1731 acc_corrupt_t_0p2_0p4=0.6667 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.5000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=2.8916 init_gold_top10=0.3333 init_gold_top100=0.3333
|
| 304 |
+
step=280 epoch=140/250 epoch_step=2/2 micro_steps=280 elapsed=4.4s lr=2.000000e-03 loss=0.4478 loss_recon=0.4478 loss_meanflow=0.0000 mean_model_t=0.2913 mean_corrupt_t=0.2913 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2913 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6750 corrupt_frac=0.6125 acc_corrupt=0.4694 loss_corrupt=0.6763 wrong_frac=0.7143 init_acc_corrupt=0.2041 acc_corrupt_t_0p0_0p2=0.3600 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1126 out_g_norm=2.5709 acc_corrupt_t_0p6_0p8=0.7500 corrupt_frac_t_0p6_0p8=1.0000 acc_corrupt_t_0p2_0p4=0.5833 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.5000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=2.2793 init_gold_top10=0.0000 init_gold_top100=0.0000
|
| 305 |
+
step=290 epoch=145/250 epoch_step=2/2 micro_steps=290 elapsed=4.3s lr=2.000000e-03 loss=0.2915 loss_recon=0.2915 loss_meanflow=0.0000 mean_model_t=0.1777 mean_corrupt_t=0.1777 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1777 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6000 corrupt_frac=0.7125 acc_corrupt=0.4386 loss_corrupt=0.5437 wrong_frac=0.8947 init_acc_corrupt=0.0526 acc_corrupt_t_0p0_0p2=0.3333 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1215 out_g_norm=2.7226 acc_corrupt_t_0p2_0p4=0.5833 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.3824 init_gold_top10=0.1667 init_gold_top100=0.3333
|
| 306 |
+
step=300 epoch=150/250 epoch_step=2/2 micro_steps=300 elapsed=3.9s lr=2.000000e-03 loss=0.4178 loss_recon=0.4178 loss_meanflow=0.0000 mean_model_t=0.2061 mean_corrupt_t=0.2061 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2061 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6125 corrupt_frac=0.7375 acc_corrupt=0.4746 loss_corrupt=0.7239 wrong_frac=0.7627 init_acc_corrupt=0.0678 acc_corrupt_t_0p0_0p2=0.6154 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1224 out_g_norm=2.1706 acc_corrupt_t_0p2_0p4=0.3636 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.5565 init_gold_top10=0.1429 init_gold_top100=0.5714
|
| 307 |
+
step=310 epoch=155/250 epoch_step=2/2 micro_steps=310 elapsed=4.9s lr=2.000000e-03 loss=0.4163 loss_recon=0.4163 loss_meanflow=0.0000 mean_model_t=0.1841 mean_corrupt_t=0.1841 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1841 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6125 corrupt_frac=0.7000 acc_corrupt=0.4464 loss_corrupt=0.6763 wrong_frac=0.7857 init_acc_corrupt=0.1429 acc_corrupt_t_0p0_0p2=0.3889 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1090 out_g_norm=2.5876 acc_corrupt_t_0p4_0p6=0.5714 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.5385 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.2671 init_gold_top10=0.5714 init_gold_top100=0.5714
|
| 308 |
+
step=320 epoch=160/250 epoch_step=2/2 micro_steps=320 elapsed=4.2s lr=2.000000e-03 loss=0.4162 loss_recon=0.4162 loss_meanflow=0.0000 mean_model_t=0.2237 mean_corrupt_t=0.2237 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2237 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6500 corrupt_frac=0.7625 acc_corrupt=0.5410 loss_corrupt=0.6050 wrong_frac=0.7213 init_acc_corrupt=0.1803 acc_corrupt_t_0p0_0p2=0.5185 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0998 out_g_norm=2.0437 acc_corrupt_t_0p2_0p4=0.5556 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.5714 corrupt_frac_t_0p4_0p6=1.0000 loss_all=2.4873 init_gold_top10=0.2500 init_gold_top100=0.5000
|
| 309 |
+
step=330 epoch=165/250 epoch_step=2/2 micro_steps=330 elapsed=4.0s lr=2.000000e-03 loss=0.3596 loss_recon=0.3596 loss_meanflow=0.0000 mean_model_t=0.1733 mean_corrupt_t=0.1733 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1733 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4375 corrupt_frac=0.7750 acc_corrupt=0.2742 loss_corrupt=0.9846 wrong_frac=0.8710 init_acc_corrupt=0.0968 acc_corrupt_t_0p0_0p2=0.1277 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0984 out_g_norm=2.1073 acc_corrupt_t_0p2_0p4=0.7333 corrupt_frac_t_0p2_0p4=1.0000 loss_all=5.2998 init_gold_top10=0.0000 init_gold_top100=0.0000
|
| 310 |
+
step=340 epoch=170/250 epoch_step=2/2 micro_steps=340 elapsed=4.4s lr=2.000000e-03 loss=0.2857 loss_recon=0.2857 loss_meanflow=0.0000 mean_model_t=0.1706 mean_corrupt_t=0.1706 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1706 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6625 corrupt_frac=0.6875 acc_corrupt=0.5273 loss_corrupt=0.4981 wrong_frac=0.8364 init_acc_corrupt=0.0545 acc_corrupt_t_0p4_0p6=0.7500 corrupt_frac_t_0p4_0p6=1.0000 out_w_norm=2.0925 out_g_norm=2.0339 acc_corrupt_t_0p0_0p2=0.5000 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p2_0p4=0.5714 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.0654 init_gold_top10=0.0000 init_gold_top100=0.0000
|
| 311 |
+
step=350 epoch=175/250 epoch_step=2/2 micro_steps=350 elapsed=4.2s lr=2.000000e-03 loss=0.1813 loss_recon=0.1813 loss_meanflow=0.0000 mean_model_t=0.1718 mean_corrupt_t=0.1718 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1718 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7750 corrupt_frac=0.7000 acc_corrupt=0.6786 loss_corrupt=0.4732 wrong_frac=0.8214 init_acc_corrupt=0.1607 acc_corrupt_t_0p0_0p2=0.5500 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0854 out_g_norm=2.3255 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.5088 init_gold_top10=0.1667 init_gold_top100=0.3333
|
| 312 |
+
step=360 epoch=180/250 epoch_step=2/2 micro_steps=360 elapsed=3.9s lr=2.000000e-03 loss=0.1510 loss_recon=0.1510 loss_meanflow=0.0000 mean_model_t=0.2313 mean_corrupt_t=0.2313 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2313 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7875 corrupt_frac=0.7125 acc_corrupt=0.7018 loss_corrupt=0.3167 wrong_frac=0.7368 init_acc_corrupt=0.1930 acc_corrupt_t_0p2_0p4=0.8571 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.0792 out_g_norm=1.7478 acc_corrupt_t_0p0_0p2=0.4828 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=1.8887 init_gold_top10=0.0000 init_gold_top100=0.5000
|
| 313 |
+
step=370 epoch=185/250 epoch_step=2/2 micro_steps=370 elapsed=4.3s lr=2.000000e-03 loss=0.2450 loss_recon=0.2450 loss_meanflow=0.0000 mean_model_t=0.1875 mean_corrupt_t=0.1875 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1875 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8000 corrupt_frac=0.6250 acc_corrupt=0.6800 loss_corrupt=0.3616 wrong_frac=0.8000 init_acc_corrupt=0.1000 acc_corrupt_t_0p0_0p2=0.7586 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0847 out_g_norm=2.1802 acc_corrupt_t_0p2_0p4=0.5714 corrupt_frac_t_0p2_0p4=1.0000 loss_all=1.4507 init_gold_top10=0.2500 init_gold_top100=0.2500
|
| 314 |
+
step=380 epoch=190/250 epoch_step=2/2 micro_steps=380 elapsed=4.3s lr=2.000000e-03 loss=0.1449 loss_recon=0.1449 loss_meanflow=0.0000 mean_model_t=0.1769 mean_corrupt_t=0.1769 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1769 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8000 corrupt_frac=0.6500 acc_corrupt=0.6923 loss_corrupt=0.3072 wrong_frac=0.7692 init_acc_corrupt=0.0962 acc_corrupt_t_0p0_0p2=0.6250 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0835 out_g_norm=1.5933 acc_corrupt_t_0p2_0p4=0.9167 corrupt_frac_t_0p2_0p4=1.0000 loss_all=1.4590 init_gold_top10=0.5000 init_gold_top100=0.5000
|
| 315 |
+
step=390 epoch=195/250 epoch_step=2/2 micro_steps=390 elapsed=3.9s lr=2.000000e-03 loss=0.1706 loss_recon=0.1706 loss_meanflow=0.0000 mean_model_t=0.2492 mean_corrupt_t=0.2492 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2492 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8250 corrupt_frac=0.7750 acc_corrupt=0.7742 loss_corrupt=0.3738 wrong_frac=0.7097 init_acc_corrupt=0.0968 acc_corrupt_t_0p2_0p4=0.7843 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.0944 out_g_norm=2.0398 acc_corrupt_t_0p0_0p2=0.7273 corrupt_frac_t_0p0_0p2=1.0000 loss_all=1.9468 init_gold_top10=0.1429 init_gold_top100=0.1429
|
| 316 |
+
step=400 epoch=200/250 epoch_step=2/2 micro_steps=400 elapsed=4.3s lr=2.000000e-03 loss=0.1752 loss_recon=0.1752 loss_meanflow=0.0000 mean_model_t=0.2467 mean_corrupt_t=0.2467 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2467 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8000 corrupt_frac=0.7750 acc_corrupt=0.7419 loss_corrupt=0.3860 wrong_frac=0.7903 init_acc_corrupt=0.1613 acc_corrupt_t_0p0_0p2=0.5000 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0922 out_g_norm=1.7354 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.8750 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.0479 init_gold_top10=0.4286 init_gold_top100=0.4286
|
| 317 |
+
step=410 epoch=205/250 epoch_step=2/2 micro_steps=410 elapsed=4.3s lr=2.000000e-03 loss=0.2606 loss_recon=0.2606 loss_meanflow=0.0000 mean_model_t=0.2461 mean_corrupt_t=0.2461 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2461 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7625 corrupt_frac=0.6250 acc_corrupt=0.6200 loss_corrupt=1.0780 wrong_frac=0.7600 init_acc_corrupt=0.1200 acc_corrupt_t_0p0_0p2=0.4286 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0802 out_g_norm=2.0628 acc_corrupt_t_0p2_0p4=0.8000 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.9167 corrupt_frac_t_0p4_0p6=1.0000 loss_all=5.3184 init_gold_top10=0.0000 init_gold_top100=0.2857
|
| 318 |
+
step=420 epoch=210/250 epoch_step=2/2 micro_steps=420 elapsed=3.9s lr=2.000000e-03 loss=0.2090 loss_recon=0.2090 loss_meanflow=0.0000 mean_model_t=0.2875 mean_corrupt_t=0.2875 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2875 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8750 corrupt_frac=0.7625 acc_corrupt=0.8361 loss_corrupt=0.3340 wrong_frac=0.7049 init_acc_corrupt=0.2459 acc_corrupt_t_0p0_0p2=0.7407 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0755 out_g_norm=1.5891 acc_corrupt_t_0p4_0p6=0.9000 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.9286 corrupt_frac_t_0p2_0p4=1.0000 loss_all=1.5884 init_gold_top10=0.5000 init_gold_top100=0.5000
|
| 319 |
+
step=430 epoch=215/250 epoch_step=2/2 micro_steps=430 elapsed=4.3s lr=2.000000e-03 loss=0.2835 loss_recon=0.2835 loss_meanflow=0.0000 mean_model_t=0.2302 mean_corrupt_t=0.2302 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2302 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8000 corrupt_frac=0.6875 acc_corrupt=0.7091 loss_corrupt=0.9789 wrong_frac=0.8727 init_acc_corrupt=0.0727 acc_corrupt_t_0p0_0p2=0.7778 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0719 out_g_norm=1.7877 acc_corrupt_t_0p2_0p4=0.5833 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=5.6514 init_gold_top10=0.0000 init_gold_top100=0.2857
|
| 320 |
+
step=440 epoch=220/250 epoch_step=2/2 micro_steps=440 elapsed=4.3s lr=2.000000e-03 loss=0.1238 loss_recon=0.1238 loss_meanflow=0.0000 mean_model_t=0.2328 mean_corrupt_t=0.2328 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2328 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8625 corrupt_frac=0.6500 acc_corrupt=0.7885 loss_corrupt=0.6087 wrong_frac=0.7692 init_acc_corrupt=0.1731 acc_corrupt_t_0p0_0p2=0.6765 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0703 out_g_norm=1.9463 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=1.0000 loss_all=3.3848 init_gold_top10=0.0000 init_gold_top100=0.2857
|
| 321 |
+
step=450 epoch=225/250 epoch_step=2/2 micro_steps=450 elapsed=3.9s lr=2.000000e-03 loss=0.1397 loss_recon=0.1397 loss_meanflow=0.0000 mean_model_t=0.1586 mean_corrupt_t=0.1586 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1586 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8875 corrupt_frac=0.6750 acc_corrupt=0.8333 loss_corrupt=0.4628 wrong_frac=0.8333 init_acc_corrupt=0.0185 acc_corrupt_t_0p2_0p4=0.6471 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.0704 out_g_norm=1.4328 acc_corrupt_t_0p0_0p2=0.9189 corrupt_frac_t_0p0_0p2=1.0000 loss_all=2.4082 init_gold_top10=0.1429 init_gold_top100=0.1429
|
| 322 |
+
step=460 epoch=230/250 epoch_step=2/2 micro_steps=460 elapsed=4.4s lr=2.000000e-03 loss=0.1377 loss_recon=0.1377 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2081 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9125 corrupt_frac=0.6375 acc_corrupt=0.8627 loss_corrupt=0.2271 wrong_frac=0.7255 init_acc_corrupt=0.1765 acc_corrupt_t_0p0_0p2=0.8437 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0668 out_g_norm=1.6259 acc_corrupt_t_0p2_0p4=0.9091 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.8750 corrupt_frac_t_0p4_0p6=1.0000 loss_all=1.0952 init_gold_top10=0.5000 init_gold_top100=0.5000
|
| 323 |
+
step=470 epoch=235/250 epoch_step=2/2 micro_steps=470 elapsed=4.3s lr=2.000000e-03 loss=0.1759 loss_recon=0.1759 loss_meanflow=0.0000 mean_model_t=0.1753 mean_corrupt_t=0.1753 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1753 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8500 corrupt_frac=0.7000 acc_corrupt=0.7857 loss_corrupt=0.4325 wrong_frac=0.8393 init_acc_corrupt=0.0893 acc_corrupt_t_0p0_0p2=0.7436 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0613 out_g_norm=1.8623 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.8182 corrupt_frac_t_0p2_0p4=1.0000 loss_all=1.9712 init_gold_top10=0.1429 init_gold_top100=0.1429
|
| 324 |
+
step=480 epoch=240/250 epoch_step=2/2 micro_steps=480 elapsed=3.9s lr=2.000000e-03 loss=0.1247 loss_recon=0.1247 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2083 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9375 corrupt_frac=0.6500 acc_corrupt=0.9038 loss_corrupt=0.2654 wrong_frac=0.7500 init_acc_corrupt=0.1731 acc_corrupt_t_0p0_0p2=0.8750 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0751 out_g_norm=1.8762 acc_corrupt_t_0p2_0p4=0.9091 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=1.2505 init_gold_top10=0.0000 init_gold_top100=0.5000
|
| 325 |
+
step=490 epoch=245/250 epoch_step=2/2 micro_steps=490 elapsed=4.8s lr=2.000000e-03 loss=0.0926 loss_recon=0.0926 loss_meanflow=0.0000 mean_model_t=0.2059 mean_corrupt_t=0.2059 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2059 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9625 corrupt_frac=0.7125 acc_corrupt=0.9474 loss_corrupt=0.3796 wrong_frac=0.7895 init_acc_corrupt=0.0877 acc_corrupt_t_0p0_0p2=0.9302 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0723 out_g_norm=1.4834 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.5068 init_gold_top10=0.1667 init_gold_top100=0.1667
|
| 326 |
+
step=500 epoch=250/250 epoch_step=2/2 micro_steps=500 elapsed=4.2s lr=2.000000e-03 loss=0.1136 loss_recon=0.1136 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2092 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8250 corrupt_frac=0.8000 acc_corrupt=0.7813 loss_corrupt=0.2750 wrong_frac=0.7656 init_acc_corrupt=0.1563 acc_corrupt_t_0p2_0p4=0.8800 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.0694 out_g_norm=1.7062 acc_corrupt_t_0p0_0p2=0.6667 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=1.8398 init_gold_top10=0.3333 init_gold_top100=0.3333
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805.log
ADDED
|
@@ -0,0 +1,987 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NCCL version 2.25.1+cuda12.8
|
| 2 |
+
{
|
| 3 |
+
"device": "cuda:0",
|
| 4 |
+
"rank": 0,
|
| 5 |
+
"world_size": 4,
|
| 6 |
+
"samples": "owt_cached_chunks:8",
|
| 7 |
+
"vocab_size": 969,
|
| 8 |
+
"tokenizer_vocab_size": 50257,
|
| 9 |
+
"save_dir": "runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805",
|
| 10 |
+
"batch_size": 128,
|
| 11 |
+
"grad_accum": 1,
|
| 12 |
+
"effective_batch_size": 512,
|
| 13 |
+
"global_batch_size": 512,
|
| 14 |
+
"lr_schedule": "constant_warmup",
|
| 15 |
+
"optimizer": "muon",
|
| 16 |
+
"epochs": 0.0,
|
| 17 |
+
"steps_per_epoch": 1,
|
| 18 |
+
"total_steps": 1000,
|
| 19 |
+
"warmup_steps": 10,
|
| 20 |
+
"warmup_epochs": -1.0,
|
| 21 |
+
"min_lr": 0.0,
|
| 22 |
+
"weight_decay": 0.1,
|
| 23 |
+
"output_weight_decay": -1.0,
|
| 24 |
+
"adamw_param_groups": "nanogpt",
|
| 25 |
+
"adam_beta1": 0.9,
|
| 26 |
+
"adam_beta2": 0.95,
|
| 27 |
+
"adam_eps": 1e-08,
|
| 28 |
+
"muon_impl": "legacy",
|
| 29 |
+
"muon_momentum": 0.95,
|
| 30 |
+
"muon_ns_steps": 5,
|
| 31 |
+
"muon_update_scale": 1.0,
|
| 32 |
+
"muon_nesterov": false,
|
| 33 |
+
"muon_width_scale": false,
|
| 34 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 35 |
+
"muon_param_count": 1965440,
|
| 36 |
+
"muon_adam_param_count": 8192,
|
| 37 |
+
"muon_param_names": [
|
| 38 |
+
"vocab_embed.embedding",
|
| 39 |
+
"sigma_map.net.0.weight",
|
| 40 |
+
"sigma_map.net.2.weight",
|
| 41 |
+
"blocks.0.attn_qkv.weight",
|
| 42 |
+
"blocks.0.attn_out.weight",
|
| 43 |
+
"blocks.0.mlp.0.weight",
|
| 44 |
+
"blocks.0.mlp.2.weight",
|
| 45 |
+
"blocks.0.adaLN_modulation.weight",
|
| 46 |
+
"blocks.1.attn_qkv.weight",
|
| 47 |
+
"blocks.1.attn_out.weight",
|
| 48 |
+
"blocks.1.mlp.0.weight",
|
| 49 |
+
"blocks.1.mlp.2.weight",
|
| 50 |
+
"blocks.1.adaLN_modulation.weight",
|
| 51 |
+
"blocks.2.attn_qkv.weight",
|
| 52 |
+
"blocks.2.attn_out.weight",
|
| 53 |
+
"blocks.2.mlp.0.weight",
|
| 54 |
+
"blocks.2.mlp.2.weight",
|
| 55 |
+
"blocks.2.adaLN_modulation.weight",
|
| 56 |
+
"output_layer.linear.weight",
|
| 57 |
+
"output_layer.adaLN_modulation.weight"
|
| 58 |
+
],
|
| 59 |
+
"muon_adam_param_names": [
|
| 60 |
+
"sigma_map.net.0.bias",
|
| 61 |
+
"sigma_map.net.2.bias",
|
| 62 |
+
"blocks.0.norm1.weight",
|
| 63 |
+
"blocks.0.norm2.weight",
|
| 64 |
+
"blocks.0.mlp.0.bias",
|
| 65 |
+
"blocks.0.mlp.2.bias",
|
| 66 |
+
"blocks.0.adaLN_modulation.bias",
|
| 67 |
+
"blocks.1.norm1.weight",
|
| 68 |
+
"blocks.1.norm2.weight",
|
| 69 |
+
"blocks.1.mlp.0.bias",
|
| 70 |
+
"blocks.1.mlp.2.bias",
|
| 71 |
+
"blocks.1.adaLN_modulation.bias",
|
| 72 |
+
"blocks.2.norm1.weight",
|
| 73 |
+
"blocks.2.norm2.weight",
|
| 74 |
+
"blocks.2.mlp.0.bias",
|
| 75 |
+
"blocks.2.mlp.2.bias",
|
| 76 |
+
"blocks.2.adaLN_modulation.bias",
|
| 77 |
+
"output_layer.norm_final.weight",
|
| 78 |
+
"output_layer.adaLN_modulation.bias"
|
| 79 |
+
],
|
| 80 |
+
"muon_effective_nesterov": false,
|
| 81 |
+
"muon_effective_width_scale": false,
|
| 82 |
+
"muon_effective_weight_decay": 0.1,
|
| 83 |
+
"muon_adam_fallback_nesterov": false,
|
| 84 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 85 |
+
"ema_decay": 0.9999,
|
| 86 |
+
"ema_start_step": 0,
|
| 87 |
+
"model_type": "ddit",
|
| 88 |
+
"ddit_mlp_type": "gelu",
|
| 89 |
+
"elf_num_time_tokens": 4,
|
| 90 |
+
"elf_num_model_mode_tokens": 0,
|
| 91 |
+
"qk_norm": true,
|
| 92 |
+
"output_bias": false,
|
| 93 |
+
"output_init_std": -1.0,
|
| 94 |
+
"norm_type": "rmsnorm",
|
| 95 |
+
"target_loss": "hard_ce",
|
| 96 |
+
"linear_soft_target_power": 1.0,
|
| 97 |
+
"linear_soft_target_min_conf": 0.0,
|
| 98 |
+
"linear_soft_target_max_conf": 1.0,
|
| 99 |
+
"t_sampling_mode": "logit_normal",
|
| 100 |
+
"t_sampling_power": 1.0,
|
| 101 |
+
"t_sampling_eps": 0.0001,
|
| 102 |
+
"t_sampling_logit_mean": -1.5,
|
| 103 |
+
"t_sampling_logit_std": 0.8,
|
| 104 |
+
"dual_t": true,
|
| 105 |
+
"corrupt_t_mode": "same",
|
| 106 |
+
"corrupt_min_t": 0.0,
|
| 107 |
+
"corrupt_max_t": 1.0,
|
| 108 |
+
"prefix_block_prob": 0.0,
|
| 109 |
+
"prefix_block_len": 128,
|
| 110 |
+
"mask_ratio_floor_schedule": "none",
|
| 111 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 112 |
+
"dirichlet_semantic_t_mode": "same",
|
| 113 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 114 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 115 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 116 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 117 |
+
"categorical_wrong_from_full_vocab": true,
|
| 118 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 119 |
+
"categorical_wrong_basin_token_ids": "",
|
| 120 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 121 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 122 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 123 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 124 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 125 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 126 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 127 |
+
"mask_mixture_original_prob": 0.0,
|
| 128 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 129 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 130 |
+
"mask_mixture_block_prob": 0.0,
|
| 131 |
+
"mask_mixture_all_prob": 1.0,
|
| 132 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 133 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 134 |
+
"mask_mixture_block_tokens": "64,128",
|
| 135 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 136 |
+
"logistic_normal_sigma_min": 0.1,
|
| 137 |
+
"logistic_normal_sigma_max": 1.0,
|
| 138 |
+
"logistic_normal_tau_min": 1.0,
|
| 139 |
+
"logistic_normal_tau_max": 1.0,
|
| 140 |
+
"torch_compile": false,
|
| 141 |
+
"compile_mode": "max-autotune",
|
| 142 |
+
"state_format": "prob",
|
| 143 |
+
"meanflow_weight": 0.0,
|
| 144 |
+
"rollout_train_prob": 0.0,
|
| 145 |
+
"rollout_train_steps": 1,
|
| 146 |
+
"rollout_train_infer_steps": 64,
|
| 147 |
+
"rollout_train_temp": 1.45,
|
| 148 |
+
"rollout_train_max_gamma": 1.0,
|
| 149 |
+
"rollout_train_corrupt_only": true,
|
| 150 |
+
"rollout_train_samplewise": false,
|
| 151 |
+
"rollout_train_compute_always": false,
|
| 152 |
+
"bridge_noise_init": "logistic_normal",
|
| 153 |
+
"noise_sigma": -1.0,
|
| 154 |
+
"allow_tf32": true,
|
| 155 |
+
"activation_checkpointing": false,
|
| 156 |
+
"activation_checkpoint_interval": 1,
|
| 157 |
+
"activation_checkpoint_scope": "block",
|
| 158 |
+
"ddp_static_graph": false,
|
| 159 |
+
"ddp_gradient_as_bucket_view": true,
|
| 160 |
+
"blocking_data_transfer": false,
|
| 161 |
+
"dataloader_prefetch_factor": 4,
|
| 162 |
+
"full_train_stats": false,
|
| 163 |
+
"tokenized_hf": false,
|
| 164 |
+
"tokenized_pad_token": "pad",
|
| 165 |
+
"elf_conditional_hf": false,
|
| 166 |
+
"record_pad_truncate": false,
|
| 167 |
+
"record_add_eos": false,
|
| 168 |
+
"record_add_special_tokens": false,
|
| 169 |
+
"record_pad_token": "pad",
|
| 170 |
+
"record_shuffle_buffer": 10000,
|
| 171 |
+
"wrap": true,
|
| 172 |
+
"wrap_mode": "stream",
|
| 173 |
+
"wrap_record_buffer_size": 200,
|
| 174 |
+
"owt_cached_chunks": true,
|
| 175 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
|
| 176 |
+
"owt_chunk_cache_rebuild": false,
|
| 177 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 178 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 179 |
+
"online_chunk_shuffle": false,
|
| 180 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 181 |
+
"openwebtext_split": "train_minus_100k",
|
| 182 |
+
"detokenizer": "auto",
|
| 183 |
+
"resolved_detokenizer": null,
|
| 184 |
+
"num_workers": 0,
|
| 185 |
+
"latest_every": 1000,
|
| 186 |
+
"resume_path": ""
|
| 187 |
+
}
|
| 188 |
+
step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=4.4s lr=2.000000e-03 loss=6.6495 loss_recon=6.6495 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1702 corrupt_frac=1.0000 acc_corrupt=0.1702 loss_corrupt=6.6495 wrong_frac=0.7917 init_acc_corrupt=0.2067 acc_corrupt_t_0p0_0p2=0.0878 corrupt_frac_t_0p0_0p2=0.5588 acc_corrupt_t_0p2_0p4=0.2294 corrupt_frac_t_0p2_0p4=0.3579 acc_corrupt_t_0p4_0p6=0.4491 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.6387 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=1.2350 out_g_norm=0.9734 acc_corrupt_t_0p8_1p0=0.8477 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.3486 init_gold_top10=0.2053 init_gold_top100=0.2800
|
| 189 |
+
step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=3.7s lr=2.000000e-03 loss=5.8699 loss_recon=5.8699 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1640 corrupt_frac=1.0000 acc_corrupt=0.1640 loss_corrupt=5.8699 wrong_frac=0.7925 init_acc_corrupt=0.2059 acc_corrupt_t_0p0_0p2=0.0976 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=0.2106 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.3899 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.6051 corrupt_frac_t_0p6_0p8=0.0119 out_w_norm=4.0392 out_g_norm=1.3294 acc_corrupt_t_0p8_1p0=0.7979 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.5159 init_gold_top10=0.2043 init_gold_top100=0.2783
|
| 190 |
+
step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=3.7s lr=2.000000e-03 loss=5.1215 loss_recon=5.1215 loss_meanflow=0.0000 mean_model_t=0.2117 mean_corrupt_t=0.2117 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1844 corrupt_frac=1.0000 acc_corrupt=0.1844 loss_corrupt=5.1215 wrong_frac=0.7884 init_acc_corrupt=0.2102 acc_corrupt_t_0p0_0p2=0.1071 corrupt_frac_t_0p0_0p2=0.5484 acc_corrupt_t_0p2_0p4=0.2350 corrupt_frac_t_0p2_0p4=0.3611 acc_corrupt_t_0p4_0p6=0.4284 corrupt_frac_t_0p4_0p6=0.0803 acc_corrupt_t_0p6_0p8=0.6246 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=6.5345 out_g_norm=0.7810 acc_corrupt_t_0p8_1p0=0.7812 corrupt_frac_t_0p8_1p0=0.0078 loss_all=4.7567 init_gold_top10=0.2224 init_gold_top100=0.2981
|
| 191 |
+
step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=3.7s lr=2.000000e-03 loss=4.3914 loss_recon=4.3914 loss_meanflow=0.0000 mean_model_t=0.2073 mean_corrupt_t=0.2073 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2275 corrupt_frac=1.0000 acc_corrupt=0.2275 loss_corrupt=4.3914 wrong_frac=0.7927 init_acc_corrupt=0.2056 acc_corrupt_t_0p0_0p2=0.1322 corrupt_frac_t_0p0_0p2=0.5613 acc_corrupt_t_0p2_0p4=0.3118 corrupt_frac_t_0p2_0p4=0.3571 acc_corrupt_t_0p4_0p6=0.4954 corrupt_frac_t_0p4_0p6=0.0727 acc_corrupt_t_0p6_0p8=0.6688 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=8.2403 out_g_norm=0.4520 loss_all=3.9405 init_gold_top10=0.2197 init_gold_top100=0.2939
|
| 192 |
+
step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=3.7s lr=2.000000e-03 loss=3.4904 loss_recon=3.4904 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2580 corrupt_frac=1.0000 acc_corrupt=0.2580 loss_corrupt=3.4904 wrong_frac=0.7904 init_acc_corrupt=0.2080 acc_corrupt_t_0p0_0p2=0.1592 corrupt_frac_t_0p0_0p2=0.5540 acc_corrupt_t_0p2_0p4=0.3420 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=0.5218 corrupt_frac_t_0p4_0p6=0.0745 out_w_norm=9.6795 out_g_norm=0.4054 acc_corrupt_t_0p6_0p8=0.6903 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=0.8594 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.9689 init_gold_top10=0.2225 init_gold_top100=0.2952
|
| 193 |
+
step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=3.7s lr=2.000000e-03 loss=2.5069 loss_recon=2.5069 loss_meanflow=0.0000 mean_model_t=0.2089 mean_corrupt_t=0.2089 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3478 corrupt_frac=1.0000 acc_corrupt=0.3478 loss_corrupt=2.5069 wrong_frac=0.7910 init_acc_corrupt=0.2074 acc_corrupt_t_0p0_0p2=0.2274 corrupt_frac_t_0p0_0p2=0.5598 acc_corrupt_t_0p2_0p4=0.4660 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.6365 corrupt_frac_t_0p4_0p6=0.0750 acc_corrupt_t_0p6_0p8=0.7711 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=10.4910 out_g_norm=0.4782 acc_corrupt_t_0p8_1p0=0.8952 corrupt_frac_t_0p8_1p0=0.0094 loss_all=2.0582 init_gold_top10=0.2072 init_gold_top100=0.2815
|
| 194 |
+
step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=3.7s lr=2.000000e-03 loss=1.6159 loss_recon=1.6159 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5261 corrupt_frac=1.0000 acc_corrupt=0.5261 loss_corrupt=1.6159 wrong_frac=0.7907 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.3700 corrupt_frac_t_0p0_0p2=0.5552 acc_corrupt_t_0p2_0p4=0.6933 corrupt_frac_t_0p2_0p4=0.3584 acc_corrupt_t_0p4_0p6=0.8285 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=11.0190 out_g_norm=0.5898 acc_corrupt_t_0p6_0p8=0.8964 corrupt_frac_t_0p6_0p8=0.0132 acc_corrupt_t_0p8_1p0=0.9167 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.3305 init_gold_top10=0.2117 init_gold_top100=0.2832
|
| 195 |
+
step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=3.7s lr=2.000000e-03 loss=0.9537 loss_recon=0.9537 loss_meanflow=0.0000 mean_model_t=0.2103 mean_corrupt_t=0.2103 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7156 corrupt_frac=1.0000 acc_corrupt=0.7156 loss_corrupt=0.9537 wrong_frac=0.7897 init_acc_corrupt=0.2086 acc_corrupt_t_0p0_0p2=0.5634 corrupt_frac_t_0p0_0p2=0.5525 acc_corrupt_t_0p2_0p4=0.8899 corrupt_frac_t_0p2_0p4=0.3592 acc_corrupt_t_0p4_0p6=0.9571 corrupt_frac_t_0p4_0p6=0.0800 acc_corrupt_t_0p6_0p8=0.9757 corrupt_frac_t_0p6_0p8=0.0141 out_w_norm=11.3997 out_g_norm=0.6929 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5768 init_gold_top10=0.2208 init_gold_top100=0.2931
|
| 196 |
+
step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=3.7s lr=2.000000e-03 loss=0.5253 loss_recon=0.5253 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8455 corrupt_frac=1.0000 acc_corrupt=0.8455 loss_corrupt=0.5253 wrong_frac=0.7906 init_acc_corrupt=0.2078 acc_corrupt_t_0p0_0p2=0.7406 corrupt_frac_t_0p0_0p2=0.5584 acc_corrupt_t_0p2_0p4=0.9744 corrupt_frac_t_0p2_0p4=0.3530 acc_corrupt_t_0p4_0p6=0.9931 corrupt_frac_t_0p4_0p6=0.0801 acc_corrupt_t_0p6_0p8=0.9967 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=11.7285 out_g_norm=0.5915 acc_corrupt_t_0p8_1p0=0.9974 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3668 init_gold_top10=0.2190 init_gold_top100=0.2921
|
| 197 |
+
step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=3.7s lr=2.000000e-03 loss=0.3128 loss_recon=0.3128 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9050 corrupt_frac=1.0000 acc_corrupt=0.9050 loss_corrupt=0.3128 wrong_frac=0.7908 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.8329 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=0.9944 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.9985 corrupt_frac_t_0p4_0p6=0.0759 acc_corrupt_t_0p6_0p8=0.9982 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=11.9476 out_g_norm=0.4867 acc_corrupt_t_0p8_1p0=0.9980 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2410 init_gold_top10=0.2187 init_gold_top100=0.2915
|
| 198 |
+
NCCL version 2.25.1+cuda12.8
|
| 199 |
+
resumed_from=runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805/latest.pt start_step=1001
|
| 200 |
+
{
|
| 201 |
+
"device": "cuda:0",
|
| 202 |
+
"rank": 0,
|
| 203 |
+
"world_size": 4,
|
| 204 |
+
"samples": "owt_cached_chunks:8",
|
| 205 |
+
"vocab_size": 969,
|
| 206 |
+
"tokenizer_vocab_size": 50257,
|
| 207 |
+
"save_dir": "runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805",
|
| 208 |
+
"batch_size": 128,
|
| 209 |
+
"grad_accum": 1,
|
| 210 |
+
"effective_batch_size": 512,
|
| 211 |
+
"global_batch_size": 512,
|
| 212 |
+
"lr_schedule": "constant_warmup",
|
| 213 |
+
"optimizer": "muon",
|
| 214 |
+
"epochs": 0.0,
|
| 215 |
+
"steps_per_epoch": 1,
|
| 216 |
+
"total_steps": 2000,
|
| 217 |
+
"warmup_steps": 10,
|
| 218 |
+
"warmup_epochs": -1.0,
|
| 219 |
+
"min_lr": 0.0,
|
| 220 |
+
"weight_decay": 0.1,
|
| 221 |
+
"output_weight_decay": -1.0,
|
| 222 |
+
"adamw_param_groups": "nanogpt",
|
| 223 |
+
"adam_beta1": 0.9,
|
| 224 |
+
"adam_beta2": 0.95,
|
| 225 |
+
"adam_eps": 1e-08,
|
| 226 |
+
"muon_impl": "legacy",
|
| 227 |
+
"muon_momentum": 0.95,
|
| 228 |
+
"muon_ns_steps": 5,
|
| 229 |
+
"muon_update_scale": 1.0,
|
| 230 |
+
"muon_nesterov": false,
|
| 231 |
+
"muon_width_scale": false,
|
| 232 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 233 |
+
"muon_param_count": 1965440,
|
| 234 |
+
"muon_adam_param_count": 8192,
|
| 235 |
+
"muon_param_names": [
|
| 236 |
+
"vocab_embed.embedding",
|
| 237 |
+
"sigma_map.net.0.weight",
|
| 238 |
+
"sigma_map.net.2.weight",
|
| 239 |
+
"blocks.0.attn_qkv.weight",
|
| 240 |
+
"blocks.0.attn_out.weight",
|
| 241 |
+
"blocks.0.mlp.0.weight",
|
| 242 |
+
"blocks.0.mlp.2.weight",
|
| 243 |
+
"blocks.0.adaLN_modulation.weight",
|
| 244 |
+
"blocks.1.attn_qkv.weight",
|
| 245 |
+
"blocks.1.attn_out.weight",
|
| 246 |
+
"blocks.1.mlp.0.weight",
|
| 247 |
+
"blocks.1.mlp.2.weight",
|
| 248 |
+
"blocks.1.adaLN_modulation.weight",
|
| 249 |
+
"blocks.2.attn_qkv.weight",
|
| 250 |
+
"blocks.2.attn_out.weight",
|
| 251 |
+
"blocks.2.mlp.0.weight",
|
| 252 |
+
"blocks.2.mlp.2.weight",
|
| 253 |
+
"blocks.2.adaLN_modulation.weight",
|
| 254 |
+
"output_layer.linear.weight",
|
| 255 |
+
"output_layer.adaLN_modulation.weight"
|
| 256 |
+
],
|
| 257 |
+
"muon_adam_param_names": [
|
| 258 |
+
"sigma_map.net.0.bias",
|
| 259 |
+
"sigma_map.net.2.bias",
|
| 260 |
+
"blocks.0.norm1.weight",
|
| 261 |
+
"blocks.0.norm2.weight",
|
| 262 |
+
"blocks.0.mlp.0.bias",
|
| 263 |
+
"blocks.0.mlp.2.bias",
|
| 264 |
+
"blocks.0.adaLN_modulation.bias",
|
| 265 |
+
"blocks.1.norm1.weight",
|
| 266 |
+
"blocks.1.norm2.weight",
|
| 267 |
+
"blocks.1.mlp.0.bias",
|
| 268 |
+
"blocks.1.mlp.2.bias",
|
| 269 |
+
"blocks.1.adaLN_modulation.bias",
|
| 270 |
+
"blocks.2.norm1.weight",
|
| 271 |
+
"blocks.2.norm2.weight",
|
| 272 |
+
"blocks.2.mlp.0.bias",
|
| 273 |
+
"blocks.2.mlp.2.bias",
|
| 274 |
+
"blocks.2.adaLN_modulation.bias",
|
| 275 |
+
"output_layer.norm_final.weight",
|
| 276 |
+
"output_layer.adaLN_modulation.bias"
|
| 277 |
+
],
|
| 278 |
+
"muon_effective_nesterov": false,
|
| 279 |
+
"muon_effective_width_scale": false,
|
| 280 |
+
"muon_effective_weight_decay": 0.1,
|
| 281 |
+
"muon_adam_fallback_nesterov": false,
|
| 282 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 283 |
+
"ema_decay": 0.9999,
|
| 284 |
+
"ema_start_step": 0,
|
| 285 |
+
"model_type": "ddit",
|
| 286 |
+
"ddit_mlp_type": "gelu",
|
| 287 |
+
"elf_num_time_tokens": 4,
|
| 288 |
+
"elf_num_model_mode_tokens": 0,
|
| 289 |
+
"qk_norm": true,
|
| 290 |
+
"output_bias": false,
|
| 291 |
+
"output_init_std": -1.0,
|
| 292 |
+
"norm_type": "rmsnorm",
|
| 293 |
+
"target_loss": "hard_ce",
|
| 294 |
+
"linear_soft_target_power": 1.0,
|
| 295 |
+
"linear_soft_target_min_conf": 0.0,
|
| 296 |
+
"linear_soft_target_max_conf": 1.0,
|
| 297 |
+
"t_sampling_mode": "logit_normal",
|
| 298 |
+
"t_sampling_power": 1.0,
|
| 299 |
+
"t_sampling_eps": 0.0001,
|
| 300 |
+
"t_sampling_logit_mean": -1.5,
|
| 301 |
+
"t_sampling_logit_std": 0.8,
|
| 302 |
+
"dual_t": true,
|
| 303 |
+
"corrupt_t_mode": "same",
|
| 304 |
+
"corrupt_min_t": 0.0,
|
| 305 |
+
"corrupt_max_t": 1.0,
|
| 306 |
+
"prefix_block_prob": 0.0,
|
| 307 |
+
"prefix_block_len": 128,
|
| 308 |
+
"mask_ratio_floor_schedule": "none",
|
| 309 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 310 |
+
"dirichlet_semantic_t_mode": "same",
|
| 311 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 312 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 313 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 314 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 315 |
+
"categorical_wrong_from_full_vocab": true,
|
| 316 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 317 |
+
"categorical_wrong_basin_token_ids": "",
|
| 318 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 319 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 320 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 321 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 322 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 323 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 324 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 325 |
+
"mask_mixture_original_prob": 0.0,
|
| 326 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 327 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 328 |
+
"mask_mixture_block_prob": 0.0,
|
| 329 |
+
"mask_mixture_all_prob": 1.0,
|
| 330 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 331 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 332 |
+
"mask_mixture_block_tokens": "64,128",
|
| 333 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 334 |
+
"logistic_normal_sigma_min": 0.1,
|
| 335 |
+
"logistic_normal_sigma_max": 1.0,
|
| 336 |
+
"logistic_normal_tau_min": 1.0,
|
| 337 |
+
"logistic_normal_tau_max": 1.0,
|
| 338 |
+
"torch_compile": false,
|
| 339 |
+
"compile_mode": "max-autotune",
|
| 340 |
+
"state_format": "prob",
|
| 341 |
+
"meanflow_weight": 0.0,
|
| 342 |
+
"rollout_train_prob": 0.0,
|
| 343 |
+
"rollout_train_steps": 1,
|
| 344 |
+
"rollout_train_infer_steps": 64,
|
| 345 |
+
"rollout_train_temp": 1.45,
|
| 346 |
+
"rollout_train_max_gamma": 1.0,
|
| 347 |
+
"rollout_train_corrupt_only": true,
|
| 348 |
+
"rollout_train_samplewise": false,
|
| 349 |
+
"rollout_train_compute_always": false,
|
| 350 |
+
"bridge_noise_init": "logistic_normal",
|
| 351 |
+
"noise_sigma": -1.0,
|
| 352 |
+
"allow_tf32": true,
|
| 353 |
+
"activation_checkpointing": false,
|
| 354 |
+
"activation_checkpoint_interval": 1,
|
| 355 |
+
"activation_checkpoint_scope": "block",
|
| 356 |
+
"ddp_static_graph": false,
|
| 357 |
+
"ddp_gradient_as_bucket_view": true,
|
| 358 |
+
"blocking_data_transfer": false,
|
| 359 |
+
"dataloader_prefetch_factor": 4,
|
| 360 |
+
"full_train_stats": false,
|
| 361 |
+
"tokenized_hf": false,
|
| 362 |
+
"tokenized_pad_token": "pad",
|
| 363 |
+
"elf_conditional_hf": false,
|
| 364 |
+
"record_pad_truncate": false,
|
| 365 |
+
"record_add_eos": false,
|
| 366 |
+
"record_add_special_tokens": false,
|
| 367 |
+
"record_pad_token": "pad",
|
| 368 |
+
"record_shuffle_buffer": 10000,
|
| 369 |
+
"wrap": true,
|
| 370 |
+
"wrap_mode": "stream",
|
| 371 |
+
"wrap_record_buffer_size": 200,
|
| 372 |
+
"owt_cached_chunks": true,
|
| 373 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
|
| 374 |
+
"owt_chunk_cache_rebuild": false,
|
| 375 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 376 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 377 |
+
"online_chunk_shuffle": false,
|
| 378 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 379 |
+
"openwebtext_split": "train_minus_100k",
|
| 380 |
+
"detokenizer": "auto",
|
| 381 |
+
"resolved_detokenizer": null,
|
| 382 |
+
"num_workers": 0,
|
| 383 |
+
"latest_every": 1000,
|
| 384 |
+
"resume_path": "runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805/latest.pt"
|
| 385 |
+
}
|
| 386 |
+
step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=4.4s lr=2.000000e-03 loss=0.2449 loss_recon=0.2449 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9244 corrupt_frac=1.0000 acc_corrupt=0.9244 loss_corrupt=0.2449 wrong_frac=0.7917 init_acc_corrupt=0.2067 acc_corrupt_t_0p0_0p2=0.8659 corrupt_frac_t_0p0_0p2=0.5588 acc_corrupt_t_0p2_0p4=0.9983 corrupt_frac_t_0p2_0p4=0.3579 acc_corrupt_t_0p4_0p6=0.9992 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9987 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.0079 out_g_norm=0.3943 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2226 init_gold_top10=0.2053 init_gold_top100=0.2800
|
| 387 |
+
step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=3.9s lr=2.000000e-03 loss=0.2175 loss_recon=0.2175 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9306 corrupt_frac=1.0000 acc_corrupt=0.9306 loss_corrupt=0.2175 wrong_frac=0.7925 init_acc_corrupt=0.2059 acc_corrupt_t_0p0_0p2=0.8761 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=0.9994 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.9996 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.9987 corrupt_frac_t_0p6_0p8=0.0119 out_w_norm=12.0269 out_g_norm=0.3266 acc_corrupt_t_0p8_1p0=0.9980 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2401 init_gold_top10=0.2043 init_gold_top100=0.2783
|
| 388 |
+
step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=3.9s lr=2.000000e-03 loss=0.1832 loss_recon=0.1832 loss_meanflow=0.0000 mean_model_t=0.2117 mean_corrupt_t=0.2117 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9416 corrupt_frac=1.0000 acc_corrupt=0.9416 loss_corrupt=0.1832 wrong_frac=0.7884 init_acc_corrupt=0.2102 acc_corrupt_t_0p0_0p2=0.8938 corrupt_frac_t_0p0_0p2=0.5484 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.3611 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0803 acc_corrupt_t_0p6_0p8=0.9995 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=12.0197 out_g_norm=0.3052 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1627 init_gold_top10=0.2224 init_gold_top100=0.2981
|
| 389 |
+
step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=3.9s lr=2.000000e-03 loss=0.1813 loss_recon=0.1813 loss_meanflow=0.0000 mean_model_t=0.2073 mean_corrupt_t=0.2073 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9413 corrupt_frac=1.0000 acc_corrupt=0.9413 loss_corrupt=0.1813 wrong_frac=0.7927 init_acc_corrupt=0.2056 acc_corrupt_t_0p0_0p2=0.8956 corrupt_frac_t_0p0_0p2=0.5613 acc_corrupt_t_0p2_0p4=0.9997 corrupt_frac_t_0p2_0p4=0.3571 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0727 acc_corrupt_t_0p6_0p8=0.9996 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=12.0342 out_g_norm=0.2857 loss_all=0.0911 init_gold_top10=0.2197 init_gold_top100=0.2939
|
| 390 |
+
step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=3.9s lr=2.000000e-03 loss=0.1679 loss_recon=0.1679 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9460 corrupt_frac=1.0000 acc_corrupt=0.9460 loss_corrupt=0.1679 wrong_frac=0.7904 init_acc_corrupt=0.2080 acc_corrupt_t_0p0_0p2=0.9026 corrupt_frac_t_0p0_0p2=0.5540 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0745 out_w_norm=12.0459 out_g_norm=0.2685 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1933 init_gold_top10=0.2225 init_gold_top100=0.2952
|
| 391 |
+
step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=3.9s lr=2.000000e-03 loss=0.1488 loss_recon=0.1488 loss_meanflow=0.0000 mean_model_t=0.2089 mean_corrupt_t=0.2089 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9519 corrupt_frac=1.0000 acc_corrupt=0.9519 loss_corrupt=0.1488 wrong_frac=0.7910 init_acc_corrupt=0.2074 acc_corrupt_t_0p0_0p2=0.9142 corrupt_frac_t_0p0_0p2=0.5598 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0750 acc_corrupt_t_0p6_0p8=0.9996 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=12.0548 out_g_norm=0.2284 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.0094 loss_all=0.1261 init_gold_top10=0.2072 init_gold_top100=0.2815
|
| 392 |
+
step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=3.9s lr=2.000000e-03 loss=0.1446 loss_recon=0.1446 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9526 corrupt_frac=1.0000 acc_corrupt=0.9526 loss_corrupt=0.1446 wrong_frac=0.7907 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9147 corrupt_frac_t_0p0_0p2=0.5552 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3584 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=12.0717 out_g_norm=0.1855 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0132 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1127 init_gold_top10=0.2117 init_gold_top100=0.2832
|
| 393 |
+
step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=3.9s lr=2.000000e-03 loss=0.1385 loss_recon=0.1385 loss_meanflow=0.0000 mean_model_t=0.2103 mean_corrupt_t=0.2103 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9550 corrupt_frac=1.0000 acc_corrupt=0.9550 loss_corrupt=0.1385 wrong_frac=0.7897 init_acc_corrupt=0.2086 acc_corrupt_t_0p0_0p2=0.9186 corrupt_frac_t_0p0_0p2=0.5525 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3592 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0800 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0141 out_w_norm=12.0831 out_g_norm=0.1934 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0903 init_gold_top10=0.2208 init_gold_top100=0.2931
|
| 394 |
+
step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=3.9s lr=2.000000e-03 loss=0.1373 loss_recon=0.1373 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9553 corrupt_frac=1.0000 acc_corrupt=0.9553 loss_corrupt=0.1373 wrong_frac=0.7906 init_acc_corrupt=0.2078 acc_corrupt_t_0p0_0p2=0.9199 corrupt_frac_t_0p0_0p2=0.5584 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3530 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0801 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.0937 out_g_norm=0.1728 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1211 init_gold_top10=0.2190 init_gold_top100=0.2921
|
| 395 |
+
step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=3.9s lr=2.000000e-03 loss=0.1201 loss_recon=0.1201 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9606 corrupt_frac=1.0000 acc_corrupt=0.9606 loss_corrupt=0.1201 wrong_frac=0.7908 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9292 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0759 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=12.1029 out_g_norm=0.1619 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0543 init_gold_top10=0.2187 init_gold_top100=0.2915
|
| 396 |
+
NCCL version 2.25.1+cuda12.8
|
| 397 |
+
resumed_from=runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805/latest.pt start_step=2001
|
| 398 |
+
{
|
| 399 |
+
"device": "cuda:0",
|
| 400 |
+
"rank": 0,
|
| 401 |
+
"world_size": 4,
|
| 402 |
+
"samples": "owt_cached_chunks:8",
|
| 403 |
+
"vocab_size": 969,
|
| 404 |
+
"tokenizer_vocab_size": 50257,
|
| 405 |
+
"save_dir": "runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805",
|
| 406 |
+
"batch_size": 128,
|
| 407 |
+
"grad_accum": 1,
|
| 408 |
+
"effective_batch_size": 512,
|
| 409 |
+
"global_batch_size": 512,
|
| 410 |
+
"lr_schedule": "constant_warmup",
|
| 411 |
+
"optimizer": "muon",
|
| 412 |
+
"epochs": 0.0,
|
| 413 |
+
"steps_per_epoch": 1,
|
| 414 |
+
"total_steps": 3000,
|
| 415 |
+
"warmup_steps": 10,
|
| 416 |
+
"warmup_epochs": -1.0,
|
| 417 |
+
"min_lr": 0.0,
|
| 418 |
+
"weight_decay": 0.1,
|
| 419 |
+
"output_weight_decay": -1.0,
|
| 420 |
+
"adamw_param_groups": "nanogpt",
|
| 421 |
+
"adam_beta1": 0.9,
|
| 422 |
+
"adam_beta2": 0.95,
|
| 423 |
+
"adam_eps": 1e-08,
|
| 424 |
+
"muon_impl": "legacy",
|
| 425 |
+
"muon_momentum": 0.95,
|
| 426 |
+
"muon_ns_steps": 5,
|
| 427 |
+
"muon_update_scale": 1.0,
|
| 428 |
+
"muon_nesterov": false,
|
| 429 |
+
"muon_width_scale": false,
|
| 430 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 431 |
+
"muon_param_count": 1965440,
|
| 432 |
+
"muon_adam_param_count": 8192,
|
| 433 |
+
"muon_param_names": [
|
| 434 |
+
"vocab_embed.embedding",
|
| 435 |
+
"sigma_map.net.0.weight",
|
| 436 |
+
"sigma_map.net.2.weight",
|
| 437 |
+
"blocks.0.attn_qkv.weight",
|
| 438 |
+
"blocks.0.attn_out.weight",
|
| 439 |
+
"blocks.0.mlp.0.weight",
|
| 440 |
+
"blocks.0.mlp.2.weight",
|
| 441 |
+
"blocks.0.adaLN_modulation.weight",
|
| 442 |
+
"blocks.1.attn_qkv.weight",
|
| 443 |
+
"blocks.1.attn_out.weight",
|
| 444 |
+
"blocks.1.mlp.0.weight",
|
| 445 |
+
"blocks.1.mlp.2.weight",
|
| 446 |
+
"blocks.1.adaLN_modulation.weight",
|
| 447 |
+
"blocks.2.attn_qkv.weight",
|
| 448 |
+
"blocks.2.attn_out.weight",
|
| 449 |
+
"blocks.2.mlp.0.weight",
|
| 450 |
+
"blocks.2.mlp.2.weight",
|
| 451 |
+
"blocks.2.adaLN_modulation.weight",
|
| 452 |
+
"output_layer.linear.weight",
|
| 453 |
+
"output_layer.adaLN_modulation.weight"
|
| 454 |
+
],
|
| 455 |
+
"muon_adam_param_names": [
|
| 456 |
+
"sigma_map.net.0.bias",
|
| 457 |
+
"sigma_map.net.2.bias",
|
| 458 |
+
"blocks.0.norm1.weight",
|
| 459 |
+
"blocks.0.norm2.weight",
|
| 460 |
+
"blocks.0.mlp.0.bias",
|
| 461 |
+
"blocks.0.mlp.2.bias",
|
| 462 |
+
"blocks.0.adaLN_modulation.bias",
|
| 463 |
+
"blocks.1.norm1.weight",
|
| 464 |
+
"blocks.1.norm2.weight",
|
| 465 |
+
"blocks.1.mlp.0.bias",
|
| 466 |
+
"blocks.1.mlp.2.bias",
|
| 467 |
+
"blocks.1.adaLN_modulation.bias",
|
| 468 |
+
"blocks.2.norm1.weight",
|
| 469 |
+
"blocks.2.norm2.weight",
|
| 470 |
+
"blocks.2.mlp.0.bias",
|
| 471 |
+
"blocks.2.mlp.2.bias",
|
| 472 |
+
"blocks.2.adaLN_modulation.bias",
|
| 473 |
+
"output_layer.norm_final.weight",
|
| 474 |
+
"output_layer.adaLN_modulation.bias"
|
| 475 |
+
],
|
| 476 |
+
"muon_effective_nesterov": false,
|
| 477 |
+
"muon_effective_width_scale": false,
|
| 478 |
+
"muon_effective_weight_decay": 0.1,
|
| 479 |
+
"muon_adam_fallback_nesterov": false,
|
| 480 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 481 |
+
"ema_decay": 0.9999,
|
| 482 |
+
"ema_start_step": 0,
|
| 483 |
+
"model_type": "ddit",
|
| 484 |
+
"ddit_mlp_type": "gelu",
|
| 485 |
+
"elf_num_time_tokens": 4,
|
| 486 |
+
"elf_num_model_mode_tokens": 0,
|
| 487 |
+
"qk_norm": true,
|
| 488 |
+
"output_bias": false,
|
| 489 |
+
"output_init_std": -1.0,
|
| 490 |
+
"norm_type": "rmsnorm",
|
| 491 |
+
"target_loss": "hard_ce",
|
| 492 |
+
"linear_soft_target_power": 1.0,
|
| 493 |
+
"linear_soft_target_min_conf": 0.0,
|
| 494 |
+
"linear_soft_target_max_conf": 1.0,
|
| 495 |
+
"t_sampling_mode": "logit_normal",
|
| 496 |
+
"t_sampling_power": 1.0,
|
| 497 |
+
"t_sampling_eps": 0.0001,
|
| 498 |
+
"t_sampling_logit_mean": -1.5,
|
| 499 |
+
"t_sampling_logit_std": 0.8,
|
| 500 |
+
"dual_t": true,
|
| 501 |
+
"corrupt_t_mode": "same",
|
| 502 |
+
"corrupt_min_t": 0.0,
|
| 503 |
+
"corrupt_max_t": 1.0,
|
| 504 |
+
"prefix_block_prob": 0.0,
|
| 505 |
+
"prefix_block_len": 128,
|
| 506 |
+
"mask_ratio_floor_schedule": "none",
|
| 507 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 508 |
+
"dirichlet_semantic_t_mode": "same",
|
| 509 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 510 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 511 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 512 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 513 |
+
"categorical_wrong_from_full_vocab": true,
|
| 514 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 515 |
+
"categorical_wrong_basin_token_ids": "",
|
| 516 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 517 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 518 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 519 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 520 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 521 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 522 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 523 |
+
"mask_mixture_original_prob": 0.0,
|
| 524 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 525 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 526 |
+
"mask_mixture_block_prob": 0.0,
|
| 527 |
+
"mask_mixture_all_prob": 1.0,
|
| 528 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 529 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 530 |
+
"mask_mixture_block_tokens": "64,128",
|
| 531 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 532 |
+
"logistic_normal_sigma_min": 0.1,
|
| 533 |
+
"logistic_normal_sigma_max": 1.0,
|
| 534 |
+
"logistic_normal_tau_min": 1.0,
|
| 535 |
+
"logistic_normal_tau_max": 1.0,
|
| 536 |
+
"torch_compile": false,
|
| 537 |
+
"compile_mode": "max-autotune",
|
| 538 |
+
"state_format": "prob",
|
| 539 |
+
"meanflow_weight": 0.0,
|
| 540 |
+
"rollout_train_prob": 0.0,
|
| 541 |
+
"rollout_train_steps": 1,
|
| 542 |
+
"rollout_train_infer_steps": 64,
|
| 543 |
+
"rollout_train_temp": 1.45,
|
| 544 |
+
"rollout_train_max_gamma": 1.0,
|
| 545 |
+
"rollout_train_corrupt_only": true,
|
| 546 |
+
"rollout_train_samplewise": false,
|
| 547 |
+
"rollout_train_compute_always": false,
|
| 548 |
+
"bridge_noise_init": "logistic_normal",
|
| 549 |
+
"noise_sigma": -1.0,
|
| 550 |
+
"allow_tf32": true,
|
| 551 |
+
"activation_checkpointing": false,
|
| 552 |
+
"activation_checkpoint_interval": 1,
|
| 553 |
+
"activation_checkpoint_scope": "block",
|
| 554 |
+
"ddp_static_graph": false,
|
| 555 |
+
"ddp_gradient_as_bucket_view": true,
|
| 556 |
+
"blocking_data_transfer": false,
|
| 557 |
+
"dataloader_prefetch_factor": 4,
|
| 558 |
+
"full_train_stats": false,
|
| 559 |
+
"tokenized_hf": false,
|
| 560 |
+
"tokenized_pad_token": "pad",
|
| 561 |
+
"elf_conditional_hf": false,
|
| 562 |
+
"record_pad_truncate": false,
|
| 563 |
+
"record_add_eos": false,
|
| 564 |
+
"record_add_special_tokens": false,
|
| 565 |
+
"record_pad_token": "pad",
|
| 566 |
+
"record_shuffle_buffer": 10000,
|
| 567 |
+
"wrap": true,
|
| 568 |
+
"wrap_mode": "stream",
|
| 569 |
+
"wrap_record_buffer_size": 200,
|
| 570 |
+
"owt_cached_chunks": true,
|
| 571 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
|
| 572 |
+
"owt_chunk_cache_rebuild": false,
|
| 573 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 574 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 575 |
+
"online_chunk_shuffle": false,
|
| 576 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 577 |
+
"openwebtext_split": "train_minus_100k",
|
| 578 |
+
"detokenizer": "auto",
|
| 579 |
+
"resolved_detokenizer": null,
|
| 580 |
+
"num_workers": 0,
|
| 581 |
+
"latest_every": 1000,
|
| 582 |
+
"resume_path": "runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805/latest.pt"
|
| 583 |
+
}
|
| 584 |
+
step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=4.4s lr=2.000000e-03 loss=0.1151 loss_recon=0.1151 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9627 corrupt_frac=1.0000 acc_corrupt=0.9627 loss_corrupt=0.1151 wrong_frac=0.7917 init_acc_corrupt=0.2067 acc_corrupt_t_0p0_0p2=0.9332 corrupt_frac_t_0p0_0p2=0.5588 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3579 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9995 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.1280 out_g_norm=0.1673 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0847 init_gold_top10=0.2053 init_gold_top100=0.2800
|
| 585 |
+
step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=3.7s lr=2.000000e-03 loss=0.1195 loss_recon=0.1195 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9618 corrupt_frac=1.0000 acc_corrupt=0.9618 loss_corrupt=0.1195 wrong_frac=0.7925 init_acc_corrupt=0.2059 acc_corrupt_t_0p0_0p2=0.9315 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0119 out_w_norm=12.1464 out_g_norm=0.1466 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1061 init_gold_top10=0.2043 init_gold_top100=0.2783
|
| 586 |
+
step=2300 epoch=2300/3000 epoch_step=1/1 micro_steps=2300 elapsed=3.7s lr=2.000000e-03 loss=0.1005 loss_recon=0.1005 loss_meanflow=0.0000 mean_model_t=0.2117 mean_corrupt_t=0.2117 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9673 corrupt_frac=1.0000 acc_corrupt=0.9673 loss_corrupt=0.1005 wrong_frac=0.7884 init_acc_corrupt=0.2102 acc_corrupt_t_0p0_0p2=0.9405 corrupt_frac_t_0p0_0p2=0.5484 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3611 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0803 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=12.1477 out_g_norm=0.1316 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0869 init_gold_top10=0.2224 init_gold_top100=0.2981
|
| 587 |
+
step=2400 epoch=2400/3000 epoch_step=1/1 micro_steps=2400 elapsed=3.7s lr=2.000000e-03 loss=0.1115 loss_recon=0.1115 loss_meanflow=0.0000 mean_model_t=0.2073 mean_corrupt_t=0.2073 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9634 corrupt_frac=1.0000 acc_corrupt=0.9634 loss_corrupt=0.1115 wrong_frac=0.7927 init_acc_corrupt=0.2056 acc_corrupt_t_0p0_0p2=0.9349 corrupt_frac_t_0p0_0p2=0.5613 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3571 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0727 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=12.1573 out_g_norm=0.1343 loss_all=0.0942 init_gold_top10=0.2197 init_gold_top100=0.2939
|
| 588 |
+
step=2500 epoch=2500/3000 epoch_step=1/1 micro_steps=2500 elapsed=3.7s lr=2.000000e-03 loss=0.1057 loss_recon=0.1057 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9654 corrupt_frac=1.0000 acc_corrupt=0.9654 loss_corrupt=0.1057 wrong_frac=0.7904 init_acc_corrupt=0.2080 acc_corrupt_t_0p0_0p2=0.9376 corrupt_frac_t_0p0_0p2=0.5540 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0745 out_w_norm=12.1754 out_g_norm=0.1418 acc_corrupt_t_0p6_0p8=0.9996 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1002 init_gold_top10=0.2225 init_gold_top100=0.2952
|
| 589 |
+
step=2600 epoch=2600/3000 epoch_step=1/1 micro_steps=2600 elapsed=3.7s lr=2.000000e-03 loss=0.0998 loss_recon=0.0998 loss_meanflow=0.0000 mean_model_t=0.2089 mean_corrupt_t=0.2089 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9678 corrupt_frac=1.0000 acc_corrupt=0.9678 loss_corrupt=0.0998 wrong_frac=0.7910 init_acc_corrupt=0.2074 acc_corrupt_t_0p0_0p2=0.9426 corrupt_frac_t_0p0_0p2=0.5598 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0750 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=12.1861 out_g_norm=0.1271 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.0094 loss_all=0.0707 init_gold_top10=0.2072 init_gold_top100=0.2815
|
| 590 |
+
step=2700 epoch=2700/3000 epoch_step=1/1 micro_steps=2700 elapsed=3.7s lr=2.000000e-03 loss=0.0970 loss_recon=0.0970 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9686 corrupt_frac=1.0000 acc_corrupt=0.9686 loss_corrupt=0.0970 wrong_frac=0.7907 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9435 corrupt_frac_t_0p0_0p2=0.5552 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3584 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=12.1979 out_g_norm=0.1102 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0132 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0570 init_gold_top10=0.2117 init_gold_top100=0.2832
|
| 591 |
+
step=2800 epoch=2800/3000 epoch_step=1/1 micro_steps=2800 elapsed=3.7s lr=2.000000e-03 loss=0.0949 loss_recon=0.0949 loss_meanflow=0.0000 mean_model_t=0.2103 mean_corrupt_t=0.2103 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9698 corrupt_frac=1.0000 acc_corrupt=0.9698 loss_corrupt=0.0949 wrong_frac=0.7897 init_acc_corrupt=0.2086 acc_corrupt_t_0p0_0p2=0.9453 corrupt_frac_t_0p0_0p2=0.5525 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3592 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0800 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0141 out_w_norm=12.2015 out_g_norm=0.1196 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0685 init_gold_top10=0.2208 init_gold_top100=0.2931
|
| 592 |
+
step=2900 epoch=2900/3000 epoch_step=1/1 micro_steps=2900 elapsed=3.7s lr=2.000000e-03 loss=0.0920 loss_recon=0.0920 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9697 corrupt_frac=1.0000 acc_corrupt=0.9697 loss_corrupt=0.0920 wrong_frac=0.7906 init_acc_corrupt=0.2078 acc_corrupt_t_0p0_0p2=0.9458 corrupt_frac_t_0p0_0p2=0.5584 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3530 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0801 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.2185 out_g_norm=0.1084 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0760 init_gold_top10=0.2190 init_gold_top100=0.2921
|
| 593 |
+
step=3000 epoch=3000/3000 epoch_step=1/1 micro_steps=3000 elapsed=3.7s lr=2.000000e-03 loss=0.0899 loss_recon=0.0899 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9708 corrupt_frac=1.0000 acc_corrupt=0.9708 loss_corrupt=0.0899 wrong_frac=0.7908 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9476 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0759 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=12.2416 out_g_norm=0.1068 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0347 init_gold_top10=0.2187 init_gold_top100=0.2915
|
| 594 |
+
NCCL version 2.25.1+cuda12.8
|
| 595 |
+
resumed_from=runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805/latest.pt start_step=3001
|
| 596 |
+
{
|
| 597 |
+
"device": "cuda:0",
|
| 598 |
+
"rank": 0,
|
| 599 |
+
"world_size": 4,
|
| 600 |
+
"samples": "owt_cached_chunks:8",
|
| 601 |
+
"vocab_size": 969,
|
| 602 |
+
"tokenizer_vocab_size": 50257,
|
| 603 |
+
"save_dir": "runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805",
|
| 604 |
+
"batch_size": 128,
|
| 605 |
+
"grad_accum": 1,
|
| 606 |
+
"effective_batch_size": 512,
|
| 607 |
+
"global_batch_size": 512,
|
| 608 |
+
"lr_schedule": "constant_warmup",
|
| 609 |
+
"optimizer": "muon",
|
| 610 |
+
"epochs": 0.0,
|
| 611 |
+
"steps_per_epoch": 1,
|
| 612 |
+
"total_steps": 4000,
|
| 613 |
+
"warmup_steps": 10,
|
| 614 |
+
"warmup_epochs": -1.0,
|
| 615 |
+
"min_lr": 0.0,
|
| 616 |
+
"weight_decay": 0.1,
|
| 617 |
+
"output_weight_decay": -1.0,
|
| 618 |
+
"adamw_param_groups": "nanogpt",
|
| 619 |
+
"adam_beta1": 0.9,
|
| 620 |
+
"adam_beta2": 0.95,
|
| 621 |
+
"adam_eps": 1e-08,
|
| 622 |
+
"muon_impl": "legacy",
|
| 623 |
+
"muon_momentum": 0.95,
|
| 624 |
+
"muon_ns_steps": 5,
|
| 625 |
+
"muon_update_scale": 1.0,
|
| 626 |
+
"muon_nesterov": false,
|
| 627 |
+
"muon_width_scale": false,
|
| 628 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 629 |
+
"muon_param_count": 1965440,
|
| 630 |
+
"muon_adam_param_count": 8192,
|
| 631 |
+
"muon_param_names": [
|
| 632 |
+
"vocab_embed.embedding",
|
| 633 |
+
"sigma_map.net.0.weight",
|
| 634 |
+
"sigma_map.net.2.weight",
|
| 635 |
+
"blocks.0.attn_qkv.weight",
|
| 636 |
+
"blocks.0.attn_out.weight",
|
| 637 |
+
"blocks.0.mlp.0.weight",
|
| 638 |
+
"blocks.0.mlp.2.weight",
|
| 639 |
+
"blocks.0.adaLN_modulation.weight",
|
| 640 |
+
"blocks.1.attn_qkv.weight",
|
| 641 |
+
"blocks.1.attn_out.weight",
|
| 642 |
+
"blocks.1.mlp.0.weight",
|
| 643 |
+
"blocks.1.mlp.2.weight",
|
| 644 |
+
"blocks.1.adaLN_modulation.weight",
|
| 645 |
+
"blocks.2.attn_qkv.weight",
|
| 646 |
+
"blocks.2.attn_out.weight",
|
| 647 |
+
"blocks.2.mlp.0.weight",
|
| 648 |
+
"blocks.2.mlp.2.weight",
|
| 649 |
+
"blocks.2.adaLN_modulation.weight",
|
| 650 |
+
"output_layer.linear.weight",
|
| 651 |
+
"output_layer.adaLN_modulation.weight"
|
| 652 |
+
],
|
| 653 |
+
"muon_adam_param_names": [
|
| 654 |
+
"sigma_map.net.0.bias",
|
| 655 |
+
"sigma_map.net.2.bias",
|
| 656 |
+
"blocks.0.norm1.weight",
|
| 657 |
+
"blocks.0.norm2.weight",
|
| 658 |
+
"blocks.0.mlp.0.bias",
|
| 659 |
+
"blocks.0.mlp.2.bias",
|
| 660 |
+
"blocks.0.adaLN_modulation.bias",
|
| 661 |
+
"blocks.1.norm1.weight",
|
| 662 |
+
"blocks.1.norm2.weight",
|
| 663 |
+
"blocks.1.mlp.0.bias",
|
| 664 |
+
"blocks.1.mlp.2.bias",
|
| 665 |
+
"blocks.1.adaLN_modulation.bias",
|
| 666 |
+
"blocks.2.norm1.weight",
|
| 667 |
+
"blocks.2.norm2.weight",
|
| 668 |
+
"blocks.2.mlp.0.bias",
|
| 669 |
+
"blocks.2.mlp.2.bias",
|
| 670 |
+
"blocks.2.adaLN_modulation.bias",
|
| 671 |
+
"output_layer.norm_final.weight",
|
| 672 |
+
"output_layer.adaLN_modulation.bias"
|
| 673 |
+
],
|
| 674 |
+
"muon_effective_nesterov": false,
|
| 675 |
+
"muon_effective_width_scale": false,
|
| 676 |
+
"muon_effective_weight_decay": 0.1,
|
| 677 |
+
"muon_adam_fallback_nesterov": false,
|
| 678 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 679 |
+
"ema_decay": 0.9999,
|
| 680 |
+
"ema_start_step": 0,
|
| 681 |
+
"model_type": "ddit",
|
| 682 |
+
"ddit_mlp_type": "gelu",
|
| 683 |
+
"elf_num_time_tokens": 4,
|
| 684 |
+
"elf_num_model_mode_tokens": 0,
|
| 685 |
+
"qk_norm": true,
|
| 686 |
+
"output_bias": false,
|
| 687 |
+
"output_init_std": -1.0,
|
| 688 |
+
"norm_type": "rmsnorm",
|
| 689 |
+
"target_loss": "hard_ce",
|
| 690 |
+
"linear_soft_target_power": 1.0,
|
| 691 |
+
"linear_soft_target_min_conf": 0.0,
|
| 692 |
+
"linear_soft_target_max_conf": 1.0,
|
| 693 |
+
"t_sampling_mode": "logit_normal",
|
| 694 |
+
"t_sampling_power": 1.0,
|
| 695 |
+
"t_sampling_eps": 0.0001,
|
| 696 |
+
"t_sampling_logit_mean": -1.5,
|
| 697 |
+
"t_sampling_logit_std": 0.8,
|
| 698 |
+
"dual_t": true,
|
| 699 |
+
"corrupt_t_mode": "same",
|
| 700 |
+
"corrupt_min_t": 0.0,
|
| 701 |
+
"corrupt_max_t": 1.0,
|
| 702 |
+
"prefix_block_prob": 0.0,
|
| 703 |
+
"prefix_block_len": 128,
|
| 704 |
+
"mask_ratio_floor_schedule": "none",
|
| 705 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 706 |
+
"dirichlet_semantic_t_mode": "same",
|
| 707 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 708 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 709 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 710 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 711 |
+
"categorical_wrong_from_full_vocab": true,
|
| 712 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 713 |
+
"categorical_wrong_basin_token_ids": "",
|
| 714 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 715 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 716 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 717 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 718 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 719 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 720 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 721 |
+
"mask_mixture_original_prob": 0.0,
|
| 722 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 723 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 724 |
+
"mask_mixture_block_prob": 0.0,
|
| 725 |
+
"mask_mixture_all_prob": 1.0,
|
| 726 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 727 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 728 |
+
"mask_mixture_block_tokens": "64,128",
|
| 729 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 730 |
+
"logistic_normal_sigma_min": 0.1,
|
| 731 |
+
"logistic_normal_sigma_max": 1.0,
|
| 732 |
+
"logistic_normal_tau_min": 1.0,
|
| 733 |
+
"logistic_normal_tau_max": 1.0,
|
| 734 |
+
"torch_compile": false,
|
| 735 |
+
"compile_mode": "max-autotune",
|
| 736 |
+
"state_format": "prob",
|
| 737 |
+
"meanflow_weight": 0.0,
|
| 738 |
+
"rollout_train_prob": 0.0,
|
| 739 |
+
"rollout_train_steps": 1,
|
| 740 |
+
"rollout_train_infer_steps": 64,
|
| 741 |
+
"rollout_train_temp": 1.45,
|
| 742 |
+
"rollout_train_max_gamma": 1.0,
|
| 743 |
+
"rollout_train_corrupt_only": true,
|
| 744 |
+
"rollout_train_samplewise": false,
|
| 745 |
+
"rollout_train_compute_always": false,
|
| 746 |
+
"bridge_noise_init": "logistic_normal",
|
| 747 |
+
"noise_sigma": -1.0,
|
| 748 |
+
"allow_tf32": true,
|
| 749 |
+
"activation_checkpointing": false,
|
| 750 |
+
"activation_checkpoint_interval": 1,
|
| 751 |
+
"activation_checkpoint_scope": "block",
|
| 752 |
+
"ddp_static_graph": false,
|
| 753 |
+
"ddp_gradient_as_bucket_view": true,
|
| 754 |
+
"blocking_data_transfer": false,
|
| 755 |
+
"dataloader_prefetch_factor": 4,
|
| 756 |
+
"full_train_stats": false,
|
| 757 |
+
"tokenized_hf": false,
|
| 758 |
+
"tokenized_pad_token": "pad",
|
| 759 |
+
"elf_conditional_hf": false,
|
| 760 |
+
"record_pad_truncate": false,
|
| 761 |
+
"record_add_eos": false,
|
| 762 |
+
"record_add_special_tokens": false,
|
| 763 |
+
"record_pad_token": "pad",
|
| 764 |
+
"record_shuffle_buffer": 10000,
|
| 765 |
+
"wrap": true,
|
| 766 |
+
"wrap_mode": "stream",
|
| 767 |
+
"wrap_record_buffer_size": 200,
|
| 768 |
+
"owt_cached_chunks": true,
|
| 769 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
|
| 770 |
+
"owt_chunk_cache_rebuild": false,
|
| 771 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 772 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 773 |
+
"online_chunk_shuffle": false,
|
| 774 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 775 |
+
"openwebtext_split": "train_minus_100k",
|
| 776 |
+
"detokenizer": "auto",
|
| 777 |
+
"resolved_detokenizer": null,
|
| 778 |
+
"num_workers": 0,
|
| 779 |
+
"latest_every": 1000,
|
| 780 |
+
"resume_path": "runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805/latest.pt"
|
| 781 |
+
}
|
| 782 |
+
step=3100 epoch=3100/4000 epoch_step=1/1 micro_steps=3100 elapsed=4.4s lr=2.000000e-03 loss=0.0858 loss_recon=0.0858 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9728 corrupt_frac=1.0000 acc_corrupt=0.9728 loss_corrupt=0.0858 wrong_frac=0.7917 init_acc_corrupt=0.2067 acc_corrupt_t_0p0_0p2=0.9514 corrupt_frac_t_0p0_0p2=0.5588 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3579 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9996 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.2445 out_g_norm=0.1039 acc_corrupt_t_0p8_1p0=0.9980 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0670 init_gold_top10=0.2053 init_gold_top100=0.2800
|
| 783 |
+
step=3200 epoch=3200/4000 epoch_step=1/1 micro_steps=3200 elapsed=3.7s lr=2.000000e-03 loss=0.0896 loss_recon=0.0896 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9710 corrupt_frac=1.0000 acc_corrupt=0.9710 loss_corrupt=0.0896 wrong_frac=0.7925 init_acc_corrupt=0.2059 acc_corrupt_t_0p0_0p2=0.9480 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0119 out_w_norm=12.2565 out_g_norm=0.1005 acc_corrupt_t_0p8_1p0=0.9980 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0698 init_gold_top10=0.2043 init_gold_top100=0.2783
|
| 784 |
+
step=3300 epoch=3300/4000 epoch_step=1/1 micro_steps=3300 elapsed=3.7s lr=2.000000e-03 loss=0.0751 loss_recon=0.0751 loss_meanflow=0.0000 mean_model_t=0.2117 mean_corrupt_t=0.2117 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9757 corrupt_frac=1.0000 acc_corrupt=0.9757 loss_corrupt=0.0751 wrong_frac=0.7884 init_acc_corrupt=0.2102 acc_corrupt_t_0p0_0p2=0.9558 corrupt_frac_t_0p0_0p2=0.5484 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3611 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0803 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=12.2656 out_g_norm=0.0997 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0877 init_gold_top10=0.2224 init_gold_top100=0.2981
|
| 785 |
+
step=3400 epoch=3400/4000 epoch_step=1/1 micro_steps=3400 elapsed=3.7s lr=2.000000e-03 loss=0.0874 loss_recon=0.0874 loss_meanflow=0.0000 mean_model_t=0.2073 mean_corrupt_t=0.2073 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9713 corrupt_frac=1.0000 acc_corrupt=0.9713 loss_corrupt=0.0874 wrong_frac=0.7927 init_acc_corrupt=0.2056 acc_corrupt_t_0p0_0p2=0.9490 corrupt_frac_t_0p0_0p2=0.5613 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3571 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0727 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=12.2718 out_g_norm=0.0918 loss_all=0.0652 init_gold_top10=0.2197 init_gold_top100=0.2939
|
| 786 |
+
step=3500 epoch=3500/4000 epoch_step=1/1 micro_steps=3500 elapsed=3.7s lr=2.000000e-03 loss=0.0786 loss_recon=0.0786 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9741 corrupt_frac=1.0000 acc_corrupt=0.9741 loss_corrupt=0.0786 wrong_frac=0.7904 init_acc_corrupt=0.2080 acc_corrupt_t_0p0_0p2=0.9533 corrupt_frac_t_0p0_0p2=0.5540 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0745 out_w_norm=12.2720 out_g_norm=0.0895 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0935 init_gold_top10=0.2225 init_gold_top100=0.2952
|
| 787 |
+
step=3600 epoch=3600/4000 epoch_step=1/1 micro_steps=3600 elapsed=3.7s lr=2.000000e-03 loss=0.0763 loss_recon=0.0763 loss_meanflow=0.0000 mean_model_t=0.2089 mean_corrupt_t=0.2089 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9753 corrupt_frac=1.0000 acc_corrupt=0.9753 loss_corrupt=0.0763 wrong_frac=0.7910 init_acc_corrupt=0.2074 acc_corrupt_t_0p0_0p2=0.9560 corrupt_frac_t_0p0_0p2=0.5598 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0750 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=12.2727 out_g_norm=0.0911 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0094 loss_all=0.0643 init_gold_top10=0.2072 init_gold_top100=0.2815
|
| 788 |
+
step=3700 epoch=3700/4000 epoch_step=1/1 micro_steps=3700 elapsed=3.7s lr=2.000000e-03 loss=0.0781 loss_recon=0.0781 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9748 corrupt_frac=1.0000 acc_corrupt=0.9748 loss_corrupt=0.0781 wrong_frac=0.7907 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9546 corrupt_frac_t_0p0_0p2=0.5552 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3584 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=12.2705 out_g_norm=0.0846 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0132 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0330 init_gold_top10=0.2117 init_gold_top100=0.2832
|
| 789 |
+
step=3800 epoch=3800/4000 epoch_step=1/1 micro_steps=3800 elapsed=3.7s lr=2.000000e-03 loss=0.0692 loss_recon=0.0692 loss_meanflow=0.0000 mean_model_t=0.2103 mean_corrupt_t=0.2103 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9780 corrupt_frac=1.0000 acc_corrupt=0.9780 loss_corrupt=0.0692 wrong_frac=0.7897 init_acc_corrupt=0.2086 acc_corrupt_t_0p0_0p2=0.9602 corrupt_frac_t_0p0_0p2=0.5525 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3592 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0800 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0141 out_w_norm=12.2632 out_g_norm=0.0868 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0457 init_gold_top10=0.2208 init_gold_top100=0.2931
|
| 790 |
+
step=3900 epoch=3900/4000 epoch_step=1/1 micro_steps=3900 elapsed=3.7s lr=2.000000e-03 loss=0.0706 loss_recon=0.0706 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9766 corrupt_frac=1.0000 acc_corrupt=0.9766 loss_corrupt=0.0706 wrong_frac=0.7906 init_acc_corrupt=0.2078 acc_corrupt_t_0p0_0p2=0.9581 corrupt_frac_t_0p0_0p2=0.5584 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3530 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0801 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.2531 out_g_norm=0.0820 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0530 init_gold_top10=0.2190 init_gold_top100=0.2921
|
| 791 |
+
step=4000 epoch=4000/4000 epoch_step=1/1 micro_steps=4000 elapsed=3.7s lr=2.000000e-03 loss=0.0658 loss_recon=0.0658 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9795 corrupt_frac=1.0000 acc_corrupt=0.9795 loss_corrupt=0.0658 wrong_frac=0.7908 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9631 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0759 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=12.2590 out_g_norm=0.0768 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0183 init_gold_top10=0.2187 init_gold_top100=0.2915
|
| 792 |
+
NCCL version 2.25.1+cuda12.8
|
| 793 |
+
resumed_from=runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805/latest.pt start_step=4001
|
| 794 |
+
{
|
| 795 |
+
"device": "cuda:0",
|
| 796 |
+
"rank": 0,
|
| 797 |
+
"world_size": 4,
|
| 798 |
+
"samples": "owt_cached_chunks:8",
|
| 799 |
+
"vocab_size": 969,
|
| 800 |
+
"tokenizer_vocab_size": 50257,
|
| 801 |
+
"save_dir": "runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805",
|
| 802 |
+
"batch_size": 128,
|
| 803 |
+
"grad_accum": 1,
|
| 804 |
+
"effective_batch_size": 512,
|
| 805 |
+
"global_batch_size": 512,
|
| 806 |
+
"lr_schedule": "constant_warmup",
|
| 807 |
+
"optimizer": "muon",
|
| 808 |
+
"epochs": 0.0,
|
| 809 |
+
"steps_per_epoch": 1,
|
| 810 |
+
"total_steps": 5000,
|
| 811 |
+
"warmup_steps": 10,
|
| 812 |
+
"warmup_epochs": -1.0,
|
| 813 |
+
"min_lr": 0.0,
|
| 814 |
+
"weight_decay": 0.1,
|
| 815 |
+
"output_weight_decay": -1.0,
|
| 816 |
+
"adamw_param_groups": "nanogpt",
|
| 817 |
+
"adam_beta1": 0.9,
|
| 818 |
+
"adam_beta2": 0.95,
|
| 819 |
+
"adam_eps": 1e-08,
|
| 820 |
+
"muon_impl": "legacy",
|
| 821 |
+
"muon_momentum": 0.95,
|
| 822 |
+
"muon_ns_steps": 5,
|
| 823 |
+
"muon_update_scale": 1.0,
|
| 824 |
+
"muon_nesterov": false,
|
| 825 |
+
"muon_width_scale": false,
|
| 826 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 827 |
+
"muon_param_count": 1965440,
|
| 828 |
+
"muon_adam_param_count": 8192,
|
| 829 |
+
"muon_param_names": [
|
| 830 |
+
"vocab_embed.embedding",
|
| 831 |
+
"sigma_map.net.0.weight",
|
| 832 |
+
"sigma_map.net.2.weight",
|
| 833 |
+
"blocks.0.attn_qkv.weight",
|
| 834 |
+
"blocks.0.attn_out.weight",
|
| 835 |
+
"blocks.0.mlp.0.weight",
|
| 836 |
+
"blocks.0.mlp.2.weight",
|
| 837 |
+
"blocks.0.adaLN_modulation.weight",
|
| 838 |
+
"blocks.1.attn_qkv.weight",
|
| 839 |
+
"blocks.1.attn_out.weight",
|
| 840 |
+
"blocks.1.mlp.0.weight",
|
| 841 |
+
"blocks.1.mlp.2.weight",
|
| 842 |
+
"blocks.1.adaLN_modulation.weight",
|
| 843 |
+
"blocks.2.attn_qkv.weight",
|
| 844 |
+
"blocks.2.attn_out.weight",
|
| 845 |
+
"blocks.2.mlp.0.weight",
|
| 846 |
+
"blocks.2.mlp.2.weight",
|
| 847 |
+
"blocks.2.adaLN_modulation.weight",
|
| 848 |
+
"output_layer.linear.weight",
|
| 849 |
+
"output_layer.adaLN_modulation.weight"
|
| 850 |
+
],
|
| 851 |
+
"muon_adam_param_names": [
|
| 852 |
+
"sigma_map.net.0.bias",
|
| 853 |
+
"sigma_map.net.2.bias",
|
| 854 |
+
"blocks.0.norm1.weight",
|
| 855 |
+
"blocks.0.norm2.weight",
|
| 856 |
+
"blocks.0.mlp.0.bias",
|
| 857 |
+
"blocks.0.mlp.2.bias",
|
| 858 |
+
"blocks.0.adaLN_modulation.bias",
|
| 859 |
+
"blocks.1.norm1.weight",
|
| 860 |
+
"blocks.1.norm2.weight",
|
| 861 |
+
"blocks.1.mlp.0.bias",
|
| 862 |
+
"blocks.1.mlp.2.bias",
|
| 863 |
+
"blocks.1.adaLN_modulation.bias",
|
| 864 |
+
"blocks.2.norm1.weight",
|
| 865 |
+
"blocks.2.norm2.weight",
|
| 866 |
+
"blocks.2.mlp.0.bias",
|
| 867 |
+
"blocks.2.mlp.2.bias",
|
| 868 |
+
"blocks.2.adaLN_modulation.bias",
|
| 869 |
+
"output_layer.norm_final.weight",
|
| 870 |
+
"output_layer.adaLN_modulation.bias"
|
| 871 |
+
],
|
| 872 |
+
"muon_effective_nesterov": false,
|
| 873 |
+
"muon_effective_width_scale": false,
|
| 874 |
+
"muon_effective_weight_decay": 0.1,
|
| 875 |
+
"muon_adam_fallback_nesterov": false,
|
| 876 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 877 |
+
"ema_decay": 0.9999,
|
| 878 |
+
"ema_start_step": 0,
|
| 879 |
+
"model_type": "ddit",
|
| 880 |
+
"ddit_mlp_type": "gelu",
|
| 881 |
+
"elf_num_time_tokens": 4,
|
| 882 |
+
"elf_num_model_mode_tokens": 0,
|
| 883 |
+
"qk_norm": true,
|
| 884 |
+
"output_bias": false,
|
| 885 |
+
"output_init_std": -1.0,
|
| 886 |
+
"norm_type": "rmsnorm",
|
| 887 |
+
"target_loss": "hard_ce",
|
| 888 |
+
"linear_soft_target_power": 1.0,
|
| 889 |
+
"linear_soft_target_min_conf": 0.0,
|
| 890 |
+
"linear_soft_target_max_conf": 1.0,
|
| 891 |
+
"t_sampling_mode": "logit_normal",
|
| 892 |
+
"t_sampling_power": 1.0,
|
| 893 |
+
"t_sampling_eps": 0.0001,
|
| 894 |
+
"t_sampling_logit_mean": -1.5,
|
| 895 |
+
"t_sampling_logit_std": 0.8,
|
| 896 |
+
"dual_t": true,
|
| 897 |
+
"corrupt_t_mode": "same",
|
| 898 |
+
"corrupt_min_t": 0.0,
|
| 899 |
+
"corrupt_max_t": 1.0,
|
| 900 |
+
"prefix_block_prob": 0.0,
|
| 901 |
+
"prefix_block_len": 128,
|
| 902 |
+
"mask_ratio_floor_schedule": "none",
|
| 903 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 904 |
+
"dirichlet_semantic_t_mode": "same",
|
| 905 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 906 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 907 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 908 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 909 |
+
"categorical_wrong_from_full_vocab": true,
|
| 910 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 911 |
+
"categorical_wrong_basin_token_ids": "",
|
| 912 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 913 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 914 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 915 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 916 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 917 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 918 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 919 |
+
"mask_mixture_original_prob": 0.0,
|
| 920 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 921 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 922 |
+
"mask_mixture_block_prob": 0.0,
|
| 923 |
+
"mask_mixture_all_prob": 1.0,
|
| 924 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 925 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 926 |
+
"mask_mixture_block_tokens": "64,128",
|
| 927 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 928 |
+
"logistic_normal_sigma_min": 0.1,
|
| 929 |
+
"logistic_normal_sigma_max": 1.0,
|
| 930 |
+
"logistic_normal_tau_min": 1.0,
|
| 931 |
+
"logistic_normal_tau_max": 1.0,
|
| 932 |
+
"torch_compile": false,
|
| 933 |
+
"compile_mode": "max-autotune",
|
| 934 |
+
"state_format": "prob",
|
| 935 |
+
"meanflow_weight": 0.0,
|
| 936 |
+
"rollout_train_prob": 0.0,
|
| 937 |
+
"rollout_train_steps": 1,
|
| 938 |
+
"rollout_train_infer_steps": 64,
|
| 939 |
+
"rollout_train_temp": 1.45,
|
| 940 |
+
"rollout_train_max_gamma": 1.0,
|
| 941 |
+
"rollout_train_corrupt_only": true,
|
| 942 |
+
"rollout_train_samplewise": false,
|
| 943 |
+
"rollout_train_compute_always": false,
|
| 944 |
+
"bridge_noise_init": "logistic_normal",
|
| 945 |
+
"noise_sigma": -1.0,
|
| 946 |
+
"allow_tf32": true,
|
| 947 |
+
"activation_checkpointing": false,
|
| 948 |
+
"activation_checkpoint_interval": 1,
|
| 949 |
+
"activation_checkpoint_scope": "block",
|
| 950 |
+
"ddp_static_graph": false,
|
| 951 |
+
"ddp_gradient_as_bucket_view": true,
|
| 952 |
+
"blocking_data_transfer": false,
|
| 953 |
+
"dataloader_prefetch_factor": 4,
|
| 954 |
+
"full_train_stats": false,
|
| 955 |
+
"tokenized_hf": false,
|
| 956 |
+
"tokenized_pad_token": "pad",
|
| 957 |
+
"elf_conditional_hf": false,
|
| 958 |
+
"record_pad_truncate": false,
|
| 959 |
+
"record_add_eos": false,
|
| 960 |
+
"record_add_special_tokens": false,
|
| 961 |
+
"record_pad_token": "pad",
|
| 962 |
+
"record_shuffle_buffer": 10000,
|
| 963 |
+
"wrap": true,
|
| 964 |
+
"wrap_mode": "stream",
|
| 965 |
+
"wrap_record_buffer_size": 200,
|
| 966 |
+
"owt_cached_chunks": true,
|
| 967 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
|
| 968 |
+
"owt_chunk_cache_rebuild": false,
|
| 969 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 970 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 971 |
+
"online_chunk_shuffle": false,
|
| 972 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 973 |
+
"openwebtext_split": "train_minus_100k",
|
| 974 |
+
"detokenizer": "auto",
|
| 975 |
+
"resolved_detokenizer": null,
|
| 976 |
+
"num_workers": 0,
|
| 977 |
+
"latest_every": 1000,
|
| 978 |
+
"resume_path": "runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805/latest.pt"
|
| 979 |
+
}
|
| 980 |
+
step=4100 epoch=4100/5000 epoch_step=1/1 micro_steps=4100 elapsed=4.4s lr=2.000000e-03 loss=0.0738 loss_recon=0.0738 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9772 corrupt_frac=1.0000 acc_corrupt=0.9772 loss_corrupt=0.0738 wrong_frac=0.7917 init_acc_corrupt=0.2067 acc_corrupt_t_0p0_0p2=0.9592 corrupt_frac_t_0p0_0p2=0.5588 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3579 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.2596 out_g_norm=0.0802 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1209 init_gold_top10=0.2053 init_gold_top100=0.2800
|
| 981 |
+
step=4200 epoch=4200/5000 epoch_step=1/1 micro_steps=4200 elapsed=3.7s lr=2.000000e-03 loss=0.0741 loss_recon=0.0741 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9760 corrupt_frac=1.0000 acc_corrupt=0.9760 loss_corrupt=0.0741 wrong_frac=0.7925 init_acc_corrupt=0.2059 acc_corrupt_t_0p0_0p2=0.9570 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0119 out_w_norm=12.2460 out_g_norm=0.0796 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0588 init_gold_top10=0.2043 init_gold_top100=0.2783
|
| 982 |
+
step=4300 epoch=4300/5000 epoch_step=1/1 micro_steps=4300 elapsed=3.7s lr=2.000000e-03 loss=0.0609 loss_recon=0.0609 loss_meanflow=0.0000 mean_model_t=0.2117 mean_corrupt_t=0.2117 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9804 corrupt_frac=1.0000 acc_corrupt=0.9804 loss_corrupt=0.0609 wrong_frac=0.7884 init_acc_corrupt=0.2102 acc_corrupt_t_0p0_0p2=0.9642 corrupt_frac_t_0p0_0p2=0.5484 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3611 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0803 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=12.2396 out_g_norm=0.0777 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0815 init_gold_top10=0.2224 init_gold_top100=0.2981
|
| 983 |
+
step=4400 epoch=4400/5000 epoch_step=1/1 micro_steps=4400 elapsed=3.7s lr=2.000000e-03 loss=0.0665 loss_recon=0.0665 loss_meanflow=0.0000 mean_model_t=0.2073 mean_corrupt_t=0.2073 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9787 corrupt_frac=1.0000 acc_corrupt=0.9787 loss_corrupt=0.0665 wrong_frac=0.7927 init_acc_corrupt=0.2056 acc_corrupt_t_0p0_0p2=0.9621 corrupt_frac_t_0p0_0p2=0.5613 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3571 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0727 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=12.2321 out_g_norm=0.0750 loss_all=0.0665 init_gold_top10=0.2197 init_gold_top100=0.2939
|
| 984 |
+
step=4500 epoch=4500/5000 epoch_step=1/1 micro_steps=4500 elapsed=3.9s lr=2.000000e-03 loss=0.0639 loss_recon=0.0639 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9799 corrupt_frac=1.0000 acc_corrupt=0.9799 loss_corrupt=0.0639 wrong_frac=0.7904 init_acc_corrupt=0.2080 acc_corrupt_t_0p0_0p2=0.9637 corrupt_frac_t_0p0_0p2=0.5540 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0745 out_w_norm=12.2223 out_g_norm=0.0766 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0581 init_gold_top10=0.2225 init_gold_top100=0.2952
|
| 985 |
+
step=4600 epoch=4600/5000 epoch_step=1/1 micro_steps=4600 elapsed=3.7s lr=2.000000e-03 loss=0.0625 loss_recon=0.0625 loss_meanflow=0.0000 mean_model_t=0.2089 mean_corrupt_t=0.2089 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9799 corrupt_frac=1.0000 acc_corrupt=0.9799 loss_corrupt=0.0625 wrong_frac=0.7910 init_acc_corrupt=0.2074 acc_corrupt_t_0p0_0p2=0.9642 corrupt_frac_t_0p0_0p2=0.5598 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0750 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=12.1981 out_g_norm=0.0736 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.0094 loss_all=0.0510 init_gold_top10=0.2072 init_gold_top100=0.2815
|
| 986 |
+
step=4700 epoch=4700/5000 epoch_step=1/1 micro_steps=4700 elapsed=3.7s lr=2.000000e-03 loss=0.0641 loss_recon=0.0641 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9791 corrupt_frac=1.0000 acc_corrupt=0.9791 loss_corrupt=0.0641 wrong_frac=0.7907 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9624 corrupt_frac_t_0p0_0p2=0.5552 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3584 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=12.1963 out_g_norm=0.0704 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0132 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0676 init_gold_top10=0.2117 init_gold_top100=0.2832
|
| 987 |
+
step=4800 epoch=4800/5000 epoch_step=1/1 micro_steps=4800 elapsed=3.7s lr=2.000000e-03 loss=0.0633 loss_recon=0.0633 loss_meanflow=0.0000 mean_model_t=0.2103 mean_corrupt_t=0.2103 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9797 corrupt_frac=1.0000 acc_corrupt=0.9797 loss_corrupt=0.0633 wrong_frac=0.7897 init_acc_corrupt=0.2086 acc_corrupt_t_0p0_0p2=0.9632 corrupt_frac_t_0p0_0p2=0.5525 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3592 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0800 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0141 out_w_norm=12.1704 out_g_norm=0.0736 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0173 init_gold_top10=0.2208 init_gold_top100=0.2931
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805.log
ADDED
|
@@ -0,0 +1,791 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NCCL version 2.25.1+cuda12.8
|
| 2 |
+
{
|
| 3 |
+
"device": "cuda:0",
|
| 4 |
+
"rank": 0,
|
| 5 |
+
"world_size": 4,
|
| 6 |
+
"samples": "owt_cached_chunks:8",
|
| 7 |
+
"vocab_size": 969,
|
| 8 |
+
"tokenizer_vocab_size": 50257,
|
| 9 |
+
"save_dir": "runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805",
|
| 10 |
+
"batch_size": 128,
|
| 11 |
+
"grad_accum": 1,
|
| 12 |
+
"effective_batch_size": 512,
|
| 13 |
+
"global_batch_size": 512,
|
| 14 |
+
"lr_schedule": "constant_warmup",
|
| 15 |
+
"optimizer": "muon",
|
| 16 |
+
"epochs": 0.0,
|
| 17 |
+
"steps_per_epoch": 1,
|
| 18 |
+
"total_steps": 1000,
|
| 19 |
+
"warmup_steps": 10,
|
| 20 |
+
"warmup_epochs": -1.0,
|
| 21 |
+
"min_lr": 0.0,
|
| 22 |
+
"weight_decay": 0.1,
|
| 23 |
+
"output_weight_decay": -1.0,
|
| 24 |
+
"adamw_param_groups": "nanogpt",
|
| 25 |
+
"adam_beta1": 0.9,
|
| 26 |
+
"adam_beta2": 0.95,
|
| 27 |
+
"adam_eps": 1e-08,
|
| 28 |
+
"muon_impl": "legacy",
|
| 29 |
+
"muon_momentum": 0.95,
|
| 30 |
+
"muon_ns_steps": 5,
|
| 31 |
+
"muon_update_scale": 1.0,
|
| 32 |
+
"muon_nesterov": false,
|
| 33 |
+
"muon_width_scale": false,
|
| 34 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 35 |
+
"muon_param_count": 1965440,
|
| 36 |
+
"muon_adam_param_count": 8192,
|
| 37 |
+
"muon_param_names": [
|
| 38 |
+
"vocab_embed.embedding",
|
| 39 |
+
"sigma_map.net.0.weight",
|
| 40 |
+
"sigma_map.net.2.weight",
|
| 41 |
+
"blocks.0.attn_qkv.weight",
|
| 42 |
+
"blocks.0.attn_out.weight",
|
| 43 |
+
"blocks.0.mlp.0.weight",
|
| 44 |
+
"blocks.0.mlp.2.weight",
|
| 45 |
+
"blocks.0.adaLN_modulation.weight",
|
| 46 |
+
"blocks.1.attn_qkv.weight",
|
| 47 |
+
"blocks.1.attn_out.weight",
|
| 48 |
+
"blocks.1.mlp.0.weight",
|
| 49 |
+
"blocks.1.mlp.2.weight",
|
| 50 |
+
"blocks.1.adaLN_modulation.weight",
|
| 51 |
+
"blocks.2.attn_qkv.weight",
|
| 52 |
+
"blocks.2.attn_out.weight",
|
| 53 |
+
"blocks.2.mlp.0.weight",
|
| 54 |
+
"blocks.2.mlp.2.weight",
|
| 55 |
+
"blocks.2.adaLN_modulation.weight",
|
| 56 |
+
"output_layer.linear.weight",
|
| 57 |
+
"output_layer.adaLN_modulation.weight"
|
| 58 |
+
],
|
| 59 |
+
"muon_adam_param_names": [
|
| 60 |
+
"sigma_map.net.0.bias",
|
| 61 |
+
"sigma_map.net.2.bias",
|
| 62 |
+
"blocks.0.norm1.weight",
|
| 63 |
+
"blocks.0.norm2.weight",
|
| 64 |
+
"blocks.0.mlp.0.bias",
|
| 65 |
+
"blocks.0.mlp.2.bias",
|
| 66 |
+
"blocks.0.adaLN_modulation.bias",
|
| 67 |
+
"blocks.1.norm1.weight",
|
| 68 |
+
"blocks.1.norm2.weight",
|
| 69 |
+
"blocks.1.mlp.0.bias",
|
| 70 |
+
"blocks.1.mlp.2.bias",
|
| 71 |
+
"blocks.1.adaLN_modulation.bias",
|
| 72 |
+
"blocks.2.norm1.weight",
|
| 73 |
+
"blocks.2.norm2.weight",
|
| 74 |
+
"blocks.2.mlp.0.bias",
|
| 75 |
+
"blocks.2.mlp.2.bias",
|
| 76 |
+
"blocks.2.adaLN_modulation.bias",
|
| 77 |
+
"output_layer.norm_final.weight",
|
| 78 |
+
"output_layer.adaLN_modulation.bias"
|
| 79 |
+
],
|
| 80 |
+
"muon_effective_nesterov": false,
|
| 81 |
+
"muon_effective_width_scale": false,
|
| 82 |
+
"muon_effective_weight_decay": 0.1,
|
| 83 |
+
"muon_adam_fallback_nesterov": false,
|
| 84 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 85 |
+
"ema_decay": 0.9999,
|
| 86 |
+
"ema_start_step": 0,
|
| 87 |
+
"model_type": "ddit",
|
| 88 |
+
"ddit_mlp_type": "gelu",
|
| 89 |
+
"elf_num_time_tokens": 4,
|
| 90 |
+
"elf_num_model_mode_tokens": 0,
|
| 91 |
+
"qk_norm": true,
|
| 92 |
+
"output_bias": false,
|
| 93 |
+
"output_init_std": -1.0,
|
| 94 |
+
"norm_type": "rmsnorm",
|
| 95 |
+
"target_loss": "hard_ce",
|
| 96 |
+
"linear_soft_target_power": 1.0,
|
| 97 |
+
"linear_soft_target_min_conf": 0.0,
|
| 98 |
+
"linear_soft_target_max_conf": 1.0,
|
| 99 |
+
"t_sampling_mode": "logit_normal",
|
| 100 |
+
"t_sampling_power": 1.0,
|
| 101 |
+
"t_sampling_eps": 0.0001,
|
| 102 |
+
"t_sampling_logit_mean": -1.5,
|
| 103 |
+
"t_sampling_logit_std": 0.8,
|
| 104 |
+
"dual_t": true,
|
| 105 |
+
"corrupt_t_mode": "same",
|
| 106 |
+
"corrupt_min_t": 0.0,
|
| 107 |
+
"corrupt_max_t": 1.0,
|
| 108 |
+
"prefix_block_prob": 0.0,
|
| 109 |
+
"prefix_block_len": 128,
|
| 110 |
+
"mask_ratio_floor_schedule": "none",
|
| 111 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 112 |
+
"dirichlet_semantic_t_mode": "same",
|
| 113 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 114 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 115 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 116 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 117 |
+
"categorical_wrong_from_full_vocab": true,
|
| 118 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 119 |
+
"categorical_wrong_basin_token_ids": "",
|
| 120 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 121 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 122 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 123 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 124 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 125 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 126 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 127 |
+
"mask_mixture_original_prob": 0.0,
|
| 128 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 129 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 130 |
+
"mask_mixture_block_prob": 0.0,
|
| 131 |
+
"mask_mixture_all_prob": 1.0,
|
| 132 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 133 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 134 |
+
"mask_mixture_block_tokens": "64,128",
|
| 135 |
+
"simplex_bridge_sampler": "logistic_normal_linear_mean",
|
| 136 |
+
"logistic_normal_sigma_min": 0.05,
|
| 137 |
+
"logistic_normal_sigma_max": 0.5,
|
| 138 |
+
"logistic_normal_tau_min": 1.0,
|
| 139 |
+
"logistic_normal_tau_max": 1.0,
|
| 140 |
+
"torch_compile": false,
|
| 141 |
+
"compile_mode": "max-autotune",
|
| 142 |
+
"state_format": "prob",
|
| 143 |
+
"meanflow_weight": 0.0,
|
| 144 |
+
"rollout_train_prob": 0.0,
|
| 145 |
+
"rollout_train_steps": 1,
|
| 146 |
+
"rollout_train_infer_steps": 64,
|
| 147 |
+
"rollout_train_temp": 1.45,
|
| 148 |
+
"rollout_train_max_gamma": 1.0,
|
| 149 |
+
"rollout_train_corrupt_only": true,
|
| 150 |
+
"rollout_train_samplewise": false,
|
| 151 |
+
"rollout_train_compute_always": false,
|
| 152 |
+
"bridge_noise_init": "logistic_normal",
|
| 153 |
+
"noise_sigma": -1.0,
|
| 154 |
+
"allow_tf32": true,
|
| 155 |
+
"activation_checkpointing": false,
|
| 156 |
+
"activation_checkpoint_interval": 1,
|
| 157 |
+
"activation_checkpoint_scope": "block",
|
| 158 |
+
"ddp_static_graph": false,
|
| 159 |
+
"ddp_gradient_as_bucket_view": true,
|
| 160 |
+
"blocking_data_transfer": false,
|
| 161 |
+
"dataloader_prefetch_factor": 4,
|
| 162 |
+
"full_train_stats": false,
|
| 163 |
+
"tokenized_hf": false,
|
| 164 |
+
"tokenized_pad_token": "pad",
|
| 165 |
+
"elf_conditional_hf": false,
|
| 166 |
+
"record_pad_truncate": false,
|
| 167 |
+
"record_add_eos": false,
|
| 168 |
+
"record_add_special_tokens": false,
|
| 169 |
+
"record_pad_token": "pad",
|
| 170 |
+
"record_shuffle_buffer": 10000,
|
| 171 |
+
"wrap": true,
|
| 172 |
+
"wrap_mode": "stream",
|
| 173 |
+
"wrap_record_buffer_size": 200,
|
| 174 |
+
"owt_cached_chunks": true,
|
| 175 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
|
| 176 |
+
"owt_chunk_cache_rebuild": false,
|
| 177 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 178 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 179 |
+
"online_chunk_shuffle": false,
|
| 180 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 181 |
+
"openwebtext_split": "train_minus_100k",
|
| 182 |
+
"detokenizer": "auto",
|
| 183 |
+
"resolved_detokenizer": null,
|
| 184 |
+
"num_workers": 0,
|
| 185 |
+
"latest_every": 1000,
|
| 186 |
+
"resume_path": ""
|
| 187 |
+
}
|
| 188 |
+
step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=4.5s lr=2.000000e-03 loss=6.6416 loss_recon=6.6416 loss_meanflow=0.0000 mean_model_t=0.2088 mean_corrupt_t=0.2088 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1812 corrupt_frac=1.0000 acc_corrupt=0.1812 loss_corrupt=6.6416 wrong_frac=0.7916 init_acc_corrupt=0.2093 acc_corrupt_t_0p0_0p2=0.0970 corrupt_frac_t_0p0_0p2=0.5587 acc_corrupt_t_0p2_0p4=0.2433 corrupt_frac_t_0p2_0p4=0.3553 acc_corrupt_t_0p4_0p6=0.4524 corrupt_frac_t_0p4_0p6=0.0770 acc_corrupt_t_0p6_0p8=0.6374 corrupt_frac_t_0p6_0p8=0.0122 out_w_norm=1.2373 out_g_norm=0.9360 acc_corrupt_t_0p8_1p0=0.8027 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.3029 init_gold_top10=0.2177 init_gold_top100=0.2919
|
| 189 |
+
step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=3.8s lr=2.000000e-03 loss=5.8451 loss_recon=5.8451 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1702 corrupt_frac=1.0000 acc_corrupt=0.1702 loss_corrupt=5.8451 wrong_frac=0.7931 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.1040 corrupt_frac_t_0p0_0p2=0.5635 acc_corrupt_t_0p2_0p4=0.2165 corrupt_frac_t_0p2_0p4=0.3538 acc_corrupt_t_0p4_0p6=0.4006 corrupt_frac_t_0p4_0p6=0.0742 acc_corrupt_t_0p6_0p8=0.6151 corrupt_frac_t_0p6_0p8=0.0127 out_w_norm=4.0542 out_g_norm=1.2907 acc_corrupt_t_0p8_1p0=0.7878 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.5064 init_gold_top10=0.1975 init_gold_top100=0.2742
|
| 190 |
+
step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=3.8s lr=2.000000e-03 loss=5.0754 loss_recon=5.0754 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1945 corrupt_frac=1.0000 acc_corrupt=0.1945 loss_corrupt=5.0754 wrong_frac=0.7916 init_acc_corrupt=0.2092 acc_corrupt_t_0p0_0p2=0.1171 corrupt_frac_t_0p0_0p2=0.5532 acc_corrupt_t_0p2_0p4=0.2511 corrupt_frac_t_0p2_0p4=0.3616 acc_corrupt_t_0p4_0p6=0.4385 corrupt_frac_t_0p4_0p6=0.0771 acc_corrupt_t_0p6_0p8=0.6358 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=6.6304 out_g_norm=0.7911 loss_all=4.7495 init_gold_top10=0.2067 init_gold_top100=0.2811
|
| 191 |
+
step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=3.8s lr=2.000000e-03 loss=4.2417 loss_recon=4.2417 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2396 corrupt_frac=1.0000 acc_corrupt=0.2396 loss_corrupt=4.2417 wrong_frac=0.7924 init_acc_corrupt=0.2084 acc_corrupt_t_0p0_0p2=0.1509 corrupt_frac_t_0p0_0p2=0.5611 acc_corrupt_t_0p2_0p4=0.3167 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.4920 corrupt_frac_t_0p4_0p6=0.0734 out_w_norm=8.3769 out_g_norm=0.4660 acc_corrupt_t_0p6_0p8=0.6663 corrupt_frac_t_0p6_0p8=0.0122 loss_all=3.9329 init_gold_top10=0.1958 init_gold_top100=0.2700
|
| 192 |
+
step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=3.8s lr=2.000000e-03 loss=3.2604 loss_recon=3.2604 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2686 corrupt_frac=1.0000 acc_corrupt=0.2686 loss_corrupt=3.2604 wrong_frac=0.7902 init_acc_corrupt=0.2106 acc_corrupt_t_0p0_0p2=0.1795 corrupt_frac_t_0p0_0p2=0.5567 acc_corrupt_t_0p2_0p4=0.3418 corrupt_frac_t_0p2_0p4=0.3548 acc_corrupt_t_0p4_0p6=0.5201 corrupt_frac_t_0p4_0p6=0.0798 out_w_norm=9.7739 out_g_norm=0.4141 acc_corrupt_t_0p6_0p8=0.6828 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=0.8711 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.7174 init_gold_top10=0.2355 init_gold_top100=0.3061
|
| 193 |
+
step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=3.8s lr=2.000000e-03 loss=2.2324 loss_recon=2.2324 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3783 corrupt_frac=1.0000 acc_corrupt=0.3783 loss_corrupt=2.2324 wrong_frac=0.7920 init_acc_corrupt=0.2088 acc_corrupt_t_0p0_0p2=0.2666 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=0.4840 corrupt_frac_t_0p2_0p4=0.3566 acc_corrupt_t_0p4_0p6=0.6511 corrupt_frac_t_0p4_0p6=0.0777 acc_corrupt_t_0p6_0p8=0.7795 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=10.5161 out_g_norm=0.4345 acc_corrupt_t_0p8_1p0=0.8880 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7136 init_gold_top10=0.2176 init_gold_top100=0.2910
|
| 194 |
+
step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=3.8s lr=2.000000e-03 loss=1.1924 loss_recon=1.1924 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6467 corrupt_frac=1.0000 acc_corrupt=0.6467 loss_corrupt=1.1924 wrong_frac=0.7910 init_acc_corrupt=0.2098 acc_corrupt_t_0p0_0p2=0.5041 corrupt_frac_t_0p0_0p2=0.5582 acc_corrupt_t_0p2_0p4=0.8081 corrupt_frac_t_0p2_0p4=0.3555 acc_corrupt_t_0p4_0p6=0.8997 corrupt_frac_t_0p4_0p6=0.0766 acc_corrupt_t_0p6_0p8=0.9358 corrupt_frac_t_0p6_0p8=0.0144 out_w_norm=11.0276 out_g_norm=0.5232 acc_corrupt_t_0p8_1p0=0.9854 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.7009 init_gold_top10=0.2223 init_gold_top100=0.2966
|
| 195 |
+
step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=3.8s lr=2.000000e-03 loss=0.4993 loss_recon=0.4993 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8635 corrupt_frac=1.0000 acc_corrupt=0.8635 loss_corrupt=0.4993 wrong_frac=0.7890 init_acc_corrupt=0.2118 acc_corrupt_t_0p0_0p2=0.7646 corrupt_frac_t_0p0_0p2=0.5512 acc_corrupt_t_0p2_0p4=0.9823 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=0.9957 corrupt_frac_t_0p4_0p6=0.0805 out_w_norm=11.4562 out_g_norm=0.4135 acc_corrupt_t_0p6_0p8=0.9968 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3302 init_gold_top10=0.2302 init_gold_top100=0.3015
|
| 196 |
+
step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=3.8s lr=2.000000e-03 loss=0.2580 loss_recon=0.2580 loss_meanflow=0.0000 mean_model_t=0.2088 mean_corrupt_t=0.2088 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9303 corrupt_frac=1.0000 acc_corrupt=0.9303 loss_corrupt=0.2580 wrong_frac=0.7914 init_acc_corrupt=0.2094 acc_corrupt_t_0p0_0p2=0.8760 corrupt_frac_t_0p0_0p2=0.5583 acc_corrupt_t_0p2_0p4=0.9987 corrupt_frac_t_0p2_0p4=0.3557 acc_corrupt_t_0p4_0p6=0.9997 corrupt_frac_t_0p4_0p6=0.0769 out_w_norm=11.7902 out_g_norm=0.3034 acc_corrupt_t_0p6_0p8=0.9991 corrupt_frac_t_0p6_0p8=0.0129 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2191 init_gold_top10=0.1997 init_gold_top100=0.2768
|
| 197 |
+
step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=3.8s lr=2.000000e-03 loss=0.1393 loss_recon=0.1393 loss_meanflow=0.0000 mean_model_t=0.2113 mean_corrupt_t=0.2113 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9612 corrupt_frac=1.0000 acc_corrupt=0.9612 loss_corrupt=0.1393 wrong_frac=0.7887 init_acc_corrupt=0.2122 acc_corrupt_t_0p0_0p2=0.9298 corrupt_frac_t_0p0_0p2=0.5509 acc_corrupt_t_0p2_0p4=0.9997 corrupt_frac_t_0p2_0p4=0.3594 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0805 out_w_norm=11.9794 out_g_norm=0.2449 acc_corrupt_t_0p6_0p8=0.9995 corrupt_frac_t_0p6_0p8=0.0125 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0803 init_gold_top10=0.2300 init_gold_top100=0.3030
|
| 198 |
+
NCCL version 2.25.1+cuda12.8
|
| 199 |
+
resumed_from=runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805/latest.pt start_step=1001
|
| 200 |
+
{
|
| 201 |
+
"device": "cuda:0",
|
| 202 |
+
"rank": 0,
|
| 203 |
+
"world_size": 4,
|
| 204 |
+
"samples": "owt_cached_chunks:8",
|
| 205 |
+
"vocab_size": 969,
|
| 206 |
+
"tokenizer_vocab_size": 50257,
|
| 207 |
+
"save_dir": "runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805",
|
| 208 |
+
"batch_size": 128,
|
| 209 |
+
"grad_accum": 1,
|
| 210 |
+
"effective_batch_size": 512,
|
| 211 |
+
"global_batch_size": 512,
|
| 212 |
+
"lr_schedule": "constant_warmup",
|
| 213 |
+
"optimizer": "muon",
|
| 214 |
+
"epochs": 0.0,
|
| 215 |
+
"steps_per_epoch": 1,
|
| 216 |
+
"total_steps": 2000,
|
| 217 |
+
"warmup_steps": 10,
|
| 218 |
+
"warmup_epochs": -1.0,
|
| 219 |
+
"min_lr": 0.0,
|
| 220 |
+
"weight_decay": 0.1,
|
| 221 |
+
"output_weight_decay": -1.0,
|
| 222 |
+
"adamw_param_groups": "nanogpt",
|
| 223 |
+
"adam_beta1": 0.9,
|
| 224 |
+
"adam_beta2": 0.95,
|
| 225 |
+
"adam_eps": 1e-08,
|
| 226 |
+
"muon_impl": "legacy",
|
| 227 |
+
"muon_momentum": 0.95,
|
| 228 |
+
"muon_ns_steps": 5,
|
| 229 |
+
"muon_update_scale": 1.0,
|
| 230 |
+
"muon_nesterov": false,
|
| 231 |
+
"muon_width_scale": false,
|
| 232 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 233 |
+
"muon_param_count": 1965440,
|
| 234 |
+
"muon_adam_param_count": 8192,
|
| 235 |
+
"muon_param_names": [
|
| 236 |
+
"vocab_embed.embedding",
|
| 237 |
+
"sigma_map.net.0.weight",
|
| 238 |
+
"sigma_map.net.2.weight",
|
| 239 |
+
"blocks.0.attn_qkv.weight",
|
| 240 |
+
"blocks.0.attn_out.weight",
|
| 241 |
+
"blocks.0.mlp.0.weight",
|
| 242 |
+
"blocks.0.mlp.2.weight",
|
| 243 |
+
"blocks.0.adaLN_modulation.weight",
|
| 244 |
+
"blocks.1.attn_qkv.weight",
|
| 245 |
+
"blocks.1.attn_out.weight",
|
| 246 |
+
"blocks.1.mlp.0.weight",
|
| 247 |
+
"blocks.1.mlp.2.weight",
|
| 248 |
+
"blocks.1.adaLN_modulation.weight",
|
| 249 |
+
"blocks.2.attn_qkv.weight",
|
| 250 |
+
"blocks.2.attn_out.weight",
|
| 251 |
+
"blocks.2.mlp.0.weight",
|
| 252 |
+
"blocks.2.mlp.2.weight",
|
| 253 |
+
"blocks.2.adaLN_modulation.weight",
|
| 254 |
+
"output_layer.linear.weight",
|
| 255 |
+
"output_layer.adaLN_modulation.weight"
|
| 256 |
+
],
|
| 257 |
+
"muon_adam_param_names": [
|
| 258 |
+
"sigma_map.net.0.bias",
|
| 259 |
+
"sigma_map.net.2.bias",
|
| 260 |
+
"blocks.0.norm1.weight",
|
| 261 |
+
"blocks.0.norm2.weight",
|
| 262 |
+
"blocks.0.mlp.0.bias",
|
| 263 |
+
"blocks.0.mlp.2.bias",
|
| 264 |
+
"blocks.0.adaLN_modulation.bias",
|
| 265 |
+
"blocks.1.norm1.weight",
|
| 266 |
+
"blocks.1.norm2.weight",
|
| 267 |
+
"blocks.1.mlp.0.bias",
|
| 268 |
+
"blocks.1.mlp.2.bias",
|
| 269 |
+
"blocks.1.adaLN_modulation.bias",
|
| 270 |
+
"blocks.2.norm1.weight",
|
| 271 |
+
"blocks.2.norm2.weight",
|
| 272 |
+
"blocks.2.mlp.0.bias",
|
| 273 |
+
"blocks.2.mlp.2.bias",
|
| 274 |
+
"blocks.2.adaLN_modulation.bias",
|
| 275 |
+
"output_layer.norm_final.weight",
|
| 276 |
+
"output_layer.adaLN_modulation.bias"
|
| 277 |
+
],
|
| 278 |
+
"muon_effective_nesterov": false,
|
| 279 |
+
"muon_effective_width_scale": false,
|
| 280 |
+
"muon_effective_weight_decay": 0.1,
|
| 281 |
+
"muon_adam_fallback_nesterov": false,
|
| 282 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 283 |
+
"ema_decay": 0.9999,
|
| 284 |
+
"ema_start_step": 0,
|
| 285 |
+
"model_type": "ddit",
|
| 286 |
+
"ddit_mlp_type": "gelu",
|
| 287 |
+
"elf_num_time_tokens": 4,
|
| 288 |
+
"elf_num_model_mode_tokens": 0,
|
| 289 |
+
"qk_norm": true,
|
| 290 |
+
"output_bias": false,
|
| 291 |
+
"output_init_std": -1.0,
|
| 292 |
+
"norm_type": "rmsnorm",
|
| 293 |
+
"target_loss": "hard_ce",
|
| 294 |
+
"linear_soft_target_power": 1.0,
|
| 295 |
+
"linear_soft_target_min_conf": 0.0,
|
| 296 |
+
"linear_soft_target_max_conf": 1.0,
|
| 297 |
+
"t_sampling_mode": "logit_normal",
|
| 298 |
+
"t_sampling_power": 1.0,
|
| 299 |
+
"t_sampling_eps": 0.0001,
|
| 300 |
+
"t_sampling_logit_mean": -1.5,
|
| 301 |
+
"t_sampling_logit_std": 0.8,
|
| 302 |
+
"dual_t": true,
|
| 303 |
+
"corrupt_t_mode": "same",
|
| 304 |
+
"corrupt_min_t": 0.0,
|
| 305 |
+
"corrupt_max_t": 1.0,
|
| 306 |
+
"prefix_block_prob": 0.0,
|
| 307 |
+
"prefix_block_len": 128,
|
| 308 |
+
"mask_ratio_floor_schedule": "none",
|
| 309 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 310 |
+
"dirichlet_semantic_t_mode": "same",
|
| 311 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 312 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 313 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 314 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 315 |
+
"categorical_wrong_from_full_vocab": true,
|
| 316 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 317 |
+
"categorical_wrong_basin_token_ids": "",
|
| 318 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 319 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 320 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 321 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 322 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 323 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 324 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 325 |
+
"mask_mixture_original_prob": 0.0,
|
| 326 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 327 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 328 |
+
"mask_mixture_block_prob": 0.0,
|
| 329 |
+
"mask_mixture_all_prob": 1.0,
|
| 330 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 331 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 332 |
+
"mask_mixture_block_tokens": "64,128",
|
| 333 |
+
"simplex_bridge_sampler": "logistic_normal_linear_mean",
|
| 334 |
+
"logistic_normal_sigma_min": 0.05,
|
| 335 |
+
"logistic_normal_sigma_max": 0.5,
|
| 336 |
+
"logistic_normal_tau_min": 1.0,
|
| 337 |
+
"logistic_normal_tau_max": 1.0,
|
| 338 |
+
"torch_compile": false,
|
| 339 |
+
"compile_mode": "max-autotune",
|
| 340 |
+
"state_format": "prob",
|
| 341 |
+
"meanflow_weight": 0.0,
|
| 342 |
+
"rollout_train_prob": 0.0,
|
| 343 |
+
"rollout_train_steps": 1,
|
| 344 |
+
"rollout_train_infer_steps": 64,
|
| 345 |
+
"rollout_train_temp": 1.45,
|
| 346 |
+
"rollout_train_max_gamma": 1.0,
|
| 347 |
+
"rollout_train_corrupt_only": true,
|
| 348 |
+
"rollout_train_samplewise": false,
|
| 349 |
+
"rollout_train_compute_always": false,
|
| 350 |
+
"bridge_noise_init": "logistic_normal",
|
| 351 |
+
"noise_sigma": -1.0,
|
| 352 |
+
"allow_tf32": true,
|
| 353 |
+
"activation_checkpointing": false,
|
| 354 |
+
"activation_checkpoint_interval": 1,
|
| 355 |
+
"activation_checkpoint_scope": "block",
|
| 356 |
+
"ddp_static_graph": false,
|
| 357 |
+
"ddp_gradient_as_bucket_view": true,
|
| 358 |
+
"blocking_data_transfer": false,
|
| 359 |
+
"dataloader_prefetch_factor": 4,
|
| 360 |
+
"full_train_stats": false,
|
| 361 |
+
"tokenized_hf": false,
|
| 362 |
+
"tokenized_pad_token": "pad",
|
| 363 |
+
"elf_conditional_hf": false,
|
| 364 |
+
"record_pad_truncate": false,
|
| 365 |
+
"record_add_eos": false,
|
| 366 |
+
"record_add_special_tokens": false,
|
| 367 |
+
"record_pad_token": "pad",
|
| 368 |
+
"record_shuffle_buffer": 10000,
|
| 369 |
+
"wrap": true,
|
| 370 |
+
"wrap_mode": "stream",
|
| 371 |
+
"wrap_record_buffer_size": 200,
|
| 372 |
+
"owt_cached_chunks": true,
|
| 373 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
|
| 374 |
+
"owt_chunk_cache_rebuild": false,
|
| 375 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 376 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 377 |
+
"online_chunk_shuffle": false,
|
| 378 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 379 |
+
"openwebtext_split": "train_minus_100k",
|
| 380 |
+
"detokenizer": "auto",
|
| 381 |
+
"resolved_detokenizer": null,
|
| 382 |
+
"num_workers": 0,
|
| 383 |
+
"latest_every": 1000,
|
| 384 |
+
"resume_path": "runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805/latest.pt"
|
| 385 |
+
}
|
| 386 |
+
step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=4.5s lr=2.000000e-03 loss=0.1004 loss_recon=0.1004 loss_meanflow=0.0000 mean_model_t=0.2088 mean_corrupt_t=0.2088 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9707 corrupt_frac=1.0000 acc_corrupt=0.9707 loss_corrupt=0.1004 wrong_frac=0.7916 init_acc_corrupt=0.2093 acc_corrupt_t_0p0_0p2=0.9476 corrupt_frac_t_0p0_0p2=0.5587 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3553 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0770 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0122 out_w_norm=12.0726 out_g_norm=0.1836 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0791 init_gold_top10=0.2177 init_gold_top100=0.2919
|
| 387 |
+
step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=3.8s lr=2.000000e-03 loss=0.0856 loss_recon=0.0856 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9749 corrupt_frac=1.0000 acc_corrupt=0.9749 loss_corrupt=0.0856 wrong_frac=0.7931 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9556 corrupt_frac_t_0p0_0p2=0.5635 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3538 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0742 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0127 out_w_norm=12.1000 out_g_norm=0.1557 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0377 init_gold_top10=0.1975 init_gold_top100=0.2742
|
| 388 |
+
step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=3.8s lr=2.000000e-03 loss=0.0679 loss_recon=0.0679 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9793 corrupt_frac=1.0000 acc_corrupt=0.9793 loss_corrupt=0.0679 wrong_frac=0.7916 init_acc_corrupt=0.2092 acc_corrupt_t_0p0_0p2=0.9626 corrupt_frac_t_0p0_0p2=0.5532 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3616 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0771 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=12.0992 out_g_norm=0.1191 loss_all=0.0899 init_gold_top10=0.2067 init_gold_top100=0.2811
|
| 389 |
+
step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=3.8s lr=2.000000e-03 loss=0.0679 loss_recon=0.0679 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9794 corrupt_frac=1.0000 acc_corrupt=0.9794 loss_corrupt=0.0679 wrong_frac=0.7924 init_acc_corrupt=0.2084 acc_corrupt_t_0p0_0p2=0.9633 corrupt_frac_t_0p0_0p2=0.5611 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0734 out_w_norm=12.0845 out_g_norm=0.1117 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0122 loss_all=0.0877 init_gold_top10=0.1958 init_gold_top100=0.2700
|
| 390 |
+
step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=3.8s lr=2.000000e-03 loss=0.0575 loss_recon=0.0575 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9828 corrupt_frac=1.0000 acc_corrupt=0.9828 loss_corrupt=0.0575 wrong_frac=0.7902 init_acc_corrupt=0.2106 acc_corrupt_t_0p0_0p2=0.9692 corrupt_frac_t_0p0_0p2=0.5567 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3548 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0798 out_w_norm=12.0797 out_g_norm=0.1056 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0339 init_gold_top10=0.2355 init_gold_top100=0.3061
|
| 391 |
+
step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=3.8s lr=2.000000e-03 loss=0.0564 loss_recon=0.0564 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9829 corrupt_frac=1.0000 acc_corrupt=0.9829 loss_corrupt=0.0564 wrong_frac=0.7920 init_acc_corrupt=0.2088 acc_corrupt_t_0p0_0p2=0.9693 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3566 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0777 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=12.0764 out_g_norm=0.0942 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1145 init_gold_top10=0.2176 init_gold_top100=0.2910
|
| 392 |
+
step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=3.8s lr=2.000000e-03 loss=0.0459 loss_recon=0.0459 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9860 corrupt_frac=1.0000 acc_corrupt=0.9860 loss_corrupt=0.0459 wrong_frac=0.7910 init_acc_corrupt=0.2098 acc_corrupt_t_0p0_0p2=0.9750 corrupt_frac_t_0p0_0p2=0.5582 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3555 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0766 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0144 out_w_norm=12.0780 out_g_norm=0.0834 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0271 init_gold_top10=0.2223 init_gold_top100=0.2966
|
| 393 |
+
step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=3.8s lr=2.000000e-03 loss=0.0491 loss_recon=0.0491 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9850 corrupt_frac=1.0000 acc_corrupt=0.9850 loss_corrupt=0.0491 wrong_frac=0.7890 init_acc_corrupt=0.2118 acc_corrupt_t_0p0_0p2=0.9729 corrupt_frac_t_0p0_0p2=0.5512 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0805 out_w_norm=12.0707 out_g_norm=0.0727 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0427 init_gold_top10=0.2302 init_gold_top100=0.3015
|
| 394 |
+
step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=3.8s lr=2.000000e-03 loss=0.0457 loss_recon=0.0457 loss_meanflow=0.0000 mean_model_t=0.2088 mean_corrupt_t=0.2088 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9862 corrupt_frac=1.0000 acc_corrupt=0.9862 loss_corrupt=0.0457 wrong_frac=0.7914 init_acc_corrupt=0.2094 acc_corrupt_t_0p0_0p2=0.9752 corrupt_frac_t_0p0_0p2=0.5583 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3557 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0769 out_w_norm=12.0792 out_g_norm=0.0672 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0129 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0452 init_gold_top10=0.1997 init_gold_top100=0.2768
|
| 395 |
+
step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=3.8s lr=2.000000e-03 loss=0.0360 loss_recon=0.0360 loss_meanflow=0.0000 mean_model_t=0.2113 mean_corrupt_t=0.2113 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9890 corrupt_frac=1.0000 acc_corrupt=0.9890 loss_corrupt=0.0360 wrong_frac=0.7887 init_acc_corrupt=0.2122 acc_corrupt_t_0p0_0p2=0.9800 corrupt_frac_t_0p0_0p2=0.5509 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3594 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0805 out_w_norm=12.0823 out_g_norm=0.0653 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0125 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0224 init_gold_top10=0.2300 init_gold_top100=0.3030
|
| 396 |
+
NCCL version 2.25.1+cuda12.8
|
| 397 |
+
resumed_from=runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805/latest.pt start_step=2001
|
| 398 |
+
{
|
| 399 |
+
"device": "cuda:0",
|
| 400 |
+
"rank": 0,
|
| 401 |
+
"world_size": 4,
|
| 402 |
+
"samples": "owt_cached_chunks:8",
|
| 403 |
+
"vocab_size": 969,
|
| 404 |
+
"tokenizer_vocab_size": 50257,
|
| 405 |
+
"save_dir": "runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805",
|
| 406 |
+
"batch_size": 128,
|
| 407 |
+
"grad_accum": 1,
|
| 408 |
+
"effective_batch_size": 512,
|
| 409 |
+
"global_batch_size": 512,
|
| 410 |
+
"lr_schedule": "constant_warmup",
|
| 411 |
+
"optimizer": "muon",
|
| 412 |
+
"epochs": 0.0,
|
| 413 |
+
"steps_per_epoch": 1,
|
| 414 |
+
"total_steps": 3000,
|
| 415 |
+
"warmup_steps": 10,
|
| 416 |
+
"warmup_epochs": -1.0,
|
| 417 |
+
"min_lr": 0.0,
|
| 418 |
+
"weight_decay": 0.1,
|
| 419 |
+
"output_weight_decay": -1.0,
|
| 420 |
+
"adamw_param_groups": "nanogpt",
|
| 421 |
+
"adam_beta1": 0.9,
|
| 422 |
+
"adam_beta2": 0.95,
|
| 423 |
+
"adam_eps": 1e-08,
|
| 424 |
+
"muon_impl": "legacy",
|
| 425 |
+
"muon_momentum": 0.95,
|
| 426 |
+
"muon_ns_steps": 5,
|
| 427 |
+
"muon_update_scale": 1.0,
|
| 428 |
+
"muon_nesterov": false,
|
| 429 |
+
"muon_width_scale": false,
|
| 430 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 431 |
+
"muon_param_count": 1965440,
|
| 432 |
+
"muon_adam_param_count": 8192,
|
| 433 |
+
"muon_param_names": [
|
| 434 |
+
"vocab_embed.embedding",
|
| 435 |
+
"sigma_map.net.0.weight",
|
| 436 |
+
"sigma_map.net.2.weight",
|
| 437 |
+
"blocks.0.attn_qkv.weight",
|
| 438 |
+
"blocks.0.attn_out.weight",
|
| 439 |
+
"blocks.0.mlp.0.weight",
|
| 440 |
+
"blocks.0.mlp.2.weight",
|
| 441 |
+
"blocks.0.adaLN_modulation.weight",
|
| 442 |
+
"blocks.1.attn_qkv.weight",
|
| 443 |
+
"blocks.1.attn_out.weight",
|
| 444 |
+
"blocks.1.mlp.0.weight",
|
| 445 |
+
"blocks.1.mlp.2.weight",
|
| 446 |
+
"blocks.1.adaLN_modulation.weight",
|
| 447 |
+
"blocks.2.attn_qkv.weight",
|
| 448 |
+
"blocks.2.attn_out.weight",
|
| 449 |
+
"blocks.2.mlp.0.weight",
|
| 450 |
+
"blocks.2.mlp.2.weight",
|
| 451 |
+
"blocks.2.adaLN_modulation.weight",
|
| 452 |
+
"output_layer.linear.weight",
|
| 453 |
+
"output_layer.adaLN_modulation.weight"
|
| 454 |
+
],
|
| 455 |
+
"muon_adam_param_names": [
|
| 456 |
+
"sigma_map.net.0.bias",
|
| 457 |
+
"sigma_map.net.2.bias",
|
| 458 |
+
"blocks.0.norm1.weight",
|
| 459 |
+
"blocks.0.norm2.weight",
|
| 460 |
+
"blocks.0.mlp.0.bias",
|
| 461 |
+
"blocks.0.mlp.2.bias",
|
| 462 |
+
"blocks.0.adaLN_modulation.bias",
|
| 463 |
+
"blocks.1.norm1.weight",
|
| 464 |
+
"blocks.1.norm2.weight",
|
| 465 |
+
"blocks.1.mlp.0.bias",
|
| 466 |
+
"blocks.1.mlp.2.bias",
|
| 467 |
+
"blocks.1.adaLN_modulation.bias",
|
| 468 |
+
"blocks.2.norm1.weight",
|
| 469 |
+
"blocks.2.norm2.weight",
|
| 470 |
+
"blocks.2.mlp.0.bias",
|
| 471 |
+
"blocks.2.mlp.2.bias",
|
| 472 |
+
"blocks.2.adaLN_modulation.bias",
|
| 473 |
+
"output_layer.norm_final.weight",
|
| 474 |
+
"output_layer.adaLN_modulation.bias"
|
| 475 |
+
],
|
| 476 |
+
"muon_effective_nesterov": false,
|
| 477 |
+
"muon_effective_width_scale": false,
|
| 478 |
+
"muon_effective_weight_decay": 0.1,
|
| 479 |
+
"muon_adam_fallback_nesterov": false,
|
| 480 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 481 |
+
"ema_decay": 0.9999,
|
| 482 |
+
"ema_start_step": 0,
|
| 483 |
+
"model_type": "ddit",
|
| 484 |
+
"ddit_mlp_type": "gelu",
|
| 485 |
+
"elf_num_time_tokens": 4,
|
| 486 |
+
"elf_num_model_mode_tokens": 0,
|
| 487 |
+
"qk_norm": true,
|
| 488 |
+
"output_bias": false,
|
| 489 |
+
"output_init_std": -1.0,
|
| 490 |
+
"norm_type": "rmsnorm",
|
| 491 |
+
"target_loss": "hard_ce",
|
| 492 |
+
"linear_soft_target_power": 1.0,
|
| 493 |
+
"linear_soft_target_min_conf": 0.0,
|
| 494 |
+
"linear_soft_target_max_conf": 1.0,
|
| 495 |
+
"t_sampling_mode": "logit_normal",
|
| 496 |
+
"t_sampling_power": 1.0,
|
| 497 |
+
"t_sampling_eps": 0.0001,
|
| 498 |
+
"t_sampling_logit_mean": -1.5,
|
| 499 |
+
"t_sampling_logit_std": 0.8,
|
| 500 |
+
"dual_t": true,
|
| 501 |
+
"corrupt_t_mode": "same",
|
| 502 |
+
"corrupt_min_t": 0.0,
|
| 503 |
+
"corrupt_max_t": 1.0,
|
| 504 |
+
"prefix_block_prob": 0.0,
|
| 505 |
+
"prefix_block_len": 128,
|
| 506 |
+
"mask_ratio_floor_schedule": "none",
|
| 507 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 508 |
+
"dirichlet_semantic_t_mode": "same",
|
| 509 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 510 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 511 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 512 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 513 |
+
"categorical_wrong_from_full_vocab": true,
|
| 514 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 515 |
+
"categorical_wrong_basin_token_ids": "",
|
| 516 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 517 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 518 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 519 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 520 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 521 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 522 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 523 |
+
"mask_mixture_original_prob": 0.0,
|
| 524 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 525 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 526 |
+
"mask_mixture_block_prob": 0.0,
|
| 527 |
+
"mask_mixture_all_prob": 1.0,
|
| 528 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 529 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 530 |
+
"mask_mixture_block_tokens": "64,128",
|
| 531 |
+
"simplex_bridge_sampler": "logistic_normal_linear_mean",
|
| 532 |
+
"logistic_normal_sigma_min": 0.05,
|
| 533 |
+
"logistic_normal_sigma_max": 0.5,
|
| 534 |
+
"logistic_normal_tau_min": 1.0,
|
| 535 |
+
"logistic_normal_tau_max": 1.0,
|
| 536 |
+
"torch_compile": false,
|
| 537 |
+
"compile_mode": "max-autotune",
|
| 538 |
+
"state_format": "prob",
|
| 539 |
+
"meanflow_weight": 0.0,
|
| 540 |
+
"rollout_train_prob": 0.0,
|
| 541 |
+
"rollout_train_steps": 1,
|
| 542 |
+
"rollout_train_infer_steps": 64,
|
| 543 |
+
"rollout_train_temp": 1.45,
|
| 544 |
+
"rollout_train_max_gamma": 1.0,
|
| 545 |
+
"rollout_train_corrupt_only": true,
|
| 546 |
+
"rollout_train_samplewise": false,
|
| 547 |
+
"rollout_train_compute_always": false,
|
| 548 |
+
"bridge_noise_init": "logistic_normal",
|
| 549 |
+
"noise_sigma": -1.0,
|
| 550 |
+
"allow_tf32": true,
|
| 551 |
+
"activation_checkpointing": false,
|
| 552 |
+
"activation_checkpoint_interval": 1,
|
| 553 |
+
"activation_checkpoint_scope": "block",
|
| 554 |
+
"ddp_static_graph": false,
|
| 555 |
+
"ddp_gradient_as_bucket_view": true,
|
| 556 |
+
"blocking_data_transfer": false,
|
| 557 |
+
"dataloader_prefetch_factor": 4,
|
| 558 |
+
"full_train_stats": false,
|
| 559 |
+
"tokenized_hf": false,
|
| 560 |
+
"tokenized_pad_token": "pad",
|
| 561 |
+
"elf_conditional_hf": false,
|
| 562 |
+
"record_pad_truncate": false,
|
| 563 |
+
"record_add_eos": false,
|
| 564 |
+
"record_add_special_tokens": false,
|
| 565 |
+
"record_pad_token": "pad",
|
| 566 |
+
"record_shuffle_buffer": 10000,
|
| 567 |
+
"wrap": true,
|
| 568 |
+
"wrap_mode": "stream",
|
| 569 |
+
"wrap_record_buffer_size": 200,
|
| 570 |
+
"owt_cached_chunks": true,
|
| 571 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
|
| 572 |
+
"owt_chunk_cache_rebuild": false,
|
| 573 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 574 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 575 |
+
"online_chunk_shuffle": false,
|
| 576 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 577 |
+
"openwebtext_split": "train_minus_100k",
|
| 578 |
+
"detokenizer": "auto",
|
| 579 |
+
"resolved_detokenizer": null,
|
| 580 |
+
"num_workers": 0,
|
| 581 |
+
"latest_every": 1000,
|
| 582 |
+
"resume_path": "runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805/latest.pt"
|
| 583 |
+
}
|
| 584 |
+
step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=4.5s lr=2.000000e-03 loss=0.0377 loss_recon=0.0377 loss_meanflow=0.0000 mean_model_t=0.2088 mean_corrupt_t=0.2088 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9883 corrupt_frac=1.0000 acc_corrupt=0.9883 loss_corrupt=0.0377 wrong_frac=0.7916 init_acc_corrupt=0.2093 acc_corrupt_t_0p0_0p2=0.9791 corrupt_frac_t_0p0_0p2=0.5587 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3553 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0770 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0122 out_w_norm=12.0951 out_g_norm=0.0622 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0344 init_gold_top10=0.2177 init_gold_top100=0.2919
|
| 585 |
+
step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=4.0s lr=2.000000e-03 loss=0.0402 loss_recon=0.0402 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9881 corrupt_frac=1.0000 acc_corrupt=0.9881 loss_corrupt=0.0402 wrong_frac=0.7931 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9788 corrupt_frac_t_0p0_0p2=0.5635 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3538 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0742 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0127 out_w_norm=12.1114 out_g_norm=0.0591 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0402 init_gold_top10=0.1975 init_gold_top100=0.2742
|
| 586 |
+
step=2300 epoch=2300/3000 epoch_step=1/1 micro_steps=2300 elapsed=3.9s lr=2.000000e-03 loss=0.0340 loss_recon=0.0340 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9892 corrupt_frac=1.0000 acc_corrupt=0.9892 loss_corrupt=0.0340 wrong_frac=0.7916 init_acc_corrupt=0.2092 acc_corrupt_t_0p0_0p2=0.9805 corrupt_frac_t_0p0_0p2=0.5532 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3616 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0771 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=12.1281 out_g_norm=0.0558 loss_all=0.0586 init_gold_top10=0.2067 init_gold_top100=0.2811
|
| 587 |
+
step=2400 epoch=2400/3000 epoch_step=1/1 micro_steps=2400 elapsed=3.9s lr=2.000000e-03 loss=0.0344 loss_recon=0.0344 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9894 corrupt_frac=1.0000 acc_corrupt=0.9894 loss_corrupt=0.0344 wrong_frac=0.7924 init_acc_corrupt=0.2084 acc_corrupt_t_0p0_0p2=0.9811 corrupt_frac_t_0p0_0p2=0.5611 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0734 out_w_norm=12.1489 out_g_norm=0.0537 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0122 loss_all=0.0615 init_gold_top10=0.1958 init_gold_top100=0.2700
|
| 588 |
+
step=2500 epoch=2500/3000 epoch_step=1/1 micro_steps=2500 elapsed=3.9s lr=2.000000e-03 loss=0.0312 loss_recon=0.0312 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9905 corrupt_frac=1.0000 acc_corrupt=0.9905 loss_corrupt=0.0312 wrong_frac=0.7902 init_acc_corrupt=0.2106 acc_corrupt_t_0p0_0p2=0.9830 corrupt_frac_t_0p0_0p2=0.5567 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3548 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0798 out_w_norm=12.1605 out_g_norm=0.0507 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0138 init_gold_top10=0.2355 init_gold_top100=0.3061
|
| 589 |
+
step=2600 epoch=2600/3000 epoch_step=1/1 micro_steps=2600 elapsed=3.9s lr=2.000000e-03 loss=0.0313 loss_recon=0.0313 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9904 corrupt_frac=1.0000 acc_corrupt=0.9904 loss_corrupt=0.0313 wrong_frac=0.7920 init_acc_corrupt=0.2088 acc_corrupt_t_0p0_0p2=0.9828 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3566 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0777 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=12.1766 out_g_norm=0.0494 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0815 init_gold_top10=0.2176 init_gold_top100=0.2910
|
| 590 |
+
step=2700 epoch=2700/3000 epoch_step=1/1 micro_steps=2700 elapsed=3.9s lr=2.000000e-03 loss=0.0249 loss_recon=0.0249 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9922 corrupt_frac=1.0000 acc_corrupt=0.9922 loss_corrupt=0.0249 wrong_frac=0.7910 init_acc_corrupt=0.2098 acc_corrupt_t_0p0_0p2=0.9860 corrupt_frac_t_0p0_0p2=0.5582 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3555 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0766 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0144 out_w_norm=12.1834 out_g_norm=0.0451 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0019 init_gold_top10=0.2223 init_gold_top100=0.2966
|
| 591 |
+
step=2800 epoch=2800/3000 epoch_step=1/1 micro_steps=2800 elapsed=3.9s lr=2.000000e-03 loss=0.0284 loss_recon=0.0284 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9912 corrupt_frac=1.0000 acc_corrupt=0.9912 loss_corrupt=0.0284 wrong_frac=0.7890 init_acc_corrupt=0.2118 acc_corrupt_t_0p0_0p2=0.9841 corrupt_frac_t_0p0_0p2=0.5512 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0805 out_w_norm=12.2032 out_g_norm=0.0412 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0442 init_gold_top10=0.2302 init_gold_top100=0.3015
|
| 592 |
+
step=2900 epoch=2900/3000 epoch_step=1/1 micro_steps=2900 elapsed=3.9s lr=2.000000e-03 loss=0.0276 loss_recon=0.0276 loss_meanflow=0.0000 mean_model_t=0.2088 mean_corrupt_t=0.2088 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9916 corrupt_frac=1.0000 acc_corrupt=0.9916 loss_corrupt=0.0276 wrong_frac=0.7914 init_acc_corrupt=0.2094 acc_corrupt_t_0p0_0p2=0.9849 corrupt_frac_t_0p0_0p2=0.5583 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3557 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0769 out_w_norm=12.2219 out_g_norm=0.0415 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0129 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0432 init_gold_top10=0.1997 init_gold_top100=0.2768
|
| 593 |
+
step=3000 epoch=3000/3000 epoch_step=1/1 micro_steps=3000 elapsed=3.9s lr=2.000000e-03 loss=0.0224 loss_recon=0.0224 loss_meanflow=0.0000 mean_model_t=0.2113 mean_corrupt_t=0.2113 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9929 corrupt_frac=1.0000 acc_corrupt=0.9929 loss_corrupt=0.0224 wrong_frac=0.7887 init_acc_corrupt=0.2122 acc_corrupt_t_0p0_0p2=0.9872 corrupt_frac_t_0p0_0p2=0.5509 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3594 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0805 out_w_norm=12.2261 out_g_norm=0.0409 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0125 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0203 init_gold_top10=0.2300 init_gold_top100=0.3030
|
| 594 |
+
NCCL version 2.25.1+cuda12.8
|
| 595 |
+
resumed_from=runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805/latest.pt start_step=3001
|
| 596 |
+
{
|
| 597 |
+
"device": "cuda:0",
|
| 598 |
+
"rank": 0,
|
| 599 |
+
"world_size": 4,
|
| 600 |
+
"samples": "owt_cached_chunks:8",
|
| 601 |
+
"vocab_size": 969,
|
| 602 |
+
"tokenizer_vocab_size": 50257,
|
| 603 |
+
"save_dir": "runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805",
|
| 604 |
+
"batch_size": 128,
|
| 605 |
+
"grad_accum": 1,
|
| 606 |
+
"effective_batch_size": 512,
|
| 607 |
+
"global_batch_size": 512,
|
| 608 |
+
"lr_schedule": "constant_warmup",
|
| 609 |
+
"optimizer": "muon",
|
| 610 |
+
"epochs": 0.0,
|
| 611 |
+
"steps_per_epoch": 1,
|
| 612 |
+
"total_steps": 4000,
|
| 613 |
+
"warmup_steps": 10,
|
| 614 |
+
"warmup_epochs": -1.0,
|
| 615 |
+
"min_lr": 0.0,
|
| 616 |
+
"weight_decay": 0.1,
|
| 617 |
+
"output_weight_decay": -1.0,
|
| 618 |
+
"adamw_param_groups": "nanogpt",
|
| 619 |
+
"adam_beta1": 0.9,
|
| 620 |
+
"adam_beta2": 0.95,
|
| 621 |
+
"adam_eps": 1e-08,
|
| 622 |
+
"muon_impl": "legacy",
|
| 623 |
+
"muon_momentum": 0.95,
|
| 624 |
+
"muon_ns_steps": 5,
|
| 625 |
+
"muon_update_scale": 1.0,
|
| 626 |
+
"muon_nesterov": false,
|
| 627 |
+
"muon_width_scale": false,
|
| 628 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 629 |
+
"muon_param_count": 1965440,
|
| 630 |
+
"muon_adam_param_count": 8192,
|
| 631 |
+
"muon_param_names": [
|
| 632 |
+
"vocab_embed.embedding",
|
| 633 |
+
"sigma_map.net.0.weight",
|
| 634 |
+
"sigma_map.net.2.weight",
|
| 635 |
+
"blocks.0.attn_qkv.weight",
|
| 636 |
+
"blocks.0.attn_out.weight",
|
| 637 |
+
"blocks.0.mlp.0.weight",
|
| 638 |
+
"blocks.0.mlp.2.weight",
|
| 639 |
+
"blocks.0.adaLN_modulation.weight",
|
| 640 |
+
"blocks.1.attn_qkv.weight",
|
| 641 |
+
"blocks.1.attn_out.weight",
|
| 642 |
+
"blocks.1.mlp.0.weight",
|
| 643 |
+
"blocks.1.mlp.2.weight",
|
| 644 |
+
"blocks.1.adaLN_modulation.weight",
|
| 645 |
+
"blocks.2.attn_qkv.weight",
|
| 646 |
+
"blocks.2.attn_out.weight",
|
| 647 |
+
"blocks.2.mlp.0.weight",
|
| 648 |
+
"blocks.2.mlp.2.weight",
|
| 649 |
+
"blocks.2.adaLN_modulation.weight",
|
| 650 |
+
"output_layer.linear.weight",
|
| 651 |
+
"output_layer.adaLN_modulation.weight"
|
| 652 |
+
],
|
| 653 |
+
"muon_adam_param_names": [
|
| 654 |
+
"sigma_map.net.0.bias",
|
| 655 |
+
"sigma_map.net.2.bias",
|
| 656 |
+
"blocks.0.norm1.weight",
|
| 657 |
+
"blocks.0.norm2.weight",
|
| 658 |
+
"blocks.0.mlp.0.bias",
|
| 659 |
+
"blocks.0.mlp.2.bias",
|
| 660 |
+
"blocks.0.adaLN_modulation.bias",
|
| 661 |
+
"blocks.1.norm1.weight",
|
| 662 |
+
"blocks.1.norm2.weight",
|
| 663 |
+
"blocks.1.mlp.0.bias",
|
| 664 |
+
"blocks.1.mlp.2.bias",
|
| 665 |
+
"blocks.1.adaLN_modulation.bias",
|
| 666 |
+
"blocks.2.norm1.weight",
|
| 667 |
+
"blocks.2.norm2.weight",
|
| 668 |
+
"blocks.2.mlp.0.bias",
|
| 669 |
+
"blocks.2.mlp.2.bias",
|
| 670 |
+
"blocks.2.adaLN_modulation.bias",
|
| 671 |
+
"output_layer.norm_final.weight",
|
| 672 |
+
"output_layer.adaLN_modulation.bias"
|
| 673 |
+
],
|
| 674 |
+
"muon_effective_nesterov": false,
|
| 675 |
+
"muon_effective_width_scale": false,
|
| 676 |
+
"muon_effective_weight_decay": 0.1,
|
| 677 |
+
"muon_adam_fallback_nesterov": false,
|
| 678 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 679 |
+
"ema_decay": 0.9999,
|
| 680 |
+
"ema_start_step": 0,
|
| 681 |
+
"model_type": "ddit",
|
| 682 |
+
"ddit_mlp_type": "gelu",
|
| 683 |
+
"elf_num_time_tokens": 4,
|
| 684 |
+
"elf_num_model_mode_tokens": 0,
|
| 685 |
+
"qk_norm": true,
|
| 686 |
+
"output_bias": false,
|
| 687 |
+
"output_init_std": -1.0,
|
| 688 |
+
"norm_type": "rmsnorm",
|
| 689 |
+
"target_loss": "hard_ce",
|
| 690 |
+
"linear_soft_target_power": 1.0,
|
| 691 |
+
"linear_soft_target_min_conf": 0.0,
|
| 692 |
+
"linear_soft_target_max_conf": 1.0,
|
| 693 |
+
"t_sampling_mode": "logit_normal",
|
| 694 |
+
"t_sampling_power": 1.0,
|
| 695 |
+
"t_sampling_eps": 0.0001,
|
| 696 |
+
"t_sampling_logit_mean": -1.5,
|
| 697 |
+
"t_sampling_logit_std": 0.8,
|
| 698 |
+
"dual_t": true,
|
| 699 |
+
"corrupt_t_mode": "same",
|
| 700 |
+
"corrupt_min_t": 0.0,
|
| 701 |
+
"corrupt_max_t": 1.0,
|
| 702 |
+
"prefix_block_prob": 0.0,
|
| 703 |
+
"prefix_block_len": 128,
|
| 704 |
+
"mask_ratio_floor_schedule": "none",
|
| 705 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 706 |
+
"dirichlet_semantic_t_mode": "same",
|
| 707 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 708 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 709 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 710 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 711 |
+
"categorical_wrong_from_full_vocab": true,
|
| 712 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 713 |
+
"categorical_wrong_basin_token_ids": "",
|
| 714 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 715 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 716 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 717 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 718 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 719 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 720 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 721 |
+
"mask_mixture_original_prob": 0.0,
|
| 722 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 723 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 724 |
+
"mask_mixture_block_prob": 0.0,
|
| 725 |
+
"mask_mixture_all_prob": 1.0,
|
| 726 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 727 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 728 |
+
"mask_mixture_block_tokens": "64,128",
|
| 729 |
+
"simplex_bridge_sampler": "logistic_normal_linear_mean",
|
| 730 |
+
"logistic_normal_sigma_min": 0.05,
|
| 731 |
+
"logistic_normal_sigma_max": 0.5,
|
| 732 |
+
"logistic_normal_tau_min": 1.0,
|
| 733 |
+
"logistic_normal_tau_max": 1.0,
|
| 734 |
+
"torch_compile": false,
|
| 735 |
+
"compile_mode": "max-autotune",
|
| 736 |
+
"state_format": "prob",
|
| 737 |
+
"meanflow_weight": 0.0,
|
| 738 |
+
"rollout_train_prob": 0.0,
|
| 739 |
+
"rollout_train_steps": 1,
|
| 740 |
+
"rollout_train_infer_steps": 64,
|
| 741 |
+
"rollout_train_temp": 1.45,
|
| 742 |
+
"rollout_train_max_gamma": 1.0,
|
| 743 |
+
"rollout_train_corrupt_only": true,
|
| 744 |
+
"rollout_train_samplewise": false,
|
| 745 |
+
"rollout_train_compute_always": false,
|
| 746 |
+
"bridge_noise_init": "logistic_normal",
|
| 747 |
+
"noise_sigma": -1.0,
|
| 748 |
+
"allow_tf32": true,
|
| 749 |
+
"activation_checkpointing": false,
|
| 750 |
+
"activation_checkpoint_interval": 1,
|
| 751 |
+
"activation_checkpoint_scope": "block",
|
| 752 |
+
"ddp_static_graph": false,
|
| 753 |
+
"ddp_gradient_as_bucket_view": true,
|
| 754 |
+
"blocking_data_transfer": false,
|
| 755 |
+
"dataloader_prefetch_factor": 4,
|
| 756 |
+
"full_train_stats": false,
|
| 757 |
+
"tokenized_hf": false,
|
| 758 |
+
"tokenized_pad_token": "pad",
|
| 759 |
+
"elf_conditional_hf": false,
|
| 760 |
+
"record_pad_truncate": false,
|
| 761 |
+
"record_add_eos": false,
|
| 762 |
+
"record_add_special_tokens": false,
|
| 763 |
+
"record_pad_token": "pad",
|
| 764 |
+
"record_shuffle_buffer": 10000,
|
| 765 |
+
"wrap": true,
|
| 766 |
+
"wrap_mode": "stream",
|
| 767 |
+
"wrap_record_buffer_size": 200,
|
| 768 |
+
"owt_cached_chunks": true,
|
| 769 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
|
| 770 |
+
"owt_chunk_cache_rebuild": false,
|
| 771 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 772 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 773 |
+
"online_chunk_shuffle": false,
|
| 774 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 775 |
+
"openwebtext_split": "train_minus_100k",
|
| 776 |
+
"detokenizer": "auto",
|
| 777 |
+
"resolved_detokenizer": null,
|
| 778 |
+
"num_workers": 0,
|
| 779 |
+
"latest_every": 1000,
|
| 780 |
+
"resume_path": "runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805/latest.pt"
|
| 781 |
+
}
|
| 782 |
+
step=3100 epoch=3100/4000 epoch_step=1/1 micro_steps=3100 elapsed=4.4s lr=2.000000e-03 loss=0.0224 loss_recon=0.0224 loss_meanflow=0.0000 mean_model_t=0.2088 mean_corrupt_t=0.2088 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9929 corrupt_frac=1.0000 acc_corrupt=0.9929 loss_corrupt=0.0224 wrong_frac=0.7916 init_acc_corrupt=0.2093 acc_corrupt_t_0p0_0p2=0.9873 corrupt_frac_t_0p0_0p2=0.5587 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3553 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0770 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0122 out_w_norm=12.2369 out_g_norm=0.0382 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0017 init_gold_top10=0.2177 init_gold_top100=0.2919
|
| 783 |
+
step=3200 epoch=3200/4000 epoch_step=1/1 micro_steps=3200 elapsed=4.0s lr=2.000000e-03 loss=0.0248 loss_recon=0.0248 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9927 corrupt_frac=1.0000 acc_corrupt=0.9927 loss_corrupt=0.0248 wrong_frac=0.7931 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9870 corrupt_frac_t_0p0_0p2=0.5635 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3538 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0742 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0127 out_w_norm=12.2663 out_g_norm=0.0392 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0048 init_gold_top10=0.1975 init_gold_top100=0.2742
|
| 784 |
+
step=3300 epoch=3300/4000 epoch_step=1/1 micro_steps=3300 elapsed=4.0s lr=2.000000e-03 loss=0.0220 loss_recon=0.0220 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9929 corrupt_frac=1.0000 acc_corrupt=0.9929 loss_corrupt=0.0220 wrong_frac=0.7916 init_acc_corrupt=0.2092 acc_corrupt_t_0p0_0p2=0.9872 corrupt_frac_t_0p0_0p2=0.5532 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3616 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0771 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=12.2826 out_g_norm=0.0395 loss_all=0.0236 init_gold_top10=0.2067 init_gold_top100=0.2811
|
| 785 |
+
step=3400 epoch=3400/4000 epoch_step=1/1 micro_steps=3400 elapsed=3.9s lr=2.000000e-03 loss=0.0221 loss_recon=0.0221 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9931 corrupt_frac=1.0000 acc_corrupt=0.9931 loss_corrupt=0.0221 wrong_frac=0.7924 init_acc_corrupt=0.2084 acc_corrupt_t_0p0_0p2=0.9878 corrupt_frac_t_0p0_0p2=0.5611 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0734 out_w_norm=12.2881 out_g_norm=0.0364 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0122 loss_all=0.0122 init_gold_top10=0.1958 init_gold_top100=0.2700
|
| 786 |
+
step=3500 epoch=3500/4000 epoch_step=1/1 micro_steps=3500 elapsed=3.9s lr=2.000000e-03 loss=0.0195 loss_recon=0.0195 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9940 corrupt_frac=1.0000 acc_corrupt=0.9940 loss_corrupt=0.0195 wrong_frac=0.7902 init_acc_corrupt=0.2106 acc_corrupt_t_0p0_0p2=0.9892 corrupt_frac_t_0p0_0p2=0.5567 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3548 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0798 out_w_norm=12.2898 out_g_norm=0.0367 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0037 init_gold_top10=0.2355 init_gold_top100=0.3061
|
| 787 |
+
step=3600 epoch=3600/4000 epoch_step=1/1 micro_steps=3600 elapsed=3.9s lr=2.000000e-03 loss=0.0201 loss_recon=0.0201 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9938 corrupt_frac=1.0000 acc_corrupt=0.9938 loss_corrupt=0.0201 wrong_frac=0.7920 init_acc_corrupt=0.2088 acc_corrupt_t_0p0_0p2=0.9889 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3566 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0777 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=12.2942 out_g_norm=0.0362 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0312 init_gold_top10=0.2176 init_gold_top100=0.2910
|
| 788 |
+
step=3700 epoch=3700/4000 epoch_step=1/1 micro_steps=3700 elapsed=3.9s lr=2.000000e-03 loss=0.0160 loss_recon=0.0160 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9951 corrupt_frac=1.0000 acc_corrupt=0.9951 loss_corrupt=0.0160 wrong_frac=0.7910 init_acc_corrupt=0.2098 acc_corrupt_t_0p0_0p2=0.9912 corrupt_frac_t_0p0_0p2=0.5582 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3555 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0766 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0144 out_w_norm=12.2805 out_g_norm=0.0321 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0018 init_gold_top10=0.2223 init_gold_top100=0.2966
|
| 789 |
+
step=3800 epoch=3800/4000 epoch_step=1/1 micro_steps=3800 elapsed=3.9s lr=2.000000e-03 loss=0.0190 loss_recon=0.0190 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9940 corrupt_frac=1.0000 acc_corrupt=0.9940 loss_corrupt=0.0190 wrong_frac=0.7890 init_acc_corrupt=0.2118 acc_corrupt_t_0p0_0p2=0.9891 corrupt_frac_t_0p0_0p2=0.5512 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0805 out_w_norm=12.2694 out_g_norm=0.0318 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0473 init_gold_top10=0.2302 init_gold_top100=0.3015
|
| 790 |
+
step=3900 epoch=3900/4000 epoch_step=1/1 micro_steps=3900 elapsed=3.9s lr=2.000000e-03 loss=0.0213 loss_recon=0.0213 loss_meanflow=0.0000 mean_model_t=0.2088 mean_corrupt_t=0.2088 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9935 corrupt_frac=1.0000 acc_corrupt=0.9935 loss_corrupt=0.0213 wrong_frac=0.7914 init_acc_corrupt=0.2094 acc_corrupt_t_0p0_0p2=0.9883 corrupt_frac_t_0p0_0p2=0.5583 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3557 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0769 out_w_norm=12.2660 out_g_norm=0.0325 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0129 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0110 init_gold_top10=0.1997 init_gold_top100=0.2768
|
| 791 |
+
step=4000 epoch=4000/4000 epoch_step=1/1 micro_steps=4000 elapsed=3.9s lr=2.000000e-03 loss=0.0161 loss_recon=0.0161 loss_meanflow=0.0000 mean_model_t=0.2113 mean_corrupt_t=0.2113 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9951 corrupt_frac=1.0000 acc_corrupt=0.9951 loss_corrupt=0.0161 wrong_frac=0.7887 init_acc_corrupt=0.2122 acc_corrupt_t_0p0_0p2=0.9910 corrupt_frac_t_0p0_0p2=0.5509 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3594 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0805 out_w_norm=12.2661 out_g_norm=0.0345 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0125 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0059 init_gold_top10=0.2300 init_gold_top100=0.3030
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_noisegeo_len256_allcorrupt_unigram_shared0p5_20260517_163805.log
ADDED
|
@@ -0,0 +1,634 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NCCL version 2.25.1+cuda12.8
|
| 2 |
+
{
|
| 3 |
+
"device": "cuda:0",
|
| 4 |
+
"rank": 0,
|
| 5 |
+
"world_size": 4,
|
| 6 |
+
"samples": "owt_cached_chunks:8",
|
| 7 |
+
"vocab_size": 969,
|
| 8 |
+
"tokenizer_vocab_size": 50257,
|
| 9 |
+
"save_dir": "runs/train8_noisegeo_len256_allcorrupt_unigram_shared0p5_20260517_163805",
|
| 10 |
+
"batch_size": 128,
|
| 11 |
+
"grad_accum": 1,
|
| 12 |
+
"effective_batch_size": 512,
|
| 13 |
+
"global_batch_size": 512,
|
| 14 |
+
"lr_schedule": "constant_warmup",
|
| 15 |
+
"optimizer": "muon",
|
| 16 |
+
"epochs": 0.0,
|
| 17 |
+
"steps_per_epoch": 1,
|
| 18 |
+
"total_steps": 1000,
|
| 19 |
+
"warmup_steps": 10,
|
| 20 |
+
"warmup_epochs": -1.0,
|
| 21 |
+
"min_lr": 0.0,
|
| 22 |
+
"weight_decay": 0.1,
|
| 23 |
+
"output_weight_decay": -1.0,
|
| 24 |
+
"adamw_param_groups": "nanogpt",
|
| 25 |
+
"adam_beta1": 0.9,
|
| 26 |
+
"adam_beta2": 0.95,
|
| 27 |
+
"adam_eps": 1e-08,
|
| 28 |
+
"muon_impl": "legacy",
|
| 29 |
+
"muon_momentum": 0.95,
|
| 30 |
+
"muon_ns_steps": 5,
|
| 31 |
+
"muon_update_scale": 1.0,
|
| 32 |
+
"muon_nesterov": false,
|
| 33 |
+
"muon_width_scale": false,
|
| 34 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 35 |
+
"muon_param_count": 1965440,
|
| 36 |
+
"muon_adam_param_count": 8192,
|
| 37 |
+
"muon_param_names": [
|
| 38 |
+
"vocab_embed.embedding",
|
| 39 |
+
"sigma_map.net.0.weight",
|
| 40 |
+
"sigma_map.net.2.weight",
|
| 41 |
+
"blocks.0.attn_qkv.weight",
|
| 42 |
+
"blocks.0.attn_out.weight",
|
| 43 |
+
"blocks.0.mlp.0.weight",
|
| 44 |
+
"blocks.0.mlp.2.weight",
|
| 45 |
+
"blocks.0.adaLN_modulation.weight",
|
| 46 |
+
"blocks.1.attn_qkv.weight",
|
| 47 |
+
"blocks.1.attn_out.weight",
|
| 48 |
+
"blocks.1.mlp.0.weight",
|
| 49 |
+
"blocks.1.mlp.2.weight",
|
| 50 |
+
"blocks.1.adaLN_modulation.weight",
|
| 51 |
+
"blocks.2.attn_qkv.weight",
|
| 52 |
+
"blocks.2.attn_out.weight",
|
| 53 |
+
"blocks.2.mlp.0.weight",
|
| 54 |
+
"blocks.2.mlp.2.weight",
|
| 55 |
+
"blocks.2.adaLN_modulation.weight",
|
| 56 |
+
"output_layer.linear.weight",
|
| 57 |
+
"output_layer.adaLN_modulation.weight"
|
| 58 |
+
],
|
| 59 |
+
"muon_adam_param_names": [
|
| 60 |
+
"sigma_map.net.0.bias",
|
| 61 |
+
"sigma_map.net.2.bias",
|
| 62 |
+
"blocks.0.norm1.weight",
|
| 63 |
+
"blocks.0.norm2.weight",
|
| 64 |
+
"blocks.0.mlp.0.bias",
|
| 65 |
+
"blocks.0.mlp.2.bias",
|
| 66 |
+
"blocks.0.adaLN_modulation.bias",
|
| 67 |
+
"blocks.1.norm1.weight",
|
| 68 |
+
"blocks.1.norm2.weight",
|
| 69 |
+
"blocks.1.mlp.0.bias",
|
| 70 |
+
"blocks.1.mlp.2.bias",
|
| 71 |
+
"blocks.1.adaLN_modulation.bias",
|
| 72 |
+
"blocks.2.norm1.weight",
|
| 73 |
+
"blocks.2.norm2.weight",
|
| 74 |
+
"blocks.2.mlp.0.bias",
|
| 75 |
+
"blocks.2.mlp.2.bias",
|
| 76 |
+
"blocks.2.adaLN_modulation.bias",
|
| 77 |
+
"output_layer.norm_final.weight",
|
| 78 |
+
"output_layer.adaLN_modulation.bias"
|
| 79 |
+
],
|
| 80 |
+
"muon_effective_nesterov": false,
|
| 81 |
+
"muon_effective_width_scale": false,
|
| 82 |
+
"muon_effective_weight_decay": 0.1,
|
| 83 |
+
"muon_adam_fallback_nesterov": false,
|
| 84 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 85 |
+
"ema_decay": 0.9999,
|
| 86 |
+
"ema_start_step": 0,
|
| 87 |
+
"model_type": "ddit",
|
| 88 |
+
"ddit_mlp_type": "gelu",
|
| 89 |
+
"elf_num_time_tokens": 4,
|
| 90 |
+
"elf_num_model_mode_tokens": 0,
|
| 91 |
+
"qk_norm": true,
|
| 92 |
+
"output_bias": false,
|
| 93 |
+
"output_init_std": -1.0,
|
| 94 |
+
"norm_type": "rmsnorm",
|
| 95 |
+
"target_loss": "hard_ce",
|
| 96 |
+
"linear_soft_target_power": 1.0,
|
| 97 |
+
"linear_soft_target_min_conf": 0.0,
|
| 98 |
+
"linear_soft_target_max_conf": 1.0,
|
| 99 |
+
"t_sampling_mode": "logit_normal",
|
| 100 |
+
"t_sampling_power": 1.0,
|
| 101 |
+
"t_sampling_eps": 0.0001,
|
| 102 |
+
"t_sampling_logit_mean": -1.5,
|
| 103 |
+
"t_sampling_logit_std": 0.8,
|
| 104 |
+
"dual_t": true,
|
| 105 |
+
"corrupt_t_mode": "same",
|
| 106 |
+
"corrupt_min_t": 0.0,
|
| 107 |
+
"corrupt_max_t": 1.0,
|
| 108 |
+
"prefix_block_prob": 0.0,
|
| 109 |
+
"prefix_block_len": 128,
|
| 110 |
+
"mask_ratio_floor_schedule": "none",
|
| 111 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 112 |
+
"dirichlet_semantic_t_mode": "same",
|
| 113 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 114 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 115 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 116 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 117 |
+
"categorical_wrong_from_full_vocab": true,
|
| 118 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 119 |
+
"categorical_wrong_basin_token_ids": "",
|
| 120 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 121 |
+
"categorical_wrong_unigram_prob": 1.0,
|
| 122 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 123 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 124 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 125 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 126 |
+
"categorical_wrong_unigram_shared_prob": 0.5,
|
| 127 |
+
"mask_mixture_original_prob": 0.0,
|
| 128 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 129 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 130 |
+
"mask_mixture_block_prob": 0.0,
|
| 131 |
+
"mask_mixture_all_prob": 1.0,
|
| 132 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 133 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 134 |
+
"mask_mixture_block_tokens": "64,128",
|
| 135 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 136 |
+
"logistic_normal_sigma_min": 0.1,
|
| 137 |
+
"logistic_normal_sigma_max": 1.0,
|
| 138 |
+
"logistic_normal_tau_min": 1.0,
|
| 139 |
+
"logistic_normal_tau_max": 1.0,
|
| 140 |
+
"torch_compile": false,
|
| 141 |
+
"compile_mode": "max-autotune",
|
| 142 |
+
"state_format": "prob",
|
| 143 |
+
"meanflow_weight": 0.0,
|
| 144 |
+
"rollout_train_prob": 0.0,
|
| 145 |
+
"rollout_train_steps": 1,
|
| 146 |
+
"rollout_train_infer_steps": 64,
|
| 147 |
+
"rollout_train_temp": 1.45,
|
| 148 |
+
"rollout_train_max_gamma": 1.0,
|
| 149 |
+
"rollout_train_corrupt_only": true,
|
| 150 |
+
"rollout_train_samplewise": false,
|
| 151 |
+
"rollout_train_compute_always": false,
|
| 152 |
+
"bridge_noise_init": "logistic_normal",
|
| 153 |
+
"noise_sigma": -1.0,
|
| 154 |
+
"allow_tf32": true,
|
| 155 |
+
"activation_checkpointing": false,
|
| 156 |
+
"activation_checkpoint_interval": 1,
|
| 157 |
+
"activation_checkpoint_scope": "block",
|
| 158 |
+
"ddp_static_graph": false,
|
| 159 |
+
"ddp_gradient_as_bucket_view": true,
|
| 160 |
+
"blocking_data_transfer": false,
|
| 161 |
+
"dataloader_prefetch_factor": 4,
|
| 162 |
+
"full_train_stats": false,
|
| 163 |
+
"tokenized_hf": false,
|
| 164 |
+
"tokenized_pad_token": "pad",
|
| 165 |
+
"elf_conditional_hf": false,
|
| 166 |
+
"record_pad_truncate": false,
|
| 167 |
+
"record_add_eos": false,
|
| 168 |
+
"record_add_special_tokens": false,
|
| 169 |
+
"record_pad_token": "pad",
|
| 170 |
+
"record_shuffle_buffer": 10000,
|
| 171 |
+
"wrap": true,
|
| 172 |
+
"wrap_mode": "stream",
|
| 173 |
+
"wrap_record_buffer_size": 200,
|
| 174 |
+
"owt_cached_chunks": true,
|
| 175 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
|
| 176 |
+
"owt_chunk_cache_rebuild": false,
|
| 177 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 178 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 179 |
+
"online_chunk_shuffle": false,
|
| 180 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 181 |
+
"openwebtext_split": "train_minus_100k",
|
| 182 |
+
"detokenizer": "auto",
|
| 183 |
+
"resolved_detokenizer": null,
|
| 184 |
+
"num_workers": 0,
|
| 185 |
+
"latest_every": 1000,
|
| 186 |
+
"resume_path": ""
|
| 187 |
+
}
|
| 188 |
+
step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=4.5s lr=2.000000e-03 loss=6.7171 loss_recon=6.7171 loss_meanflow=0.0000 mean_model_t=0.2078 mean_corrupt_t=0.2078 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0942 corrupt_frac=1.0000 acc_corrupt=0.0942 loss_corrupt=6.7171 wrong_frac=0.7922 init_acc_corrupt=0.1148 acc_corrupt_t_0p0_0p2=0.0455 corrupt_frac_t_0p0_0p2=0.5620 acc_corrupt_t_0p2_0p4=0.1254 corrupt_frac_t_0p2_0p4=0.3533 acc_corrupt_t_0p4_0p6=0.2713 corrupt_frac_t_0p4_0p6=0.0755 acc_corrupt_t_0p6_0p8=0.4107 corrupt_frac_t_0p6_0p8=0.0117 out_w_norm=1.0887 out_g_norm=1.0092 acc_corrupt_t_0p8_1p0=0.8945 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.4630 init_gold_top10=0.2117 init_gold_top100=0.4247
|
| 189 |
+
step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=3.8s lr=2.000000e-03 loss=6.1879 loss_recon=6.1879 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1016 corrupt_frac=1.0000 acc_corrupt=0.1016 loss_corrupt=6.1879 wrong_frac=0.7916 init_acc_corrupt=0.1160 acc_corrupt_t_0p0_0p2=0.0495 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=0.1388 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.2810 corrupt_frac_t_0p4_0p6=0.0744 acc_corrupt_t_0p6_0p8=0.3941 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=0.4678 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=3.3136 out_g_norm=1.4051 loss_all=5.9306 init_gold_top10=0.2016 init_gold_top100=0.4192
|
| 190 |
+
step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=3.8s lr=2.000000e-03 loss=5.7129 loss_recon=5.7129 loss_meanflow=0.0000 mean_model_t=0.2105 mean_corrupt_t=0.2105 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1216 corrupt_frac=1.0000 acc_corrupt=0.1216 loss_corrupt=5.7129 wrong_frac=0.7894 init_acc_corrupt=0.1187 acc_corrupt_t_0p0_0p2=0.0537 corrupt_frac_t_0p0_0p2=0.5542 acc_corrupt_t_0p2_0p4=0.1615 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.3620 corrupt_frac_t_0p4_0p6=0.0813 acc_corrupt_t_0p6_0p8=0.5778 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=5.3362 out_g_norm=0.6855 acc_corrupt_t_0p8_1p0=0.5938 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.4734 init_gold_top10=0.2133 init_gold_top100=0.4237
|
| 191 |
+
step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=3.8s lr=2.000000e-03 loss=5.3493 loss_recon=5.3493 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1453 corrupt_frac=1.0000 acc_corrupt=0.1453 loss_corrupt=5.3493 wrong_frac=0.7915 init_acc_corrupt=0.1161 acc_corrupt_t_0p0_0p2=0.0574 corrupt_frac_t_0p0_0p2=0.5597 acc_corrupt_t_0p2_0p4=0.2021 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.4675 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.6597 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=7.0676 out_g_norm=0.3587 acc_corrupt_t_0p8_1p0=0.7930 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.0547 init_gold_top10=0.2146 init_gold_top100=0.4290
|
| 192 |
+
step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=3.8s lr=2.000000e-03 loss=4.9052 loss_recon=4.9052 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1578 corrupt_frac=1.0000 acc_corrupt=0.1578 loss_corrupt=4.9052 wrong_frac=0.7914 init_acc_corrupt=0.1160 acc_corrupt_t_0p0_0p2=0.0608 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.2300 corrupt_frac_t_0p2_0p4=0.3550 acc_corrupt_t_0p4_0p6=0.4843 corrupt_frac_t_0p4_0p6=0.0741 acc_corrupt_t_0p6_0p8=0.6602 corrupt_frac_t_0p6_0p8=0.0143 out_w_norm=8.6464 out_g_norm=0.4603 loss_all=4.6661 init_gold_top10=0.1906 init_gold_top100=0.4129
|
| 193 |
+
step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=3.8s lr=2.000000e-03 loss=4.3633 loss_recon=4.3633 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1713 corrupt_frac=1.0000 acc_corrupt=0.1713 loss_corrupt=4.3633 wrong_frac=0.7917 init_acc_corrupt=0.1154 acc_corrupt_t_0p0_0p2=0.0670 corrupt_frac_t_0p0_0p2=0.5568 acc_corrupt_t_0p2_0p4=0.2545 corrupt_frac_t_0p2_0p4=0.3606 acc_corrupt_t_0p4_0p6=0.4927 corrupt_frac_t_0p4_0p6=0.0742 acc_corrupt_t_0p6_0p8=0.6751 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=9.6855 out_g_norm=0.4965 loss_all=4.2113 init_gold_top10=0.1772 init_gold_top100=0.4130
|
| 194 |
+
step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=3.8s lr=2.000000e-03 loss=3.8335 loss_recon=3.8335 loss_meanflow=0.0000 mean_model_t=0.2079 mean_corrupt_t=0.2079 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1847 corrupt_frac=1.0000 acc_corrupt=0.1847 loss_corrupt=3.8335 wrong_frac=0.7925 init_acc_corrupt=0.1147 acc_corrupt_t_0p0_0p2=0.0766 corrupt_frac_t_0p0_0p2=0.5607 acc_corrupt_t_0p2_0p4=0.2754 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.5085 corrupt_frac_t_0p4_0p6=0.0739 acc_corrupt_t_0p6_0p8=0.6886 corrupt_frac_t_0p6_0p8=0.0132 out_w_norm=10.2232 out_g_norm=0.6982 loss_all=3.5264 init_gold_top10=0.2137 init_gold_top100=0.4244
|
| 195 |
+
step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=3.8s lr=2.000000e-03 loss=3.3860 loss_recon=3.3860 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2059 corrupt_frac=1.0000 acc_corrupt=0.2059 loss_corrupt=3.3860 wrong_frac=0.7904 init_acc_corrupt=0.1168 acc_corrupt_t_0p0_0p2=0.0875 corrupt_frac_t_0p0_0p2=0.5505 acc_corrupt_t_0p2_0p4=0.3057 corrupt_frac_t_0p2_0p4=0.3649 acc_corrupt_t_0p4_0p6=0.5299 corrupt_frac_t_0p4_0p6=0.0765 out_w_norm=10.5877 out_g_norm=0.9937 acc_corrupt_t_0p6_0p8=0.6989 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=0.8398 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.1965 init_gold_top10=0.1935 init_gold_top100=0.4118
|
| 196 |
+
step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=3.8s lr=2.000000e-03 loss=3.0038 loss_recon=3.0038 loss_meanflow=0.0000 mean_model_t=0.2102 mean_corrupt_t=0.2102 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2384 corrupt_frac=1.0000 acc_corrupt=0.2384 loss_corrupt=3.0038 wrong_frac=0.7901 init_acc_corrupt=0.1169 acc_corrupt_t_0p0_0p2=0.1048 corrupt_frac_t_0p0_0p2=0.5513 acc_corrupt_t_0p2_0p4=0.3572 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.5844 corrupt_frac_t_0p4_0p6=0.0755 out_w_norm=10.8302 out_g_norm=1.2164 acc_corrupt_t_0p6_0p8=0.7286 corrupt_frac_t_0p6_0p8=0.0127 acc_corrupt_t_0p8_1p0=0.8242 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.9094 init_gold_top10=0.1910 init_gold_top100=0.4159
|
| 197 |
+
step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=3.8s lr=2.000000e-03 loss=2.6625 loss_recon=2.6625 loss_meanflow=0.0000 mean_model_t=0.2101 mean_corrupt_t=0.2101 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2880 corrupt_frac=1.0000 acc_corrupt=0.2880 loss_corrupt=2.6625 wrong_frac=0.7897 init_acc_corrupt=0.1182 acc_corrupt_t_0p0_0p2=0.1255 corrupt_frac_t_0p0_0p2=0.5591 acc_corrupt_t_0p2_0p4=0.4443 corrupt_frac_t_0p2_0p4=0.3526 acc_corrupt_t_0p4_0p6=0.6811 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p8_1p0=0.8555 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=10.9865 out_g_norm=1.5042 acc_corrupt_t_0p6_0p8=0.7904 corrupt_frac_t_0p6_0p8=0.0133 loss_all=2.5068 init_gold_top10=0.1951 init_gold_top100=0.4188
|
| 198 |
+
Traceback (most recent call last):
|
| 199 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 200 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 201 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
|
| 202 |
+
main()
|
| 203 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 204 |
+
return f(*args, **kwargs)
|
| 205 |
+
^^^^^^^^^^^^^^^^^^
|
| 206 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
|
| 207 |
+
run(args)
|
| 208 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
|
| 209 |
+
elastic_launch(
|
| 210 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
|
| 211 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 212 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 213 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
|
| 214 |
+
result = agent.run()
|
| 215 |
+
^^^^^^^^^^^
|
| 216 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
|
| 217 |
+
result = f(*args, **kwargs)
|
| 218 |
+
^^^^^^^^^^^^^^^^^^
|
| 219 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run
|
| 220 |
+
result = self._invoke_run(role)
|
| 221 |
+
^^^^^^^^^^^^^^^^^^^^^^
|
| 222 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 864, in _invoke_run
|
| 223 |
+
self._initialize_workers(self._worker_group)
|
| 224 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
|
| 225 |
+
result = f(*args, **kwargs)
|
| 226 |
+
^^^^^^^^^^^^^^^^^^
|
| 227 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 683, in _initialize_workers
|
| 228 |
+
self._rendezvous(worker_group)
|
| 229 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
|
| 230 |
+
result = f(*args, **kwargs)
|
| 231 |
+
^^^^^^^^^^^^^^^^^^
|
| 232 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 500, in _rendezvous
|
| 233 |
+
rdzv_info = spec.rdzv_handler.next_rendezvous()
|
| 234 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 235 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 67, in next_rendezvous
|
| 236 |
+
self._store = TCPStore( # type: ignore[call-arg]
|
| 237 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 238 |
+
RuntimeError: The server socket has failed to listen on any local network address. port: 32976, useIpv6: 0, code: -98, name: EADDRINUSE, message: address already in use
|
| 239 |
+
NCCL version 2.25.1+cuda12.8
|
| 240 |
+
resumed_from=runs/train8_noisegeo_len256_allcorrupt_unigram_shared0p5_20260517_163805/latest.pt start_step=1001
|
| 241 |
+
{
|
| 242 |
+
"device": "cuda:0",
|
| 243 |
+
"rank": 0,
|
| 244 |
+
"world_size": 4,
|
| 245 |
+
"samples": "owt_cached_chunks:8",
|
| 246 |
+
"vocab_size": 969,
|
| 247 |
+
"tokenizer_vocab_size": 50257,
|
| 248 |
+
"save_dir": "runs/train8_noisegeo_len256_allcorrupt_unigram_shared0p5_20260517_163805",
|
| 249 |
+
"batch_size": 128,
|
| 250 |
+
"grad_accum": 1,
|
| 251 |
+
"effective_batch_size": 512,
|
| 252 |
+
"global_batch_size": 512,
|
| 253 |
+
"lr_schedule": "constant_warmup",
|
| 254 |
+
"optimizer": "muon",
|
| 255 |
+
"epochs": 0.0,
|
| 256 |
+
"steps_per_epoch": 1,
|
| 257 |
+
"total_steps": 2000,
|
| 258 |
+
"warmup_steps": 10,
|
| 259 |
+
"warmup_epochs": -1.0,
|
| 260 |
+
"min_lr": 0.0,
|
| 261 |
+
"weight_decay": 0.1,
|
| 262 |
+
"output_weight_decay": -1.0,
|
| 263 |
+
"adamw_param_groups": "nanogpt",
|
| 264 |
+
"adam_beta1": 0.9,
|
| 265 |
+
"adam_beta2": 0.95,
|
| 266 |
+
"adam_eps": 1e-08,
|
| 267 |
+
"muon_impl": "legacy",
|
| 268 |
+
"muon_momentum": 0.95,
|
| 269 |
+
"muon_ns_steps": 5,
|
| 270 |
+
"muon_update_scale": 1.0,
|
| 271 |
+
"muon_nesterov": false,
|
| 272 |
+
"muon_width_scale": false,
|
| 273 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 274 |
+
"muon_param_count": 1965440,
|
| 275 |
+
"muon_adam_param_count": 8192,
|
| 276 |
+
"muon_param_names": [
|
| 277 |
+
"vocab_embed.embedding",
|
| 278 |
+
"sigma_map.net.0.weight",
|
| 279 |
+
"sigma_map.net.2.weight",
|
| 280 |
+
"blocks.0.attn_qkv.weight",
|
| 281 |
+
"blocks.0.attn_out.weight",
|
| 282 |
+
"blocks.0.mlp.0.weight",
|
| 283 |
+
"blocks.0.mlp.2.weight",
|
| 284 |
+
"blocks.0.adaLN_modulation.weight",
|
| 285 |
+
"blocks.1.attn_qkv.weight",
|
| 286 |
+
"blocks.1.attn_out.weight",
|
| 287 |
+
"blocks.1.mlp.0.weight",
|
| 288 |
+
"blocks.1.mlp.2.weight",
|
| 289 |
+
"blocks.1.adaLN_modulation.weight",
|
| 290 |
+
"blocks.2.attn_qkv.weight",
|
| 291 |
+
"blocks.2.attn_out.weight",
|
| 292 |
+
"blocks.2.mlp.0.weight",
|
| 293 |
+
"blocks.2.mlp.2.weight",
|
| 294 |
+
"blocks.2.adaLN_modulation.weight",
|
| 295 |
+
"output_layer.linear.weight",
|
| 296 |
+
"output_layer.adaLN_modulation.weight"
|
| 297 |
+
],
|
| 298 |
+
"muon_adam_param_names": [
|
| 299 |
+
"sigma_map.net.0.bias",
|
| 300 |
+
"sigma_map.net.2.bias",
|
| 301 |
+
"blocks.0.norm1.weight",
|
| 302 |
+
"blocks.0.norm2.weight",
|
| 303 |
+
"blocks.0.mlp.0.bias",
|
| 304 |
+
"blocks.0.mlp.2.bias",
|
| 305 |
+
"blocks.0.adaLN_modulation.bias",
|
| 306 |
+
"blocks.1.norm1.weight",
|
| 307 |
+
"blocks.1.norm2.weight",
|
| 308 |
+
"blocks.1.mlp.0.bias",
|
| 309 |
+
"blocks.1.mlp.2.bias",
|
| 310 |
+
"blocks.1.adaLN_modulation.bias",
|
| 311 |
+
"blocks.2.norm1.weight",
|
| 312 |
+
"blocks.2.norm2.weight",
|
| 313 |
+
"blocks.2.mlp.0.bias",
|
| 314 |
+
"blocks.2.mlp.2.bias",
|
| 315 |
+
"blocks.2.adaLN_modulation.bias",
|
| 316 |
+
"output_layer.norm_final.weight",
|
| 317 |
+
"output_layer.adaLN_modulation.bias"
|
| 318 |
+
],
|
| 319 |
+
"muon_effective_nesterov": false,
|
| 320 |
+
"muon_effective_width_scale": false,
|
| 321 |
+
"muon_effective_weight_decay": 0.1,
|
| 322 |
+
"muon_adam_fallback_nesterov": false,
|
| 323 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 324 |
+
"ema_decay": 0.9999,
|
| 325 |
+
"ema_start_step": 0,
|
| 326 |
+
"model_type": "ddit",
|
| 327 |
+
"ddit_mlp_type": "gelu",
|
| 328 |
+
"elf_num_time_tokens": 4,
|
| 329 |
+
"elf_num_model_mode_tokens": 0,
|
| 330 |
+
"qk_norm": true,
|
| 331 |
+
"output_bias": false,
|
| 332 |
+
"output_init_std": -1.0,
|
| 333 |
+
"norm_type": "rmsnorm",
|
| 334 |
+
"target_loss": "hard_ce",
|
| 335 |
+
"linear_soft_target_power": 1.0,
|
| 336 |
+
"linear_soft_target_min_conf": 0.0,
|
| 337 |
+
"linear_soft_target_max_conf": 1.0,
|
| 338 |
+
"t_sampling_mode": "logit_normal",
|
| 339 |
+
"t_sampling_power": 1.0,
|
| 340 |
+
"t_sampling_eps": 0.0001,
|
| 341 |
+
"t_sampling_logit_mean": -1.5,
|
| 342 |
+
"t_sampling_logit_std": 0.8,
|
| 343 |
+
"dual_t": true,
|
| 344 |
+
"corrupt_t_mode": "same",
|
| 345 |
+
"corrupt_min_t": 0.0,
|
| 346 |
+
"corrupt_max_t": 1.0,
|
| 347 |
+
"prefix_block_prob": 0.0,
|
| 348 |
+
"prefix_block_len": 128,
|
| 349 |
+
"mask_ratio_floor_schedule": "none",
|
| 350 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 351 |
+
"dirichlet_semantic_t_mode": "same",
|
| 352 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 353 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 354 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 355 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 356 |
+
"categorical_wrong_from_full_vocab": true,
|
| 357 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 358 |
+
"categorical_wrong_basin_token_ids": "",
|
| 359 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 360 |
+
"categorical_wrong_unigram_prob": 1.0,
|
| 361 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 362 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 363 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 364 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 365 |
+
"categorical_wrong_unigram_shared_prob": 0.5,
|
| 366 |
+
"mask_mixture_original_prob": 0.0,
|
| 367 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 368 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 369 |
+
"mask_mixture_block_prob": 0.0,
|
| 370 |
+
"mask_mixture_all_prob": 1.0,
|
| 371 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 372 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 373 |
+
"mask_mixture_block_tokens": "64,128",
|
| 374 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 375 |
+
"logistic_normal_sigma_min": 0.1,
|
| 376 |
+
"logistic_normal_sigma_max": 1.0,
|
| 377 |
+
"logistic_normal_tau_min": 1.0,
|
| 378 |
+
"logistic_normal_tau_max": 1.0,
|
| 379 |
+
"torch_compile": false,
|
| 380 |
+
"compile_mode": "max-autotune",
|
| 381 |
+
"state_format": "prob",
|
| 382 |
+
"meanflow_weight": 0.0,
|
| 383 |
+
"rollout_train_prob": 0.0,
|
| 384 |
+
"rollout_train_steps": 1,
|
| 385 |
+
"rollout_train_infer_steps": 64,
|
| 386 |
+
"rollout_train_temp": 1.45,
|
| 387 |
+
"rollout_train_max_gamma": 1.0,
|
| 388 |
+
"rollout_train_corrupt_only": true,
|
| 389 |
+
"rollout_train_samplewise": false,
|
| 390 |
+
"rollout_train_compute_always": false,
|
| 391 |
+
"bridge_noise_init": "logistic_normal",
|
| 392 |
+
"noise_sigma": -1.0,
|
| 393 |
+
"allow_tf32": true,
|
| 394 |
+
"activation_checkpointing": false,
|
| 395 |
+
"activation_checkpoint_interval": 1,
|
| 396 |
+
"activation_checkpoint_scope": "block",
|
| 397 |
+
"ddp_static_graph": false,
|
| 398 |
+
"ddp_gradient_as_bucket_view": true,
|
| 399 |
+
"blocking_data_transfer": false,
|
| 400 |
+
"dataloader_prefetch_factor": 4,
|
| 401 |
+
"full_train_stats": false,
|
| 402 |
+
"tokenized_hf": false,
|
| 403 |
+
"tokenized_pad_token": "pad",
|
| 404 |
+
"elf_conditional_hf": false,
|
| 405 |
+
"record_pad_truncate": false,
|
| 406 |
+
"record_add_eos": false,
|
| 407 |
+
"record_add_special_tokens": false,
|
| 408 |
+
"record_pad_token": "pad",
|
| 409 |
+
"record_shuffle_buffer": 10000,
|
| 410 |
+
"wrap": true,
|
| 411 |
+
"wrap_mode": "stream",
|
| 412 |
+
"wrap_record_buffer_size": 200,
|
| 413 |
+
"owt_cached_chunks": true,
|
| 414 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
|
| 415 |
+
"owt_chunk_cache_rebuild": false,
|
| 416 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 417 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 418 |
+
"online_chunk_shuffle": false,
|
| 419 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 420 |
+
"openwebtext_split": "train_minus_100k",
|
| 421 |
+
"detokenizer": "auto",
|
| 422 |
+
"resolved_detokenizer": null,
|
| 423 |
+
"num_workers": 0,
|
| 424 |
+
"latest_every": 1000,
|
| 425 |
+
"resume_path": "runs/train8_noisegeo_len256_allcorrupt_unigram_shared0p5_20260517_163805/latest.pt"
|
| 426 |
+
}
|
| 427 |
+
step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=4.8s lr=2.000000e-03 loss=2.2796 loss_recon=2.2796 loss_meanflow=0.0000 mean_model_t=0.2078 mean_corrupt_t=0.2078 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3636 corrupt_frac=1.0000 acc_corrupt=0.3636 loss_corrupt=2.2796 wrong_frac=0.7922 init_acc_corrupt=0.1148 acc_corrupt_t_0p0_0p2=0.1696 corrupt_frac_t_0p0_0p2=0.5620 acc_corrupt_t_0p2_0p4=0.5667 corrupt_frac_t_0p2_0p4=0.3533 acc_corrupt_t_0p4_0p6=0.7971 corrupt_frac_t_0p4_0p6=0.0755 acc_corrupt_t_0p6_0p8=0.8522 corrupt_frac_t_0p6_0p8=0.0117 out_w_norm=11.0831 out_g_norm=1.6545 acc_corrupt_t_0p8_1p0=0.9609 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.9833 init_gold_top10=0.2117 init_gold_top100=0.4247
|
| 428 |
+
step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=4.1s lr=2.000000e-03 loss=1.8033 loss_recon=1.8033 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4715 corrupt_frac=1.0000 acc_corrupt=0.4715 loss_corrupt=1.8033 wrong_frac=0.7916 init_acc_corrupt=0.1160 acc_corrupt_t_0p0_0p2=0.2501 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=0.7186 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.8878 corrupt_frac_t_0p4_0p6=0.0744 acc_corrupt_t_0p6_0p8=0.9219 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=0.9346 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=11.1814 out_g_norm=1.7198 loss_all=1.5160 init_gold_top10=0.2016 init_gold_top100=0.4192
|
| 429 |
+
step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=4.1s lr=2.000000e-03 loss=1.4037 loss_recon=1.4037 loss_meanflow=0.0000 mean_model_t=0.2105 mean_corrupt_t=0.2105 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5691 corrupt_frac=1.0000 acc_corrupt=0.5691 loss_corrupt=1.4037 wrong_frac=0.7894 init_acc_corrupt=0.1187 acc_corrupt_t_0p0_0p2=0.3382 corrupt_frac_t_0p0_0p2=0.5542 acc_corrupt_t_0p2_0p4=0.8347 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.9404 corrupt_frac_t_0p4_0p6=0.0813 acc_corrupt_t_0p6_0p8=0.9533 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=11.3091 out_g_norm=1.9176 acc_corrupt_t_0p8_1p0=0.9688 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0970 init_gold_top10=0.2133 init_gold_top100=0.4237
|
| 430 |
+
step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=4.1s lr=2.000000e-03 loss=1.1179 loss_recon=1.1179 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6413 corrupt_frac=1.0000 acc_corrupt=0.6413 loss_corrupt=1.1179 wrong_frac=0.7915 init_acc_corrupt=0.1161 acc_corrupt_t_0p0_0p2=0.4109 corrupt_frac_t_0p0_0p2=0.5597 acc_corrupt_t_0p2_0p4=0.9243 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.9755 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9771 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=11.4008 out_g_norm=1.8839 acc_corrupt_t_0p8_1p0=0.9844 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0257 init_gold_top10=0.2146 init_gold_top100=0.4290
|
| 431 |
+
step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=4.1s lr=2.000000e-03 loss=0.9652 loss_recon=0.9652 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6788 corrupt_frac=1.0000 acc_corrupt=0.6788 loss_corrupt=0.9652 wrong_frac=0.7914 init_acc_corrupt=0.1160 acc_corrupt_t_0p0_0p2=0.4517 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.9651 corrupt_frac_t_0p2_0p4=0.3550 acc_corrupt_t_0p4_0p6=0.9896 corrupt_frac_t_0p4_0p6=0.0741 acc_corrupt_t_0p6_0p8=0.9916 corrupt_frac_t_0p6_0p8=0.0143 out_w_norm=11.4319 out_g_norm=1.6646 loss_all=0.9362 init_gold_top10=0.1906 init_gold_top100=0.4129
|
| 432 |
+
step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=4.1s lr=2.000000e-03 loss=0.8711 loss_recon=0.8711 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7041 corrupt_frac=1.0000 acc_corrupt=0.7041 loss_corrupt=0.8711 wrong_frac=0.7917 init_acc_corrupt=0.1154 acc_corrupt_t_0p0_0p2=0.4811 corrupt_frac_t_0p0_0p2=0.5568 acc_corrupt_t_0p2_0p4=0.9818 corrupt_frac_t_0p2_0p4=0.3606 acc_corrupt_t_0p4_0p6=0.9955 corrupt_frac_t_0p4_0p6=0.0742 acc_corrupt_t_0p6_0p8=0.9946 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=11.4282 out_g_norm=1.3996 loss_all=0.8841 init_gold_top10=0.1772 init_gold_top100=0.4130
|
| 433 |
+
step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=4.1s lr=2.000000e-03 loss=0.8025 loss_recon=0.8025 loss_meanflow=0.0000 mean_model_t=0.2079 mean_corrupt_t=0.2079 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7223 corrupt_frac=1.0000 acc_corrupt=0.7223 loss_corrupt=0.8025 wrong_frac=0.7925 init_acc_corrupt=0.1147 acc_corrupt_t_0p0_0p2=0.5119 corrupt_frac_t_0p0_0p2=0.5607 acc_corrupt_t_0p2_0p4=0.9893 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.9980 corrupt_frac_t_0p4_0p6=0.0739 acc_corrupt_t_0p6_0p8=0.9964 corrupt_frac_t_0p6_0p8=0.0132 out_w_norm=11.4249 out_g_norm=1.2780 loss_all=0.7132 init_gold_top10=0.2137 init_gold_top100=0.4244
|
| 434 |
+
step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=4.1s lr=2.000000e-03 loss=0.7516 loss_recon=0.7516 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7385 corrupt_frac=1.0000 acc_corrupt=0.7385 loss_corrupt=0.7516 wrong_frac=0.7904 init_acc_corrupt=0.1168 acc_corrupt_t_0p0_0p2=0.5306 corrupt_frac_t_0p0_0p2=0.5505 acc_corrupt_t_0p2_0p4=0.9921 corrupt_frac_t_0p2_0p4=0.3649 acc_corrupt_t_0p4_0p6=0.9986 corrupt_frac_t_0p4_0p6=0.0765 out_w_norm=11.4288 out_g_norm=1.0979 acc_corrupt_t_0p6_0p8=0.9968 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=0.9922 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.6790 init_gold_top10=0.1935 init_gold_top100=0.4118
|
| 435 |
+
step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=4.1s lr=2.000000e-03 loss=0.7025 loss_recon=0.7025 loss_meanflow=0.0000 mean_model_t=0.2102 mean_corrupt_t=0.2102 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7553 corrupt_frac=1.0000 acc_corrupt=0.7553 loss_corrupt=0.7025 wrong_frac=0.7901 init_acc_corrupt=0.1169 acc_corrupt_t_0p0_0p2=0.5593 corrupt_frac_t_0p0_0p2=0.5513 acc_corrupt_t_0p2_0p4=0.9956 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.9994 corrupt_frac_t_0p4_0p6=0.0755 out_w_norm=11.4342 out_g_norm=0.9973 acc_corrupt_t_0p6_0p8=0.9980 corrupt_frac_t_0p6_0p8=0.0127 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.6958 init_gold_top10=0.1910 init_gold_top100=0.4159
|
| 436 |
+
step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=4.1s lr=2.000000e-03 loss=0.6663 loss_recon=0.6663 loss_meanflow=0.0000 mean_model_t=0.2101 mean_corrupt_t=0.2101 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7647 corrupt_frac=1.0000 acc_corrupt=0.7647 loss_corrupt=0.6663 wrong_frac=0.7897 init_acc_corrupt=0.1182 acc_corrupt_t_0p0_0p2=0.5808 corrupt_frac_t_0p0_0p2=0.5591 acc_corrupt_t_0p2_0p4=0.9975 corrupt_frac_t_0p2_0p4=0.3526 acc_corrupt_t_0p4_0p6=0.9996 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p8_1p0=0.9785 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=11.4442 out_g_norm=0.8627 acc_corrupt_t_0p6_0p8=0.9991 corrupt_frac_t_0p6_0p8=0.0133 loss_all=0.6893 init_gold_top10=0.1951 init_gold_top100=0.4188
|
| 437 |
+
NCCL version 2.25.1+cuda12.8
|
| 438 |
+
resumed_from=runs/train8_noisegeo_len256_allcorrupt_unigram_shared0p5_20260517_163805/latest.pt start_step=2001
|
| 439 |
+
{
|
| 440 |
+
"device": "cuda:0",
|
| 441 |
+
"rank": 0,
|
| 442 |
+
"world_size": 4,
|
| 443 |
+
"samples": "owt_cached_chunks:8",
|
| 444 |
+
"vocab_size": 969,
|
| 445 |
+
"tokenizer_vocab_size": 50257,
|
| 446 |
+
"save_dir": "runs/train8_noisegeo_len256_allcorrupt_unigram_shared0p5_20260517_163805",
|
| 447 |
+
"batch_size": 128,
|
| 448 |
+
"grad_accum": 1,
|
| 449 |
+
"effective_batch_size": 512,
|
| 450 |
+
"global_batch_size": 512,
|
| 451 |
+
"lr_schedule": "constant_warmup",
|
| 452 |
+
"optimizer": "muon",
|
| 453 |
+
"epochs": 0.0,
|
| 454 |
+
"steps_per_epoch": 1,
|
| 455 |
+
"total_steps": 3000,
|
| 456 |
+
"warmup_steps": 10,
|
| 457 |
+
"warmup_epochs": -1.0,
|
| 458 |
+
"min_lr": 0.0,
|
| 459 |
+
"weight_decay": 0.1,
|
| 460 |
+
"output_weight_decay": -1.0,
|
| 461 |
+
"adamw_param_groups": "nanogpt",
|
| 462 |
+
"adam_beta1": 0.9,
|
| 463 |
+
"adam_beta2": 0.95,
|
| 464 |
+
"adam_eps": 1e-08,
|
| 465 |
+
"muon_impl": "legacy",
|
| 466 |
+
"muon_momentum": 0.95,
|
| 467 |
+
"muon_ns_steps": 5,
|
| 468 |
+
"muon_update_scale": 1.0,
|
| 469 |
+
"muon_nesterov": false,
|
| 470 |
+
"muon_width_scale": false,
|
| 471 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 472 |
+
"muon_param_count": 1965440,
|
| 473 |
+
"muon_adam_param_count": 8192,
|
| 474 |
+
"muon_param_names": [
|
| 475 |
+
"vocab_embed.embedding",
|
| 476 |
+
"sigma_map.net.0.weight",
|
| 477 |
+
"sigma_map.net.2.weight",
|
| 478 |
+
"blocks.0.attn_qkv.weight",
|
| 479 |
+
"blocks.0.attn_out.weight",
|
| 480 |
+
"blocks.0.mlp.0.weight",
|
| 481 |
+
"blocks.0.mlp.2.weight",
|
| 482 |
+
"blocks.0.adaLN_modulation.weight",
|
| 483 |
+
"blocks.1.attn_qkv.weight",
|
| 484 |
+
"blocks.1.attn_out.weight",
|
| 485 |
+
"blocks.1.mlp.0.weight",
|
| 486 |
+
"blocks.1.mlp.2.weight",
|
| 487 |
+
"blocks.1.adaLN_modulation.weight",
|
| 488 |
+
"blocks.2.attn_qkv.weight",
|
| 489 |
+
"blocks.2.attn_out.weight",
|
| 490 |
+
"blocks.2.mlp.0.weight",
|
| 491 |
+
"blocks.2.mlp.2.weight",
|
| 492 |
+
"blocks.2.adaLN_modulation.weight",
|
| 493 |
+
"output_layer.linear.weight",
|
| 494 |
+
"output_layer.adaLN_modulation.weight"
|
| 495 |
+
],
|
| 496 |
+
"muon_adam_param_names": [
|
| 497 |
+
"sigma_map.net.0.bias",
|
| 498 |
+
"sigma_map.net.2.bias",
|
| 499 |
+
"blocks.0.norm1.weight",
|
| 500 |
+
"blocks.0.norm2.weight",
|
| 501 |
+
"blocks.0.mlp.0.bias",
|
| 502 |
+
"blocks.0.mlp.2.bias",
|
| 503 |
+
"blocks.0.adaLN_modulation.bias",
|
| 504 |
+
"blocks.1.norm1.weight",
|
| 505 |
+
"blocks.1.norm2.weight",
|
| 506 |
+
"blocks.1.mlp.0.bias",
|
| 507 |
+
"blocks.1.mlp.2.bias",
|
| 508 |
+
"blocks.1.adaLN_modulation.bias",
|
| 509 |
+
"blocks.2.norm1.weight",
|
| 510 |
+
"blocks.2.norm2.weight",
|
| 511 |
+
"blocks.2.mlp.0.bias",
|
| 512 |
+
"blocks.2.mlp.2.bias",
|
| 513 |
+
"blocks.2.adaLN_modulation.bias",
|
| 514 |
+
"output_layer.norm_final.weight",
|
| 515 |
+
"output_layer.adaLN_modulation.bias"
|
| 516 |
+
],
|
| 517 |
+
"muon_effective_nesterov": false,
|
| 518 |
+
"muon_effective_width_scale": false,
|
| 519 |
+
"muon_effective_weight_decay": 0.1,
|
| 520 |
+
"muon_adam_fallback_nesterov": false,
|
| 521 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 522 |
+
"ema_decay": 0.9999,
|
| 523 |
+
"ema_start_step": 0,
|
| 524 |
+
"model_type": "ddit",
|
| 525 |
+
"ddit_mlp_type": "gelu",
|
| 526 |
+
"elf_num_time_tokens": 4,
|
| 527 |
+
"elf_num_model_mode_tokens": 0,
|
| 528 |
+
"qk_norm": true,
|
| 529 |
+
"output_bias": false,
|
| 530 |
+
"output_init_std": -1.0,
|
| 531 |
+
"norm_type": "rmsnorm",
|
| 532 |
+
"target_loss": "hard_ce",
|
| 533 |
+
"linear_soft_target_power": 1.0,
|
| 534 |
+
"linear_soft_target_min_conf": 0.0,
|
| 535 |
+
"linear_soft_target_max_conf": 1.0,
|
| 536 |
+
"t_sampling_mode": "logit_normal",
|
| 537 |
+
"t_sampling_power": 1.0,
|
| 538 |
+
"t_sampling_eps": 0.0001,
|
| 539 |
+
"t_sampling_logit_mean": -1.5,
|
| 540 |
+
"t_sampling_logit_std": 0.8,
|
| 541 |
+
"dual_t": true,
|
| 542 |
+
"corrupt_t_mode": "same",
|
| 543 |
+
"corrupt_min_t": 0.0,
|
| 544 |
+
"corrupt_max_t": 1.0,
|
| 545 |
+
"prefix_block_prob": 0.0,
|
| 546 |
+
"prefix_block_len": 128,
|
| 547 |
+
"mask_ratio_floor_schedule": "none",
|
| 548 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 549 |
+
"dirichlet_semantic_t_mode": "same",
|
| 550 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 551 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 552 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 553 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 554 |
+
"categorical_wrong_from_full_vocab": true,
|
| 555 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 556 |
+
"categorical_wrong_basin_token_ids": "",
|
| 557 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 558 |
+
"categorical_wrong_unigram_prob": 1.0,
|
| 559 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 560 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 561 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 562 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 563 |
+
"categorical_wrong_unigram_shared_prob": 0.5,
|
| 564 |
+
"mask_mixture_original_prob": 0.0,
|
| 565 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 566 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 567 |
+
"mask_mixture_block_prob": 0.0,
|
| 568 |
+
"mask_mixture_all_prob": 1.0,
|
| 569 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 570 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 571 |
+
"mask_mixture_block_tokens": "64,128",
|
| 572 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 573 |
+
"logistic_normal_sigma_min": 0.1,
|
| 574 |
+
"logistic_normal_sigma_max": 1.0,
|
| 575 |
+
"logistic_normal_tau_min": 1.0,
|
| 576 |
+
"logistic_normal_tau_max": 1.0,
|
| 577 |
+
"torch_compile": false,
|
| 578 |
+
"compile_mode": "max-autotune",
|
| 579 |
+
"state_format": "prob",
|
| 580 |
+
"meanflow_weight": 0.0,
|
| 581 |
+
"rollout_train_prob": 0.0,
|
| 582 |
+
"rollout_train_steps": 1,
|
| 583 |
+
"rollout_train_infer_steps": 64,
|
| 584 |
+
"rollout_train_temp": 1.45,
|
| 585 |
+
"rollout_train_max_gamma": 1.0,
|
| 586 |
+
"rollout_train_corrupt_only": true,
|
| 587 |
+
"rollout_train_samplewise": false,
|
| 588 |
+
"rollout_train_compute_always": false,
|
| 589 |
+
"bridge_noise_init": "logistic_normal",
|
| 590 |
+
"noise_sigma": -1.0,
|
| 591 |
+
"allow_tf32": true,
|
| 592 |
+
"activation_checkpointing": false,
|
| 593 |
+
"activation_checkpoint_interval": 1,
|
| 594 |
+
"activation_checkpoint_scope": "block",
|
| 595 |
+
"ddp_static_graph": false,
|
| 596 |
+
"ddp_gradient_as_bucket_view": true,
|
| 597 |
+
"blocking_data_transfer": false,
|
| 598 |
+
"dataloader_prefetch_factor": 4,
|
| 599 |
+
"full_train_stats": false,
|
| 600 |
+
"tokenized_hf": false,
|
| 601 |
+
"tokenized_pad_token": "pad",
|
| 602 |
+
"elf_conditional_hf": false,
|
| 603 |
+
"record_pad_truncate": false,
|
| 604 |
+
"record_add_eos": false,
|
| 605 |
+
"record_add_special_tokens": false,
|
| 606 |
+
"record_pad_token": "pad",
|
| 607 |
+
"record_shuffle_buffer": 10000,
|
| 608 |
+
"wrap": true,
|
| 609 |
+
"wrap_mode": "stream",
|
| 610 |
+
"wrap_record_buffer_size": 200,
|
| 611 |
+
"owt_cached_chunks": true,
|
| 612 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
|
| 613 |
+
"owt_chunk_cache_rebuild": false,
|
| 614 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 615 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 616 |
+
"online_chunk_shuffle": false,
|
| 617 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 618 |
+
"openwebtext_split": "train_minus_100k",
|
| 619 |
+
"detokenizer": "auto",
|
| 620 |
+
"resolved_detokenizer": null,
|
| 621 |
+
"num_workers": 0,
|
| 622 |
+
"latest_every": 1000,
|
| 623 |
+
"resume_path": "runs/train8_noisegeo_len256_allcorrupt_unigram_shared0p5_20260517_163805/latest.pt"
|
| 624 |
+
}
|
| 625 |
+
step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=4.5s lr=2.000000e-03 loss=0.6586 loss_recon=0.6586 loss_meanflow=0.0000 mean_model_t=0.2078 mean_corrupt_t=0.2078 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7668 corrupt_frac=1.0000 acc_corrupt=0.7668 loss_corrupt=0.6586 wrong_frac=0.7922 init_acc_corrupt=0.1148 acc_corrupt_t_0p0_0p2=0.5865 corrupt_frac_t_0p0_0p2=0.5620 acc_corrupt_t_0p2_0p4=0.9977 corrupt_frac_t_0p2_0p4=0.3533 acc_corrupt_t_0p4_0p6=0.9996 corrupt_frac_t_0p4_0p6=0.0755 acc_corrupt_t_0p6_0p8=0.9993 corrupt_frac_t_0p6_0p8=0.0117 out_w_norm=11.4584 out_g_norm=0.8178 acc_corrupt_t_0p8_1p0=0.9922 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.6532 init_gold_top10=0.2117 init_gold_top100=0.4247
|
| 626 |
+
step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=3.8s lr=2.000000e-03 loss=0.6295 loss_recon=0.6295 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7760 corrupt_frac=1.0000 acc_corrupt=0.7760 loss_corrupt=0.6295 wrong_frac=0.7916 init_acc_corrupt=0.1160 acc_corrupt_t_0p0_0p2=0.5995 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=0.9984 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0744 acc_corrupt_t_0p6_0p8=0.9990 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=11.4663 out_g_norm=0.7199 loss_all=0.6324 init_gold_top10=0.2016 init_gold_top100=0.4192
|
| 627 |
+
step=2300 epoch=2300/3000 epoch_step=1/1 micro_steps=2300 elapsed=3.8s lr=2.000000e-03 loss=0.6050 loss_recon=0.6050 loss_meanflow=0.0000 mean_model_t=0.2105 mean_corrupt_t=0.2105 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7846 corrupt_frac=1.0000 acc_corrupt=0.7846 loss_corrupt=0.6050 wrong_frac=0.7894 init_acc_corrupt=0.1187 acc_corrupt_t_0p0_0p2=0.6123 corrupt_frac_t_0p0_0p2=0.5542 acc_corrupt_t_0p2_0p4=0.9985 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0813 acc_corrupt_t_0p6_0p8=0.9993 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=11.4696 out_g_norm=0.6746 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.4587 init_gold_top10=0.2133 init_gold_top100=0.4237
|
| 628 |
+
step=2400 epoch=2400/3000 epoch_step=1/1 micro_steps=2400 elapsed=3.8s lr=2.000000e-03 loss=0.5927 loss_recon=0.5927 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7908 corrupt_frac=1.0000 acc_corrupt=0.7908 loss_corrupt=0.5927 wrong_frac=0.7915 init_acc_corrupt=0.1161 acc_corrupt_t_0p0_0p2=0.6268 corrupt_frac_t_0p0_0p2=0.5597 acc_corrupt_t_0p2_0p4=0.9991 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9984 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=11.4838 out_g_norm=0.6499 acc_corrupt_t_0p8_1p0=0.9980 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5744 init_gold_top10=0.2146 init_gold_top100=0.4290
|
| 629 |
+
step=2500 epoch=2500/3000 epoch_step=1/1 micro_steps=2500 elapsed=3.8s lr=2.000000e-03 loss=0.5809 loss_recon=0.5809 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7926 corrupt_frac=1.0000 acc_corrupt=0.7926 loss_corrupt=0.5809 wrong_frac=0.7914 init_acc_corrupt=0.1160 acc_corrupt_t_0p0_0p2=0.6313 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.9991 corrupt_frac_t_0p2_0p4=0.3550 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0741 acc_corrupt_t_0p6_0p8=0.9993 corrupt_frac_t_0p6_0p8=0.0143 out_w_norm=11.5089 out_g_norm=0.5936 loss_all=0.5771 init_gold_top10=0.1906 init_gold_top100=0.4129
|
| 630 |
+
step=2600 epoch=2600/3000 epoch_step=1/1 micro_steps=2600 elapsed=3.8s lr=2.000000e-03 loss=0.5694 loss_recon=0.5694 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7986 corrupt_frac=1.0000 acc_corrupt=0.7986 loss_corrupt=0.5694 wrong_frac=0.7917 init_acc_corrupt=0.1154 acc_corrupt_t_0p0_0p2=0.6388 corrupt_frac_t_0p0_0p2=0.5568 acc_corrupt_t_0p2_0p4=0.9994 corrupt_frac_t_0p2_0p4=0.3606 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0742 acc_corrupt_t_0p6_0p8=0.9993 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=11.5321 out_g_norm=0.5380 loss_all=0.6365 init_gold_top10=0.1772 init_gold_top100=0.4130
|
| 631 |
+
step=2700 epoch=2700/3000 epoch_step=1/1 micro_steps=2700 elapsed=3.8s lr=2.000000e-03 loss=0.5522 loss_recon=0.5522 loss_meanflow=0.0000 mean_model_t=0.2079 mean_corrupt_t=0.2079 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8042 corrupt_frac=1.0000 acc_corrupt=0.8042 loss_corrupt=0.5522 wrong_frac=0.7925 init_acc_corrupt=0.1147 acc_corrupt_t_0p0_0p2=0.6515 corrupt_frac_t_0p0_0p2=0.5607 acc_corrupt_t_0p2_0p4=0.9990 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0739 acc_corrupt_t_0p6_0p8=0.9993 corrupt_frac_t_0p6_0p8=0.0132 out_w_norm=11.5532 out_g_norm=0.5052 loss_all=0.5069 init_gold_top10=0.2137 init_gold_top100=0.4244
|
| 632 |
+
step=2800 epoch=2800/3000 epoch_step=1/1 micro_steps=2800 elapsed=3.8s lr=2.000000e-03 loss=0.5327 loss_recon=0.5327 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8109 corrupt_frac=1.0000 acc_corrupt=0.8109 loss_corrupt=0.5327 wrong_frac=0.7904 init_acc_corrupt=0.1168 acc_corrupt_t_0p0_0p2=0.6569 corrupt_frac_t_0p0_0p2=0.5505 acc_corrupt_t_0p2_0p4=0.9994 corrupt_frac_t_0p2_0p4=0.3649 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0765 out_w_norm=11.5678 out_g_norm=0.4714 acc_corrupt_t_0p6_0p8=0.9993 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=0.9922 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.4982 init_gold_top10=0.1935 init_gold_top100=0.4118
|
| 633 |
+
step=2900 epoch=2900/3000 epoch_step=1/1 micro_steps=2900 elapsed=3.8s lr=2.000000e-03 loss=0.5118 loss_recon=0.5118 loss_meanflow=0.0000 mean_model_t=0.2102 mean_corrupt_t=0.2102 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8184 corrupt_frac=1.0000 acc_corrupt=0.8184 loss_corrupt=0.5118 wrong_frac=0.7901 init_acc_corrupt=0.1169 acc_corrupt_t_0p0_0p2=0.6710 corrupt_frac_t_0p0_0p2=0.5513 acc_corrupt_t_0p2_0p4=0.9994 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0755 out_w_norm=11.5866 out_g_norm=0.4311 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0127 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.4894 init_gold_top10=0.1910 init_gold_top100=0.4159
|
| 634 |
+
step=3000 epoch=3000/3000 epoch_step=1/1 micro_steps=3000 elapsed=3.8s lr=2.000000e-03 loss=0.5007 loss_recon=0.5007 loss_meanflow=0.0000 mean_model_t=0.2101 mean_corrupt_t=0.2101 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8232 corrupt_frac=1.0000 acc_corrupt=0.8232 loss_corrupt=0.5007 wrong_frac=0.7897 init_acc_corrupt=0.1182 acc_corrupt_t_0p0_0p2=0.6841 corrupt_frac_t_0p0_0p2=0.5591 acc_corrupt_t_0p2_0p4=0.9997 corrupt_frac_t_0p2_0p4=0.3526 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p8_1p0=0.9922 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=11.6083 out_g_norm=0.3939 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0133 loss_all=0.5446 init_gold_top10=0.1951 init_gold_top100=0.4188
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_focused_len256_rollin_p100_s4_i32_20260517_1733focused.log
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NCCL version 2.25.1+cuda12.8
|
| 2 |
+
{
|
| 3 |
+
"device": "cuda:0",
|
| 4 |
+
"rank": 0,
|
| 5 |
+
"world_size": 4,
|
| 6 |
+
"samples": "owt_cached_chunks:8",
|
| 7 |
+
"vocab_size": 969,
|
| 8 |
+
"tokenizer_vocab_size": 50257,
|
| 9 |
+
"save_dir": "runs/train8_rollin_focused_len256_rollin_p100_s4_i32_20260517_1733focused",
|
| 10 |
+
"batch_size": 128,
|
| 11 |
+
"grad_accum": 1,
|
| 12 |
+
"effective_batch_size": 512,
|
| 13 |
+
"global_batch_size": 512,
|
| 14 |
+
"lr_schedule": "constant_warmup",
|
| 15 |
+
"optimizer": "muon",
|
| 16 |
+
"epochs": 0.0,
|
| 17 |
+
"steps_per_epoch": 1,
|
| 18 |
+
"total_steps": 500,
|
| 19 |
+
"warmup_steps": 10,
|
| 20 |
+
"warmup_epochs": -1.0,
|
| 21 |
+
"min_lr": 0.0,
|
| 22 |
+
"weight_decay": 0.1,
|
| 23 |
+
"output_weight_decay": -1.0,
|
| 24 |
+
"adamw_param_groups": "nanogpt",
|
| 25 |
+
"adam_beta1": 0.9,
|
| 26 |
+
"adam_beta2": 0.95,
|
| 27 |
+
"adam_eps": 1e-08,
|
| 28 |
+
"muon_impl": "legacy",
|
| 29 |
+
"muon_momentum": 0.95,
|
| 30 |
+
"muon_ns_steps": 5,
|
| 31 |
+
"muon_update_scale": 1.0,
|
| 32 |
+
"muon_nesterov": false,
|
| 33 |
+
"muon_width_scale": false,
|
| 34 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 35 |
+
"muon_param_count": 1965440,
|
| 36 |
+
"muon_adam_param_count": 8192,
|
| 37 |
+
"muon_param_names": [
|
| 38 |
+
"vocab_embed.embedding",
|
| 39 |
+
"sigma_map.net.0.weight",
|
| 40 |
+
"sigma_map.net.2.weight",
|
| 41 |
+
"blocks.0.attn_qkv.weight",
|
| 42 |
+
"blocks.0.attn_out.weight",
|
| 43 |
+
"blocks.0.mlp.0.weight",
|
| 44 |
+
"blocks.0.mlp.2.weight",
|
| 45 |
+
"blocks.0.adaLN_modulation.weight",
|
| 46 |
+
"blocks.1.attn_qkv.weight",
|
| 47 |
+
"blocks.1.attn_out.weight",
|
| 48 |
+
"blocks.1.mlp.0.weight",
|
| 49 |
+
"blocks.1.mlp.2.weight",
|
| 50 |
+
"blocks.1.adaLN_modulation.weight",
|
| 51 |
+
"blocks.2.attn_qkv.weight",
|
| 52 |
+
"blocks.2.attn_out.weight",
|
| 53 |
+
"blocks.2.mlp.0.weight",
|
| 54 |
+
"blocks.2.mlp.2.weight",
|
| 55 |
+
"blocks.2.adaLN_modulation.weight",
|
| 56 |
+
"output_layer.linear.weight",
|
| 57 |
+
"output_layer.adaLN_modulation.weight"
|
| 58 |
+
],
|
| 59 |
+
"muon_adam_param_names": [
|
| 60 |
+
"sigma_map.net.0.bias",
|
| 61 |
+
"sigma_map.net.2.bias",
|
| 62 |
+
"blocks.0.norm1.weight",
|
| 63 |
+
"blocks.0.norm2.weight",
|
| 64 |
+
"blocks.0.mlp.0.bias",
|
| 65 |
+
"blocks.0.mlp.2.bias",
|
| 66 |
+
"blocks.0.adaLN_modulation.bias",
|
| 67 |
+
"blocks.1.norm1.weight",
|
| 68 |
+
"blocks.1.norm2.weight",
|
| 69 |
+
"blocks.1.mlp.0.bias",
|
| 70 |
+
"blocks.1.mlp.2.bias",
|
| 71 |
+
"blocks.1.adaLN_modulation.bias",
|
| 72 |
+
"blocks.2.norm1.weight",
|
| 73 |
+
"blocks.2.norm2.weight",
|
| 74 |
+
"blocks.2.mlp.0.bias",
|
| 75 |
+
"blocks.2.mlp.2.bias",
|
| 76 |
+
"blocks.2.adaLN_modulation.bias",
|
| 77 |
+
"output_layer.norm_final.weight",
|
| 78 |
+
"output_layer.adaLN_modulation.bias"
|
| 79 |
+
],
|
| 80 |
+
"muon_effective_nesterov": false,
|
| 81 |
+
"muon_effective_width_scale": false,
|
| 82 |
+
"muon_effective_weight_decay": 0.1,
|
| 83 |
+
"muon_adam_fallback_nesterov": false,
|
| 84 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 85 |
+
"ema_decay": 0.9999,
|
| 86 |
+
"ema_start_step": 0,
|
| 87 |
+
"model_type": "ddit",
|
| 88 |
+
"ddit_mlp_type": "gelu",
|
| 89 |
+
"elf_num_time_tokens": 4,
|
| 90 |
+
"elf_num_model_mode_tokens": 0,
|
| 91 |
+
"qk_norm": true,
|
| 92 |
+
"output_bias": false,
|
| 93 |
+
"output_init_std": -1.0,
|
| 94 |
+
"norm_type": "rmsnorm",
|
| 95 |
+
"target_loss": "hard_ce",
|
| 96 |
+
"linear_soft_target_power": 1.0,
|
| 97 |
+
"linear_soft_target_min_conf": 0.0,
|
| 98 |
+
"linear_soft_target_max_conf": 1.0,
|
| 99 |
+
"t_sampling_mode": "logit_normal",
|
| 100 |
+
"t_sampling_power": 1.0,
|
| 101 |
+
"t_sampling_eps": 0.0001,
|
| 102 |
+
"t_sampling_logit_mean": -1.5,
|
| 103 |
+
"t_sampling_logit_std": 0.8,
|
| 104 |
+
"dual_t": true,
|
| 105 |
+
"corrupt_t_mode": "same",
|
| 106 |
+
"corrupt_min_t": 0.0,
|
| 107 |
+
"corrupt_max_t": 1.0,
|
| 108 |
+
"prefix_block_prob": 0.0,
|
| 109 |
+
"prefix_block_len": 128,
|
| 110 |
+
"mask_ratio_floor_schedule": "none",
|
| 111 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 112 |
+
"dirichlet_semantic_t_mode": "same",
|
| 113 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 114 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 115 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 116 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 117 |
+
"categorical_wrong_from_full_vocab": true,
|
| 118 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 119 |
+
"categorical_wrong_basin_token_ids": "",
|
| 120 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 121 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 122 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 123 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 124 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 125 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 126 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 127 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 128 |
+
"mask_mixture_original_prob": 0.0,
|
| 129 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 130 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 131 |
+
"mask_mixture_block_prob": 0.0,
|
| 132 |
+
"mask_mixture_all_prob": 1.0,
|
| 133 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 134 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 135 |
+
"mask_mixture_block_tokens": "64,128",
|
| 136 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 137 |
+
"logistic_normal_sigma_min": 0.1,
|
| 138 |
+
"logistic_normal_sigma_max": 1.0,
|
| 139 |
+
"logistic_normal_tau_min": 1.0,
|
| 140 |
+
"logistic_normal_tau_max": 1.0,
|
| 141 |
+
"torch_compile": false,
|
| 142 |
+
"compile_mode": "max-autotune",
|
| 143 |
+
"state_format": "prob",
|
| 144 |
+
"meanflow_weight": 0.0,
|
| 145 |
+
"rollout_train_prob": 1.0,
|
| 146 |
+
"rollout_train_steps": 4,
|
| 147 |
+
"rollout_train_infer_steps": 32,
|
| 148 |
+
"rollout_train_temp": 1.45,
|
| 149 |
+
"rollout_train_max_gamma": 1.0,
|
| 150 |
+
"rollout_train_corrupt_only": true,
|
| 151 |
+
"rollout_train_samplewise": true,
|
| 152 |
+
"rollout_train_compute_always": false,
|
| 153 |
+
"bridge_noise_init": "logistic_normal",
|
| 154 |
+
"noise_sigma": -1.0,
|
| 155 |
+
"allow_tf32": true,
|
| 156 |
+
"activation_checkpointing": false,
|
| 157 |
+
"activation_checkpoint_interval": 1,
|
| 158 |
+
"activation_checkpoint_scope": "block",
|
| 159 |
+
"ddp_static_graph": false,
|
| 160 |
+
"ddp_gradient_as_bucket_view": true,
|
| 161 |
+
"blocking_data_transfer": false,
|
| 162 |
+
"dataloader_prefetch_factor": 4,
|
| 163 |
+
"full_train_stats": false,
|
| 164 |
+
"tokenized_hf": false,
|
| 165 |
+
"tokenized_pad_token": "pad",
|
| 166 |
+
"elf_conditional_hf": false,
|
| 167 |
+
"record_pad_truncate": false,
|
| 168 |
+
"record_add_eos": false,
|
| 169 |
+
"record_add_special_tokens": false,
|
| 170 |
+
"record_pad_token": "pad",
|
| 171 |
+
"record_shuffle_buffer": 10000,
|
| 172 |
+
"wrap": true,
|
| 173 |
+
"wrap_mode": "stream",
|
| 174 |
+
"wrap_record_buffer_size": 200,
|
| 175 |
+
"owt_cached_chunks": true,
|
| 176 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
|
| 177 |
+
"owt_chunk_cache_rebuild": false,
|
| 178 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 179 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 180 |
+
"online_chunk_shuffle": false,
|
| 181 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 182 |
+
"openwebtext_split": "train_minus_100k",
|
| 183 |
+
"detokenizer": "auto",
|
| 184 |
+
"resolved_detokenizer": null,
|
| 185 |
+
"num_workers": 0,
|
| 186 |
+
"latest_every": 500,
|
| 187 |
+
"resume_path": ""
|
| 188 |
+
}
|
| 189 |
+
step=100 epoch=100/500 epoch_step=1/1 micro_steps=100 elapsed=8.2s lr=2.000000e-03 loss=6.7057 loss_recon=6.7057 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=1.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0995 corrupt_frac=1.0000 acc_corrupt=0.0995 loss_corrupt=6.7057 wrong_frac=0.7915 init_acc_corrupt=0.1159 acc_corrupt_t_0p0_0p2=0.0488 corrupt_frac_t_0p0_0p2=0.5559 acc_corrupt_t_0p2_0p4=0.1333 corrupt_frac_t_0p2_0p4=0.3589 acc_corrupt_t_0p4_0p6=0.2777 corrupt_frac_t_0p4_0p6=0.0773 acc_corrupt_t_0p6_0p8=0.3853 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=1.0988 out_g_norm=1.0104 loss_all=6.4460 init_gold_top10=0.2110 init_gold_top100=0.5461 rollout_applied_pos_frac=1.0000 init_acc_rollout_applied=0.1169 init_acc_rollout_kept=0.0000 logit_acc_rollout_applied=0.1051 logit_acc_rollout_kept=0.0000
|
| 190 |
+
step=200 epoch=200/500 epoch_step=1/1 micro_steps=200 elapsed=7.5s lr=2.000000e-03 loss=6.0920 loss_recon=6.0920 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=1.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1116 corrupt_frac=1.0000 acc_corrupt=0.1116 loss_corrupt=6.0920 wrong_frac=0.7892 init_acc_corrupt=0.1190 acc_corrupt_t_0p0_0p2=0.0551 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.1512 corrupt_frac_t_0p2_0p4=0.3621 acc_corrupt_t_0p4_0p6=0.2945 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.4229 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=3.3334 out_g_norm=1.4060 acc_corrupt_t_0p8_1p0=0.4766 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.8240 init_gold_top10=0.2049 init_gold_top100=0.5963 rollout_applied_pos_frac=1.0000 init_acc_rollout_applied=0.1107 init_acc_rollout_kept=0.0000 logit_acc_rollout_applied=0.1096 logit_acc_rollout_kept=0.0000
|
| 191 |
+
step=300 epoch=300/500 epoch_step=1/1 micro_steps=300 elapsed=7.4s lr=2.000000e-03 loss=5.5560 loss_recon=5.5560 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=1.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1210 corrupt_frac=1.0000 acc_corrupt=0.1210 loss_corrupt=5.5560 wrong_frac=0.7935 init_acc_corrupt=0.1153 acc_corrupt_t_0p0_0p2=0.0590 corrupt_frac_t_0p0_0p2=0.5641 acc_corrupt_t_0p2_0p4=0.1694 corrupt_frac_t_0p2_0p4=0.3542 acc_corrupt_t_0p4_0p6=0.3234 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.4773 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=5.2201 out_g_norm=0.7125 acc_corrupt_t_0p8_1p0=0.6380 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.2826 init_gold_top10=0.2209 init_gold_top100=0.6553 rollout_applied_pos_frac=1.0000 init_acc_rollout_applied=0.1227 init_acc_rollout_kept=0.0000 logit_acc_rollout_applied=0.1303 logit_acc_rollout_kept=0.0000
|
| 192 |
+
step=400 epoch=400/500 epoch_step=1/1 micro_steps=400 elapsed=7.3s lr=2.000000e-03 loss=4.9781 loss_recon=4.9781 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=1.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1483 corrupt_frac=1.0000 acc_corrupt=0.1483 loss_corrupt=4.9781 wrong_frac=0.7917 init_acc_corrupt=0.1181 acc_corrupt_t_0p0_0p2=0.0642 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.2056 corrupt_frac_t_0p2_0p4=0.3620 acc_corrupt_t_0p4_0p6=0.4439 corrupt_frac_t_0p4_0p6=0.0719 out_w_norm=6.9063 out_g_norm=0.4180 acc_corrupt_t_0p6_0p8=0.6502 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.7422 corrupt_frac_t_0p8_1p0=0.0078 loss_all=4.7369 init_gold_top10=0.2113 init_gold_top100=0.7779 rollout_applied_pos_frac=1.0000 init_acc_rollout_applied=0.1016 init_acc_rollout_kept=0.0000 logit_acc_rollout_applied=0.1530 logit_acc_rollout_kept=0.0000
|
| 193 |
+
step=500 epoch=500/500 epoch_step=1/1 micro_steps=500 elapsed=7.3s lr=2.000000e-03 loss=4.1805 loss_recon=4.1805 loss_meanflow=0.0000 mean_model_t=0.2071 mean_corrupt_t=0.2071 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=1.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1890 corrupt_frac=1.0000 acc_corrupt=0.1890 loss_corrupt=4.1805 wrong_frac=0.7928 init_acc_corrupt=0.1178 acc_corrupt_t_0p0_0p2=0.0758 corrupt_frac_t_0p0_0p2=0.5632 acc_corrupt_t_0p2_0p4=0.2839 corrupt_frac_t_0p2_0p4=0.3546 acc_corrupt_t_0p4_0p6=0.5403 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.7033 corrupt_frac_t_0p6_0p8=0.0118 acc_corrupt_t_0p8_1p0=0.8555 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=8.4044 out_g_norm=0.4606 loss_all=3.6550 init_gold_top10=0.2562 init_gold_top100=0.9090 rollout_applied_pos_frac=1.0000 init_acc_rollout_applied=0.1274 init_acc_rollout_kept=0.0000 logit_acc_rollout_applied=0.2250 logit_acc_rollout_kept=0.0000
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_len1024_rollin_p50_s4_i32_20260517_1840ctx1024.log
ADDED
|
@@ -0,0 +1,397 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NCCL version 2.25.1+cuda12.8
|
| 2 |
+
{
|
| 3 |
+
"device": "cuda:0",
|
| 4 |
+
"rank": 0,
|
| 5 |
+
"world_size": 4,
|
| 6 |
+
"samples": "owt_cached_chunks:8",
|
| 7 |
+
"vocab_size": 2664,
|
| 8 |
+
"tokenizer_vocab_size": 50257,
|
| 9 |
+
"save_dir": "runs/train8_rollin_len1024_rollin_p50_s4_i32_20260517_1840ctx1024",
|
| 10 |
+
"batch_size": 32,
|
| 11 |
+
"grad_accum": 4,
|
| 12 |
+
"effective_batch_size": 512,
|
| 13 |
+
"global_batch_size": 512,
|
| 14 |
+
"lr_schedule": "constant_warmup",
|
| 15 |
+
"optimizer": "muon",
|
| 16 |
+
"epochs": 0.0,
|
| 17 |
+
"steps_per_epoch": 1,
|
| 18 |
+
"total_steps": 1000,
|
| 19 |
+
"warmup_steps": 10,
|
| 20 |
+
"warmup_epochs": -1.0,
|
| 21 |
+
"min_lr": 0.0,
|
| 22 |
+
"weight_decay": 0.1,
|
| 23 |
+
"output_weight_decay": -1.0,
|
| 24 |
+
"adamw_param_groups": "nanogpt",
|
| 25 |
+
"adam_beta1": 0.9,
|
| 26 |
+
"adam_beta2": 0.95,
|
| 27 |
+
"adam_eps": 1e-08,
|
| 28 |
+
"muon_impl": "legacy",
|
| 29 |
+
"muon_momentum": 0.95,
|
| 30 |
+
"muon_ns_steps": 5,
|
| 31 |
+
"muon_update_scale": 1.0,
|
| 32 |
+
"muon_nesterov": false,
|
| 33 |
+
"muon_width_scale": false,
|
| 34 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 35 |
+
"muon_param_count": 2616320,
|
| 36 |
+
"muon_adam_param_count": 8192,
|
| 37 |
+
"muon_param_names": [
|
| 38 |
+
"vocab_embed.embedding",
|
| 39 |
+
"sigma_map.net.0.weight",
|
| 40 |
+
"sigma_map.net.2.weight",
|
| 41 |
+
"blocks.0.attn_qkv.weight",
|
| 42 |
+
"blocks.0.attn_out.weight",
|
| 43 |
+
"blocks.0.mlp.0.weight",
|
| 44 |
+
"blocks.0.mlp.2.weight",
|
| 45 |
+
"blocks.0.adaLN_modulation.weight",
|
| 46 |
+
"blocks.1.attn_qkv.weight",
|
| 47 |
+
"blocks.1.attn_out.weight",
|
| 48 |
+
"blocks.1.mlp.0.weight",
|
| 49 |
+
"blocks.1.mlp.2.weight",
|
| 50 |
+
"blocks.1.adaLN_modulation.weight",
|
| 51 |
+
"blocks.2.attn_qkv.weight",
|
| 52 |
+
"blocks.2.attn_out.weight",
|
| 53 |
+
"blocks.2.mlp.0.weight",
|
| 54 |
+
"blocks.2.mlp.2.weight",
|
| 55 |
+
"blocks.2.adaLN_modulation.weight",
|
| 56 |
+
"output_layer.linear.weight",
|
| 57 |
+
"output_layer.adaLN_modulation.weight"
|
| 58 |
+
],
|
| 59 |
+
"muon_adam_param_names": [
|
| 60 |
+
"sigma_map.net.0.bias",
|
| 61 |
+
"sigma_map.net.2.bias",
|
| 62 |
+
"blocks.0.norm1.weight",
|
| 63 |
+
"blocks.0.norm2.weight",
|
| 64 |
+
"blocks.0.mlp.0.bias",
|
| 65 |
+
"blocks.0.mlp.2.bias",
|
| 66 |
+
"blocks.0.adaLN_modulation.bias",
|
| 67 |
+
"blocks.1.norm1.weight",
|
| 68 |
+
"blocks.1.norm2.weight",
|
| 69 |
+
"blocks.1.mlp.0.bias",
|
| 70 |
+
"blocks.1.mlp.2.bias",
|
| 71 |
+
"blocks.1.adaLN_modulation.bias",
|
| 72 |
+
"blocks.2.norm1.weight",
|
| 73 |
+
"blocks.2.norm2.weight",
|
| 74 |
+
"blocks.2.mlp.0.bias",
|
| 75 |
+
"blocks.2.mlp.2.bias",
|
| 76 |
+
"blocks.2.adaLN_modulation.bias",
|
| 77 |
+
"output_layer.norm_final.weight",
|
| 78 |
+
"output_layer.adaLN_modulation.bias"
|
| 79 |
+
],
|
| 80 |
+
"muon_effective_nesterov": false,
|
| 81 |
+
"muon_effective_width_scale": false,
|
| 82 |
+
"muon_effective_weight_decay": 0.1,
|
| 83 |
+
"muon_adam_fallback_nesterov": false,
|
| 84 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 85 |
+
"ema_decay": 0.9999,
|
| 86 |
+
"ema_start_step": 0,
|
| 87 |
+
"model_type": "ddit",
|
| 88 |
+
"ddit_mlp_type": "gelu",
|
| 89 |
+
"elf_num_time_tokens": 4,
|
| 90 |
+
"elf_num_model_mode_tokens": 0,
|
| 91 |
+
"qk_norm": true,
|
| 92 |
+
"output_bias": false,
|
| 93 |
+
"output_init_std": -1.0,
|
| 94 |
+
"norm_type": "rmsnorm",
|
| 95 |
+
"target_loss": "hard_ce",
|
| 96 |
+
"linear_soft_target_power": 1.0,
|
| 97 |
+
"linear_soft_target_min_conf": 0.0,
|
| 98 |
+
"linear_soft_target_max_conf": 1.0,
|
| 99 |
+
"t_sampling_mode": "logit_normal",
|
| 100 |
+
"t_sampling_power": 1.0,
|
| 101 |
+
"t_sampling_eps": 0.0001,
|
| 102 |
+
"t_sampling_logit_mean": -1.5,
|
| 103 |
+
"t_sampling_logit_std": 0.8,
|
| 104 |
+
"dual_t": true,
|
| 105 |
+
"corrupt_t_mode": "same",
|
| 106 |
+
"corrupt_min_t": 0.0,
|
| 107 |
+
"corrupt_max_t": 1.0,
|
| 108 |
+
"prefix_block_prob": 0.0,
|
| 109 |
+
"prefix_block_len": 128,
|
| 110 |
+
"mask_ratio_floor_schedule": "none",
|
| 111 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 112 |
+
"dirichlet_semantic_t_mode": "same",
|
| 113 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 114 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 115 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 116 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 117 |
+
"categorical_wrong_from_full_vocab": true,
|
| 118 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 119 |
+
"categorical_wrong_basin_token_ids": "",
|
| 120 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 121 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 122 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 123 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 124 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 125 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 126 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 127 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 128 |
+
"mask_mixture_original_prob": 0.0,
|
| 129 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 130 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 131 |
+
"mask_mixture_block_prob": 0.0,
|
| 132 |
+
"mask_mixture_all_prob": 1.0,
|
| 133 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 134 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 135 |
+
"mask_mixture_block_tokens": "64,128",
|
| 136 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 137 |
+
"logistic_normal_sigma_min": 0.1,
|
| 138 |
+
"logistic_normal_sigma_max": 1.0,
|
| 139 |
+
"logistic_normal_tau_min": 1.0,
|
| 140 |
+
"logistic_normal_tau_max": 1.0,
|
| 141 |
+
"torch_compile": false,
|
| 142 |
+
"compile_mode": "max-autotune",
|
| 143 |
+
"state_format": "prob",
|
| 144 |
+
"meanflow_weight": 0.0,
|
| 145 |
+
"rollout_train_prob": 0.5,
|
| 146 |
+
"rollout_train_steps": 4,
|
| 147 |
+
"rollout_train_infer_steps": 32,
|
| 148 |
+
"rollout_train_temp": 1.45,
|
| 149 |
+
"rollout_train_max_gamma": 1.0,
|
| 150 |
+
"rollout_train_corrupt_only": true,
|
| 151 |
+
"rollout_train_samplewise": true,
|
| 152 |
+
"rollout_train_compute_always": false,
|
| 153 |
+
"rollout_train_sync_t": false,
|
| 154 |
+
"bridge_noise_init": "logistic_normal",
|
| 155 |
+
"noise_sigma": -1.0,
|
| 156 |
+
"allow_tf32": true,
|
| 157 |
+
"activation_checkpointing": false,
|
| 158 |
+
"activation_checkpoint_interval": 1,
|
| 159 |
+
"activation_checkpoint_scope": "block",
|
| 160 |
+
"ddp_static_graph": false,
|
| 161 |
+
"ddp_gradient_as_bucket_view": true,
|
| 162 |
+
"blocking_data_transfer": false,
|
| 163 |
+
"dataloader_prefetch_factor": 4,
|
| 164 |
+
"full_train_stats": false,
|
| 165 |
+
"tokenized_hf": false,
|
| 166 |
+
"tokenized_pad_token": "pad",
|
| 167 |
+
"elf_conditional_hf": false,
|
| 168 |
+
"record_pad_truncate": false,
|
| 169 |
+
"record_add_eos": false,
|
| 170 |
+
"record_add_special_tokens": false,
|
| 171 |
+
"record_pad_token": "pad",
|
| 172 |
+
"record_shuffle_buffer": 10000,
|
| 173 |
+
"wrap": true,
|
| 174 |
+
"wrap_mode": "stream",
|
| 175 |
+
"wrap_record_buffer_size": 200,
|
| 176 |
+
"owt_cached_chunks": true,
|
| 177 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
|
| 178 |
+
"owt_chunk_cache_rebuild": false,
|
| 179 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 180 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 181 |
+
"online_chunk_shuffle": false,
|
| 182 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 183 |
+
"openwebtext_split": "train_minus_100k",
|
| 184 |
+
"detokenizer": "auto",
|
| 185 |
+
"resolved_detokenizer": null,
|
| 186 |
+
"num_workers": 0,
|
| 187 |
+
"latest_every": 1000,
|
| 188 |
+
"resume_path": ""
|
| 189 |
+
}
|
| 190 |
+
step=100 epoch=100/1000 epoch_step=1/1 micro_steps=400 elapsed=38.3s lr=2.000000e-03 loss=7.7211 loss_recon=7.7211 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5055 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0955 corrupt_frac=1.0000 acc_corrupt=0.0955 loss_corrupt=7.7211 wrong_frac=0.7908 init_acc_corrupt=0.1161 acc_corrupt_t_0p0_0p2=0.0500 corrupt_frac_t_0p0_0p2=0.5551 acc_corrupt_t_0p2_0p4=0.1263 corrupt_frac_t_0p2_0p4=0.3604 acc_corrupt_t_0p4_0p6=0.2508 corrupt_frac_t_0p4_0p6=0.0822 acc_corrupt_t_0p6_0p8=0.3566 corrupt_frac_t_0p6_0p8=0.0356 out_w_norm=1.0063 out_g_norm=0.6737 loss_all=7.5189 init_gold_top10=0.1519 init_gold_top100=0.4115 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.0816 init_acc_rollout_kept=0.0578 logit_acc_rollout_applied=0.0869 logit_acc_rollout_kept=0.0730
|
| 191 |
+
step=200 epoch=200/1000 epoch_step=1/1 micro_steps=800 elapsed=37.5s lr=2.000000e-03 loss=7.0875 loss_recon=7.0875 loss_meanflow=0.0000 mean_model_t=0.2087 mean_corrupt_t=0.2087 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5035 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1032 corrupt_frac=1.0000 acc_corrupt=0.1032 loss_corrupt=7.0875 wrong_frac=0.7913 init_acc_corrupt=0.1163 acc_corrupt_t_0p0_0p2=0.0556 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=0.1398 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.2566 corrupt_frac_t_0p4_0p6=0.0807 acc_corrupt_t_0p8_1p0=0.4570 corrupt_frac_t_0p8_1p0=0.0312 out_w_norm=2.8644 out_g_norm=1.0978 acc_corrupt_t_0p6_0p8=0.3368 corrupt_frac_t_0p6_0p8=0.0357 loss_all=6.7515 init_gold_top10=0.1885 init_gold_top100=0.4529 rollout_applied_pos_frac=0.5625 init_acc_rollout_applied=0.1125 init_acc_rollout_kept=0.1085 logit_acc_rollout_applied=0.1083 logit_acc_rollout_kept=0.1037
|
| 192 |
+
step=300 epoch=300/1000 epoch_step=1/1 micro_steps=1200 elapsed=37.5s lr=2.000000e-03 loss=6.4511 loss_recon=6.4511 loss_meanflow=0.0000 mean_model_t=0.2118 mean_corrupt_t=0.2118 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4957 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1155 corrupt_frac=1.0000 acc_corrupt=0.1155 loss_corrupt=6.4511 wrong_frac=0.7884 init_acc_corrupt=0.1212 acc_corrupt_t_0p0_0p2=0.0595 corrupt_frac_t_0p0_0p2=0.5476 acc_corrupt_t_0p2_0p4=0.1565 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.2815 corrupt_frac_t_0p4_0p6=0.0835 out_w_norm=4.3617 out_g_norm=0.8178 acc_corrupt_t_0p6_0p8=0.3852 corrupt_frac_t_0p6_0p8=0.0366 acc_corrupt_t_0p8_1p0=0.5366 corrupt_frac_t_0p8_1p0=0.0312 loss_all=6.2349 init_gold_top10=0.1833 init_gold_top100=0.4322 rollout_applied_pos_frac=0.4375 init_acc_rollout_applied=0.1633 init_acc_rollout_kept=0.0588 logit_acc_rollout_applied=0.1432 logit_acc_rollout_kept=0.0882
|
| 193 |
+
step=400 epoch=400/1000 epoch_step=1/1 micro_steps=1600 elapsed=37.5s lr=2.000000e-03 loss=5.9906 loss_recon=5.9906 loss_meanflow=0.0000 mean_model_t=0.2091 mean_corrupt_t=0.2091 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5023 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1250 corrupt_frac=1.0000 acc_corrupt=0.1250 loss_corrupt=5.9906 wrong_frac=0.7911 init_acc_corrupt=0.1188 acc_corrupt_t_0p0_0p2=0.0629 corrupt_frac_t_0p0_0p2=0.5556 acc_corrupt_t_0p2_0p4=0.1717 corrupt_frac_t_0p2_0p4=0.3572 acc_corrupt_t_0p4_0p6=0.3178 corrupt_frac_t_0p4_0p6=0.0865 out_w_norm=5.4987 out_g_norm=0.3310 acc_corrupt_t_0p6_0p8=0.4333 corrupt_frac_t_0p6_0p8=0.0366 acc_corrupt_t_0p8_1p0=0.5137 corrupt_frac_t_0p8_1p0=0.0312 loss_all=5.7675 init_gold_top10=0.1940 init_gold_top100=0.4395 rollout_applied_pos_frac=0.4062 init_acc_rollout_applied=0.1511 init_acc_rollout_kept=0.0853 logit_acc_rollout_applied=0.1493 logit_acc_rollout_kept=0.1149
|
| 194 |
+
step=500 epoch=500/1000 epoch_step=1/1 micro_steps=2000 elapsed=37.4s lr=2.000000e-03 loss=5.4943 loss_recon=5.4943 loss_meanflow=0.0000 mean_model_t=0.2104 mean_corrupt_t=0.2104 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5032 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1383 corrupt_frac=1.0000 acc_corrupt=0.1383 loss_corrupt=5.4943 wrong_frac=0.7895 init_acc_corrupt=0.1195 acc_corrupt_t_0p0_0p2=0.0679 corrupt_frac_t_0p0_0p2=0.5505 acc_corrupt_t_0p2_0p4=0.1911 corrupt_frac_t_0p2_0p4=0.3658 acc_corrupt_t_0p4_0p6=0.3549 corrupt_frac_t_0p4_0p6=0.0809 out_w_norm=6.7494 out_g_norm=0.2532 acc_corrupt_t_0p6_0p8=0.4935 corrupt_frac_t_0p6_0p8=0.0353 acc_corrupt_t_0p8_1p0=0.6230 corrupt_frac_t_0p8_1p0=0.0312 loss_all=5.4116 init_gold_top10=0.1764 init_gold_top100=0.4898 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.0849 init_acc_rollout_kept=0.1106 logit_acc_rollout_applied=0.1191 logit_acc_rollout_kept=0.1371
|
| 195 |
+
step=600 epoch=600/1000 epoch_step=1/1 micro_steps=2400 elapsed=37.4s lr=2.000000e-03 loss=4.8848 loss_recon=4.8848 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4970 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1534 corrupt_frac=1.0000 acc_corrupt=0.1534 loss_corrupt=4.8848 wrong_frac=0.7905 init_acc_corrupt=0.1196 acc_corrupt_t_0p0_0p2=0.0718 corrupt_frac_t_0p0_0p2=0.5511 acc_corrupt_t_0p2_0p4=0.2144 corrupt_frac_t_0p2_0p4=0.3627 out_w_norm=8.0570 out_g_norm=0.2940 acc_corrupt_t_0p4_0p6=0.3997 corrupt_frac_t_0p4_0p6=0.0831 acc_corrupt_t_0p6_0p8=0.5749 corrupt_frac_t_0p6_0p8=0.0372 acc_corrupt_t_0p8_1p0=0.7373 corrupt_frac_t_0p8_1p0=0.0312 loss_all=4.4114 init_gold_top10=0.2211 init_gold_top100=0.6172 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.1576 init_acc_rollout_kept=0.0989 logit_acc_rollout_applied=0.2024 logit_acc_rollout_kept=0.1540
|
| 196 |
+
step=700 epoch=700/1000 epoch_step=1/1 micro_steps=2800 elapsed=37.5s lr=2.000000e-03 loss=4.2577 loss_recon=4.2577 loss_meanflow=0.0000 mean_model_t=0.2102 mean_corrupt_t=0.2102 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4981 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1793 corrupt_frac=1.0000 acc_corrupt=0.1793 loss_corrupt=4.2577 wrong_frac=0.7899 init_acc_corrupt=0.1206 acc_corrupt_t_0p0_0p2=0.0785 corrupt_frac_t_0p0_0p2=0.5530 acc_corrupt_t_0p2_0p4=0.2532 corrupt_frac_t_0p2_0p4=0.3584 acc_corrupt_t_0p4_0p6=0.4900 corrupt_frac_t_0p4_0p6=0.0863 acc_corrupt_t_0p6_0p8=0.6753 corrupt_frac_t_0p6_0p8=0.0361 out_w_norm=9.2749 out_g_norm=0.3343 acc_corrupt_t_0p8_1p0=0.8145 corrupt_frac_t_0p8_1p0=0.0312 loss_all=4.1313 init_gold_top10=0.2126 init_gold_top100=0.5775 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.1626 init_acc_rollout_kept=0.1093 logit_acc_rollout_applied=0.2192 logit_acc_rollout_kept=0.1784
|
| 197 |
+
step=800 epoch=800/1000 epoch_step=1/1 micro_steps=3200 elapsed=37.5s lr=2.000000e-03 loss=3.7527 loss_recon=3.7527 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5079 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2030 corrupt_frac=1.0000 acc_corrupt=0.2030 loss_corrupt=3.7527 wrong_frac=0.7903 init_acc_corrupt=0.1207 acc_corrupt_t_0p0_0p2=0.0879 corrupt_frac_t_0p0_0p2=0.5517 acc_corrupt_t_0p2_0p4=0.2965 corrupt_frac_t_0p2_0p4=0.3616 acc_corrupt_t_0p4_0p6=0.5306 corrupt_frac_t_0p4_0p6=0.0850 out_w_norm=10.1655 out_g_norm=0.3946 acc_corrupt_t_0p6_0p8=0.7050 corrupt_frac_t_0p6_0p8=0.0338 acc_corrupt_t_0p8_1p0=0.8413 corrupt_frac_t_0p8_1p0=0.0312 loss_all=3.6608 init_gold_top10=0.2162 init_gold_top100=0.6223 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.1540 init_acc_rollout_kept=0.0570 logit_acc_rollout_applied=0.2421 logit_acc_rollout_kept=0.1555
|
| 198 |
+
step=900 epoch=900/1000 epoch_step=1/1 micro_steps=3600 elapsed=37.4s lr=2.000000e-03 loss=3.3035 loss_recon=3.3035 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4956 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2399 corrupt_frac=1.0000 acc_corrupt=0.2399 loss_corrupt=3.3035 wrong_frac=0.7892 init_acc_corrupt=0.1235 acc_corrupt_t_0p0_0p2=0.1037 corrupt_frac_t_0p0_0p2=0.5547 acc_corrupt_t_0p2_0p4=0.3581 corrupt_frac_t_0p2_0p4=0.3554 acc_corrupt_t_0p4_0p6=0.5989 corrupt_frac_t_0p4_0p6=0.0877 out_w_norm=10.7195 out_g_norm=0.5110 acc_corrupt_t_0p6_0p8=0.7433 corrupt_frac_t_0p6_0p8=0.0358 acc_corrupt_t_0p8_1p0=0.8540 corrupt_frac_t_0p8_1p0=0.0312 loss_all=3.0285 init_gold_top10=0.2676 init_gold_top100=0.6202 rollout_applied_pos_frac=0.4375 init_acc_rollout_applied=0.1009 init_acc_rollout_kept=0.1176 logit_acc_rollout_applied=0.2540 logit_acc_rollout_kept=0.2764
|
| 199 |
+
step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=4000 elapsed=37.5s lr=2.000000e-03 loss=2.8950 loss_recon=2.8950 loss_meanflow=0.0000 mean_model_t=0.2090 mean_corrupt_t=0.2090 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4988 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2938 corrupt_frac=1.0000 acc_corrupt=0.2938 loss_corrupt=2.8950 wrong_frac=0.7911 init_acc_corrupt=0.1224 acc_corrupt_t_0p0_0p2=0.1292 corrupt_frac_t_0p0_0p2=0.5598 acc_corrupt_t_0p2_0p4=0.4540 corrupt_frac_t_0p2_0p4=0.3541 acc_corrupt_t_0p4_0p6=0.6939 corrupt_frac_t_0p4_0p6=0.0843 out_w_norm=11.0862 out_g_norm=0.6754 acc_corrupt_t_0p6_0p8=0.8143 corrupt_frac_t_0p6_0p8=0.0341 acc_corrupt_t_0p8_1p0=0.9004 corrupt_frac_t_0p8_1p0=0.0312 loss_all=3.0296 init_gold_top10=0.2572 init_gold_top100=0.5604 rollout_applied_pos_frac=0.3750 init_acc_rollout_applied=0.0764 init_acc_rollout_kept=0.1180 logit_acc_rollout_applied=0.2254 logit_acc_rollout_kept=0.3271
|
| 200 |
+
NCCL version 2.25.1+cuda12.8
|
| 201 |
+
resumed_from=runs/train8_rollin_len1024_rollin_p50_s4_i32_20260517_1840ctx1024/latest.pt start_step=1001
|
| 202 |
+
{
|
| 203 |
+
"device": "cuda:0",
|
| 204 |
+
"rank": 0,
|
| 205 |
+
"world_size": 4,
|
| 206 |
+
"samples": "owt_cached_chunks:8",
|
| 207 |
+
"vocab_size": 2664,
|
| 208 |
+
"tokenizer_vocab_size": 50257,
|
| 209 |
+
"save_dir": "runs/train8_rollin_len1024_rollin_p50_s4_i32_20260517_1840ctx1024",
|
| 210 |
+
"batch_size": 32,
|
| 211 |
+
"grad_accum": 4,
|
| 212 |
+
"effective_batch_size": 512,
|
| 213 |
+
"global_batch_size": 512,
|
| 214 |
+
"lr_schedule": "constant_warmup",
|
| 215 |
+
"optimizer": "muon",
|
| 216 |
+
"epochs": 0.0,
|
| 217 |
+
"steps_per_epoch": 1,
|
| 218 |
+
"total_steps": 2000,
|
| 219 |
+
"warmup_steps": 10,
|
| 220 |
+
"warmup_epochs": -1.0,
|
| 221 |
+
"min_lr": 0.0,
|
| 222 |
+
"weight_decay": 0.1,
|
| 223 |
+
"output_weight_decay": -1.0,
|
| 224 |
+
"adamw_param_groups": "nanogpt",
|
| 225 |
+
"adam_beta1": 0.9,
|
| 226 |
+
"adam_beta2": 0.95,
|
| 227 |
+
"adam_eps": 1e-08,
|
| 228 |
+
"muon_impl": "legacy",
|
| 229 |
+
"muon_momentum": 0.95,
|
| 230 |
+
"muon_ns_steps": 5,
|
| 231 |
+
"muon_update_scale": 1.0,
|
| 232 |
+
"muon_nesterov": false,
|
| 233 |
+
"muon_width_scale": false,
|
| 234 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 235 |
+
"muon_param_count": 2616320,
|
| 236 |
+
"muon_adam_param_count": 8192,
|
| 237 |
+
"muon_param_names": [
|
| 238 |
+
"vocab_embed.embedding",
|
| 239 |
+
"sigma_map.net.0.weight",
|
| 240 |
+
"sigma_map.net.2.weight",
|
| 241 |
+
"blocks.0.attn_qkv.weight",
|
| 242 |
+
"blocks.0.attn_out.weight",
|
| 243 |
+
"blocks.0.mlp.0.weight",
|
| 244 |
+
"blocks.0.mlp.2.weight",
|
| 245 |
+
"blocks.0.adaLN_modulation.weight",
|
| 246 |
+
"blocks.1.attn_qkv.weight",
|
| 247 |
+
"blocks.1.attn_out.weight",
|
| 248 |
+
"blocks.1.mlp.0.weight",
|
| 249 |
+
"blocks.1.mlp.2.weight",
|
| 250 |
+
"blocks.1.adaLN_modulation.weight",
|
| 251 |
+
"blocks.2.attn_qkv.weight",
|
| 252 |
+
"blocks.2.attn_out.weight",
|
| 253 |
+
"blocks.2.mlp.0.weight",
|
| 254 |
+
"blocks.2.mlp.2.weight",
|
| 255 |
+
"blocks.2.adaLN_modulation.weight",
|
| 256 |
+
"output_layer.linear.weight",
|
| 257 |
+
"output_layer.adaLN_modulation.weight"
|
| 258 |
+
],
|
| 259 |
+
"muon_adam_param_names": [
|
| 260 |
+
"sigma_map.net.0.bias",
|
| 261 |
+
"sigma_map.net.2.bias",
|
| 262 |
+
"blocks.0.norm1.weight",
|
| 263 |
+
"blocks.0.norm2.weight",
|
| 264 |
+
"blocks.0.mlp.0.bias",
|
| 265 |
+
"blocks.0.mlp.2.bias",
|
| 266 |
+
"blocks.0.adaLN_modulation.bias",
|
| 267 |
+
"blocks.1.norm1.weight",
|
| 268 |
+
"blocks.1.norm2.weight",
|
| 269 |
+
"blocks.1.mlp.0.bias",
|
| 270 |
+
"blocks.1.mlp.2.bias",
|
| 271 |
+
"blocks.1.adaLN_modulation.bias",
|
| 272 |
+
"blocks.2.norm1.weight",
|
| 273 |
+
"blocks.2.norm2.weight",
|
| 274 |
+
"blocks.2.mlp.0.bias",
|
| 275 |
+
"blocks.2.mlp.2.bias",
|
| 276 |
+
"blocks.2.adaLN_modulation.bias",
|
| 277 |
+
"output_layer.norm_final.weight",
|
| 278 |
+
"output_layer.adaLN_modulation.bias"
|
| 279 |
+
],
|
| 280 |
+
"muon_effective_nesterov": false,
|
| 281 |
+
"muon_effective_width_scale": false,
|
| 282 |
+
"muon_effective_weight_decay": 0.1,
|
| 283 |
+
"muon_adam_fallback_nesterov": false,
|
| 284 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 285 |
+
"ema_decay": 0.9999,
|
| 286 |
+
"ema_start_step": 0,
|
| 287 |
+
"model_type": "ddit",
|
| 288 |
+
"ddit_mlp_type": "gelu",
|
| 289 |
+
"elf_num_time_tokens": 4,
|
| 290 |
+
"elf_num_model_mode_tokens": 0,
|
| 291 |
+
"qk_norm": true,
|
| 292 |
+
"output_bias": false,
|
| 293 |
+
"output_init_std": -1.0,
|
| 294 |
+
"norm_type": "rmsnorm",
|
| 295 |
+
"target_loss": "hard_ce",
|
| 296 |
+
"linear_soft_target_power": 1.0,
|
| 297 |
+
"linear_soft_target_min_conf": 0.0,
|
| 298 |
+
"linear_soft_target_max_conf": 1.0,
|
| 299 |
+
"t_sampling_mode": "logit_normal",
|
| 300 |
+
"t_sampling_power": 1.0,
|
| 301 |
+
"t_sampling_eps": 0.0001,
|
| 302 |
+
"t_sampling_logit_mean": -1.5,
|
| 303 |
+
"t_sampling_logit_std": 0.8,
|
| 304 |
+
"dual_t": true,
|
| 305 |
+
"corrupt_t_mode": "same",
|
| 306 |
+
"corrupt_min_t": 0.0,
|
| 307 |
+
"corrupt_max_t": 1.0,
|
| 308 |
+
"prefix_block_prob": 0.0,
|
| 309 |
+
"prefix_block_len": 128,
|
| 310 |
+
"mask_ratio_floor_schedule": "none",
|
| 311 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 312 |
+
"dirichlet_semantic_t_mode": "same",
|
| 313 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 314 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 315 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 316 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 317 |
+
"categorical_wrong_from_full_vocab": true,
|
| 318 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 319 |
+
"categorical_wrong_basin_token_ids": "",
|
| 320 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 321 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 322 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 323 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 324 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 325 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 326 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 327 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 328 |
+
"mask_mixture_original_prob": 0.0,
|
| 329 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 330 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 331 |
+
"mask_mixture_block_prob": 0.0,
|
| 332 |
+
"mask_mixture_all_prob": 1.0,
|
| 333 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 334 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 335 |
+
"mask_mixture_block_tokens": "64,128",
|
| 336 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 337 |
+
"logistic_normal_sigma_min": 0.1,
|
| 338 |
+
"logistic_normal_sigma_max": 1.0,
|
| 339 |
+
"logistic_normal_tau_min": 1.0,
|
| 340 |
+
"logistic_normal_tau_max": 1.0,
|
| 341 |
+
"torch_compile": false,
|
| 342 |
+
"compile_mode": "max-autotune",
|
| 343 |
+
"state_format": "prob",
|
| 344 |
+
"meanflow_weight": 0.0,
|
| 345 |
+
"rollout_train_prob": 0.5,
|
| 346 |
+
"rollout_train_steps": 4,
|
| 347 |
+
"rollout_train_infer_steps": 32,
|
| 348 |
+
"rollout_train_temp": 1.45,
|
| 349 |
+
"rollout_train_max_gamma": 1.0,
|
| 350 |
+
"rollout_train_corrupt_only": true,
|
| 351 |
+
"rollout_train_samplewise": true,
|
| 352 |
+
"rollout_train_compute_always": false,
|
| 353 |
+
"rollout_train_sync_t": false,
|
| 354 |
+
"bridge_noise_init": "logistic_normal",
|
| 355 |
+
"noise_sigma": -1.0,
|
| 356 |
+
"allow_tf32": true,
|
| 357 |
+
"activation_checkpointing": false,
|
| 358 |
+
"activation_checkpoint_interval": 1,
|
| 359 |
+
"activation_checkpoint_scope": "block",
|
| 360 |
+
"ddp_static_graph": false,
|
| 361 |
+
"ddp_gradient_as_bucket_view": true,
|
| 362 |
+
"blocking_data_transfer": false,
|
| 363 |
+
"dataloader_prefetch_factor": 4,
|
| 364 |
+
"full_train_stats": false,
|
| 365 |
+
"tokenized_hf": false,
|
| 366 |
+
"tokenized_pad_token": "pad",
|
| 367 |
+
"elf_conditional_hf": false,
|
| 368 |
+
"record_pad_truncate": false,
|
| 369 |
+
"record_add_eos": false,
|
| 370 |
+
"record_add_special_tokens": false,
|
| 371 |
+
"record_pad_token": "pad",
|
| 372 |
+
"record_shuffle_buffer": 10000,
|
| 373 |
+
"wrap": true,
|
| 374 |
+
"wrap_mode": "stream",
|
| 375 |
+
"wrap_record_buffer_size": 200,
|
| 376 |
+
"owt_cached_chunks": true,
|
| 377 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
|
| 378 |
+
"owt_chunk_cache_rebuild": false,
|
| 379 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 380 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 381 |
+
"online_chunk_shuffle": false,
|
| 382 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 383 |
+
"openwebtext_split": "train_minus_100k",
|
| 384 |
+
"detokenizer": "auto",
|
| 385 |
+
"resolved_detokenizer": null,
|
| 386 |
+
"num_workers": 0,
|
| 387 |
+
"latest_every": 1000,
|
| 388 |
+
"resume_path": "runs/train8_rollin_len1024_rollin_p50_s4_i32_20260517_1840ctx1024/latest.pt"
|
| 389 |
+
}
|
| 390 |
+
step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=4400 elapsed=38.3s lr=2.000000e-03 loss=2.4945 loss_recon=2.4945 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5055 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3683 corrupt_frac=1.0000 acc_corrupt=0.3683 loss_corrupt=2.4945 wrong_frac=0.7908 init_acc_corrupt=0.1240 acc_corrupt_t_0p0_0p2=0.1675 corrupt_frac_t_0p0_0p2=0.5551 acc_corrupt_t_0p2_0p4=0.5746 corrupt_frac_t_0p2_0p4=0.3604 acc_corrupt_t_0p4_0p6=0.7983 corrupt_frac_t_0p4_0p6=0.0822 acc_corrupt_t_0p6_0p8=0.8784 corrupt_frac_t_0p6_0p8=0.0356 out_w_norm=11.3758 out_g_norm=0.7552 loss_all=2.5911 init_gold_top10=0.3336 init_gold_top100=0.6645 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.0988 init_acc_rollout_kept=0.0578 logit_acc_rollout_applied=0.3668 logit_acc_rollout_kept=0.3079
|
| 391 |
+
step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=4800 elapsed=37.6s lr=2.000000e-03 loss=2.1611 loss_recon=2.1611 loss_meanflow=0.0000 mean_model_t=0.2087 mean_corrupt_t=0.2087 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5035 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4426 corrupt_frac=1.0000 acc_corrupt=0.4426 loss_corrupt=2.1611 wrong_frac=0.7913 init_acc_corrupt=0.1255 acc_corrupt_t_0p0_0p2=0.2089 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=0.7016 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.8862 corrupt_frac_t_0p4_0p6=0.0807 acc_corrupt_t_0p8_1p0=0.9312 corrupt_frac_t_0p8_1p0=0.0312 out_w_norm=11.6450 out_g_norm=0.8699 acc_corrupt_t_0p6_0p8=0.9271 corrupt_frac_t_0p6_0p8=0.0357 loss_all=2.2175 init_gold_top10=0.4112 init_gold_top100=0.6921 rollout_applied_pos_frac=0.5625 init_acc_rollout_applied=0.1286 init_acc_rollout_kept=0.1085 logit_acc_rollout_applied=0.4788 logit_acc_rollout_kept=0.4031
|
| 392 |
+
step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=5200 elapsed=37.7s lr=2.000000e-03 loss=1.8630 loss_recon=1.8630 loss_meanflow=0.0000 mean_model_t=0.2118 mean_corrupt_t=0.2118 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4957 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5171 corrupt_frac=1.0000 acc_corrupt=0.5171 loss_corrupt=1.8630 wrong_frac=0.7884 init_acc_corrupt=0.1321 acc_corrupt_t_0p0_0p2=0.2547 corrupt_frac_t_0p0_0p2=0.5476 acc_corrupt_t_0p2_0p4=0.8082 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.9420 corrupt_frac_t_0p4_0p6=0.0835 out_w_norm=11.8612 out_g_norm=0.9278 acc_corrupt_t_0p6_0p8=0.9599 corrupt_frac_t_0p6_0p8=0.0366 acc_corrupt_t_0p8_1p0=0.9707 corrupt_frac_t_0p8_1p0=0.0312 loss_all=1.9265 init_gold_top10=0.4385 init_gold_top100=0.6085 rollout_applied_pos_frac=0.4375 init_acc_rollout_applied=0.1980 init_acc_rollout_kept=0.0588 logit_acc_rollout_applied=0.7255 logit_acc_rollout_kept=0.3106
|
| 393 |
+
step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=5600 elapsed=37.6s lr=2.000000e-03 loss=1.6395 loss_recon=1.6395 loss_meanflow=0.0000 mean_model_t=0.2091 mean_corrupt_t=0.2091 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5023 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5740 corrupt_frac=1.0000 acc_corrupt=0.5740 loss_corrupt=1.6395 wrong_frac=0.7911 init_acc_corrupt=0.1323 acc_corrupt_t_0p0_0p2=0.3145 corrupt_frac_t_0p0_0p2=0.5556 acc_corrupt_t_0p2_0p4=0.8804 corrupt_frac_t_0p2_0p4=0.3572 acc_corrupt_t_0p4_0p6=0.9724 corrupt_frac_t_0p4_0p6=0.0865 out_w_norm=12.0236 out_g_norm=0.9774 acc_corrupt_t_0p6_0p8=0.9795 corrupt_frac_t_0p6_0p8=0.0366 acc_corrupt_t_0p8_1p0=0.9873 corrupt_frac_t_0p8_1p0=0.0312 loss_all=1.7346 init_gold_top10=0.3990 init_gold_top100=0.5863 rollout_applied_pos_frac=0.4062 init_acc_rollout_applied=0.1877 init_acc_rollout_kept=0.0853 logit_acc_rollout_applied=0.6870 logit_acc_rollout_kept=0.4878
|
| 394 |
+
step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=6000 elapsed=37.6s lr=2.000000e-03 loss=1.3980 loss_recon=1.3980 loss_meanflow=0.0000 mean_model_t=0.2104 mean_corrupt_t=0.2104 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5032 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6327 corrupt_frac=1.0000 acc_corrupt=0.6327 loss_corrupt=1.3980 wrong_frac=0.7895 init_acc_corrupt=0.1356 acc_corrupt_t_0p0_0p2=0.3876 corrupt_frac_t_0p0_0p2=0.5505 acc_corrupt_t_0p2_0p4=0.9209 corrupt_frac_t_0p2_0p4=0.3658 acc_corrupt_t_0p4_0p6=0.9857 corrupt_frac_t_0p4_0p6=0.0809 out_w_norm=12.1409 out_g_norm=0.9930 acc_corrupt_t_0p6_0p8=0.9875 corrupt_frac_t_0p6_0p8=0.0353 acc_corrupt_t_0p8_1p0=0.9740 corrupt_frac_t_0p8_1p0=0.0312 loss_all=2.0586 init_gold_top10=0.4348 init_gold_top100=0.6795 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.1116 init_acc_rollout_kept=0.1106 logit_acc_rollout_applied=0.5888 logit_acc_rollout_kept=0.4600
|
| 395 |
+
step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=6400 elapsed=37.5s lr=2.000000e-03 loss=1.2548 loss_recon=1.2548 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4970 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6669 corrupt_frac=1.0000 acc_corrupt=0.6669 loss_corrupt=1.2548 wrong_frac=0.7905 init_acc_corrupt=0.1381 acc_corrupt_t_0p0_0p2=0.4306 corrupt_frac_t_0p0_0p2=0.5511 acc_corrupt_t_0p2_0p4=0.9487 corrupt_frac_t_0p2_0p4=0.3627 out_w_norm=12.2261 out_g_norm=0.9112 acc_corrupt_t_0p4_0p6=0.9921 corrupt_frac_t_0p4_0p6=0.0831 acc_corrupt_t_0p6_0p8=0.9925 corrupt_frac_t_0p6_0p8=0.0372 acc_corrupt_t_0p8_1p0=0.9829 corrupt_frac_t_0p8_1p0=0.0312 loss_all=1.0855 init_gold_top10=0.5554 init_gold_top100=0.6855 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.2027 init_acc_rollout_kept=0.0989 logit_acc_rollout_applied=0.8286 logit_acc_rollout_kept=0.5480
|
| 396 |
+
step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=6800 elapsed=37.3s lr=2.000000e-03 loss=1.1174 loss_recon=1.1174 loss_meanflow=0.0000 mean_model_t=0.2102 mean_corrupt_t=0.2102 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4981 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6984 corrupt_frac=1.0000 acc_corrupt=0.6984 loss_corrupt=1.1174 wrong_frac=0.7899 init_acc_corrupt=0.1409 acc_corrupt_t_0p0_0p2=0.4781 corrupt_frac_t_0p0_0p2=0.5530 acc_corrupt_t_0p2_0p4=0.9650 corrupt_frac_t_0p2_0p4=0.3584 acc_corrupt_t_0p4_0p6=0.9951 corrupt_frac_t_0p4_0p6=0.0863 acc_corrupt_t_0p6_0p8=0.9940 corrupt_frac_t_0p6_0p8=0.0361 out_w_norm=12.2909 out_g_norm=0.8889 acc_corrupt_t_0p8_1p0=0.9893 corrupt_frac_t_0p8_1p0=0.0312 loss_all=1.3472 init_gold_top10=0.4938 init_gold_top100=0.6382 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.2066 init_acc_rollout_kept=0.1093 logit_acc_rollout_applied=0.7921 logit_acc_rollout_kept=0.5591
|
| 397 |
+
step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=7200 elapsed=37.6s lr=2.000000e-03 loss=1.0126 loss_recon=1.0126 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5079 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7211 corrupt_frac=1.0000 acc_corrupt=0.7211 loss_corrupt=1.0126 wrong_frac=0.7903 init_acc_corrupt=0.1416 acc_corrupt_t_0p0_0p2=0.5124 corrupt_frac_t_0p0_0p2=0.5517 acc_corrupt_t_0p2_0p4=0.9736 corrupt_frac_t_0p2_0p4=0.3616 acc_corrupt_t_0p4_0p6=0.9970 corrupt_frac_t_0p4_0p6=0.0850 out_w_norm=12.3265 out_g_norm=0.8711 acc_corrupt_t_0p6_0p8=0.9957 corrupt_frac_t_0p6_0p8=0.0338 acc_corrupt_t_0p8_1p0=0.9839 corrupt_frac_t_0p8_1p0=0.0312 loss_all=1.1469 init_gold_top10=0.5062 init_gold_top100=0.6721 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.2005 init_acc_rollout_kept=0.0570 logit_acc_rollout_applied=0.7336 logit_acc_rollout_kept=0.5972
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_len1024_rollin_p50_s4_i32_20260517_1855ctx1024bs128.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_synct_len256_synct_p50_s8_i64_20260517_1800synct.log
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NCCL version 2.25.1+cuda12.8
|
| 2 |
+
{
|
| 3 |
+
"device": "cuda:0",
|
| 4 |
+
"rank": 0,
|
| 5 |
+
"world_size": 4,
|
| 6 |
+
"samples": "owt_cached_chunks:8",
|
| 7 |
+
"vocab_size": 969,
|
| 8 |
+
"tokenizer_vocab_size": 50257,
|
| 9 |
+
"save_dir": "runs/train8_rollin_synct_len256_synct_p50_s8_i64_20260517_1800synct",
|
| 10 |
+
"batch_size": 128,
|
| 11 |
+
"grad_accum": 1,
|
| 12 |
+
"effective_batch_size": 512,
|
| 13 |
+
"global_batch_size": 512,
|
| 14 |
+
"lr_schedule": "constant_warmup",
|
| 15 |
+
"optimizer": "muon",
|
| 16 |
+
"epochs": 0.0,
|
| 17 |
+
"steps_per_epoch": 1,
|
| 18 |
+
"total_steps": 500,
|
| 19 |
+
"warmup_steps": 10,
|
| 20 |
+
"warmup_epochs": -1.0,
|
| 21 |
+
"min_lr": 0.0,
|
| 22 |
+
"weight_decay": 0.1,
|
| 23 |
+
"output_weight_decay": -1.0,
|
| 24 |
+
"adamw_param_groups": "nanogpt",
|
| 25 |
+
"adam_beta1": 0.9,
|
| 26 |
+
"adam_beta2": 0.95,
|
| 27 |
+
"adam_eps": 1e-08,
|
| 28 |
+
"muon_impl": "legacy",
|
| 29 |
+
"muon_momentum": 0.95,
|
| 30 |
+
"muon_ns_steps": 5,
|
| 31 |
+
"muon_update_scale": 1.0,
|
| 32 |
+
"muon_nesterov": false,
|
| 33 |
+
"muon_width_scale": false,
|
| 34 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 35 |
+
"muon_param_count": 1965440,
|
| 36 |
+
"muon_adam_param_count": 8192,
|
| 37 |
+
"muon_param_names": [
|
| 38 |
+
"vocab_embed.embedding",
|
| 39 |
+
"sigma_map.net.0.weight",
|
| 40 |
+
"sigma_map.net.2.weight",
|
| 41 |
+
"blocks.0.attn_qkv.weight",
|
| 42 |
+
"blocks.0.attn_out.weight",
|
| 43 |
+
"blocks.0.mlp.0.weight",
|
| 44 |
+
"blocks.0.mlp.2.weight",
|
| 45 |
+
"blocks.0.adaLN_modulation.weight",
|
| 46 |
+
"blocks.1.attn_qkv.weight",
|
| 47 |
+
"blocks.1.attn_out.weight",
|
| 48 |
+
"blocks.1.mlp.0.weight",
|
| 49 |
+
"blocks.1.mlp.2.weight",
|
| 50 |
+
"blocks.1.adaLN_modulation.weight",
|
| 51 |
+
"blocks.2.attn_qkv.weight",
|
| 52 |
+
"blocks.2.attn_out.weight",
|
| 53 |
+
"blocks.2.mlp.0.weight",
|
| 54 |
+
"blocks.2.mlp.2.weight",
|
| 55 |
+
"blocks.2.adaLN_modulation.weight",
|
| 56 |
+
"output_layer.linear.weight",
|
| 57 |
+
"output_layer.adaLN_modulation.weight"
|
| 58 |
+
],
|
| 59 |
+
"muon_adam_param_names": [
|
| 60 |
+
"sigma_map.net.0.bias",
|
| 61 |
+
"sigma_map.net.2.bias",
|
| 62 |
+
"blocks.0.norm1.weight",
|
| 63 |
+
"blocks.0.norm2.weight",
|
| 64 |
+
"blocks.0.mlp.0.bias",
|
| 65 |
+
"blocks.0.mlp.2.bias",
|
| 66 |
+
"blocks.0.adaLN_modulation.bias",
|
| 67 |
+
"blocks.1.norm1.weight",
|
| 68 |
+
"blocks.1.norm2.weight",
|
| 69 |
+
"blocks.1.mlp.0.bias",
|
| 70 |
+
"blocks.1.mlp.2.bias",
|
| 71 |
+
"blocks.1.adaLN_modulation.bias",
|
| 72 |
+
"blocks.2.norm1.weight",
|
| 73 |
+
"blocks.2.norm2.weight",
|
| 74 |
+
"blocks.2.mlp.0.bias",
|
| 75 |
+
"blocks.2.mlp.2.bias",
|
| 76 |
+
"blocks.2.adaLN_modulation.bias",
|
| 77 |
+
"output_layer.norm_final.weight",
|
| 78 |
+
"output_layer.adaLN_modulation.bias"
|
| 79 |
+
],
|
| 80 |
+
"muon_effective_nesterov": false,
|
| 81 |
+
"muon_effective_width_scale": false,
|
| 82 |
+
"muon_effective_weight_decay": 0.1,
|
| 83 |
+
"muon_adam_fallback_nesterov": false,
|
| 84 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 85 |
+
"ema_decay": 0.9999,
|
| 86 |
+
"ema_start_step": 0,
|
| 87 |
+
"model_type": "ddit",
|
| 88 |
+
"ddit_mlp_type": "gelu",
|
| 89 |
+
"elf_num_time_tokens": 4,
|
| 90 |
+
"elf_num_model_mode_tokens": 0,
|
| 91 |
+
"qk_norm": true,
|
| 92 |
+
"output_bias": false,
|
| 93 |
+
"output_init_std": -1.0,
|
| 94 |
+
"norm_type": "rmsnorm",
|
| 95 |
+
"target_loss": "hard_ce",
|
| 96 |
+
"linear_soft_target_power": 1.0,
|
| 97 |
+
"linear_soft_target_min_conf": 0.0,
|
| 98 |
+
"linear_soft_target_max_conf": 1.0,
|
| 99 |
+
"t_sampling_mode": "logit_normal",
|
| 100 |
+
"t_sampling_power": 1.0,
|
| 101 |
+
"t_sampling_eps": 0.0001,
|
| 102 |
+
"t_sampling_logit_mean": -1.5,
|
| 103 |
+
"t_sampling_logit_std": 0.8,
|
| 104 |
+
"dual_t": true,
|
| 105 |
+
"corrupt_t_mode": "same",
|
| 106 |
+
"corrupt_min_t": 0.0,
|
| 107 |
+
"corrupt_max_t": 1.0,
|
| 108 |
+
"prefix_block_prob": 0.0,
|
| 109 |
+
"prefix_block_len": 128,
|
| 110 |
+
"mask_ratio_floor_schedule": "none",
|
| 111 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 112 |
+
"dirichlet_semantic_t_mode": "same",
|
| 113 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 114 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 115 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 116 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 117 |
+
"categorical_wrong_from_full_vocab": true,
|
| 118 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 119 |
+
"categorical_wrong_basin_token_ids": "",
|
| 120 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 121 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 122 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 123 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 124 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 125 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 126 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 127 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 128 |
+
"mask_mixture_original_prob": 0.0,
|
| 129 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 130 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 131 |
+
"mask_mixture_block_prob": 0.0,
|
| 132 |
+
"mask_mixture_all_prob": 1.0,
|
| 133 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 134 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 135 |
+
"mask_mixture_block_tokens": "64,128",
|
| 136 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 137 |
+
"logistic_normal_sigma_min": 0.1,
|
| 138 |
+
"logistic_normal_sigma_max": 1.0,
|
| 139 |
+
"logistic_normal_tau_min": 1.0,
|
| 140 |
+
"logistic_normal_tau_max": 1.0,
|
| 141 |
+
"torch_compile": false,
|
| 142 |
+
"compile_mode": "max-autotune",
|
| 143 |
+
"state_format": "prob",
|
| 144 |
+
"meanflow_weight": 0.0,
|
| 145 |
+
"rollout_train_prob": 0.5,
|
| 146 |
+
"rollout_train_steps": 8,
|
| 147 |
+
"rollout_train_infer_steps": 64,
|
| 148 |
+
"rollout_train_temp": 1.45,
|
| 149 |
+
"rollout_train_max_gamma": 1.0,
|
| 150 |
+
"rollout_train_corrupt_only": true,
|
| 151 |
+
"rollout_train_samplewise": true,
|
| 152 |
+
"rollout_train_compute_always": false,
|
| 153 |
+
"rollout_train_sync_t": true,
|
| 154 |
+
"bridge_noise_init": "logistic_normal",
|
| 155 |
+
"noise_sigma": -1.0,
|
| 156 |
+
"allow_tf32": true,
|
| 157 |
+
"activation_checkpointing": false,
|
| 158 |
+
"activation_checkpoint_interval": 1,
|
| 159 |
+
"activation_checkpoint_scope": "block",
|
| 160 |
+
"ddp_static_graph": false,
|
| 161 |
+
"ddp_gradient_as_bucket_view": true,
|
| 162 |
+
"blocking_data_transfer": false,
|
| 163 |
+
"dataloader_prefetch_factor": 4,
|
| 164 |
+
"full_train_stats": false,
|
| 165 |
+
"tokenized_hf": false,
|
| 166 |
+
"tokenized_pad_token": "pad",
|
| 167 |
+
"elf_conditional_hf": false,
|
| 168 |
+
"record_pad_truncate": false,
|
| 169 |
+
"record_add_eos": false,
|
| 170 |
+
"record_add_special_tokens": false,
|
| 171 |
+
"record_pad_token": "pad",
|
| 172 |
+
"record_shuffle_buffer": 10000,
|
| 173 |
+
"wrap": true,
|
| 174 |
+
"wrap_mode": "stream",
|
| 175 |
+
"wrap_record_buffer_size": 200,
|
| 176 |
+
"owt_cached_chunks": true,
|
| 177 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
|
| 178 |
+
"owt_chunk_cache_rebuild": false,
|
| 179 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 180 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 181 |
+
"online_chunk_shuffle": false,
|
| 182 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 183 |
+
"openwebtext_split": "train_minus_100k",
|
| 184 |
+
"detokenizer": "auto",
|
| 185 |
+
"resolved_detokenizer": null,
|
| 186 |
+
"num_workers": 0,
|
| 187 |
+
"latest_every": 500,
|
| 188 |
+
"resume_path": ""
|
| 189 |
+
}
|
| 190 |
+
step=100 epoch=100/500 epoch_step=1/1 micro_steps=100 elapsed=11.2s lr=2.000000e-03 loss=6.7066 loss_recon=6.7066 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5128 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0995 corrupt_frac=1.0000 acc_corrupt=0.0995 loss_corrupt=6.7066 wrong_frac=0.7915 init_acc_corrupt=0.1159 acc_corrupt_t_0p0_0p2=0.0486 corrupt_frac_t_0p0_0p2=0.5559 acc_corrupt_t_0p2_0p4=0.1326 corrupt_frac_t_0p2_0p4=0.3589 acc_corrupt_t_0p4_0p6=0.2811 corrupt_frac_t_0p4_0p6=0.0773 acc_corrupt_t_0p6_0p8=0.4045 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=1.0999 out_g_norm=1.0064 loss_all=6.4488 init_gold_top10=0.2091 init_gold_top100=0.4887 rollout_applied_pos_frac=0.4844 init_acc_rollout_applied=0.1132 init_acc_rollout_kept=0.1206 logit_acc_rollout_applied=0.1071 logit_acc_rollout_kept=0.0991
|
| 191 |
+
W0517 17:58:23.621000 251950 torch/distributed/elastic/agent/server/api.py:719] Received 15 death signal, shutting down workers
|
| 192 |
+
W0517 17:58:23.623000 251950 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 251955 closing signal SIGTERM
|
| 193 |
+
W0517 17:58:23.624000 251950 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 251956 closing signal SIGTERM
|
| 194 |
+
W0517 17:58:23.624000 251950 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 251957 closing signal SIGTERM
|
| 195 |
+
W0517 17:58:23.625000 251950 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 251958 closing signal SIGTERM
|
| 196 |
+
Traceback (most recent call last):
|
| 197 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 198 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 199 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
|
| 200 |
+
main()
|
| 201 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 202 |
+
return f(*args, **kwargs)
|
| 203 |
+
^^^^^^^^^^^^^^^^^^
|
| 204 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
|
| 205 |
+
run(args)
|
| 206 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
|
| 207 |
+
elastic_launch(
|
| 208 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
|
| 209 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 210 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 211 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
|
| 212 |
+
result = agent.run()
|
| 213 |
+
^^^^^^^^^^^
|
| 214 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
|
| 215 |
+
result = f(*args, **kwargs)
|
| 216 |
+
^^^^^^^^^^^^^^^^^^
|
| 217 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run
|
| 218 |
+
result = self._invoke_run(role)
|
| 219 |
+
^^^^^^^^^^^^^^^^^^^^^^
|
| 220 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 870, in _invoke_run
|
| 221 |
+
time.sleep(monitor_interval)
|
| 222 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler
|
| 223 |
+
raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
|
| 224 |
+
torch.distributed.elastic.multiprocessing.api.SignalException: Process 251950 got signal: 15
|
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_wrongfloor_len256_wrongfloor0p3_20260517_1815wrongfloor.log
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NCCL version 2.25.1+cuda12.8
|
| 2 |
+
{
|
| 3 |
+
"device": "cuda:0",
|
| 4 |
+
"rank": 0,
|
| 5 |
+
"world_size": 4,
|
| 6 |
+
"samples": "owt_cached_chunks:8",
|
| 7 |
+
"vocab_size": 969,
|
| 8 |
+
"tokenizer_vocab_size": 50257,
|
| 9 |
+
"save_dir": "runs/train8_wrongfloor_len256_wrongfloor0p3_20260517_1815wrongfloor",
|
| 10 |
+
"batch_size": 128,
|
| 11 |
+
"grad_accum": 1,
|
| 12 |
+
"effective_batch_size": 512,
|
| 13 |
+
"global_batch_size": 512,
|
| 14 |
+
"lr_schedule": "constant_warmup",
|
| 15 |
+
"optimizer": "muon",
|
| 16 |
+
"epochs": 0.0,
|
| 17 |
+
"steps_per_epoch": 1,
|
| 18 |
+
"total_steps": 1000,
|
| 19 |
+
"warmup_steps": 10,
|
| 20 |
+
"warmup_epochs": -1.0,
|
| 21 |
+
"min_lr": 0.0,
|
| 22 |
+
"weight_decay": 0.1,
|
| 23 |
+
"output_weight_decay": -1.0,
|
| 24 |
+
"adamw_param_groups": "nanogpt",
|
| 25 |
+
"adam_beta1": 0.9,
|
| 26 |
+
"adam_beta2": 0.95,
|
| 27 |
+
"adam_eps": 1e-08,
|
| 28 |
+
"muon_impl": "legacy",
|
| 29 |
+
"muon_momentum": 0.95,
|
| 30 |
+
"muon_ns_steps": 5,
|
| 31 |
+
"muon_update_scale": 1.0,
|
| 32 |
+
"muon_nesterov": false,
|
| 33 |
+
"muon_width_scale": false,
|
| 34 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 35 |
+
"muon_param_count": 1965440,
|
| 36 |
+
"muon_adam_param_count": 8192,
|
| 37 |
+
"muon_param_names": [
|
| 38 |
+
"vocab_embed.embedding",
|
| 39 |
+
"sigma_map.net.0.weight",
|
| 40 |
+
"sigma_map.net.2.weight",
|
| 41 |
+
"blocks.0.attn_qkv.weight",
|
| 42 |
+
"blocks.0.attn_out.weight",
|
| 43 |
+
"blocks.0.mlp.0.weight",
|
| 44 |
+
"blocks.0.mlp.2.weight",
|
| 45 |
+
"blocks.0.adaLN_modulation.weight",
|
| 46 |
+
"blocks.1.attn_qkv.weight",
|
| 47 |
+
"blocks.1.attn_out.weight",
|
| 48 |
+
"blocks.1.mlp.0.weight",
|
| 49 |
+
"blocks.1.mlp.2.weight",
|
| 50 |
+
"blocks.1.adaLN_modulation.weight",
|
| 51 |
+
"blocks.2.attn_qkv.weight",
|
| 52 |
+
"blocks.2.attn_out.weight",
|
| 53 |
+
"blocks.2.mlp.0.weight",
|
| 54 |
+
"blocks.2.mlp.2.weight",
|
| 55 |
+
"blocks.2.adaLN_modulation.weight",
|
| 56 |
+
"output_layer.linear.weight",
|
| 57 |
+
"output_layer.adaLN_modulation.weight"
|
| 58 |
+
],
|
| 59 |
+
"muon_adam_param_names": [
|
| 60 |
+
"sigma_map.net.0.bias",
|
| 61 |
+
"sigma_map.net.2.bias",
|
| 62 |
+
"blocks.0.norm1.weight",
|
| 63 |
+
"blocks.0.norm2.weight",
|
| 64 |
+
"blocks.0.mlp.0.bias",
|
| 65 |
+
"blocks.0.mlp.2.bias",
|
| 66 |
+
"blocks.0.adaLN_modulation.bias",
|
| 67 |
+
"blocks.1.norm1.weight",
|
| 68 |
+
"blocks.1.norm2.weight",
|
| 69 |
+
"blocks.1.mlp.0.bias",
|
| 70 |
+
"blocks.1.mlp.2.bias",
|
| 71 |
+
"blocks.1.adaLN_modulation.bias",
|
| 72 |
+
"blocks.2.norm1.weight",
|
| 73 |
+
"blocks.2.norm2.weight",
|
| 74 |
+
"blocks.2.mlp.0.bias",
|
| 75 |
+
"blocks.2.mlp.2.bias",
|
| 76 |
+
"blocks.2.adaLN_modulation.bias",
|
| 77 |
+
"output_layer.norm_final.weight",
|
| 78 |
+
"output_layer.adaLN_modulation.bias"
|
| 79 |
+
],
|
| 80 |
+
"muon_effective_nesterov": false,
|
| 81 |
+
"muon_effective_width_scale": false,
|
| 82 |
+
"muon_effective_weight_decay": 0.1,
|
| 83 |
+
"muon_adam_fallback_nesterov": false,
|
| 84 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 85 |
+
"ema_decay": 0.9999,
|
| 86 |
+
"ema_start_step": 0,
|
| 87 |
+
"model_type": "ddit",
|
| 88 |
+
"ddit_mlp_type": "gelu",
|
| 89 |
+
"elf_num_time_tokens": 4,
|
| 90 |
+
"elf_num_model_mode_tokens": 0,
|
| 91 |
+
"qk_norm": true,
|
| 92 |
+
"output_bias": false,
|
| 93 |
+
"output_init_std": -1.0,
|
| 94 |
+
"norm_type": "rmsnorm",
|
| 95 |
+
"target_loss": "hard_ce",
|
| 96 |
+
"linear_soft_target_power": 1.0,
|
| 97 |
+
"linear_soft_target_min_conf": 0.0,
|
| 98 |
+
"linear_soft_target_max_conf": 1.0,
|
| 99 |
+
"t_sampling_mode": "logit_normal",
|
| 100 |
+
"t_sampling_power": 1.0,
|
| 101 |
+
"t_sampling_eps": 0.0001,
|
| 102 |
+
"t_sampling_logit_mean": -1.5,
|
| 103 |
+
"t_sampling_logit_std": 0.8,
|
| 104 |
+
"dual_t": true,
|
| 105 |
+
"corrupt_t_mode": "same",
|
| 106 |
+
"corrupt_min_t": 0.0,
|
| 107 |
+
"corrupt_max_t": 1.0,
|
| 108 |
+
"prefix_block_prob": 0.0,
|
| 109 |
+
"prefix_block_len": 128,
|
| 110 |
+
"mask_ratio_floor_schedule": "none",
|
| 111 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 112 |
+
"dirichlet_semantic_t_mode": "same",
|
| 113 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 114 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 115 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 116 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 117 |
+
"categorical_wrong_from_full_vocab": true,
|
| 118 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 119 |
+
"categorical_wrong_basin_token_ids": "",
|
| 120 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 121 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 122 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 123 |
+
"categorical_wrong_prob_floor": 0.3,
|
| 124 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 125 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 126 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 127 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 128 |
+
"mask_mixture_original_prob": 0.0,
|
| 129 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 130 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 131 |
+
"mask_mixture_block_prob": 0.0,
|
| 132 |
+
"mask_mixture_all_prob": 1.0,
|
| 133 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 134 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 135 |
+
"mask_mixture_block_tokens": "64,128",
|
| 136 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 137 |
+
"logistic_normal_sigma_min": 0.1,
|
| 138 |
+
"logistic_normal_sigma_max": 1.0,
|
| 139 |
+
"logistic_normal_tau_min": 1.0,
|
| 140 |
+
"logistic_normal_tau_max": 1.0,
|
| 141 |
+
"torch_compile": false,
|
| 142 |
+
"compile_mode": "max-autotune",
|
| 143 |
+
"state_format": "prob",
|
| 144 |
+
"meanflow_weight": 0.0,
|
| 145 |
+
"rollout_train_prob": 0.0,
|
| 146 |
+
"rollout_train_steps": 1,
|
| 147 |
+
"rollout_train_infer_steps": 64,
|
| 148 |
+
"rollout_train_temp": 1.45,
|
| 149 |
+
"rollout_train_max_gamma": 1.0,
|
| 150 |
+
"rollout_train_corrupt_only": true,
|
| 151 |
+
"rollout_train_samplewise": false,
|
| 152 |
+
"rollout_train_compute_always": false,
|
| 153 |
+
"rollout_train_sync_t": false,
|
| 154 |
+
"bridge_noise_init": "logistic_normal",
|
| 155 |
+
"noise_sigma": -1.0,
|
| 156 |
+
"allow_tf32": true,
|
| 157 |
+
"activation_checkpointing": false,
|
| 158 |
+
"activation_checkpoint_interval": 1,
|
| 159 |
+
"activation_checkpoint_scope": "block",
|
| 160 |
+
"ddp_static_graph": false,
|
| 161 |
+
"ddp_gradient_as_bucket_view": true,
|
| 162 |
+
"blocking_data_transfer": false,
|
| 163 |
+
"dataloader_prefetch_factor": 4,
|
| 164 |
+
"full_train_stats": false,
|
| 165 |
+
"tokenized_hf": false,
|
| 166 |
+
"tokenized_pad_token": "pad",
|
| 167 |
+
"elf_conditional_hf": false,
|
| 168 |
+
"record_pad_truncate": false,
|
| 169 |
+
"record_add_eos": false,
|
| 170 |
+
"record_add_special_tokens": false,
|
| 171 |
+
"record_pad_token": "pad",
|
| 172 |
+
"record_shuffle_buffer": 10000,
|
| 173 |
+
"wrap": true,
|
| 174 |
+
"wrap_mode": "stream",
|
| 175 |
+
"wrap_record_buffer_size": 200,
|
| 176 |
+
"owt_cached_chunks": true,
|
| 177 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
|
| 178 |
+
"owt_chunk_cache_rebuild": false,
|
| 179 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 180 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 181 |
+
"online_chunk_shuffle": false,
|
| 182 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 183 |
+
"openwebtext_split": "train_minus_100k",
|
| 184 |
+
"detokenizer": "auto",
|
| 185 |
+
"resolved_detokenizer": null,
|
| 186 |
+
"num_workers": 0,
|
| 187 |
+
"latest_every": 1000,
|
| 188 |
+
"resume_path": ""
|
| 189 |
+
}
|
| 190 |
+
step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=4.4s lr=2.000000e-03 loss=6.7048 loss_recon=6.7048 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0998 corrupt_frac=1.0000 acc_corrupt=0.0998 loss_corrupt=6.7048 wrong_frac=0.7918 init_acc_corrupt=0.1152 acc_corrupt_t_0p0_0p2=0.0485 corrupt_frac_t_0p0_0p2=0.5588 acc_corrupt_t_0p2_0p4=0.1343 corrupt_frac_t_0p2_0p4=0.3579 acc_corrupt_t_0p4_0p6=0.2822 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.4246 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=1.0992 out_g_norm=1.0008 acc_corrupt_t_0p8_1p0=0.4414 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.4736 init_gold_top10=0.1887 init_gold_top100=0.4165
|
| 191 |
+
step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=3.7s lr=2.000000e-03 loss=6.1011 loss_recon=6.1011 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1091 corrupt_frac=1.0000 acc_corrupt=0.1091 loss_corrupt=6.1011 wrong_frac=0.7926 init_acc_corrupt=0.1151 acc_corrupt_t_0p0_0p2=0.0546 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=0.1490 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.2945 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.4062 corrupt_frac_t_0p6_0p8=0.0119 out_w_norm=3.3219 out_g_norm=1.4082 acc_corrupt_t_0p8_1p0=0.4609 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.8429 init_gold_top10=0.1850 init_gold_top100=0.4143
|
| 192 |
+
step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=3.7s lr=2.000000e-03 loss=5.5386 loss_recon=5.5386 loss_meanflow=0.0000 mean_model_t=0.2117 mean_corrupt_t=0.2117 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1243 corrupt_frac=1.0000 acc_corrupt=0.1243 loss_corrupt=5.5386 wrong_frac=0.7885 init_acc_corrupt=0.1197 acc_corrupt_t_0p0_0p2=0.0591 corrupt_frac_t_0p0_0p2=0.5484 acc_corrupt_t_0p2_0p4=0.1681 corrupt_frac_t_0p2_0p4=0.3611 acc_corrupt_t_0p4_0p6=0.3285 corrupt_frac_t_0p4_0p6=0.0803 acc_corrupt_t_0p6_0p8=0.4765 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=5.2102 out_g_norm=0.7079 acc_corrupt_t_0p8_1p0=0.4980 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.2636 init_gold_top10=0.2059 init_gold_top100=0.4209
|
| 193 |
+
step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=3.7s lr=2.000000e-03 loss=5.0143 loss_recon=5.0143 loss_meanflow=0.0000 mean_model_t=0.2073 mean_corrupt_t=0.2073 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1462 corrupt_frac=1.0000 acc_corrupt=0.1462 loss_corrupt=5.0143 wrong_frac=0.7928 init_acc_corrupt=0.1147 acc_corrupt_t_0p0_0p2=0.0638 corrupt_frac_t_0p0_0p2=0.5613 acc_corrupt_t_0p2_0p4=0.2018 corrupt_frac_t_0p2_0p4=0.3571 acc_corrupt_t_0p4_0p6=0.4463 corrupt_frac_t_0p4_0p6=0.0727 acc_corrupt_t_0p6_0p8=0.6573 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=6.8955 out_g_norm=0.4054 loss_all=4.6717 init_gold_top10=0.2018 init_gold_top100=0.4261
|
| 194 |
+
step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=3.7s lr=2.000000e-03 loss=4.2633 loss_recon=4.2633 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1848 corrupt_frac=1.0000 acc_corrupt=0.1848 loss_corrupt=4.2633 wrong_frac=0.7905 init_acc_corrupt=0.1175 acc_corrupt_t_0p0_0p2=0.0726 corrupt_frac_t_0p0_0p2=0.5540 acc_corrupt_t_0p2_0p4=0.2715 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=0.5261 corrupt_frac_t_0p4_0p6=0.0745 out_w_norm=8.4270 out_g_norm=0.4608 acc_corrupt_t_0p6_0p8=0.6848 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=0.7448 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.8613 init_gold_top10=0.2070 init_gold_top100=0.4169
|
| 195 |
+
step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=3.7s lr=2.000000e-03 loss=3.4251 loss_recon=3.4251 loss_meanflow=0.0000 mean_model_t=0.2089 mean_corrupt_t=0.2089 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2389 corrupt_frac=1.0000 acc_corrupt=0.2389 loss_corrupt=3.4251 wrong_frac=0.7911 init_acc_corrupt=0.1167 acc_corrupt_t_0p0_0p2=0.0945 corrupt_frac_t_0p0_0p2=0.5598 acc_corrupt_t_0p2_0p4=0.3712 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.6290 corrupt_frac_t_0p4_0p6=0.0750 acc_corrupt_t_0p6_0p8=0.7620 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=9.5765 out_g_norm=0.5181 acc_corrupt_t_0p8_1p0=0.7689 corrupt_frac_t_0p8_1p0=0.0094 loss_all=3.1182 init_gold_top10=0.1900 init_gold_top100=0.4196
|
| 196 |
+
step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=3.7s lr=2.000000e-03 loss=2.6250 loss_recon=2.6250 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3370 corrupt_frac=1.0000 acc_corrupt=0.3370 loss_corrupt=2.6250 wrong_frac=0.7907 init_acc_corrupt=0.1176 acc_corrupt_t_0p0_0p2=0.1399 corrupt_frac_t_0p0_0p2=0.5552 acc_corrupt_t_0p2_0p4=0.5298 corrupt_frac_t_0p2_0p4=0.3584 acc_corrupt_t_0p4_0p6=0.7932 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=10.2365 out_g_norm=0.6321 acc_corrupt_t_0p6_0p8=0.8852 corrupt_frac_t_0p6_0p8=0.0132 acc_corrupt_t_0p8_1p0=0.8451 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.2547 init_gold_top10=0.1937 init_gold_top100=0.4183
|
| 197 |
+
step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=3.7s lr=2.000000e-03 loss=1.8240 loss_recon=1.8240 loss_meanflow=0.0000 mean_model_t=0.2103 mean_corrupt_t=0.2103 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4941 corrupt_frac=1.0000 acc_corrupt=0.4941 loss_corrupt=1.8240 wrong_frac=0.7898 init_acc_corrupt=0.1186 acc_corrupt_t_0p0_0p2=0.2539 corrupt_frac_t_0p0_0p2=0.5525 acc_corrupt_t_0p2_0p4=0.7542 corrupt_frac_t_0p2_0p4=0.3592 acc_corrupt_t_0p4_0p6=0.9359 corrupt_frac_t_0p4_0p6=0.0800 acc_corrupt_t_0p6_0p8=0.9643 corrupt_frac_t_0p6_0p8=0.0141 out_w_norm=10.6284 out_g_norm=0.8746 acc_corrupt_t_0p8_1p0=0.9531 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.4551 init_gold_top10=0.2038 init_gold_top100=0.4186
|
| 198 |
+
step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=3.7s lr=2.000000e-03 loss=1.1915 loss_recon=1.1915 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6347 corrupt_frac=1.0000 acc_corrupt=0.6347 loss_corrupt=1.1915 wrong_frac=0.7907 init_acc_corrupt=0.1180 acc_corrupt_t_0p0_0p2=0.3964 corrupt_frac_t_0p0_0p2=0.5584 acc_corrupt_t_0p2_0p4=0.9225 corrupt_frac_t_0p2_0p4=0.3530 acc_corrupt_t_0p4_0p6=0.9890 corrupt_frac_t_0p4_0p6=0.0801 acc_corrupt_t_0p6_0p8=0.9912 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=10.9390 out_g_norm=0.9790 acc_corrupt_t_0p8_1p0=0.9727 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0933 init_gold_top10=0.2031 init_gold_top100=0.4228
|
| 199 |
+
step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=3.7s lr=2.000000e-03 loss=0.8912 loss_recon=0.8912 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7007 corrupt_frac=1.0000 acc_corrupt=0.7007 loss_corrupt=0.8912 wrong_frac=0.7908 init_acc_corrupt=0.1167 acc_corrupt_t_0p0_0p2=0.4763 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=0.9778 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.9982 corrupt_frac_t_0p4_0p6=0.0759 acc_corrupt_t_0p6_0p8=0.9958 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=11.1022 out_g_norm=0.9477 acc_corrupt_t_0p8_1p0=0.9805 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.8570 init_gold_top10=0.2021 init_gold_top100=0.4184
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/hf_xet-1.5.0.dist-info/RECORD
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hf_xet-1.5.0.dist-info/INSTALLER,sha256=5hhM4Q4mYTT9z6QB6PGpUAW81PGNFrYrdXMj4oM_6ak,2
|
| 2 |
+
hf_xet-1.5.0.dist-info/METADATA,sha256=kiRjS5pSbyNKVHLD5pYclp87MWjinAwQrmuPjKJX8yA,4882
|
| 3 |
+
hf_xet-1.5.0.dist-info/RECORD,,
|
| 4 |
+
hf_xet-1.5.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 5 |
+
hf_xet-1.5.0.dist-info/WHEEL,sha256=LbLhSGTXlKRmTCKROpRhcqPcmllwkP3mQdJf3GnRbUM,143
|
| 6 |
+
hf_xet-1.5.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
| 7 |
+
hf_xet-1.5.0.dist-info/sboms/hf_xet.cyclonedx.json,sha256=cqFELPUEOVkJUBakT3vgAWYE1N939XMjcP_vAKt_0xw,305769
|
| 8 |
+
hf_xet/__init__.py,sha256=E8UDdyQ8glZ_nve9hHEf22bPang8-RKx4VuApXYeQUo,107
|
| 9 |
+
hf_xet/hf_xet.abi3.so,sha256=6s8jp7y5mQ7kVbKGwgTcVFDzZ8rBdC63i7CPjbm_wEk,11465992
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/hf_xet-1.5.0.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: maturin (1.13.1)
|
| 3 |
+
Root-Is-Purelib: false
|
| 4 |
+
Tag: cp37-abi3-manylinux_2_17_x86_64
|
| 5 |
+
Tag: cp37-abi3-manylinux2014_x86_64
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/__init__.py
ADDED
|
File without changes
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/anyio.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import ssl
|
| 4 |
+
import typing
|
| 5 |
+
|
| 6 |
+
import anyio
|
| 7 |
+
|
| 8 |
+
from .._exceptions import (
|
| 9 |
+
ConnectError,
|
| 10 |
+
ConnectTimeout,
|
| 11 |
+
ReadError,
|
| 12 |
+
ReadTimeout,
|
| 13 |
+
WriteError,
|
| 14 |
+
WriteTimeout,
|
| 15 |
+
map_exceptions,
|
| 16 |
+
)
|
| 17 |
+
from .._utils import is_socket_readable
|
| 18 |
+
from .base import SOCKET_OPTION, AsyncNetworkBackend, AsyncNetworkStream
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class AnyIOStream(AsyncNetworkStream):
|
| 22 |
+
def __init__(self, stream: anyio.abc.ByteStream) -> None:
|
| 23 |
+
self._stream = stream
|
| 24 |
+
|
| 25 |
+
async def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
|
| 26 |
+
exc_map = {
|
| 27 |
+
TimeoutError: ReadTimeout,
|
| 28 |
+
anyio.BrokenResourceError: ReadError,
|
| 29 |
+
anyio.ClosedResourceError: ReadError,
|
| 30 |
+
anyio.EndOfStream: ReadError,
|
| 31 |
+
}
|
| 32 |
+
with map_exceptions(exc_map):
|
| 33 |
+
with anyio.fail_after(timeout):
|
| 34 |
+
try:
|
| 35 |
+
return await self._stream.receive(max_bytes=max_bytes)
|
| 36 |
+
except anyio.EndOfStream: # pragma: nocover
|
| 37 |
+
return b""
|
| 38 |
+
|
| 39 |
+
async def write(self, buffer: bytes, timeout: float | None = None) -> None:
|
| 40 |
+
if not buffer:
|
| 41 |
+
return
|
| 42 |
+
|
| 43 |
+
exc_map = {
|
| 44 |
+
TimeoutError: WriteTimeout,
|
| 45 |
+
anyio.BrokenResourceError: WriteError,
|
| 46 |
+
anyio.ClosedResourceError: WriteError,
|
| 47 |
+
}
|
| 48 |
+
with map_exceptions(exc_map):
|
| 49 |
+
with anyio.fail_after(timeout):
|
| 50 |
+
await self._stream.send(item=buffer)
|
| 51 |
+
|
| 52 |
+
async def aclose(self) -> None:
|
| 53 |
+
await self._stream.aclose()
|
| 54 |
+
|
| 55 |
+
async def start_tls(
|
| 56 |
+
self,
|
| 57 |
+
ssl_context: ssl.SSLContext,
|
| 58 |
+
server_hostname: str | None = None,
|
| 59 |
+
timeout: float | None = None,
|
| 60 |
+
) -> AsyncNetworkStream:
|
| 61 |
+
exc_map = {
|
| 62 |
+
TimeoutError: ConnectTimeout,
|
| 63 |
+
anyio.BrokenResourceError: ConnectError,
|
| 64 |
+
anyio.EndOfStream: ConnectError,
|
| 65 |
+
ssl.SSLError: ConnectError,
|
| 66 |
+
}
|
| 67 |
+
with map_exceptions(exc_map):
|
| 68 |
+
try:
|
| 69 |
+
with anyio.fail_after(timeout):
|
| 70 |
+
ssl_stream = await anyio.streams.tls.TLSStream.wrap(
|
| 71 |
+
self._stream,
|
| 72 |
+
ssl_context=ssl_context,
|
| 73 |
+
hostname=server_hostname,
|
| 74 |
+
standard_compatible=False,
|
| 75 |
+
server_side=False,
|
| 76 |
+
)
|
| 77 |
+
except Exception as exc: # pragma: nocover
|
| 78 |
+
await self.aclose()
|
| 79 |
+
raise exc
|
| 80 |
+
return AnyIOStream(ssl_stream)
|
| 81 |
+
|
| 82 |
+
def get_extra_info(self, info: str) -> typing.Any:
|
| 83 |
+
if info == "ssl_object":
|
| 84 |
+
return self._stream.extra(anyio.streams.tls.TLSAttribute.ssl_object, None)
|
| 85 |
+
if info == "client_addr":
|
| 86 |
+
return self._stream.extra(anyio.abc.SocketAttribute.local_address, None)
|
| 87 |
+
if info == "server_addr":
|
| 88 |
+
return self._stream.extra(anyio.abc.SocketAttribute.remote_address, None)
|
| 89 |
+
if info == "socket":
|
| 90 |
+
return self._stream.extra(anyio.abc.SocketAttribute.raw_socket, None)
|
| 91 |
+
if info == "is_readable":
|
| 92 |
+
sock = self._stream.extra(anyio.abc.SocketAttribute.raw_socket, None)
|
| 93 |
+
return is_socket_readable(sock)
|
| 94 |
+
return None
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class AnyIOBackend(AsyncNetworkBackend):
|
| 98 |
+
async def connect_tcp(
|
| 99 |
+
self,
|
| 100 |
+
host: str,
|
| 101 |
+
port: int,
|
| 102 |
+
timeout: float | None = None,
|
| 103 |
+
local_address: str | None = None,
|
| 104 |
+
socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
|
| 105 |
+
) -> AsyncNetworkStream: # pragma: nocover
|
| 106 |
+
if socket_options is None:
|
| 107 |
+
socket_options = []
|
| 108 |
+
exc_map = {
|
| 109 |
+
TimeoutError: ConnectTimeout,
|
| 110 |
+
OSError: ConnectError,
|
| 111 |
+
anyio.BrokenResourceError: ConnectError,
|
| 112 |
+
}
|
| 113 |
+
with map_exceptions(exc_map):
|
| 114 |
+
with anyio.fail_after(timeout):
|
| 115 |
+
stream: anyio.abc.ByteStream = await anyio.connect_tcp(
|
| 116 |
+
remote_host=host,
|
| 117 |
+
remote_port=port,
|
| 118 |
+
local_host=local_address,
|
| 119 |
+
)
|
| 120 |
+
# By default TCP sockets opened in `asyncio` include TCP_NODELAY.
|
| 121 |
+
for option in socket_options:
|
| 122 |
+
stream._raw_socket.setsockopt(*option) # type: ignore[attr-defined] # pragma: no cover
|
| 123 |
+
return AnyIOStream(stream)
|
| 124 |
+
|
| 125 |
+
async def connect_unix_socket(
|
| 126 |
+
self,
|
| 127 |
+
path: str,
|
| 128 |
+
timeout: float | None = None,
|
| 129 |
+
socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
|
| 130 |
+
) -> AsyncNetworkStream: # pragma: nocover
|
| 131 |
+
if socket_options is None:
|
| 132 |
+
socket_options = []
|
| 133 |
+
exc_map = {
|
| 134 |
+
TimeoutError: ConnectTimeout,
|
| 135 |
+
OSError: ConnectError,
|
| 136 |
+
anyio.BrokenResourceError: ConnectError,
|
| 137 |
+
}
|
| 138 |
+
with map_exceptions(exc_map):
|
| 139 |
+
with anyio.fail_after(timeout):
|
| 140 |
+
stream: anyio.abc.ByteStream = await anyio.connect_unix(path)
|
| 141 |
+
for option in socket_options:
|
| 142 |
+
stream._raw_socket.setsockopt(*option) # type: ignore[attr-defined] # pragma: no cover
|
| 143 |
+
return AnyIOStream(stream)
|
| 144 |
+
|
| 145 |
+
async def sleep(self, seconds: float) -> None:
|
| 146 |
+
await anyio.sleep(seconds) # pragma: nocover
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/auto.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import typing
|
| 4 |
+
|
| 5 |
+
from .._synchronization import current_async_library
|
| 6 |
+
from .base import SOCKET_OPTION, AsyncNetworkBackend, AsyncNetworkStream
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class AutoBackend(AsyncNetworkBackend):
|
| 10 |
+
async def _init_backend(self) -> None:
|
| 11 |
+
if not (hasattr(self, "_backend")):
|
| 12 |
+
backend = current_async_library()
|
| 13 |
+
if backend == "trio":
|
| 14 |
+
from .trio import TrioBackend
|
| 15 |
+
|
| 16 |
+
self._backend: AsyncNetworkBackend = TrioBackend()
|
| 17 |
+
else:
|
| 18 |
+
from .anyio import AnyIOBackend
|
| 19 |
+
|
| 20 |
+
self._backend = AnyIOBackend()
|
| 21 |
+
|
| 22 |
+
async def connect_tcp(
|
| 23 |
+
self,
|
| 24 |
+
host: str,
|
| 25 |
+
port: int,
|
| 26 |
+
timeout: float | None = None,
|
| 27 |
+
local_address: str | None = None,
|
| 28 |
+
socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
|
| 29 |
+
) -> AsyncNetworkStream:
|
| 30 |
+
await self._init_backend()
|
| 31 |
+
return await self._backend.connect_tcp(
|
| 32 |
+
host,
|
| 33 |
+
port,
|
| 34 |
+
timeout=timeout,
|
| 35 |
+
local_address=local_address,
|
| 36 |
+
socket_options=socket_options,
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
async def connect_unix_socket(
|
| 40 |
+
self,
|
| 41 |
+
path: str,
|
| 42 |
+
timeout: float | None = None,
|
| 43 |
+
socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
|
| 44 |
+
) -> AsyncNetworkStream: # pragma: nocover
|
| 45 |
+
await self._init_backend()
|
| 46 |
+
return await self._backend.connect_unix_socket(
|
| 47 |
+
path, timeout=timeout, socket_options=socket_options
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
async def sleep(self, seconds: float) -> None: # pragma: nocover
|
| 51 |
+
await self._init_backend()
|
| 52 |
+
return await self._backend.sleep(seconds)
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/base.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import ssl
|
| 4 |
+
import time
|
| 5 |
+
import typing
|
| 6 |
+
|
| 7 |
+
SOCKET_OPTION = typing.Union[
|
| 8 |
+
typing.Tuple[int, int, int],
|
| 9 |
+
typing.Tuple[int, int, typing.Union[bytes, bytearray]],
|
| 10 |
+
typing.Tuple[int, int, None, int],
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class NetworkStream:
|
| 15 |
+
def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
|
| 16 |
+
raise NotImplementedError() # pragma: nocover
|
| 17 |
+
|
| 18 |
+
def write(self, buffer: bytes, timeout: float | None = None) -> None:
|
| 19 |
+
raise NotImplementedError() # pragma: nocover
|
| 20 |
+
|
| 21 |
+
def close(self) -> None:
|
| 22 |
+
raise NotImplementedError() # pragma: nocover
|
| 23 |
+
|
| 24 |
+
def start_tls(
|
| 25 |
+
self,
|
| 26 |
+
ssl_context: ssl.SSLContext,
|
| 27 |
+
server_hostname: str | None = None,
|
| 28 |
+
timeout: float | None = None,
|
| 29 |
+
) -> NetworkStream:
|
| 30 |
+
raise NotImplementedError() # pragma: nocover
|
| 31 |
+
|
| 32 |
+
def get_extra_info(self, info: str) -> typing.Any:
|
| 33 |
+
return None # pragma: nocover
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class NetworkBackend:
|
| 37 |
+
def connect_tcp(
|
| 38 |
+
self,
|
| 39 |
+
host: str,
|
| 40 |
+
port: int,
|
| 41 |
+
timeout: float | None = None,
|
| 42 |
+
local_address: str | None = None,
|
| 43 |
+
socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
|
| 44 |
+
) -> NetworkStream:
|
| 45 |
+
raise NotImplementedError() # pragma: nocover
|
| 46 |
+
|
| 47 |
+
def connect_unix_socket(
|
| 48 |
+
self,
|
| 49 |
+
path: str,
|
| 50 |
+
timeout: float | None = None,
|
| 51 |
+
socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
|
| 52 |
+
) -> NetworkStream:
|
| 53 |
+
raise NotImplementedError() # pragma: nocover
|
| 54 |
+
|
| 55 |
+
def sleep(self, seconds: float) -> None:
|
| 56 |
+
time.sleep(seconds) # pragma: nocover
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class AsyncNetworkStream:
|
| 60 |
+
async def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
|
| 61 |
+
raise NotImplementedError() # pragma: nocover
|
| 62 |
+
|
| 63 |
+
async def write(self, buffer: bytes, timeout: float | None = None) -> None:
|
| 64 |
+
raise NotImplementedError() # pragma: nocover
|
| 65 |
+
|
| 66 |
+
async def aclose(self) -> None:
|
| 67 |
+
raise NotImplementedError() # pragma: nocover
|
| 68 |
+
|
| 69 |
+
async def start_tls(
|
| 70 |
+
self,
|
| 71 |
+
ssl_context: ssl.SSLContext,
|
| 72 |
+
server_hostname: str | None = None,
|
| 73 |
+
timeout: float | None = None,
|
| 74 |
+
) -> AsyncNetworkStream:
|
| 75 |
+
raise NotImplementedError() # pragma: nocover
|
| 76 |
+
|
| 77 |
+
def get_extra_info(self, info: str) -> typing.Any:
|
| 78 |
+
return None # pragma: nocover
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class AsyncNetworkBackend:
|
| 82 |
+
async def connect_tcp(
|
| 83 |
+
self,
|
| 84 |
+
host: str,
|
| 85 |
+
port: int,
|
| 86 |
+
timeout: float | None = None,
|
| 87 |
+
local_address: str | None = None,
|
| 88 |
+
socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
|
| 89 |
+
) -> AsyncNetworkStream:
|
| 90 |
+
raise NotImplementedError() # pragma: nocover
|
| 91 |
+
|
| 92 |
+
async def connect_unix_socket(
|
| 93 |
+
self,
|
| 94 |
+
path: str,
|
| 95 |
+
timeout: float | None = None,
|
| 96 |
+
socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
|
| 97 |
+
) -> AsyncNetworkStream:
|
| 98 |
+
raise NotImplementedError() # pragma: nocover
|
| 99 |
+
|
| 100 |
+
async def sleep(self, seconds: float) -> None:
|
| 101 |
+
raise NotImplementedError() # pragma: nocover
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/mock.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import ssl
|
| 4 |
+
import typing
|
| 5 |
+
|
| 6 |
+
from .._exceptions import ReadError
|
| 7 |
+
from .base import (
|
| 8 |
+
SOCKET_OPTION,
|
| 9 |
+
AsyncNetworkBackend,
|
| 10 |
+
AsyncNetworkStream,
|
| 11 |
+
NetworkBackend,
|
| 12 |
+
NetworkStream,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class MockSSLObject:
|
| 17 |
+
def __init__(self, http2: bool):
|
| 18 |
+
self._http2 = http2
|
| 19 |
+
|
| 20 |
+
def selected_alpn_protocol(self) -> str:
|
| 21 |
+
return "h2" if self._http2 else "http/1.1"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class MockStream(NetworkStream):
|
| 25 |
+
def __init__(self, buffer: list[bytes], http2: bool = False) -> None:
|
| 26 |
+
self._buffer = buffer
|
| 27 |
+
self._http2 = http2
|
| 28 |
+
self._closed = False
|
| 29 |
+
|
| 30 |
+
def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
|
| 31 |
+
if self._closed:
|
| 32 |
+
raise ReadError("Connection closed")
|
| 33 |
+
if not self._buffer:
|
| 34 |
+
return b""
|
| 35 |
+
return self._buffer.pop(0)
|
| 36 |
+
|
| 37 |
+
def write(self, buffer: bytes, timeout: float | None = None) -> None:
|
| 38 |
+
pass
|
| 39 |
+
|
| 40 |
+
def close(self) -> None:
|
| 41 |
+
self._closed = True
|
| 42 |
+
|
| 43 |
+
def start_tls(
|
| 44 |
+
self,
|
| 45 |
+
ssl_context: ssl.SSLContext,
|
| 46 |
+
server_hostname: str | None = None,
|
| 47 |
+
timeout: float | None = None,
|
| 48 |
+
) -> NetworkStream:
|
| 49 |
+
return self
|
| 50 |
+
|
| 51 |
+
def get_extra_info(self, info: str) -> typing.Any:
|
| 52 |
+
return MockSSLObject(http2=self._http2) if info == "ssl_object" else None
|
| 53 |
+
|
| 54 |
+
def __repr__(self) -> str:
|
| 55 |
+
return "<httpcore.MockStream>"
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class MockBackend(NetworkBackend):
|
| 59 |
+
def __init__(self, buffer: list[bytes], http2: bool = False) -> None:
|
| 60 |
+
self._buffer = buffer
|
| 61 |
+
self._http2 = http2
|
| 62 |
+
|
| 63 |
+
def connect_tcp(
|
| 64 |
+
self,
|
| 65 |
+
host: str,
|
| 66 |
+
port: int,
|
| 67 |
+
timeout: float | None = None,
|
| 68 |
+
local_address: str | None = None,
|
| 69 |
+
socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
|
| 70 |
+
) -> NetworkStream:
|
| 71 |
+
return MockStream(list(self._buffer), http2=self._http2)
|
| 72 |
+
|
| 73 |
+
def connect_unix_socket(
|
| 74 |
+
self,
|
| 75 |
+
path: str,
|
| 76 |
+
timeout: float | None = None,
|
| 77 |
+
socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
|
| 78 |
+
) -> NetworkStream:
|
| 79 |
+
return MockStream(list(self._buffer), http2=self._http2)
|
| 80 |
+
|
| 81 |
+
def sleep(self, seconds: float) -> None:
|
| 82 |
+
pass
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
class AsyncMockStream(AsyncNetworkStream):
|
| 86 |
+
def __init__(self, buffer: list[bytes], http2: bool = False) -> None:
|
| 87 |
+
self._buffer = buffer
|
| 88 |
+
self._http2 = http2
|
| 89 |
+
self._closed = False
|
| 90 |
+
|
| 91 |
+
async def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
|
| 92 |
+
if self._closed:
|
| 93 |
+
raise ReadError("Connection closed")
|
| 94 |
+
if not self._buffer:
|
| 95 |
+
return b""
|
| 96 |
+
return self._buffer.pop(0)
|
| 97 |
+
|
| 98 |
+
async def write(self, buffer: bytes, timeout: float | None = None) -> None:
|
| 99 |
+
pass
|
| 100 |
+
|
| 101 |
+
async def aclose(self) -> None:
|
| 102 |
+
self._closed = True
|
| 103 |
+
|
| 104 |
+
async def start_tls(
|
| 105 |
+
self,
|
| 106 |
+
ssl_context: ssl.SSLContext,
|
| 107 |
+
server_hostname: str | None = None,
|
| 108 |
+
timeout: float | None = None,
|
| 109 |
+
) -> AsyncNetworkStream:
|
| 110 |
+
return self
|
| 111 |
+
|
| 112 |
+
def get_extra_info(self, info: str) -> typing.Any:
|
| 113 |
+
return MockSSLObject(http2=self._http2) if info == "ssl_object" else None
|
| 114 |
+
|
| 115 |
+
def __repr__(self) -> str:
|
| 116 |
+
return "<httpcore.AsyncMockStream>"
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
class AsyncMockBackend(AsyncNetworkBackend):
|
| 120 |
+
def __init__(self, buffer: list[bytes], http2: bool = False) -> None:
|
| 121 |
+
self._buffer = buffer
|
| 122 |
+
self._http2 = http2
|
| 123 |
+
|
| 124 |
+
async def connect_tcp(
|
| 125 |
+
self,
|
| 126 |
+
host: str,
|
| 127 |
+
port: int,
|
| 128 |
+
timeout: float | None = None,
|
| 129 |
+
local_address: str | None = None,
|
| 130 |
+
socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
|
| 131 |
+
) -> AsyncNetworkStream:
|
| 132 |
+
return AsyncMockStream(list(self._buffer), http2=self._http2)
|
| 133 |
+
|
| 134 |
+
async def connect_unix_socket(
|
| 135 |
+
self,
|
| 136 |
+
path: str,
|
| 137 |
+
timeout: float | None = None,
|
| 138 |
+
socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
|
| 139 |
+
) -> AsyncNetworkStream:
|
| 140 |
+
return AsyncMockStream(list(self._buffer), http2=self._http2)
|
| 141 |
+
|
| 142 |
+
async def sleep(self, seconds: float) -> None:
|
| 143 |
+
pass
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/sync.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import functools
|
| 4 |
+
import socket
|
| 5 |
+
import ssl
|
| 6 |
+
import sys
|
| 7 |
+
import typing
|
| 8 |
+
|
| 9 |
+
from .._exceptions import (
|
| 10 |
+
ConnectError,
|
| 11 |
+
ConnectTimeout,
|
| 12 |
+
ExceptionMapping,
|
| 13 |
+
ReadError,
|
| 14 |
+
ReadTimeout,
|
| 15 |
+
WriteError,
|
| 16 |
+
WriteTimeout,
|
| 17 |
+
map_exceptions,
|
| 18 |
+
)
|
| 19 |
+
from .._utils import is_socket_readable
|
| 20 |
+
from .base import SOCKET_OPTION, NetworkBackend, NetworkStream
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class TLSinTLSStream(NetworkStream): # pragma: no cover
|
| 24 |
+
"""
|
| 25 |
+
Because the standard `SSLContext.wrap_socket` method does
|
| 26 |
+
not work for `SSLSocket` objects, we need this class
|
| 27 |
+
to implement TLS stream using an underlying `SSLObject`
|
| 28 |
+
instance in order to support TLS on top of TLS.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
# Defined in RFC 8449
|
| 32 |
+
TLS_RECORD_SIZE = 16384
|
| 33 |
+
|
| 34 |
+
def __init__(
|
| 35 |
+
self,
|
| 36 |
+
sock: socket.socket,
|
| 37 |
+
ssl_context: ssl.SSLContext,
|
| 38 |
+
server_hostname: str | None = None,
|
| 39 |
+
timeout: float | None = None,
|
| 40 |
+
):
|
| 41 |
+
self._sock = sock
|
| 42 |
+
self._incoming = ssl.MemoryBIO()
|
| 43 |
+
self._outgoing = ssl.MemoryBIO()
|
| 44 |
+
|
| 45 |
+
self.ssl_obj = ssl_context.wrap_bio(
|
| 46 |
+
incoming=self._incoming,
|
| 47 |
+
outgoing=self._outgoing,
|
| 48 |
+
server_hostname=server_hostname,
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
self._sock.settimeout(timeout)
|
| 52 |
+
self._perform_io(self.ssl_obj.do_handshake)
|
| 53 |
+
|
| 54 |
+
def _perform_io(
|
| 55 |
+
self,
|
| 56 |
+
func: typing.Callable[..., typing.Any],
|
| 57 |
+
) -> typing.Any:
|
| 58 |
+
ret = None
|
| 59 |
+
|
| 60 |
+
while True:
|
| 61 |
+
errno = None
|
| 62 |
+
try:
|
| 63 |
+
ret = func()
|
| 64 |
+
except (ssl.SSLWantReadError, ssl.SSLWantWriteError) as e:
|
| 65 |
+
errno = e.errno
|
| 66 |
+
|
| 67 |
+
self._sock.sendall(self._outgoing.read())
|
| 68 |
+
|
| 69 |
+
if errno == ssl.SSL_ERROR_WANT_READ:
|
| 70 |
+
buf = self._sock.recv(self.TLS_RECORD_SIZE)
|
| 71 |
+
|
| 72 |
+
if buf:
|
| 73 |
+
self._incoming.write(buf)
|
| 74 |
+
else:
|
| 75 |
+
self._incoming.write_eof()
|
| 76 |
+
if errno is None:
|
| 77 |
+
return ret
|
| 78 |
+
|
| 79 |
+
def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
|
| 80 |
+
exc_map: ExceptionMapping = {socket.timeout: ReadTimeout, OSError: ReadError}
|
| 81 |
+
with map_exceptions(exc_map):
|
| 82 |
+
self._sock.settimeout(timeout)
|
| 83 |
+
return typing.cast(
|
| 84 |
+
bytes, self._perform_io(functools.partial(self.ssl_obj.read, max_bytes))
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
def write(self, buffer: bytes, timeout: float | None = None) -> None:
|
| 88 |
+
exc_map: ExceptionMapping = {socket.timeout: WriteTimeout, OSError: WriteError}
|
| 89 |
+
with map_exceptions(exc_map):
|
| 90 |
+
self._sock.settimeout(timeout)
|
| 91 |
+
while buffer:
|
| 92 |
+
nsent = self._perform_io(functools.partial(self.ssl_obj.write, buffer))
|
| 93 |
+
buffer = buffer[nsent:]
|
| 94 |
+
|
| 95 |
+
def close(self) -> None:
|
| 96 |
+
self._sock.close()
|
| 97 |
+
|
| 98 |
+
def start_tls(
|
| 99 |
+
self,
|
| 100 |
+
ssl_context: ssl.SSLContext,
|
| 101 |
+
server_hostname: str | None = None,
|
| 102 |
+
timeout: float | None = None,
|
| 103 |
+
) -> NetworkStream:
|
| 104 |
+
raise NotImplementedError()
|
| 105 |
+
|
| 106 |
+
def get_extra_info(self, info: str) -> typing.Any:
|
| 107 |
+
if info == "ssl_object":
|
| 108 |
+
return self.ssl_obj
|
| 109 |
+
if info == "client_addr":
|
| 110 |
+
return self._sock.getsockname()
|
| 111 |
+
if info == "server_addr":
|
| 112 |
+
return self._sock.getpeername()
|
| 113 |
+
if info == "socket":
|
| 114 |
+
return self._sock
|
| 115 |
+
if info == "is_readable":
|
| 116 |
+
return is_socket_readable(self._sock)
|
| 117 |
+
return None
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
class SyncStream(NetworkStream):
|
| 121 |
+
def __init__(self, sock: socket.socket) -> None:
|
| 122 |
+
self._sock = sock
|
| 123 |
+
|
| 124 |
+
def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
|
| 125 |
+
exc_map: ExceptionMapping = {socket.timeout: ReadTimeout, OSError: ReadError}
|
| 126 |
+
with map_exceptions(exc_map):
|
| 127 |
+
self._sock.settimeout(timeout)
|
| 128 |
+
return self._sock.recv(max_bytes)
|
| 129 |
+
|
| 130 |
+
def write(self, buffer: bytes, timeout: float | None = None) -> None:
|
| 131 |
+
if not buffer:
|
| 132 |
+
return
|
| 133 |
+
|
| 134 |
+
exc_map: ExceptionMapping = {socket.timeout: WriteTimeout, OSError: WriteError}
|
| 135 |
+
with map_exceptions(exc_map):
|
| 136 |
+
while buffer:
|
| 137 |
+
self._sock.settimeout(timeout)
|
| 138 |
+
n = self._sock.send(buffer)
|
| 139 |
+
buffer = buffer[n:]
|
| 140 |
+
|
| 141 |
+
def close(self) -> None:
|
| 142 |
+
self._sock.close()
|
| 143 |
+
|
| 144 |
+
def start_tls(
|
| 145 |
+
self,
|
| 146 |
+
ssl_context: ssl.SSLContext,
|
| 147 |
+
server_hostname: str | None = None,
|
| 148 |
+
timeout: float | None = None,
|
| 149 |
+
) -> NetworkStream:
|
| 150 |
+
exc_map: ExceptionMapping = {
|
| 151 |
+
socket.timeout: ConnectTimeout,
|
| 152 |
+
OSError: ConnectError,
|
| 153 |
+
}
|
| 154 |
+
with map_exceptions(exc_map):
|
| 155 |
+
try:
|
| 156 |
+
if isinstance(self._sock, ssl.SSLSocket): # pragma: no cover
|
| 157 |
+
# If the underlying socket has already been upgraded
|
| 158 |
+
# to the TLS layer (i.e. is an instance of SSLSocket),
|
| 159 |
+
# we need some additional smarts to support TLS-in-TLS.
|
| 160 |
+
return TLSinTLSStream(
|
| 161 |
+
self._sock, ssl_context, server_hostname, timeout
|
| 162 |
+
)
|
| 163 |
+
else:
|
| 164 |
+
self._sock.settimeout(timeout)
|
| 165 |
+
sock = ssl_context.wrap_socket(
|
| 166 |
+
self._sock, server_hostname=server_hostname
|
| 167 |
+
)
|
| 168 |
+
except Exception as exc: # pragma: nocover
|
| 169 |
+
self.close()
|
| 170 |
+
raise exc
|
| 171 |
+
return SyncStream(sock)
|
| 172 |
+
|
| 173 |
+
def get_extra_info(self, info: str) -> typing.Any:
|
| 174 |
+
if info == "ssl_object" and isinstance(self._sock, ssl.SSLSocket):
|
| 175 |
+
return self._sock._sslobj # type: ignore
|
| 176 |
+
if info == "client_addr":
|
| 177 |
+
return self._sock.getsockname()
|
| 178 |
+
if info == "server_addr":
|
| 179 |
+
return self._sock.getpeername()
|
| 180 |
+
if info == "socket":
|
| 181 |
+
return self._sock
|
| 182 |
+
if info == "is_readable":
|
| 183 |
+
return is_socket_readable(self._sock)
|
| 184 |
+
return None
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
class SyncBackend(NetworkBackend):
|
| 188 |
+
def connect_tcp(
|
| 189 |
+
self,
|
| 190 |
+
host: str,
|
| 191 |
+
port: int,
|
| 192 |
+
timeout: float | None = None,
|
| 193 |
+
local_address: str | None = None,
|
| 194 |
+
socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
|
| 195 |
+
) -> NetworkStream:
|
| 196 |
+
# Note that we automatically include `TCP_NODELAY`
|
| 197 |
+
# in addition to any other custom socket options.
|
| 198 |
+
if socket_options is None:
|
| 199 |
+
socket_options = [] # pragma: no cover
|
| 200 |
+
address = (host, port)
|
| 201 |
+
source_address = None if local_address is None else (local_address, 0)
|
| 202 |
+
exc_map: ExceptionMapping = {
|
| 203 |
+
socket.timeout: ConnectTimeout,
|
| 204 |
+
OSError: ConnectError,
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
with map_exceptions(exc_map):
|
| 208 |
+
sock = socket.create_connection(
|
| 209 |
+
address,
|
| 210 |
+
timeout,
|
| 211 |
+
source_address=source_address,
|
| 212 |
+
)
|
| 213 |
+
for option in socket_options:
|
| 214 |
+
sock.setsockopt(*option) # pragma: no cover
|
| 215 |
+
sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
|
| 216 |
+
return SyncStream(sock)
|
| 217 |
+
|
| 218 |
+
def connect_unix_socket(
|
| 219 |
+
self,
|
| 220 |
+
path: str,
|
| 221 |
+
timeout: float | None = None,
|
| 222 |
+
socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
|
| 223 |
+
) -> NetworkStream: # pragma: nocover
|
| 224 |
+
if sys.platform == "win32":
|
| 225 |
+
raise RuntimeError(
|
| 226 |
+
"Attempted to connect to a UNIX socket on a Windows system."
|
| 227 |
+
)
|
| 228 |
+
if socket_options is None:
|
| 229 |
+
socket_options = []
|
| 230 |
+
|
| 231 |
+
exc_map: ExceptionMapping = {
|
| 232 |
+
socket.timeout: ConnectTimeout,
|
| 233 |
+
OSError: ConnectError,
|
| 234 |
+
}
|
| 235 |
+
with map_exceptions(exc_map):
|
| 236 |
+
sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
| 237 |
+
for option in socket_options:
|
| 238 |
+
sock.setsockopt(*option)
|
| 239 |
+
sock.settimeout(timeout)
|
| 240 |
+
sock.connect(path)
|
| 241 |
+
return SyncStream(sock)
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/trio.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import ssl
|
| 4 |
+
import typing
|
| 5 |
+
|
| 6 |
+
import trio
|
| 7 |
+
|
| 8 |
+
from .._exceptions import (
|
| 9 |
+
ConnectError,
|
| 10 |
+
ConnectTimeout,
|
| 11 |
+
ExceptionMapping,
|
| 12 |
+
ReadError,
|
| 13 |
+
ReadTimeout,
|
| 14 |
+
WriteError,
|
| 15 |
+
WriteTimeout,
|
| 16 |
+
map_exceptions,
|
| 17 |
+
)
|
| 18 |
+
from .base import SOCKET_OPTION, AsyncNetworkBackend, AsyncNetworkStream
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class TrioStream(AsyncNetworkStream):
|
| 22 |
+
def __init__(self, stream: trio.abc.Stream) -> None:
|
| 23 |
+
self._stream = stream
|
| 24 |
+
|
| 25 |
+
async def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
|
| 26 |
+
timeout_or_inf = float("inf") if timeout is None else timeout
|
| 27 |
+
exc_map: ExceptionMapping = {
|
| 28 |
+
trio.TooSlowError: ReadTimeout,
|
| 29 |
+
trio.BrokenResourceError: ReadError,
|
| 30 |
+
trio.ClosedResourceError: ReadError,
|
| 31 |
+
}
|
| 32 |
+
with map_exceptions(exc_map):
|
| 33 |
+
with trio.fail_after(timeout_or_inf):
|
| 34 |
+
data: bytes = await self._stream.receive_some(max_bytes=max_bytes)
|
| 35 |
+
return data
|
| 36 |
+
|
| 37 |
+
async def write(self, buffer: bytes, timeout: float | None = None) -> None:
|
| 38 |
+
if not buffer:
|
| 39 |
+
return
|
| 40 |
+
|
| 41 |
+
timeout_or_inf = float("inf") if timeout is None else timeout
|
| 42 |
+
exc_map: ExceptionMapping = {
|
| 43 |
+
trio.TooSlowError: WriteTimeout,
|
| 44 |
+
trio.BrokenResourceError: WriteError,
|
| 45 |
+
trio.ClosedResourceError: WriteError,
|
| 46 |
+
}
|
| 47 |
+
with map_exceptions(exc_map):
|
| 48 |
+
with trio.fail_after(timeout_or_inf):
|
| 49 |
+
await self._stream.send_all(data=buffer)
|
| 50 |
+
|
| 51 |
+
async def aclose(self) -> None:
|
| 52 |
+
await self._stream.aclose()
|
| 53 |
+
|
| 54 |
+
async def start_tls(
|
| 55 |
+
self,
|
| 56 |
+
ssl_context: ssl.SSLContext,
|
| 57 |
+
server_hostname: str | None = None,
|
| 58 |
+
timeout: float | None = None,
|
| 59 |
+
) -> AsyncNetworkStream:
|
| 60 |
+
timeout_or_inf = float("inf") if timeout is None else timeout
|
| 61 |
+
exc_map: ExceptionMapping = {
|
| 62 |
+
trio.TooSlowError: ConnectTimeout,
|
| 63 |
+
trio.BrokenResourceError: ConnectError,
|
| 64 |
+
}
|
| 65 |
+
ssl_stream = trio.SSLStream(
|
| 66 |
+
self._stream,
|
| 67 |
+
ssl_context=ssl_context,
|
| 68 |
+
server_hostname=server_hostname,
|
| 69 |
+
https_compatible=True,
|
| 70 |
+
server_side=False,
|
| 71 |
+
)
|
| 72 |
+
with map_exceptions(exc_map):
|
| 73 |
+
try:
|
| 74 |
+
with trio.fail_after(timeout_or_inf):
|
| 75 |
+
await ssl_stream.do_handshake()
|
| 76 |
+
except Exception as exc: # pragma: nocover
|
| 77 |
+
await self.aclose()
|
| 78 |
+
raise exc
|
| 79 |
+
return TrioStream(ssl_stream)
|
| 80 |
+
|
| 81 |
+
def get_extra_info(self, info: str) -> typing.Any:
|
| 82 |
+
if info == "ssl_object" and isinstance(self._stream, trio.SSLStream):
|
| 83 |
+
# Type checkers cannot see `_ssl_object` attribute because trio._ssl.SSLStream uses __getattr__/__setattr__.
|
| 84 |
+
# Tracked at https://github.com/python-trio/trio/issues/542
|
| 85 |
+
return self._stream._ssl_object # type: ignore[attr-defined]
|
| 86 |
+
if info == "client_addr":
|
| 87 |
+
return self._get_socket_stream().socket.getsockname()
|
| 88 |
+
if info == "server_addr":
|
| 89 |
+
return self._get_socket_stream().socket.getpeername()
|
| 90 |
+
if info == "socket":
|
| 91 |
+
stream = self._stream
|
| 92 |
+
while isinstance(stream, trio.SSLStream):
|
| 93 |
+
stream = stream.transport_stream
|
| 94 |
+
assert isinstance(stream, trio.SocketStream)
|
| 95 |
+
return stream.socket
|
| 96 |
+
if info == "is_readable":
|
| 97 |
+
socket = self.get_extra_info("socket")
|
| 98 |
+
return socket.is_readable()
|
| 99 |
+
return None
|
| 100 |
+
|
| 101 |
+
def _get_socket_stream(self) -> trio.SocketStream:
|
| 102 |
+
stream = self._stream
|
| 103 |
+
while isinstance(stream, trio.SSLStream):
|
| 104 |
+
stream = stream.transport_stream
|
| 105 |
+
assert isinstance(stream, trio.SocketStream)
|
| 106 |
+
return stream
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
class TrioBackend(AsyncNetworkBackend):
|
| 110 |
+
async def connect_tcp(
|
| 111 |
+
self,
|
| 112 |
+
host: str,
|
| 113 |
+
port: int,
|
| 114 |
+
timeout: float | None = None,
|
| 115 |
+
local_address: str | None = None,
|
| 116 |
+
socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
|
| 117 |
+
) -> AsyncNetworkStream:
|
| 118 |
+
# By default for TCP sockets, trio enables TCP_NODELAY.
|
| 119 |
+
# https://trio.readthedocs.io/en/stable/reference-io.html#trio.SocketStream
|
| 120 |
+
if socket_options is None:
|
| 121 |
+
socket_options = [] # pragma: no cover
|
| 122 |
+
timeout_or_inf = float("inf") if timeout is None else timeout
|
| 123 |
+
exc_map: ExceptionMapping = {
|
| 124 |
+
trio.TooSlowError: ConnectTimeout,
|
| 125 |
+
trio.BrokenResourceError: ConnectError,
|
| 126 |
+
OSError: ConnectError,
|
| 127 |
+
}
|
| 128 |
+
with map_exceptions(exc_map):
|
| 129 |
+
with trio.fail_after(timeout_or_inf):
|
| 130 |
+
stream: trio.abc.Stream = await trio.open_tcp_stream(
|
| 131 |
+
host=host, port=port, local_address=local_address
|
| 132 |
+
)
|
| 133 |
+
for option in socket_options:
|
| 134 |
+
stream.setsockopt(*option) # type: ignore[attr-defined] # pragma: no cover
|
| 135 |
+
return TrioStream(stream)
|
| 136 |
+
|
| 137 |
+
async def connect_unix_socket(
|
| 138 |
+
self,
|
| 139 |
+
path: str,
|
| 140 |
+
timeout: float | None = None,
|
| 141 |
+
socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
|
| 142 |
+
) -> AsyncNetworkStream: # pragma: nocover
|
| 143 |
+
if socket_options is None:
|
| 144 |
+
socket_options = []
|
| 145 |
+
timeout_or_inf = float("inf") if timeout is None else timeout
|
| 146 |
+
exc_map: ExceptionMapping = {
|
| 147 |
+
trio.TooSlowError: ConnectTimeout,
|
| 148 |
+
trio.BrokenResourceError: ConnectError,
|
| 149 |
+
OSError: ConnectError,
|
| 150 |
+
}
|
| 151 |
+
with map_exceptions(exc_map):
|
| 152 |
+
with trio.fail_after(timeout_or_inf):
|
| 153 |
+
stream: trio.abc.Stream = await trio.open_unix_socket(path)
|
| 154 |
+
for option in socket_options:
|
| 155 |
+
stream.setsockopt(*option) # type: ignore[attr-defined] # pragma: no cover
|
| 156 |
+
return TrioStream(stream)
|
| 157 |
+
|
| 158 |
+
async def sleep(self, seconds: float) -> None:
|
| 159 |
+
await trio.sleep(seconds) # pragma: nocover
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__all__ = (
|
| 2 |
+
"decode",
|
| 3 |
+
"DECODE_DEFAULT_CHARS",
|
| 4 |
+
"DECODE_COMPONENT_CHARS",
|
| 5 |
+
"encode",
|
| 6 |
+
"ENCODE_DEFAULT_CHARS",
|
| 7 |
+
"ENCODE_COMPONENT_CHARS",
|
| 8 |
+
"format",
|
| 9 |
+
"parse",
|
| 10 |
+
"URL",
|
| 11 |
+
)
|
| 12 |
+
__version__ = "0.1.2" # DO NOT EDIT THIS LINE MANUALLY. LET bump2version UTILITY DO IT
|
| 13 |
+
|
| 14 |
+
from mdurl._decode import DECODE_COMPONENT_CHARS, DECODE_DEFAULT_CHARS, decode
|
| 15 |
+
from mdurl._encode import ENCODE_COMPONENT_CHARS, ENCODE_DEFAULT_CHARS, encode
|
| 16 |
+
from mdurl._format import format
|
| 17 |
+
from mdurl._parse import url_parse as parse
|
| 18 |
+
from mdurl._url import URL
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_decode.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from collections.abc import Sequence
|
| 4 |
+
import functools
|
| 5 |
+
import re
|
| 6 |
+
|
| 7 |
+
DECODE_DEFAULT_CHARS = ";/?:@&=+$,#"
|
| 8 |
+
DECODE_COMPONENT_CHARS = ""
|
| 9 |
+
|
| 10 |
+
decode_cache: dict[str, list[str]] = {}
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def get_decode_cache(exclude: str) -> Sequence[str]:
|
| 14 |
+
if exclude in decode_cache:
|
| 15 |
+
return decode_cache[exclude]
|
| 16 |
+
|
| 17 |
+
cache: list[str] = []
|
| 18 |
+
decode_cache[exclude] = cache
|
| 19 |
+
|
| 20 |
+
for i in range(128):
|
| 21 |
+
ch = chr(i)
|
| 22 |
+
cache.append(ch)
|
| 23 |
+
|
| 24 |
+
for i in range(len(exclude)):
|
| 25 |
+
ch_code = ord(exclude[i])
|
| 26 |
+
cache[ch_code] = "%" + ("0" + hex(ch_code)[2:].upper())[-2:]
|
| 27 |
+
|
| 28 |
+
return cache
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# Decode percent-encoded string.
|
| 32 |
+
#
|
| 33 |
+
def decode(string: str, exclude: str = DECODE_DEFAULT_CHARS) -> str:
|
| 34 |
+
cache = get_decode_cache(exclude)
|
| 35 |
+
repl_func = functools.partial(repl_func_with_cache, cache=cache)
|
| 36 |
+
return re.sub(r"(%[a-f0-9]{2})+", repl_func, string, flags=re.IGNORECASE)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def repl_func_with_cache(match: re.Match, cache: Sequence[str]) -> str:
|
| 40 |
+
seq = match.group()
|
| 41 |
+
result = ""
|
| 42 |
+
|
| 43 |
+
i = 0
|
| 44 |
+
l = len(seq) # noqa: E741
|
| 45 |
+
while i < l:
|
| 46 |
+
b1 = int(seq[i + 1 : i + 3], 16)
|
| 47 |
+
|
| 48 |
+
if b1 < 0x80:
|
| 49 |
+
result += cache[b1]
|
| 50 |
+
i += 3 # emulate JS for loop statement3
|
| 51 |
+
continue
|
| 52 |
+
|
| 53 |
+
if (b1 & 0xE0) == 0xC0 and (i + 3 < l):
|
| 54 |
+
# 110xxxxx 10xxxxxx
|
| 55 |
+
b2 = int(seq[i + 4 : i + 6], 16)
|
| 56 |
+
|
| 57 |
+
if (b2 & 0xC0) == 0x80:
|
| 58 |
+
all_bytes = bytes((b1, b2))
|
| 59 |
+
try:
|
| 60 |
+
result += all_bytes.decode()
|
| 61 |
+
except UnicodeDecodeError:
|
| 62 |
+
result += "\ufffd" * 2
|
| 63 |
+
|
| 64 |
+
i += 3
|
| 65 |
+
i += 3 # emulate JS for loop statement3
|
| 66 |
+
continue
|
| 67 |
+
|
| 68 |
+
if (b1 & 0xF0) == 0xE0 and (i + 6 < l):
|
| 69 |
+
# 1110xxxx 10xxxxxx 10xxxxxx
|
| 70 |
+
b2 = int(seq[i + 4 : i + 6], 16)
|
| 71 |
+
b3 = int(seq[i + 7 : i + 9], 16)
|
| 72 |
+
|
| 73 |
+
if (b2 & 0xC0) == 0x80 and (b3 & 0xC0) == 0x80:
|
| 74 |
+
all_bytes = bytes((b1, b2, b3))
|
| 75 |
+
try:
|
| 76 |
+
result += all_bytes.decode()
|
| 77 |
+
except UnicodeDecodeError:
|
| 78 |
+
result += "\ufffd" * 3
|
| 79 |
+
|
| 80 |
+
i += 6
|
| 81 |
+
i += 3 # emulate JS for loop statement3
|
| 82 |
+
continue
|
| 83 |
+
|
| 84 |
+
if (b1 & 0xF8) == 0xF0 and (i + 9 < l):
|
| 85 |
+
# 111110xx 10xxxxxx 10xxxxxx 10xxxxxx
|
| 86 |
+
b2 = int(seq[i + 4 : i + 6], 16)
|
| 87 |
+
b3 = int(seq[i + 7 : i + 9], 16)
|
| 88 |
+
b4 = int(seq[i + 10 : i + 12], 16)
|
| 89 |
+
|
| 90 |
+
if (b2 & 0xC0) == 0x80 and (b3 & 0xC0) == 0x80 and (b4 & 0xC0) == 0x80:
|
| 91 |
+
all_bytes = bytes((b1, b2, b3, b4))
|
| 92 |
+
try:
|
| 93 |
+
result += all_bytes.decode()
|
| 94 |
+
except UnicodeDecodeError:
|
| 95 |
+
result += "\ufffd" * 4
|
| 96 |
+
|
| 97 |
+
i += 9
|
| 98 |
+
i += 3 # emulate JS for loop statement3
|
| 99 |
+
continue
|
| 100 |
+
|
| 101 |
+
result += "\ufffd"
|
| 102 |
+
i += 3 # emulate JS for loop statement3
|
| 103 |
+
|
| 104 |
+
return result
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_encode.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from collections.abc import Sequence
|
| 4 |
+
from string import ascii_letters, digits, hexdigits
|
| 5 |
+
from urllib.parse import quote as encode_uri_component
|
| 6 |
+
|
| 7 |
+
ASCII_LETTERS_AND_DIGITS = ascii_letters + digits
|
| 8 |
+
|
| 9 |
+
ENCODE_DEFAULT_CHARS = ";/?:@&=+$,-_.!~*'()#"
|
| 10 |
+
ENCODE_COMPONENT_CHARS = "-_.!~*'()"
|
| 11 |
+
|
| 12 |
+
encode_cache: dict[str, list[str]] = {}
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# Create a lookup array where anything but characters in `chars` string
|
| 16 |
+
# and alphanumeric chars is percent-encoded.
|
| 17 |
+
def get_encode_cache(exclude: str) -> Sequence[str]:
|
| 18 |
+
if exclude in encode_cache:
|
| 19 |
+
return encode_cache[exclude]
|
| 20 |
+
|
| 21 |
+
cache: list[str] = []
|
| 22 |
+
encode_cache[exclude] = cache
|
| 23 |
+
|
| 24 |
+
for i in range(128):
|
| 25 |
+
ch = chr(i)
|
| 26 |
+
|
| 27 |
+
if ch in ASCII_LETTERS_AND_DIGITS:
|
| 28 |
+
# always allow unencoded alphanumeric characters
|
| 29 |
+
cache.append(ch)
|
| 30 |
+
else:
|
| 31 |
+
cache.append("%" + ("0" + hex(i)[2:].upper())[-2:])
|
| 32 |
+
|
| 33 |
+
for i in range(len(exclude)):
|
| 34 |
+
cache[ord(exclude[i])] = exclude[i]
|
| 35 |
+
|
| 36 |
+
return cache
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# Encode unsafe characters with percent-encoding, skipping already
|
| 40 |
+
# encoded sequences.
|
| 41 |
+
#
|
| 42 |
+
# - string - string to encode
|
| 43 |
+
# - exclude - list of characters to ignore (in addition to a-zA-Z0-9)
|
| 44 |
+
# - keepEscaped - don't encode '%' in a correct escape sequence (default: true)
|
| 45 |
+
def encode(
|
| 46 |
+
string: str, exclude: str = ENCODE_DEFAULT_CHARS, *, keep_escaped: bool = True
|
| 47 |
+
) -> str:
|
| 48 |
+
result = ""
|
| 49 |
+
|
| 50 |
+
cache = get_encode_cache(exclude)
|
| 51 |
+
|
| 52 |
+
l = len(string) # noqa: E741
|
| 53 |
+
i = 0
|
| 54 |
+
while i < l:
|
| 55 |
+
code = ord(string[i])
|
| 56 |
+
|
| 57 |
+
# %
|
| 58 |
+
if keep_escaped and code == 0x25 and i + 2 < l:
|
| 59 |
+
if all(c in hexdigits for c in string[i + 1 : i + 3]):
|
| 60 |
+
result += string[i : i + 3]
|
| 61 |
+
i += 2
|
| 62 |
+
i += 1 # JS for loop statement3
|
| 63 |
+
continue
|
| 64 |
+
|
| 65 |
+
if code < 128:
|
| 66 |
+
result += cache[code]
|
| 67 |
+
i += 1 # JS for loop statement3
|
| 68 |
+
continue
|
| 69 |
+
|
| 70 |
+
if code >= 0xD800 and code <= 0xDFFF:
|
| 71 |
+
if code >= 0xD800 and code <= 0xDBFF and i + 1 < l:
|
| 72 |
+
next_code = ord(string[i + 1])
|
| 73 |
+
if next_code >= 0xDC00 and next_code <= 0xDFFF:
|
| 74 |
+
result += encode_uri_component(string[i] + string[i + 1])
|
| 75 |
+
i += 1
|
| 76 |
+
i += 1 # JS for loop statement3
|
| 77 |
+
continue
|
| 78 |
+
result += "%EF%BF%BD"
|
| 79 |
+
i += 1 # JS for loop statement3
|
| 80 |
+
continue
|
| 81 |
+
|
| 82 |
+
result += encode_uri_component(string[i])
|
| 83 |
+
i += 1 # JS for loop statement3
|
| 84 |
+
|
| 85 |
+
return result
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_format.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import TYPE_CHECKING
|
| 4 |
+
|
| 5 |
+
if TYPE_CHECKING:
|
| 6 |
+
from mdurl._url import URL
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def format(url: URL) -> str: # noqa: A001
|
| 10 |
+
result = ""
|
| 11 |
+
|
| 12 |
+
result += url.protocol or ""
|
| 13 |
+
result += "//" if url.slashes else ""
|
| 14 |
+
result += url.auth + "@" if url.auth else ""
|
| 15 |
+
|
| 16 |
+
if url.hostname and ":" in url.hostname:
|
| 17 |
+
# ipv6 address
|
| 18 |
+
result += "[" + url.hostname + "]"
|
| 19 |
+
else:
|
| 20 |
+
result += url.hostname or ""
|
| 21 |
+
|
| 22 |
+
result += ":" + url.port if url.port else ""
|
| 23 |
+
result += url.pathname or ""
|
| 24 |
+
result += url.search or ""
|
| 25 |
+
result += url.hash or ""
|
| 26 |
+
|
| 27 |
+
return result
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_parse.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright Joyent, Inc. and other Node contributors.
|
| 2 |
+
#
|
| 3 |
+
# Permission is hereby granted, free of charge, to any person obtaining a
|
| 4 |
+
# copy of this software and associated documentation files (the
|
| 5 |
+
# "Software"), to deal in the Software without restriction, including
|
| 6 |
+
# without limitation the rights to use, copy, modify, merge, publish,
|
| 7 |
+
# distribute, sublicense, and/or sell copies of the Software, and to permit
|
| 8 |
+
# persons to whom the Software is furnished to do so, subject to the
|
| 9 |
+
# following conditions:
|
| 10 |
+
#
|
| 11 |
+
# The above copyright notice and this permission notice shall be included
|
| 12 |
+
# in all copies or substantial portions of the Software.
|
| 13 |
+
#
|
| 14 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
| 15 |
+
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
| 16 |
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
|
| 17 |
+
# NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
|
| 18 |
+
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
| 19 |
+
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
|
| 20 |
+
# USE OR OTHER DEALINGS IN THE SOFTWARE.
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# Changes from joyent/node:
|
| 24 |
+
#
|
| 25 |
+
# 1. No leading slash in paths,
|
| 26 |
+
# e.g. in `url.parse('http://foo?bar')` pathname is ``, not `/`
|
| 27 |
+
#
|
| 28 |
+
# 2. Backslashes are not replaced with slashes,
|
| 29 |
+
# so `http:\\example.org\` is treated like a relative path
|
| 30 |
+
#
|
| 31 |
+
# 3. Trailing colon is treated like a part of the path,
|
| 32 |
+
# i.e. in `http://example.org:foo` pathname is `:foo`
|
| 33 |
+
#
|
| 34 |
+
# 4. Nothing is URL-encoded in the resulting object,
|
| 35 |
+
# (in joyent/node some chars in auth and paths are encoded)
|
| 36 |
+
#
|
| 37 |
+
# 5. `url.parse()` does not have `parseQueryString` argument
|
| 38 |
+
#
|
| 39 |
+
# 6. Removed extraneous result properties: `host`, `path`, `query`, etc.,
|
| 40 |
+
# which can be constructed using other parts of the url.
|
| 41 |
+
|
| 42 |
+
from __future__ import annotations
|
| 43 |
+
|
| 44 |
+
from collections import defaultdict
|
| 45 |
+
import re
|
| 46 |
+
|
| 47 |
+
from mdurl._url import URL
|
| 48 |
+
|
| 49 |
+
# Reference: RFC 3986, RFC 1808, RFC 2396
|
| 50 |
+
|
| 51 |
+
# define these here so at least they only have to be
|
| 52 |
+
# compiled once on the first module load.
|
| 53 |
+
PROTOCOL_PATTERN = re.compile(r"^([a-z0-9.+-]+:)", flags=re.IGNORECASE)
|
| 54 |
+
PORT_PATTERN = re.compile(r":[0-9]*$")
|
| 55 |
+
|
| 56 |
+
# Special case for a simple path URL
|
| 57 |
+
SIMPLE_PATH_PATTERN = re.compile(r"^(//?(?!/)[^?\s]*)(\?[^\s]*)?$")
|
| 58 |
+
|
| 59 |
+
# RFC 2396: characters reserved for delimiting URLs.
|
| 60 |
+
# We actually just auto-escape these.
|
| 61 |
+
DELIMS = ("<", ">", '"', "`", " ", "\r", "\n", "\t")
|
| 62 |
+
|
| 63 |
+
# RFC 2396: characters not allowed for various reasons.
|
| 64 |
+
UNWISE = ("{", "}", "|", "\\", "^", "`") + DELIMS
|
| 65 |
+
|
| 66 |
+
# Allowed by RFCs, but cause of XSS attacks. Always escape these.
|
| 67 |
+
AUTO_ESCAPE = ("'",) + UNWISE
|
| 68 |
+
# Characters that are never ever allowed in a hostname.
|
| 69 |
+
# Note that any invalid chars are also handled, but these
|
| 70 |
+
# are the ones that are *expected* to be seen, so we fast-path
|
| 71 |
+
# them.
|
| 72 |
+
NON_HOST_CHARS = ("%", "/", "?", ";", "#") + AUTO_ESCAPE
|
| 73 |
+
HOST_ENDING_CHARS = ("/", "?", "#")
|
| 74 |
+
HOSTNAME_MAX_LEN = 255
|
| 75 |
+
HOSTNAME_PART_PATTERN = re.compile(r"^[+a-z0-9A-Z_-]{0,63}$")
|
| 76 |
+
HOSTNAME_PART_START = re.compile(r"^([+a-z0-9A-Z_-]{0,63})(.*)$")
|
| 77 |
+
# protocols that can allow "unsafe" and "unwise" chars.
|
| 78 |
+
|
| 79 |
+
# protocols that never have a hostname.
|
| 80 |
+
HOSTLESS_PROTOCOL = defaultdict(
|
| 81 |
+
bool,
|
| 82 |
+
{
|
| 83 |
+
"javascript": True,
|
| 84 |
+
"javascript:": True,
|
| 85 |
+
},
|
| 86 |
+
)
|
| 87 |
+
# protocols that always contain a // bit.
|
| 88 |
+
SLASHED_PROTOCOL = defaultdict(
|
| 89 |
+
bool,
|
| 90 |
+
{
|
| 91 |
+
"http": True,
|
| 92 |
+
"https": True,
|
| 93 |
+
"ftp": True,
|
| 94 |
+
"gopher": True,
|
| 95 |
+
"file": True,
|
| 96 |
+
"http:": True,
|
| 97 |
+
"https:": True,
|
| 98 |
+
"ftp:": True,
|
| 99 |
+
"gopher:": True,
|
| 100 |
+
"file:": True,
|
| 101 |
+
},
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
class MutableURL:
|
| 106 |
+
def __init__(self) -> None:
|
| 107 |
+
self.protocol: str | None = None
|
| 108 |
+
self.slashes: bool = False
|
| 109 |
+
self.auth: str | None = None
|
| 110 |
+
self.port: str | None = None
|
| 111 |
+
self.hostname: str | None = None
|
| 112 |
+
self.hash: str | None = None
|
| 113 |
+
self.search: str | None = None
|
| 114 |
+
self.pathname: str | None = None
|
| 115 |
+
|
| 116 |
+
def parse(self, url: str, slashes_denote_host: bool) -> "MutableURL":
|
| 117 |
+
lower_proto = ""
|
| 118 |
+
slashes = False
|
| 119 |
+
rest = url
|
| 120 |
+
|
| 121 |
+
# trim before proceeding.
|
| 122 |
+
# This is to support parse stuff like " http://foo.com \n"
|
| 123 |
+
rest = rest.strip()
|
| 124 |
+
|
| 125 |
+
if not slashes_denote_host and len(url.split("#")) == 1:
|
| 126 |
+
# Try fast path regexp
|
| 127 |
+
simple_path = SIMPLE_PATH_PATTERN.match(rest)
|
| 128 |
+
if simple_path:
|
| 129 |
+
self.pathname = simple_path.group(1)
|
| 130 |
+
if simple_path.group(2):
|
| 131 |
+
self.search = simple_path.group(2)
|
| 132 |
+
return self
|
| 133 |
+
|
| 134 |
+
proto = ""
|
| 135 |
+
proto_match = PROTOCOL_PATTERN.match(rest)
|
| 136 |
+
if proto_match:
|
| 137 |
+
proto = proto_match.group()
|
| 138 |
+
lower_proto = proto.lower()
|
| 139 |
+
self.protocol = proto
|
| 140 |
+
rest = rest[len(proto) :]
|
| 141 |
+
|
| 142 |
+
# figure out if it's got a host
|
| 143 |
+
# user@server is *always* interpreted as a hostname, and url
|
| 144 |
+
# resolution will treat //foo/bar as host=foo,path=bar because that's
|
| 145 |
+
# how the browser resolves relative URLs.
|
| 146 |
+
if slashes_denote_host or proto or re.search(r"^//[^@/]+@[^@/]+", rest):
|
| 147 |
+
slashes = rest.startswith("//")
|
| 148 |
+
if slashes and not (proto and HOSTLESS_PROTOCOL[proto]):
|
| 149 |
+
rest = rest[2:]
|
| 150 |
+
self.slashes = True
|
| 151 |
+
|
| 152 |
+
if not HOSTLESS_PROTOCOL[proto] and (
|
| 153 |
+
slashes or (proto and not SLASHED_PROTOCOL[proto])
|
| 154 |
+
):
|
| 155 |
+
|
| 156 |
+
# there's a hostname.
|
| 157 |
+
# the first instance of /, ?, ;, or # ends the host.
|
| 158 |
+
#
|
| 159 |
+
# If there is an @ in the hostname, then non-host chars *are* allowed
|
| 160 |
+
# to the left of the last @ sign, unless some host-ending character
|
| 161 |
+
# comes *before* the @-sign.
|
| 162 |
+
# URLs are obnoxious.
|
| 163 |
+
#
|
| 164 |
+
# ex:
|
| 165 |
+
# http://a@b@c/ => user:a@b host:c
|
| 166 |
+
# http://a@b?@c => user:a host:c path:/?@c
|
| 167 |
+
|
| 168 |
+
# v0.12 TODO(isaacs): This is not quite how Chrome does things.
|
| 169 |
+
# Review our test case against browsers more comprehensively.
|
| 170 |
+
|
| 171 |
+
# find the first instance of any hostEndingChars
|
| 172 |
+
host_end = -1
|
| 173 |
+
for i in range(len(HOST_ENDING_CHARS)):
|
| 174 |
+
hec = rest.find(HOST_ENDING_CHARS[i])
|
| 175 |
+
if hec != -1 and (host_end == -1 or hec < host_end):
|
| 176 |
+
host_end = hec
|
| 177 |
+
|
| 178 |
+
# at this point, either we have an explicit point where the
|
| 179 |
+
# auth portion cannot go past, or the last @ char is the decider.
|
| 180 |
+
if host_end == -1:
|
| 181 |
+
# atSign can be anywhere.
|
| 182 |
+
at_sign = rest.rfind("@")
|
| 183 |
+
else:
|
| 184 |
+
# atSign must be in auth portion.
|
| 185 |
+
# http://a@b/c@d => host:b auth:a path:/c@d
|
| 186 |
+
at_sign = rest.rfind("@", 0, host_end + 1)
|
| 187 |
+
|
| 188 |
+
# Now we have a portion which is definitely the auth.
|
| 189 |
+
# Pull that off.
|
| 190 |
+
if at_sign != -1:
|
| 191 |
+
auth = rest[:at_sign]
|
| 192 |
+
rest = rest[at_sign + 1 :]
|
| 193 |
+
self.auth = auth
|
| 194 |
+
|
| 195 |
+
# the host is the remaining to the left of the first non-host char
|
| 196 |
+
host_end = -1
|
| 197 |
+
for i in range(len(NON_HOST_CHARS)):
|
| 198 |
+
hec = rest.find(NON_HOST_CHARS[i])
|
| 199 |
+
if hec != -1 and (host_end == -1 or hec < host_end):
|
| 200 |
+
host_end = hec
|
| 201 |
+
# if we still have not hit it, then the entire thing is a host.
|
| 202 |
+
if host_end == -1:
|
| 203 |
+
host_end = len(rest)
|
| 204 |
+
|
| 205 |
+
if host_end > 0 and rest[host_end - 1] == ":":
|
| 206 |
+
host_end -= 1
|
| 207 |
+
host = rest[:host_end]
|
| 208 |
+
rest = rest[host_end:]
|
| 209 |
+
|
| 210 |
+
# pull out port.
|
| 211 |
+
self.parse_host(host)
|
| 212 |
+
|
| 213 |
+
# we've indicated that there is a hostname,
|
| 214 |
+
# so even if it's empty, it has to be present.
|
| 215 |
+
self.hostname = self.hostname or ""
|
| 216 |
+
|
| 217 |
+
# if hostname begins with [ and ends with ]
|
| 218 |
+
# assume that it's an IPv6 address.
|
| 219 |
+
ipv6_hostname = self.hostname.startswith("[") and self.hostname.endswith(
|
| 220 |
+
"]"
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
# validate a little.
|
| 224 |
+
if not ipv6_hostname:
|
| 225 |
+
hostparts = self.hostname.split(".")
|
| 226 |
+
l = len(hostparts) # noqa: E741
|
| 227 |
+
i = 0
|
| 228 |
+
while i < l:
|
| 229 |
+
part = hostparts[i]
|
| 230 |
+
if not part:
|
| 231 |
+
i += 1 # emulate statement3 in JS for loop
|
| 232 |
+
continue
|
| 233 |
+
if not HOSTNAME_PART_PATTERN.search(part):
|
| 234 |
+
newpart = ""
|
| 235 |
+
k = len(part)
|
| 236 |
+
j = 0
|
| 237 |
+
while j < k:
|
| 238 |
+
if ord(part[j]) > 127:
|
| 239 |
+
# we replace non-ASCII char with a temporary placeholder
|
| 240 |
+
# we need this to make sure size of hostname is not
|
| 241 |
+
# broken by replacing non-ASCII by nothing
|
| 242 |
+
newpart += "x"
|
| 243 |
+
else:
|
| 244 |
+
newpart += part[j]
|
| 245 |
+
j += 1 # emulate statement3 in JS for loop
|
| 246 |
+
|
| 247 |
+
# we test again with ASCII char only
|
| 248 |
+
if not HOSTNAME_PART_PATTERN.search(newpart):
|
| 249 |
+
valid_parts = hostparts[:i]
|
| 250 |
+
not_host = hostparts[i + 1 :]
|
| 251 |
+
bit = HOSTNAME_PART_START.search(part)
|
| 252 |
+
if bit:
|
| 253 |
+
valid_parts.append(bit.group(1))
|
| 254 |
+
not_host.insert(0, bit.group(2))
|
| 255 |
+
if not_host:
|
| 256 |
+
rest = ".".join(not_host) + rest
|
| 257 |
+
self.hostname = ".".join(valid_parts)
|
| 258 |
+
break
|
| 259 |
+
i += 1 # emulate statement3 in JS for loop
|
| 260 |
+
|
| 261 |
+
if len(self.hostname) > HOSTNAME_MAX_LEN:
|
| 262 |
+
self.hostname = ""
|
| 263 |
+
|
| 264 |
+
# strip [ and ] from the hostname
|
| 265 |
+
# the host field still retains them, though
|
| 266 |
+
if ipv6_hostname:
|
| 267 |
+
self.hostname = self.hostname[1:-1]
|
| 268 |
+
|
| 269 |
+
# chop off from the tail first.
|
| 270 |
+
hash = rest.find("#") # noqa: A001
|
| 271 |
+
if hash != -1:
|
| 272 |
+
# got a fragment string.
|
| 273 |
+
self.hash = rest[hash:]
|
| 274 |
+
rest = rest[:hash]
|
| 275 |
+
qm = rest.find("?")
|
| 276 |
+
if qm != -1:
|
| 277 |
+
self.search = rest[qm:]
|
| 278 |
+
rest = rest[:qm]
|
| 279 |
+
if rest:
|
| 280 |
+
self.pathname = rest
|
| 281 |
+
if SLASHED_PROTOCOL[lower_proto] and self.hostname and not self.pathname:
|
| 282 |
+
self.pathname = ""
|
| 283 |
+
|
| 284 |
+
return self
|
| 285 |
+
|
| 286 |
+
def parse_host(self, host: str) -> None:
|
| 287 |
+
port_match = PORT_PATTERN.search(host)
|
| 288 |
+
if port_match:
|
| 289 |
+
port = port_match.group()
|
| 290 |
+
if port != ":":
|
| 291 |
+
self.port = port[1:]
|
| 292 |
+
host = host[: -len(port)]
|
| 293 |
+
if host:
|
| 294 |
+
self.hostname = host
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
def url_parse(url: URL | str, *, slashes_denote_host: bool = False) -> URL:
|
| 298 |
+
if isinstance(url, URL):
|
| 299 |
+
return url
|
| 300 |
+
u = MutableURL()
|
| 301 |
+
u.parse(url, slashes_denote_host)
|
| 302 |
+
return URL(
|
| 303 |
+
u.protocol, u.slashes, u.auth, u.port, u.hostname, u.hash, u.search, u.pathname
|
| 304 |
+
)
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_url.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import NamedTuple
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class URL(NamedTuple):
|
| 7 |
+
protocol: str | None
|
| 8 |
+
slashes: bool
|
| 9 |
+
auth: str | None
|
| 10 |
+
port: str | None
|
| 11 |
+
hostname: str | None
|
| 12 |
+
hash: str | None # noqa: A003
|
| 13 |
+
search: str | None
|
| 14 |
+
pathname: str | None
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/py.typed
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Marker file for PEP 561
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
uv
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/LICENSE.txt
ADDED
|
@@ -0,0 +1,971 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Copyright (c) 2005-2023, NumPy Developers.
|
| 2 |
+
All rights reserved.
|
| 3 |
+
|
| 4 |
+
Redistribution and use in source and binary forms, with or without
|
| 5 |
+
modification, are permitted provided that the following conditions are
|
| 6 |
+
met:
|
| 7 |
+
|
| 8 |
+
* Redistributions of source code must retain the above copyright
|
| 9 |
+
notice, this list of conditions and the following disclaimer.
|
| 10 |
+
|
| 11 |
+
* Redistributions in binary form must reproduce the above
|
| 12 |
+
copyright notice, this list of conditions and the following
|
| 13 |
+
disclaimer in the documentation and/or other materials provided
|
| 14 |
+
with the distribution.
|
| 15 |
+
|
| 16 |
+
* Neither the name of the NumPy Developers nor the names of any
|
| 17 |
+
contributors may be used to endorse or promote products derived
|
| 18 |
+
from this software without specific prior written permission.
|
| 19 |
+
|
| 20 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| 21 |
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| 22 |
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| 23 |
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| 24 |
+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| 25 |
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
| 26 |
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
| 27 |
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
| 28 |
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 29 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 30 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 31 |
+
|
| 32 |
+
----
|
| 33 |
+
|
| 34 |
+
The NumPy repository and source distributions bundle several libraries that are
|
| 35 |
+
compatibly licensed. We list these here.
|
| 36 |
+
|
| 37 |
+
Name: lapack-lite
|
| 38 |
+
Files: numpy/linalg/lapack_lite/*
|
| 39 |
+
License: BSD-3-Clause
|
| 40 |
+
For details, see numpy/linalg/lapack_lite/LICENSE.txt
|
| 41 |
+
|
| 42 |
+
Name: tempita
|
| 43 |
+
Files: tools/npy_tempita/*
|
| 44 |
+
License: MIT
|
| 45 |
+
For details, see tools/npy_tempita/license.txt
|
| 46 |
+
|
| 47 |
+
Name: dragon4
|
| 48 |
+
Files: numpy/core/src/multiarray/dragon4.c
|
| 49 |
+
License: MIT
|
| 50 |
+
For license text, see numpy/core/src/multiarray/dragon4.c
|
| 51 |
+
|
| 52 |
+
Name: libdivide
|
| 53 |
+
Files: numpy/core/include/numpy/libdivide/*
|
| 54 |
+
License: Zlib
|
| 55 |
+
For license text, see numpy/core/include/numpy/libdivide/LICENSE.txt
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
Note that the following files are vendored in the repository and sdist but not
|
| 59 |
+
installed in built numpy packages:
|
| 60 |
+
|
| 61 |
+
Name: Meson
|
| 62 |
+
Files: vendored-meson/meson/*
|
| 63 |
+
License: Apache 2.0
|
| 64 |
+
For license text, see vendored-meson/meson/COPYING
|
| 65 |
+
|
| 66 |
+
Name: spin
|
| 67 |
+
Files: .spin/cmds.py
|
| 68 |
+
License: BSD-3
|
| 69 |
+
For license text, see .spin/LICENSE
|
| 70 |
+
|
| 71 |
+
----
|
| 72 |
+
|
| 73 |
+
This binary distribution of NumPy also bundles the following software:
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
Name: OpenBLAS
|
| 77 |
+
Files: numpy.libs/libopenblas*.so
|
| 78 |
+
Description: bundled as a dynamically linked library
|
| 79 |
+
Availability: https://github.com/OpenMathLib/OpenBLAS/
|
| 80 |
+
License: BSD-3-Clause
|
| 81 |
+
Copyright (c) 2011-2014, The OpenBLAS Project
|
| 82 |
+
All rights reserved.
|
| 83 |
+
|
| 84 |
+
Redistribution and use in source and binary forms, with or without
|
| 85 |
+
modification, are permitted provided that the following conditions are
|
| 86 |
+
met:
|
| 87 |
+
|
| 88 |
+
1. Redistributions of source code must retain the above copyright
|
| 89 |
+
notice, this list of conditions and the following disclaimer.
|
| 90 |
+
|
| 91 |
+
2. Redistributions in binary form must reproduce the above copyright
|
| 92 |
+
notice, this list of conditions and the following disclaimer in
|
| 93 |
+
the documentation and/or other materials provided with the
|
| 94 |
+
distribution.
|
| 95 |
+
3. Neither the name of the OpenBLAS project nor the names of
|
| 96 |
+
its contributors may be used to endorse or promote products
|
| 97 |
+
derived from this software without specific prior written
|
| 98 |
+
permission.
|
| 99 |
+
|
| 100 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
| 101 |
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| 102 |
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
| 103 |
+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
| 104 |
+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
| 105 |
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
| 106 |
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
| 107 |
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
| 108 |
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
| 109 |
+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
Name: LAPACK
|
| 113 |
+
Files: numpy.libs/libopenblas*.so
|
| 114 |
+
Description: bundled in OpenBLAS
|
| 115 |
+
Availability: https://github.com/OpenMathLib/OpenBLAS/
|
| 116 |
+
License: BSD-3-Clause-Attribution
|
| 117 |
+
Copyright (c) 1992-2013 The University of Tennessee and The University
|
| 118 |
+
of Tennessee Research Foundation. All rights
|
| 119 |
+
reserved.
|
| 120 |
+
Copyright (c) 2000-2013 The University of California Berkeley. All
|
| 121 |
+
rights reserved.
|
| 122 |
+
Copyright (c) 2006-2013 The University of Colorado Denver. All rights
|
| 123 |
+
reserved.
|
| 124 |
+
|
| 125 |
+
$COPYRIGHT$
|
| 126 |
+
|
| 127 |
+
Additional copyrights may follow
|
| 128 |
+
|
| 129 |
+
$HEADER$
|
| 130 |
+
|
| 131 |
+
Redistribution and use in source and binary forms, with or without
|
| 132 |
+
modification, are permitted provided that the following conditions are
|
| 133 |
+
met:
|
| 134 |
+
|
| 135 |
+
- Redistributions of source code must retain the above copyright
|
| 136 |
+
notice, this list of conditions and the following disclaimer.
|
| 137 |
+
|
| 138 |
+
- Redistributions in binary form must reproduce the above copyright
|
| 139 |
+
notice, this list of conditions and the following disclaimer listed
|
| 140 |
+
in this license in the documentation and/or other materials
|
| 141 |
+
provided with the distribution.
|
| 142 |
+
|
| 143 |
+
- Neither the name of the copyright holders nor the names of its
|
| 144 |
+
contributors may be used to endorse or promote products derived from
|
| 145 |
+
this software without specific prior written permission.
|
| 146 |
+
|
| 147 |
+
The copyright holders provide no reassurances that the source code
|
| 148 |
+
provided does not infringe any patent, copyright, or any other
|
| 149 |
+
intellectual property rights of third parties. The copyright holders
|
| 150 |
+
disclaim any liability to any recipient for claims brought against
|
| 151 |
+
recipient by any third party for infringement of that parties
|
| 152 |
+
intellectual property rights.
|
| 153 |
+
|
| 154 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| 155 |
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| 156 |
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| 157 |
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| 158 |
+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| 159 |
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
| 160 |
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
| 161 |
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
| 162 |
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 163 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 164 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
Name: GCC runtime library
|
| 168 |
+
Files: numpy.libs/libgfortran*.so
|
| 169 |
+
Description: dynamically linked to files compiled with gcc
|
| 170 |
+
Availability: https://gcc.gnu.org/git/?p=gcc.git;a=tree;f=libgfortran
|
| 171 |
+
License: GPL-3.0-with-GCC-exception
|
| 172 |
+
Copyright (C) 2002-2017 Free Software Foundation, Inc.
|
| 173 |
+
|
| 174 |
+
Libgfortran is free software; you can redistribute it and/or modify
|
| 175 |
+
it under the terms of the GNU General Public License as published by
|
| 176 |
+
the Free Software Foundation; either version 3, or (at your option)
|
| 177 |
+
any later version.
|
| 178 |
+
|
| 179 |
+
Libgfortran is distributed in the hope that it will be useful,
|
| 180 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 181 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 182 |
+
GNU General Public License for more details.
|
| 183 |
+
|
| 184 |
+
Under Section 7 of GPL version 3, you are granted additional
|
| 185 |
+
permissions described in the GCC Runtime Library Exception, version
|
| 186 |
+
3.1, as published by the Free Software Foundation.
|
| 187 |
+
|
| 188 |
+
You should have received a copy of the GNU General Public License and
|
| 189 |
+
a copy of the GCC Runtime Library Exception along with this program;
|
| 190 |
+
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
| 191 |
+
<http://www.gnu.org/licenses/>.
|
| 192 |
+
|
| 193 |
+
----
|
| 194 |
+
|
| 195 |
+
Full text of license texts referred to above follows (that they are
|
| 196 |
+
listed below does not necessarily imply the conditions apply to the
|
| 197 |
+
present binary release):
|
| 198 |
+
|
| 199 |
+
----
|
| 200 |
+
|
| 201 |
+
GCC RUNTIME LIBRARY EXCEPTION
|
| 202 |
+
|
| 203 |
+
Version 3.1, 31 March 2009
|
| 204 |
+
|
| 205 |
+
Copyright (C) 2009 Free Software Foundation, Inc. <http://fsf.org/>
|
| 206 |
+
|
| 207 |
+
Everyone is permitted to copy and distribute verbatim copies of this
|
| 208 |
+
license document, but changing it is not allowed.
|
| 209 |
+
|
| 210 |
+
This GCC Runtime Library Exception ("Exception") is an additional
|
| 211 |
+
permission under section 7 of the GNU General Public License, version
|
| 212 |
+
3 ("GPLv3"). It applies to a given file (the "Runtime Library") that
|
| 213 |
+
bears a notice placed by the copyright holder of the file stating that
|
| 214 |
+
the file is governed by GPLv3 along with this Exception.
|
| 215 |
+
|
| 216 |
+
When you use GCC to compile a program, GCC may combine portions of
|
| 217 |
+
certain GCC header files and runtime libraries with the compiled
|
| 218 |
+
program. The purpose of this Exception is to allow compilation of
|
| 219 |
+
non-GPL (including proprietary) programs to use, in this way, the
|
| 220 |
+
header files and runtime libraries covered by this Exception.
|
| 221 |
+
|
| 222 |
+
0. Definitions.
|
| 223 |
+
|
| 224 |
+
A file is an "Independent Module" if it either requires the Runtime
|
| 225 |
+
Library for execution after a Compilation Process, or makes use of an
|
| 226 |
+
interface provided by the Runtime Library, but is not otherwise based
|
| 227 |
+
on the Runtime Library.
|
| 228 |
+
|
| 229 |
+
"GCC" means a version of the GNU Compiler Collection, with or without
|
| 230 |
+
modifications, governed by version 3 (or a specified later version) of
|
| 231 |
+
the GNU General Public License (GPL) with the option of using any
|
| 232 |
+
subsequent versions published by the FSF.
|
| 233 |
+
|
| 234 |
+
"GPL-compatible Software" is software whose conditions of propagation,
|
| 235 |
+
modification and use would permit combination with GCC in accord with
|
| 236 |
+
the license of GCC.
|
| 237 |
+
|
| 238 |
+
"Target Code" refers to output from any compiler for a real or virtual
|
| 239 |
+
target processor architecture, in executable form or suitable for
|
| 240 |
+
input to an assembler, loader, linker and/or execution
|
| 241 |
+
phase. Notwithstanding that, Target Code does not include data in any
|
| 242 |
+
format that is used as a compiler intermediate representation, or used
|
| 243 |
+
for producing a compiler intermediate representation.
|
| 244 |
+
|
| 245 |
+
The "Compilation Process" transforms code entirely represented in
|
| 246 |
+
non-intermediate languages designed for human-written code, and/or in
|
| 247 |
+
Java Virtual Machine byte code, into Target Code. Thus, for example,
|
| 248 |
+
use of source code generators and preprocessors need not be considered
|
| 249 |
+
part of the Compilation Process, since the Compilation Process can be
|
| 250 |
+
understood as starting with the output of the generators or
|
| 251 |
+
preprocessors.
|
| 252 |
+
|
| 253 |
+
A Compilation Process is "Eligible" if it is done using GCC, alone or
|
| 254 |
+
with other GPL-compatible software, or if it is done without using any
|
| 255 |
+
work based on GCC. For example, using non-GPL-compatible Software to
|
| 256 |
+
optimize any GCC intermediate representations would not qualify as an
|
| 257 |
+
Eligible Compilation Process.
|
| 258 |
+
|
| 259 |
+
1. Grant of Additional Permission.
|
| 260 |
+
|
| 261 |
+
You have permission to propagate a work of Target Code formed by
|
| 262 |
+
combining the Runtime Library with Independent Modules, even if such
|
| 263 |
+
propagation would otherwise violate the terms of GPLv3, provided that
|
| 264 |
+
all Target Code was generated by Eligible Compilation Processes. You
|
| 265 |
+
may then convey such a combination under terms of your choice,
|
| 266 |
+
consistent with the licensing of the Independent Modules.
|
| 267 |
+
|
| 268 |
+
2. No Weakening of GCC Copyleft.
|
| 269 |
+
|
| 270 |
+
The availability of this Exception does not imply any general
|
| 271 |
+
presumption that third-party software is unaffected by the copyleft
|
| 272 |
+
requirements of the license of GCC.
|
| 273 |
+
|
| 274 |
+
----
|
| 275 |
+
|
| 276 |
+
GNU GENERAL PUBLIC LICENSE
|
| 277 |
+
Version 3, 29 June 2007
|
| 278 |
+
|
| 279 |
+
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
|
| 280 |
+
Everyone is permitted to copy and distribute verbatim copies
|
| 281 |
+
of this license document, but changing it is not allowed.
|
| 282 |
+
|
| 283 |
+
Preamble
|
| 284 |
+
|
| 285 |
+
The GNU General Public License is a free, copyleft license for
|
| 286 |
+
software and other kinds of works.
|
| 287 |
+
|
| 288 |
+
The licenses for most software and other practical works are designed
|
| 289 |
+
to take away your freedom to share and change the works. By contrast,
|
| 290 |
+
the GNU General Public License is intended to guarantee your freedom to
|
| 291 |
+
share and change all versions of a program--to make sure it remains free
|
| 292 |
+
software for all its users. We, the Free Software Foundation, use the
|
| 293 |
+
GNU General Public License for most of our software; it applies also to
|
| 294 |
+
any other work released this way by its authors. You can apply it to
|
| 295 |
+
your programs, too.
|
| 296 |
+
|
| 297 |
+
When we speak of free software, we are referring to freedom, not
|
| 298 |
+
price. Our General Public Licenses are designed to make sure that you
|
| 299 |
+
have the freedom to distribute copies of free software (and charge for
|
| 300 |
+
them if you wish), that you receive source code or can get it if you
|
| 301 |
+
want it, that you can change the software or use pieces of it in new
|
| 302 |
+
free programs, and that you know you can do these things.
|
| 303 |
+
|
| 304 |
+
To protect your rights, we need to prevent others from denying you
|
| 305 |
+
these rights or asking you to surrender the rights. Therefore, you have
|
| 306 |
+
certain responsibilities if you distribute copies of the software, or if
|
| 307 |
+
you modify it: responsibilities to respect the freedom of others.
|
| 308 |
+
|
| 309 |
+
For example, if you distribute copies of such a program, whether
|
| 310 |
+
gratis or for a fee, you must pass on to the recipients the same
|
| 311 |
+
freedoms that you received. You must make sure that they, too, receive
|
| 312 |
+
or can get the source code. And you must show them these terms so they
|
| 313 |
+
know their rights.
|
| 314 |
+
|
| 315 |
+
Developers that use the GNU GPL protect your rights with two steps:
|
| 316 |
+
(1) assert copyright on the software, and (2) offer you this License
|
| 317 |
+
giving you legal permission to copy, distribute and/or modify it.
|
| 318 |
+
|
| 319 |
+
For the developers' and authors' protection, the GPL clearly explains
|
| 320 |
+
that there is no warranty for this free software. For both users' and
|
| 321 |
+
authors' sake, the GPL requires that modified versions be marked as
|
| 322 |
+
changed, so that their problems will not be attributed erroneously to
|
| 323 |
+
authors of previous versions.
|
| 324 |
+
|
| 325 |
+
Some devices are designed to deny users access to install or run
|
| 326 |
+
modified versions of the software inside them, although the manufacturer
|
| 327 |
+
can do so. This is fundamentally incompatible with the aim of
|
| 328 |
+
protecting users' freedom to change the software. The systematic
|
| 329 |
+
pattern of such abuse occurs in the area of products for individuals to
|
| 330 |
+
use, which is precisely where it is most unacceptable. Therefore, we
|
| 331 |
+
have designed this version of the GPL to prohibit the practice for those
|
| 332 |
+
products. If such problems arise substantially in other domains, we
|
| 333 |
+
stand ready to extend this provision to those domains in future versions
|
| 334 |
+
of the GPL, as needed to protect the freedom of users.
|
| 335 |
+
|
| 336 |
+
Finally, every program is threatened constantly by software patents.
|
| 337 |
+
States should not allow patents to restrict development and use of
|
| 338 |
+
software on general-purpose computers, but in those that do, we wish to
|
| 339 |
+
avoid the special danger that patents applied to a free program could
|
| 340 |
+
make it effectively proprietary. To prevent this, the GPL assures that
|
| 341 |
+
patents cannot be used to render the program non-free.
|
| 342 |
+
|
| 343 |
+
The precise terms and conditions for copying, distribution and
|
| 344 |
+
modification follow.
|
| 345 |
+
|
| 346 |
+
TERMS AND CONDITIONS
|
| 347 |
+
|
| 348 |
+
0. Definitions.
|
| 349 |
+
|
| 350 |
+
"This License" refers to version 3 of the GNU General Public License.
|
| 351 |
+
|
| 352 |
+
"Copyright" also means copyright-like laws that apply to other kinds of
|
| 353 |
+
works, such as semiconductor masks.
|
| 354 |
+
|
| 355 |
+
"The Program" refers to any copyrightable work licensed under this
|
| 356 |
+
License. Each licensee is addressed as "you". "Licensees" and
|
| 357 |
+
"recipients" may be individuals or organizations.
|
| 358 |
+
|
| 359 |
+
To "modify" a work means to copy from or adapt all or part of the work
|
| 360 |
+
in a fashion requiring copyright permission, other than the making of an
|
| 361 |
+
exact copy. The resulting work is called a "modified version" of the
|
| 362 |
+
earlier work or a work "based on" the earlier work.
|
| 363 |
+
|
| 364 |
+
A "covered work" means either the unmodified Program or a work based
|
| 365 |
+
on the Program.
|
| 366 |
+
|
| 367 |
+
To "propagate" a work means to do anything with it that, without
|
| 368 |
+
permission, would make you directly or secondarily liable for
|
| 369 |
+
infringement under applicable copyright law, except executing it on a
|
| 370 |
+
computer or modifying a private copy. Propagation includes copying,
|
| 371 |
+
distribution (with or without modification), making available to the
|
| 372 |
+
public, and in some countries other activities as well.
|
| 373 |
+
|
| 374 |
+
To "convey" a work means any kind of propagation that enables other
|
| 375 |
+
parties to make or receive copies. Mere interaction with a user through
|
| 376 |
+
a computer network, with no transfer of a copy, is not conveying.
|
| 377 |
+
|
| 378 |
+
An interactive user interface displays "Appropriate Legal Notices"
|
| 379 |
+
to the extent that it includes a convenient and prominently visible
|
| 380 |
+
feature that (1) displays an appropriate copyright notice, and (2)
|
| 381 |
+
tells the user that there is no warranty for the work (except to the
|
| 382 |
+
extent that warranties are provided), that licensees may convey the
|
| 383 |
+
work under this License, and how to view a copy of this License. If
|
| 384 |
+
the interface presents a list of user commands or options, such as a
|
| 385 |
+
menu, a prominent item in the list meets this criterion.
|
| 386 |
+
|
| 387 |
+
1. Source Code.
|
| 388 |
+
|
| 389 |
+
The "source code" for a work means the preferred form of the work
|
| 390 |
+
for making modifications to it. "Object code" means any non-source
|
| 391 |
+
form of a work.
|
| 392 |
+
|
| 393 |
+
A "Standard Interface" means an interface that either is an official
|
| 394 |
+
standard defined by a recognized standards body, or, in the case of
|
| 395 |
+
interfaces specified for a particular programming language, one that
|
| 396 |
+
is widely used among developers working in that language.
|
| 397 |
+
|
| 398 |
+
The "System Libraries" of an executable work include anything, other
|
| 399 |
+
than the work as a whole, that (a) is included in the normal form of
|
| 400 |
+
packaging a Major Component, but which is not part of that Major
|
| 401 |
+
Component, and (b) serves only to enable use of the work with that
|
| 402 |
+
Major Component, or to implement a Standard Interface for which an
|
| 403 |
+
implementation is available to the public in source code form. A
|
| 404 |
+
"Major Component", in this context, means a major essential component
|
| 405 |
+
(kernel, window system, and so on) of the specific operating system
|
| 406 |
+
(if any) on which the executable work runs, or a compiler used to
|
| 407 |
+
produce the work, or an object code interpreter used to run it.
|
| 408 |
+
|
| 409 |
+
The "Corresponding Source" for a work in object code form means all
|
| 410 |
+
the source code needed to generate, install, and (for an executable
|
| 411 |
+
work) run the object code and to modify the work, including scripts to
|
| 412 |
+
control those activities. However, it does not include the work's
|
| 413 |
+
System Libraries, or general-purpose tools or generally available free
|
| 414 |
+
programs which are used unmodified in performing those activities but
|
| 415 |
+
which are not part of the work. For example, Corresponding Source
|
| 416 |
+
includes interface definition files associated with source files for
|
| 417 |
+
the work, and the source code for shared libraries and dynamically
|
| 418 |
+
linked subprograms that the work is specifically designed to require,
|
| 419 |
+
such as by intimate data communication or control flow between those
|
| 420 |
+
subprograms and other parts of the work.
|
| 421 |
+
|
| 422 |
+
The Corresponding Source need not include anything that users
|
| 423 |
+
can regenerate automatically from other parts of the Corresponding
|
| 424 |
+
Source.
|
| 425 |
+
|
| 426 |
+
The Corresponding Source for a work in source code form is that
|
| 427 |
+
same work.
|
| 428 |
+
|
| 429 |
+
2. Basic Permissions.
|
| 430 |
+
|
| 431 |
+
All rights granted under this License are granted for the term of
|
| 432 |
+
copyright on the Program, and are irrevocable provided the stated
|
| 433 |
+
conditions are met. This License explicitly affirms your unlimited
|
| 434 |
+
permission to run the unmodified Program. The output from running a
|
| 435 |
+
covered work is covered by this License only if the output, given its
|
| 436 |
+
content, constitutes a covered work. This License acknowledges your
|
| 437 |
+
rights of fair use or other equivalent, as provided by copyright law.
|
| 438 |
+
|
| 439 |
+
You may make, run and propagate covered works that you do not
|
| 440 |
+
convey, without conditions so long as your license otherwise remains
|
| 441 |
+
in force. You may convey covered works to others for the sole purpose
|
| 442 |
+
of having them make modifications exclusively for you, or provide you
|
| 443 |
+
with facilities for running those works, provided that you comply with
|
| 444 |
+
the terms of this License in conveying all material for which you do
|
| 445 |
+
not control copyright. Those thus making or running the covered works
|
| 446 |
+
for you must do so exclusively on your behalf, under your direction
|
| 447 |
+
and control, on terms that prohibit them from making any copies of
|
| 448 |
+
your copyrighted material outside their relationship with you.
|
| 449 |
+
|
| 450 |
+
Conveying under any other circumstances is permitted solely under
|
| 451 |
+
the conditions stated below. Sublicensing is not allowed; section 10
|
| 452 |
+
makes it unnecessary.
|
| 453 |
+
|
| 454 |
+
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
| 455 |
+
|
| 456 |
+
No covered work shall be deemed part of an effective technological
|
| 457 |
+
measure under any applicable law fulfilling obligations under article
|
| 458 |
+
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
| 459 |
+
similar laws prohibiting or restricting circumvention of such
|
| 460 |
+
measures.
|
| 461 |
+
|
| 462 |
+
When you convey a covered work, you waive any legal power to forbid
|
| 463 |
+
circumvention of technological measures to the extent such circumvention
|
| 464 |
+
is effected by exercising rights under this License with respect to
|
| 465 |
+
the covered work, and you disclaim any intention to limit operation or
|
| 466 |
+
modification of the work as a means of enforcing, against the work's
|
| 467 |
+
users, your or third parties' legal rights to forbid circumvention of
|
| 468 |
+
technological measures.
|
| 469 |
+
|
| 470 |
+
4. Conveying Verbatim Copies.
|
| 471 |
+
|
| 472 |
+
You may convey verbatim copies of the Program's source code as you
|
| 473 |
+
receive it, in any medium, provided that you conspicuously and
|
| 474 |
+
appropriately publish on each copy an appropriate copyright notice;
|
| 475 |
+
keep intact all notices stating that this License and any
|
| 476 |
+
non-permissive terms added in accord with section 7 apply to the code;
|
| 477 |
+
keep intact all notices of the absence of any warranty; and give all
|
| 478 |
+
recipients a copy of this License along with the Program.
|
| 479 |
+
|
| 480 |
+
You may charge any price or no price for each copy that you convey,
|
| 481 |
+
and you may offer support or warranty protection for a fee.
|
| 482 |
+
|
| 483 |
+
5. Conveying Modified Source Versions.
|
| 484 |
+
|
| 485 |
+
You may convey a work based on the Program, or the modifications to
|
| 486 |
+
produce it from the Program, in the form of source code under the
|
| 487 |
+
terms of section 4, provided that you also meet all of these conditions:
|
| 488 |
+
|
| 489 |
+
a) The work must carry prominent notices stating that you modified
|
| 490 |
+
it, and giving a relevant date.
|
| 491 |
+
|
| 492 |
+
b) The work must carry prominent notices stating that it is
|
| 493 |
+
released under this License and any conditions added under section
|
| 494 |
+
7. This requirement modifies the requirement in section 4 to
|
| 495 |
+
"keep intact all notices".
|
| 496 |
+
|
| 497 |
+
c) You must license the entire work, as a whole, under this
|
| 498 |
+
License to anyone who comes into possession of a copy. This
|
| 499 |
+
License will therefore apply, along with any applicable section 7
|
| 500 |
+
additional terms, to the whole of the work, and all its parts,
|
| 501 |
+
regardless of how they are packaged. This License gives no
|
| 502 |
+
permission to license the work in any other way, but it does not
|
| 503 |
+
invalidate such permission if you have separately received it.
|
| 504 |
+
|
| 505 |
+
d) If the work has interactive user interfaces, each must display
|
| 506 |
+
Appropriate Legal Notices; however, if the Program has interactive
|
| 507 |
+
interfaces that do not display Appropriate Legal Notices, your
|
| 508 |
+
work need not make them do so.
|
| 509 |
+
|
| 510 |
+
A compilation of a covered work with other separate and independent
|
| 511 |
+
works, which are not by their nature extensions of the covered work,
|
| 512 |
+
and which are not combined with it such as to form a larger program,
|
| 513 |
+
in or on a volume of a storage or distribution medium, is called an
|
| 514 |
+
"aggregate" if the compilation and its resulting copyright are not
|
| 515 |
+
used to limit the access or legal rights of the compilation's users
|
| 516 |
+
beyond what the individual works permit. Inclusion of a covered work
|
| 517 |
+
in an aggregate does not cause this License to apply to the other
|
| 518 |
+
parts of the aggregate.
|
| 519 |
+
|
| 520 |
+
6. Conveying Non-Source Forms.
|
| 521 |
+
|
| 522 |
+
You may convey a covered work in object code form under the terms
|
| 523 |
+
of sections 4 and 5, provided that you also convey the
|
| 524 |
+
machine-readable Corresponding Source under the terms of this License,
|
| 525 |
+
in one of these ways:
|
| 526 |
+
|
| 527 |
+
a) Convey the object code in, or embodied in, a physical product
|
| 528 |
+
(including a physical distribution medium), accompanied by the
|
| 529 |
+
Corresponding Source fixed on a durable physical medium
|
| 530 |
+
customarily used for software interchange.
|
| 531 |
+
|
| 532 |
+
b) Convey the object code in, or embodied in, a physical product
|
| 533 |
+
(including a physical distribution medium), accompanied by a
|
| 534 |
+
written offer, valid for at least three years and valid for as
|
| 535 |
+
long as you offer spare parts or customer support for that product
|
| 536 |
+
model, to give anyone who possesses the object code either (1) a
|
| 537 |
+
copy of the Corresponding Source for all the software in the
|
| 538 |
+
product that is covered by this License, on a durable physical
|
| 539 |
+
medium customarily used for software interchange, for a price no
|
| 540 |
+
more than your reasonable cost of physically performing this
|
| 541 |
+
conveying of source, or (2) access to copy the
|
| 542 |
+
Corresponding Source from a network server at no charge.
|
| 543 |
+
|
| 544 |
+
c) Convey individual copies of the object code with a copy of the
|
| 545 |
+
written offer to provide the Corresponding Source. This
|
| 546 |
+
alternative is allowed only occasionally and noncommercially, and
|
| 547 |
+
only if you received the object code with such an offer, in accord
|
| 548 |
+
with subsection 6b.
|
| 549 |
+
|
| 550 |
+
d) Convey the object code by offering access from a designated
|
| 551 |
+
place (gratis or for a charge), and offer equivalent access to the
|
| 552 |
+
Corresponding Source in the same way through the same place at no
|
| 553 |
+
further charge. You need not require recipients to copy the
|
| 554 |
+
Corresponding Source along with the object code. If the place to
|
| 555 |
+
copy the object code is a network server, the Corresponding Source
|
| 556 |
+
may be on a different server (operated by you or a third party)
|
| 557 |
+
that supports equivalent copying facilities, provided you maintain
|
| 558 |
+
clear directions next to the object code saying where to find the
|
| 559 |
+
Corresponding Source. Regardless of what server hosts the
|
| 560 |
+
Corresponding Source, you remain obligated to ensure that it is
|
| 561 |
+
available for as long as needed to satisfy these requirements.
|
| 562 |
+
|
| 563 |
+
e) Convey the object code using peer-to-peer transmission, provided
|
| 564 |
+
you inform other peers where the object code and Corresponding
|
| 565 |
+
Source of the work are being offered to the general public at no
|
| 566 |
+
charge under subsection 6d.
|
| 567 |
+
|
| 568 |
+
A separable portion of the object code, whose source code is excluded
|
| 569 |
+
from the Corresponding Source as a System Library, need not be
|
| 570 |
+
included in conveying the object code work.
|
| 571 |
+
|
| 572 |
+
A "User Product" is either (1) a "consumer product", which means any
|
| 573 |
+
tangible personal property which is normally used for personal, family,
|
| 574 |
+
or household purposes, or (2) anything designed or sold for incorporation
|
| 575 |
+
into a dwelling. In determining whether a product is a consumer product,
|
| 576 |
+
doubtful cases shall be resolved in favor of coverage. For a particular
|
| 577 |
+
product received by a particular user, "normally used" refers to a
|
| 578 |
+
typical or common use of that class of product, regardless of the status
|
| 579 |
+
of the particular user or of the way in which the particular user
|
| 580 |
+
actually uses, or expects or is expected to use, the product. A product
|
| 581 |
+
is a consumer product regardless of whether the product has substantial
|
| 582 |
+
commercial, industrial or non-consumer uses, unless such uses represent
|
| 583 |
+
the only significant mode of use of the product.
|
| 584 |
+
|
| 585 |
+
"Installation Information" for a User Product means any methods,
|
| 586 |
+
procedures, authorization keys, or other information required to install
|
| 587 |
+
and execute modified versions of a covered work in that User Product from
|
| 588 |
+
a modified version of its Corresponding Source. The information must
|
| 589 |
+
suffice to ensure that the continued functioning of the modified object
|
| 590 |
+
code is in no case prevented or interfered with solely because
|
| 591 |
+
modification has been made.
|
| 592 |
+
|
| 593 |
+
If you convey an object code work under this section in, or with, or
|
| 594 |
+
specifically for use in, a User Product, and the conveying occurs as
|
| 595 |
+
part of a transaction in which the right of possession and use of the
|
| 596 |
+
User Product is transferred to the recipient in perpetuity or for a
|
| 597 |
+
fixed term (regardless of how the transaction is characterized), the
|
| 598 |
+
Corresponding Source conveyed under this section must be accompanied
|
| 599 |
+
by the Installation Information. But this requirement does not apply
|
| 600 |
+
if neither you nor any third party retains the ability to install
|
| 601 |
+
modified object code on the User Product (for example, the work has
|
| 602 |
+
been installed in ROM).
|
| 603 |
+
|
| 604 |
+
The requirement to provide Installation Information does not include a
|
| 605 |
+
requirement to continue to provide support service, warranty, or updates
|
| 606 |
+
for a work that has been modified or installed by the recipient, or for
|
| 607 |
+
the User Product in which it has been modified or installed. Access to a
|
| 608 |
+
network may be denied when the modification itself materially and
|
| 609 |
+
adversely affects the operation of the network or violates the rules and
|
| 610 |
+
protocols for communication across the network.
|
| 611 |
+
|
| 612 |
+
Corresponding Source conveyed, and Installation Information provided,
|
| 613 |
+
in accord with this section must be in a format that is publicly
|
| 614 |
+
documented (and with an implementation available to the public in
|
| 615 |
+
source code form), and must require no special password or key for
|
| 616 |
+
unpacking, reading or copying.
|
| 617 |
+
|
| 618 |
+
7. Additional Terms.
|
| 619 |
+
|
| 620 |
+
"Additional permissions" are terms that supplement the terms of this
|
| 621 |
+
License by making exceptions from one or more of its conditions.
|
| 622 |
+
Additional permissions that are applicable to the entire Program shall
|
| 623 |
+
be treated as though they were included in this License, to the extent
|
| 624 |
+
that they are valid under applicable law. If additional permissions
|
| 625 |
+
apply only to part of the Program, that part may be used separately
|
| 626 |
+
under those permissions, but the entire Program remains governed by
|
| 627 |
+
this License without regard to the additional permissions.
|
| 628 |
+
|
| 629 |
+
When you convey a copy of a covered work, you may at your option
|
| 630 |
+
remove any additional permissions from that copy, or from any part of
|
| 631 |
+
it. (Additional permissions may be written to require their own
|
| 632 |
+
removal in certain cases when you modify the work.) You may place
|
| 633 |
+
additional permissions on material, added by you to a covered work,
|
| 634 |
+
for which you have or can give appropriate copyright permission.
|
| 635 |
+
|
| 636 |
+
Notwithstanding any other provision of this License, for material you
|
| 637 |
+
add to a covered work, you may (if authorized by the copyright holders of
|
| 638 |
+
that material) supplement the terms of this License with terms:
|
| 639 |
+
|
| 640 |
+
a) Disclaiming warranty or limiting liability differently from the
|
| 641 |
+
terms of sections 15 and 16 of this License; or
|
| 642 |
+
|
| 643 |
+
b) Requiring preservation of specified reasonable legal notices or
|
| 644 |
+
author attributions in that material or in the Appropriate Legal
|
| 645 |
+
Notices displayed by works containing it; or
|
| 646 |
+
|
| 647 |
+
c) Prohibiting misrepresentation of the origin of that material, or
|
| 648 |
+
requiring that modified versions of such material be marked in
|
| 649 |
+
reasonable ways as different from the original version; or
|
| 650 |
+
|
| 651 |
+
d) Limiting the use for publicity purposes of names of licensors or
|
| 652 |
+
authors of the material; or
|
| 653 |
+
|
| 654 |
+
e) Declining to grant rights under trademark law for use of some
|
| 655 |
+
trade names, trademarks, or service marks; or
|
| 656 |
+
|
| 657 |
+
f) Requiring indemnification of licensors and authors of that
|
| 658 |
+
material by anyone who conveys the material (or modified versions of
|
| 659 |
+
it) with contractual assumptions of liability to the recipient, for
|
| 660 |
+
any liability that these contractual assumptions directly impose on
|
| 661 |
+
those licensors and authors.
|
| 662 |
+
|
| 663 |
+
All other non-permissive additional terms are considered "further
|
| 664 |
+
restrictions" within the meaning of section 10. If the Program as you
|
| 665 |
+
received it, or any part of it, contains a notice stating that it is
|
| 666 |
+
governed by this License along with a term that is a further
|
| 667 |
+
restriction, you may remove that term. If a license document contains
|
| 668 |
+
a further restriction but permits relicensing or conveying under this
|
| 669 |
+
License, you may add to a covered work material governed by the terms
|
| 670 |
+
of that license document, provided that the further restriction does
|
| 671 |
+
not survive such relicensing or conveying.
|
| 672 |
+
|
| 673 |
+
If you add terms to a covered work in accord with this section, you
|
| 674 |
+
must place, in the relevant source files, a statement of the
|
| 675 |
+
additional terms that apply to those files, or a notice indicating
|
| 676 |
+
where to find the applicable terms.
|
| 677 |
+
|
| 678 |
+
Additional terms, permissive or non-permissive, may be stated in the
|
| 679 |
+
form of a separately written license, or stated as exceptions;
|
| 680 |
+
the above requirements apply either way.
|
| 681 |
+
|
| 682 |
+
8. Termination.
|
| 683 |
+
|
| 684 |
+
You may not propagate or modify a covered work except as expressly
|
| 685 |
+
provided under this License. Any attempt otherwise to propagate or
|
| 686 |
+
modify it is void, and will automatically terminate your rights under
|
| 687 |
+
this License (including any patent licenses granted under the third
|
| 688 |
+
paragraph of section 11).
|
| 689 |
+
|
| 690 |
+
However, if you cease all violation of this License, then your
|
| 691 |
+
license from a particular copyright holder is reinstated (a)
|
| 692 |
+
provisionally, unless and until the copyright holder explicitly and
|
| 693 |
+
finally terminates your license, and (b) permanently, if the copyright
|
| 694 |
+
holder fails to notify you of the violation by some reasonable means
|
| 695 |
+
prior to 60 days after the cessation.
|
| 696 |
+
|
| 697 |
+
Moreover, your license from a particular copyright holder is
|
| 698 |
+
reinstated permanently if the copyright holder notifies you of the
|
| 699 |
+
violation by some reasonable means, this is the first time you have
|
| 700 |
+
received notice of violation of this License (for any work) from that
|
| 701 |
+
copyright holder, and you cure the violation prior to 30 days after
|
| 702 |
+
your receipt of the notice.
|
| 703 |
+
|
| 704 |
+
Termination of your rights under this section does not terminate the
|
| 705 |
+
licenses of parties who have received copies or rights from you under
|
| 706 |
+
this License. If your rights have been terminated and not permanently
|
| 707 |
+
reinstated, you do not qualify to receive new licenses for the same
|
| 708 |
+
material under section 10.
|
| 709 |
+
|
| 710 |
+
9. Acceptance Not Required for Having Copies.
|
| 711 |
+
|
| 712 |
+
You are not required to accept this License in order to receive or
|
| 713 |
+
run a copy of the Program. Ancillary propagation of a covered work
|
| 714 |
+
occurring solely as a consequence of using peer-to-peer transmission
|
| 715 |
+
to receive a copy likewise does not require acceptance. However,
|
| 716 |
+
nothing other than this License grants you permission to propagate or
|
| 717 |
+
modify any covered work. These actions infringe copyright if you do
|
| 718 |
+
not accept this License. Therefore, by modifying or propagating a
|
| 719 |
+
covered work, you indicate your acceptance of this License to do so.
|
| 720 |
+
|
| 721 |
+
10. Automatic Licensing of Downstream Recipients.
|
| 722 |
+
|
| 723 |
+
Each time you convey a covered work, the recipient automatically
|
| 724 |
+
receives a license from the original licensors, to run, modify and
|
| 725 |
+
propagate that work, subject to this License. You are not responsible
|
| 726 |
+
for enforcing compliance by third parties with this License.
|
| 727 |
+
|
| 728 |
+
An "entity transaction" is a transaction transferring control of an
|
| 729 |
+
organization, or substantially all assets of one, or subdividing an
|
| 730 |
+
organization, or merging organizations. If propagation of a covered
|
| 731 |
+
work results from an entity transaction, each party to that
|
| 732 |
+
transaction who receives a copy of the work also receives whatever
|
| 733 |
+
licenses to the work the party's predecessor in interest had or could
|
| 734 |
+
give under the previous paragraph, plus a right to possession of the
|
| 735 |
+
Corresponding Source of the work from the predecessor in interest, if
|
| 736 |
+
the predecessor has it or can get it with reasonable efforts.
|
| 737 |
+
|
| 738 |
+
You may not impose any further restrictions on the exercise of the
|
| 739 |
+
rights granted or affirmed under this License. For example, you may
|
| 740 |
+
not impose a license fee, royalty, or other charge for exercise of
|
| 741 |
+
rights granted under this License, and you may not initiate litigation
|
| 742 |
+
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
| 743 |
+
any patent claim is infringed by making, using, selling, offering for
|
| 744 |
+
sale, or importing the Program or any portion of it.
|
| 745 |
+
|
| 746 |
+
11. Patents.
|
| 747 |
+
|
| 748 |
+
A "contributor" is a copyright holder who authorizes use under this
|
| 749 |
+
License of the Program or a work on which the Program is based. The
|
| 750 |
+
work thus licensed is called the contributor's "contributor version".
|
| 751 |
+
|
| 752 |
+
A contributor's "essential patent claims" are all patent claims
|
| 753 |
+
owned or controlled by the contributor, whether already acquired or
|
| 754 |
+
hereafter acquired, that would be infringed by some manner, permitted
|
| 755 |
+
by this License, of making, using, or selling its contributor version,
|
| 756 |
+
but do not include claims that would be infringed only as a
|
| 757 |
+
consequence of further modification of the contributor version. For
|
| 758 |
+
purposes of this definition, "control" includes the right to grant
|
| 759 |
+
patent sublicenses in a manner consistent with the requirements of
|
| 760 |
+
this License.
|
| 761 |
+
|
| 762 |
+
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
| 763 |
+
patent license under the contributor's essential patent claims, to
|
| 764 |
+
make, use, sell, offer for sale, import and otherwise run, modify and
|
| 765 |
+
propagate the contents of its contributor version.
|
| 766 |
+
|
| 767 |
+
In the following three paragraphs, a "patent license" is any express
|
| 768 |
+
agreement or commitment, however denominated, not to enforce a patent
|
| 769 |
+
(such as an express permission to practice a patent or covenant not to
|
| 770 |
+
sue for patent infringement). To "grant" such a patent license to a
|
| 771 |
+
party means to make such an agreement or commitment not to enforce a
|
| 772 |
+
patent against the party.
|
| 773 |
+
|
| 774 |
+
If you convey a covered work, knowingly relying on a patent license,
|
| 775 |
+
and the Corresponding Source of the work is not available for anyone
|
| 776 |
+
to copy, free of charge and under the terms of this License, through a
|
| 777 |
+
publicly available network server or other readily accessible means,
|
| 778 |
+
then you must either (1) cause the Corresponding Source to be so
|
| 779 |
+
available, or (2) arrange to deprive yourself of the benefit of the
|
| 780 |
+
patent license for this particular work, or (3) arrange, in a manner
|
| 781 |
+
consistent with the requirements of this License, to extend the patent
|
| 782 |
+
license to downstream recipients. "Knowingly relying" means you have
|
| 783 |
+
actual knowledge that, but for the patent license, your conveying the
|
| 784 |
+
covered work in a country, or your recipient's use of the covered work
|
| 785 |
+
in a country, would infringe one or more identifiable patents in that
|
| 786 |
+
country that you have reason to believe are valid.
|
| 787 |
+
|
| 788 |
+
If, pursuant to or in connection with a single transaction or
|
| 789 |
+
arrangement, you convey, or propagate by procuring conveyance of, a
|
| 790 |
+
covered work, and grant a patent license to some of the parties
|
| 791 |
+
receiving the covered work authorizing them to use, propagate, modify
|
| 792 |
+
or convey a specific copy of the covered work, then the patent license
|
| 793 |
+
you grant is automatically extended to all recipients of the covered
|
| 794 |
+
work and works based on it.
|
| 795 |
+
|
| 796 |
+
A patent license is "discriminatory" if it does not include within
|
| 797 |
+
the scope of its coverage, prohibits the exercise of, or is
|
| 798 |
+
conditioned on the non-exercise of one or more of the rights that are
|
| 799 |
+
specifically granted under this License. You may not convey a covered
|
| 800 |
+
work if you are a party to an arrangement with a third party that is
|
| 801 |
+
in the business of distributing software, under which you make payment
|
| 802 |
+
to the third party based on the extent of your activity of conveying
|
| 803 |
+
the work, and under which the third party grants, to any of the
|
| 804 |
+
parties who would receive the covered work from you, a discriminatory
|
| 805 |
+
patent license (a) in connection with copies of the covered work
|
| 806 |
+
conveyed by you (or copies made from those copies), or (b) primarily
|
| 807 |
+
for and in connection with specific products or compilations that
|
| 808 |
+
contain the covered work, unless you entered into that arrangement,
|
| 809 |
+
or that patent license was granted, prior to 28 March 2007.
|
| 810 |
+
|
| 811 |
+
Nothing in this License shall be construed as excluding or limiting
|
| 812 |
+
any implied license or other defenses to infringement that may
|
| 813 |
+
otherwise be available to you under applicable patent law.
|
| 814 |
+
|
| 815 |
+
12. No Surrender of Others' Freedom.
|
| 816 |
+
|
| 817 |
+
If conditions are imposed on you (whether by court order, agreement or
|
| 818 |
+
otherwise) that contradict the conditions of this License, they do not
|
| 819 |
+
excuse you from the conditions of this License. If you cannot convey a
|
| 820 |
+
covered work so as to satisfy simultaneously your obligations under this
|
| 821 |
+
License and any other pertinent obligations, then as a consequence you may
|
| 822 |
+
not convey it at all. For example, if you agree to terms that obligate you
|
| 823 |
+
to collect a royalty for further conveying from those to whom you convey
|
| 824 |
+
the Program, the only way you could satisfy both those terms and this
|
| 825 |
+
License would be to refrain entirely from conveying the Program.
|
| 826 |
+
|
| 827 |
+
13. Use with the GNU Affero General Public License.
|
| 828 |
+
|
| 829 |
+
Notwithstanding any other provision of this License, you have
|
| 830 |
+
permission to link or combine any covered work with a work licensed
|
| 831 |
+
under version 3 of the GNU Affero General Public License into a single
|
| 832 |
+
combined work, and to convey the resulting work. The terms of this
|
| 833 |
+
License will continue to apply to the part which is the covered work,
|
| 834 |
+
but the special requirements of the GNU Affero General Public License,
|
| 835 |
+
section 13, concerning interaction through a network will apply to the
|
| 836 |
+
combination as such.
|
| 837 |
+
|
| 838 |
+
14. Revised Versions of this License.
|
| 839 |
+
|
| 840 |
+
The Free Software Foundation may publish revised and/or new versions of
|
| 841 |
+
the GNU General Public License from time to time. Such new versions will
|
| 842 |
+
be similar in spirit to the present version, but may differ in detail to
|
| 843 |
+
address new problems or concerns.
|
| 844 |
+
|
| 845 |
+
Each version is given a distinguishing version number. If the
|
| 846 |
+
Program specifies that a certain numbered version of the GNU General
|
| 847 |
+
Public License "or any later version" applies to it, you have the
|
| 848 |
+
option of following the terms and conditions either of that numbered
|
| 849 |
+
version or of any later version published by the Free Software
|
| 850 |
+
Foundation. If the Program does not specify a version number of the
|
| 851 |
+
GNU General Public License, you may choose any version ever published
|
| 852 |
+
by the Free Software Foundation.
|
| 853 |
+
|
| 854 |
+
If the Program specifies that a proxy can decide which future
|
| 855 |
+
versions of the GNU General Public License can be used, that proxy's
|
| 856 |
+
public statement of acceptance of a version permanently authorizes you
|
| 857 |
+
to choose that version for the Program.
|
| 858 |
+
|
| 859 |
+
Later license versions may give you additional or different
|
| 860 |
+
permissions. However, no additional obligations are imposed on any
|
| 861 |
+
author or copyright holder as a result of your choosing to follow a
|
| 862 |
+
later version.
|
| 863 |
+
|
| 864 |
+
15. Disclaimer of Warranty.
|
| 865 |
+
|
| 866 |
+
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
| 867 |
+
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
| 868 |
+
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
| 869 |
+
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
| 870 |
+
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 871 |
+
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
| 872 |
+
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
| 873 |
+
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
| 874 |
+
|
| 875 |
+
16. Limitation of Liability.
|
| 876 |
+
|
| 877 |
+
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
| 878 |
+
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
| 879 |
+
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
| 880 |
+
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
| 881 |
+
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
| 882 |
+
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
| 883 |
+
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
| 884 |
+
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
| 885 |
+
SUCH DAMAGES.
|
| 886 |
+
|
| 887 |
+
17. Interpretation of Sections 15 and 16.
|
| 888 |
+
|
| 889 |
+
If the disclaimer of warranty and limitation of liability provided
|
| 890 |
+
above cannot be given local legal effect according to their terms,
|
| 891 |
+
reviewing courts shall apply local law that most closely approximates
|
| 892 |
+
an absolute waiver of all civil liability in connection with the
|
| 893 |
+
Program, unless a warranty or assumption of liability accompanies a
|
| 894 |
+
copy of the Program in return for a fee.
|
| 895 |
+
|
| 896 |
+
END OF TERMS AND CONDITIONS
|
| 897 |
+
|
| 898 |
+
How to Apply These Terms to Your New Programs
|
| 899 |
+
|
| 900 |
+
If you develop a new program, and you want it to be of the greatest
|
| 901 |
+
possible use to the public, the best way to achieve this is to make it
|
| 902 |
+
free software which everyone can redistribute and change under these terms.
|
| 903 |
+
|
| 904 |
+
To do so, attach the following notices to the program. It is safest
|
| 905 |
+
to attach them to the start of each source file to most effectively
|
| 906 |
+
state the exclusion of warranty; and each file should have at least
|
| 907 |
+
the "copyright" line and a pointer to where the full notice is found.
|
| 908 |
+
|
| 909 |
+
<one line to give the program's name and a brief idea of what it does.>
|
| 910 |
+
Copyright (C) <year> <name of author>
|
| 911 |
+
|
| 912 |
+
This program is free software: you can redistribute it and/or modify
|
| 913 |
+
it under the terms of the GNU General Public License as published by
|
| 914 |
+
the Free Software Foundation, either version 3 of the License, or
|
| 915 |
+
(at your option) any later version.
|
| 916 |
+
|
| 917 |
+
This program is distributed in the hope that it will be useful,
|
| 918 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 919 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 920 |
+
GNU General Public License for more details.
|
| 921 |
+
|
| 922 |
+
You should have received a copy of the GNU General Public License
|
| 923 |
+
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
| 924 |
+
|
| 925 |
+
Also add information on how to contact you by electronic and paper mail.
|
| 926 |
+
|
| 927 |
+
If the program does terminal interaction, make it output a short
|
| 928 |
+
notice like this when it starts in an interactive mode:
|
| 929 |
+
|
| 930 |
+
<program> Copyright (C) <year> <name of author>
|
| 931 |
+
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
| 932 |
+
This is free software, and you are welcome to redistribute it
|
| 933 |
+
under certain conditions; type `show c' for details.
|
| 934 |
+
|
| 935 |
+
The hypothetical commands `show w' and `show c' should show the appropriate
|
| 936 |
+
parts of the General Public License. Of course, your program's commands
|
| 937 |
+
might be different; for a GUI interface, you would use an "about box".
|
| 938 |
+
|
| 939 |
+
You should also get your employer (if you work as a programmer) or school,
|
| 940 |
+
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
| 941 |
+
For more information on this, and how to apply and follow the GNU GPL, see
|
| 942 |
+
<http://www.gnu.org/licenses/>.
|
| 943 |
+
|
| 944 |
+
The GNU General Public License does not permit incorporating your program
|
| 945 |
+
into proprietary programs. If your program is a subroutine library, you
|
| 946 |
+
may consider it more useful to permit linking proprietary applications with
|
| 947 |
+
the library. If this is what you want to do, use the GNU Lesser General
|
| 948 |
+
Public License instead of this License. But first, please read
|
| 949 |
+
<http://www.gnu.org/philosophy/why-not-lgpl.html>.
|
| 950 |
+
|
| 951 |
+
Name: libquadmath
|
| 952 |
+
Files: numpy.libs/libquadmath*.so
|
| 953 |
+
Description: dynamically linked to files compiled with gcc
|
| 954 |
+
Availability: https://gcc.gnu.org/git/?p=gcc.git;a=tree;f=libquadmath
|
| 955 |
+
License: LGPL-2.1-or-later
|
| 956 |
+
|
| 957 |
+
GCC Quad-Precision Math Library
|
| 958 |
+
Copyright (C) 2010-2019 Free Software Foundation, Inc.
|
| 959 |
+
Written by Francois-Xavier Coudert <fxcoudert@gcc.gnu.org>
|
| 960 |
+
|
| 961 |
+
This file is part of the libquadmath library.
|
| 962 |
+
Libquadmath is free software; you can redistribute it and/or
|
| 963 |
+
modify it under the terms of the GNU Library General Public
|
| 964 |
+
License as published by the Free Software Foundation; either
|
| 965 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 966 |
+
|
| 967 |
+
Libquadmath is distributed in the hope that it will be useful,
|
| 968 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 969 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 970 |
+
Lesser General Public License for more details.
|
| 971 |
+
https://www.gnu.org/licenses/old-licenses/lgpl-2.1.html
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/METADATA
ADDED
|
@@ -0,0 +1,1092 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.1
|
| 2 |
+
Name: numpy
|
| 3 |
+
Version: 1.26.4
|
| 4 |
+
Summary: Fundamental package for array computing in Python
|
| 5 |
+
Home-page: https://numpy.org
|
| 6 |
+
Author: Travis E. Oliphant et al.
|
| 7 |
+
Maintainer-Email: NumPy Developers <numpy-discussion@python.org>
|
| 8 |
+
License: Copyright (c) 2005-2023, NumPy Developers.
|
| 9 |
+
All rights reserved.
|
| 10 |
+
|
| 11 |
+
Redistribution and use in source and binary forms, with or without
|
| 12 |
+
modification, are permitted provided that the following conditions are
|
| 13 |
+
met:
|
| 14 |
+
|
| 15 |
+
* Redistributions of source code must retain the above copyright
|
| 16 |
+
notice, this list of conditions and the following disclaimer.
|
| 17 |
+
|
| 18 |
+
* Redistributions in binary form must reproduce the above
|
| 19 |
+
copyright notice, this list of conditions and the following
|
| 20 |
+
disclaimer in the documentation and/or other materials provided
|
| 21 |
+
with the distribution.
|
| 22 |
+
|
| 23 |
+
* Neither the name of the NumPy Developers nor the names of any
|
| 24 |
+
contributors may be used to endorse or promote products derived
|
| 25 |
+
from this software without specific prior written permission.
|
| 26 |
+
|
| 27 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| 28 |
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| 29 |
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| 30 |
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| 31 |
+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| 32 |
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
| 33 |
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
| 34 |
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
| 35 |
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 36 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 37 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 38 |
+
|
| 39 |
+
----
|
| 40 |
+
|
| 41 |
+
The NumPy repository and source distributions bundle several libraries that are
|
| 42 |
+
compatibly licensed. We list these here.
|
| 43 |
+
|
| 44 |
+
Name: lapack-lite
|
| 45 |
+
Files: numpy/linalg/lapack_lite/*
|
| 46 |
+
License: BSD-3-Clause
|
| 47 |
+
For details, see numpy/linalg/lapack_lite/LICENSE.txt
|
| 48 |
+
|
| 49 |
+
Name: tempita
|
| 50 |
+
Files: tools/npy_tempita/*
|
| 51 |
+
License: MIT
|
| 52 |
+
For details, see tools/npy_tempita/license.txt
|
| 53 |
+
|
| 54 |
+
Name: dragon4
|
| 55 |
+
Files: numpy/core/src/multiarray/dragon4.c
|
| 56 |
+
License: MIT
|
| 57 |
+
For license text, see numpy/core/src/multiarray/dragon4.c
|
| 58 |
+
|
| 59 |
+
Name: libdivide
|
| 60 |
+
Files: numpy/core/include/numpy/libdivide/*
|
| 61 |
+
License: Zlib
|
| 62 |
+
For license text, see numpy/core/include/numpy/libdivide/LICENSE.txt
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
Note that the following files are vendored in the repository and sdist but not
|
| 66 |
+
installed in built numpy packages:
|
| 67 |
+
|
| 68 |
+
Name: Meson
|
| 69 |
+
Files: vendored-meson/meson/*
|
| 70 |
+
License: Apache 2.0
|
| 71 |
+
For license text, see vendored-meson/meson/COPYING
|
| 72 |
+
|
| 73 |
+
Name: spin
|
| 74 |
+
Files: .spin/cmds.py
|
| 75 |
+
License: BSD-3
|
| 76 |
+
For license text, see .spin/LICENSE
|
| 77 |
+
|
| 78 |
+
----
|
| 79 |
+
|
| 80 |
+
This binary distribution of NumPy also bundles the following software:
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
Name: OpenBLAS
|
| 84 |
+
Files: numpy.libs/libopenblas*.so
|
| 85 |
+
Description: bundled as a dynamically linked library
|
| 86 |
+
Availability: https://github.com/OpenMathLib/OpenBLAS/
|
| 87 |
+
License: BSD-3-Clause
|
| 88 |
+
Copyright (c) 2011-2014, The OpenBLAS Project
|
| 89 |
+
All rights reserved.
|
| 90 |
+
|
| 91 |
+
Redistribution and use in source and binary forms, with or without
|
| 92 |
+
modification, are permitted provided that the following conditions are
|
| 93 |
+
met:
|
| 94 |
+
|
| 95 |
+
1. Redistributions of source code must retain the above copyright
|
| 96 |
+
notice, this list of conditions and the following disclaimer.
|
| 97 |
+
|
| 98 |
+
2. Redistributions in binary form must reproduce the above copyright
|
| 99 |
+
notice, this list of conditions and the following disclaimer in
|
| 100 |
+
the documentation and/or other materials provided with the
|
| 101 |
+
distribution.
|
| 102 |
+
3. Neither the name of the OpenBLAS project nor the names of
|
| 103 |
+
its contributors may be used to endorse or promote products
|
| 104 |
+
derived from this software without specific prior written
|
| 105 |
+
permission.
|
| 106 |
+
|
| 107 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
| 108 |
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| 109 |
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
| 110 |
+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
| 111 |
+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
| 112 |
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
| 113 |
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
| 114 |
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
| 115 |
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
| 116 |
+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
Name: LAPACK
|
| 120 |
+
Files: numpy.libs/libopenblas*.so
|
| 121 |
+
Description: bundled in OpenBLAS
|
| 122 |
+
Availability: https://github.com/OpenMathLib/OpenBLAS/
|
| 123 |
+
License: BSD-3-Clause-Attribution
|
| 124 |
+
Copyright (c) 1992-2013 The University of Tennessee and The University
|
| 125 |
+
of Tennessee Research Foundation. All rights
|
| 126 |
+
reserved.
|
| 127 |
+
Copyright (c) 2000-2013 The University of California Berkeley. All
|
| 128 |
+
rights reserved.
|
| 129 |
+
Copyright (c) 2006-2013 The University of Colorado Denver. All rights
|
| 130 |
+
reserved.
|
| 131 |
+
|
| 132 |
+
$COPYRIGHT$
|
| 133 |
+
|
| 134 |
+
Additional copyrights may follow
|
| 135 |
+
|
| 136 |
+
$HEADER$
|
| 137 |
+
|
| 138 |
+
Redistribution and use in source and binary forms, with or without
|
| 139 |
+
modification, are permitted provided that the following conditions are
|
| 140 |
+
met:
|
| 141 |
+
|
| 142 |
+
- Redistributions of source code must retain the above copyright
|
| 143 |
+
notice, this list of conditions and the following disclaimer.
|
| 144 |
+
|
| 145 |
+
- Redistributions in binary form must reproduce the above copyright
|
| 146 |
+
notice, this list of conditions and the following disclaimer listed
|
| 147 |
+
in this license in the documentation and/or other materials
|
| 148 |
+
provided with the distribution.
|
| 149 |
+
|
| 150 |
+
- Neither the name of the copyright holders nor the names of its
|
| 151 |
+
contributors may be used to endorse or promote products derived from
|
| 152 |
+
this software without specific prior written permission.
|
| 153 |
+
|
| 154 |
+
The copyright holders provide no reassurances that the source code
|
| 155 |
+
provided does not infringe any patent, copyright, or any other
|
| 156 |
+
intellectual property rights of third parties. The copyright holders
|
| 157 |
+
disclaim any liability to any recipient for claims brought against
|
| 158 |
+
recipient by any third party for infringement of that parties
|
| 159 |
+
intellectual property rights.
|
| 160 |
+
|
| 161 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| 162 |
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| 163 |
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| 164 |
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| 165 |
+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| 166 |
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
| 167 |
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
| 168 |
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
| 169 |
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 170 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 171 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
Name: GCC runtime library
|
| 175 |
+
Files: numpy.libs/libgfortran*.so
|
| 176 |
+
Description: dynamically linked to files compiled with gcc
|
| 177 |
+
Availability: https://gcc.gnu.org/git/?p=gcc.git;a=tree;f=libgfortran
|
| 178 |
+
License: GPL-3.0-with-GCC-exception
|
| 179 |
+
Copyright (C) 2002-2017 Free Software Foundation, Inc.
|
| 180 |
+
|
| 181 |
+
Libgfortran is free software; you can redistribute it and/or modify
|
| 182 |
+
it under the terms of the GNU General Public License as published by
|
| 183 |
+
the Free Software Foundation; either version 3, or (at your option)
|
| 184 |
+
any later version.
|
| 185 |
+
|
| 186 |
+
Libgfortran is distributed in the hope that it will be useful,
|
| 187 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 188 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 189 |
+
GNU General Public License for more details.
|
| 190 |
+
|
| 191 |
+
Under Section 7 of GPL version 3, you are granted additional
|
| 192 |
+
permissions described in the GCC Runtime Library Exception, version
|
| 193 |
+
3.1, as published by the Free Software Foundation.
|
| 194 |
+
|
| 195 |
+
You should have received a copy of the GNU General Public License and
|
| 196 |
+
a copy of the GCC Runtime Library Exception along with this program;
|
| 197 |
+
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
| 198 |
+
<http://www.gnu.org/licenses/>.
|
| 199 |
+
|
| 200 |
+
----
|
| 201 |
+
|
| 202 |
+
Full text of license texts referred to above follows (that they are
|
| 203 |
+
listed below does not necessarily imply the conditions apply to the
|
| 204 |
+
present binary release):
|
| 205 |
+
|
| 206 |
+
----
|
| 207 |
+
|
| 208 |
+
GCC RUNTIME LIBRARY EXCEPTION
|
| 209 |
+
|
| 210 |
+
Version 3.1, 31 March 2009
|
| 211 |
+
|
| 212 |
+
Copyright (C) 2009 Free Software Foundation, Inc. <http://fsf.org/>
|
| 213 |
+
|
| 214 |
+
Everyone is permitted to copy and distribute verbatim copies of this
|
| 215 |
+
license document, but changing it is not allowed.
|
| 216 |
+
|
| 217 |
+
This GCC Runtime Library Exception ("Exception") is an additional
|
| 218 |
+
permission under section 7 of the GNU General Public License, version
|
| 219 |
+
3 ("GPLv3"). It applies to a given file (the "Runtime Library") that
|
| 220 |
+
bears a notice placed by the copyright holder of the file stating that
|
| 221 |
+
the file is governed by GPLv3 along with this Exception.
|
| 222 |
+
|
| 223 |
+
When you use GCC to compile a program, GCC may combine portions of
|
| 224 |
+
certain GCC header files and runtime libraries with the compiled
|
| 225 |
+
program. The purpose of this Exception is to allow compilation of
|
| 226 |
+
non-GPL (including proprietary) programs to use, in this way, the
|
| 227 |
+
header files and runtime libraries covered by this Exception.
|
| 228 |
+
|
| 229 |
+
0. Definitions.
|
| 230 |
+
|
| 231 |
+
A file is an "Independent Module" if it either requires the Runtime
|
| 232 |
+
Library for execution after a Compilation Process, or makes use of an
|
| 233 |
+
interface provided by the Runtime Library, but is not otherwise based
|
| 234 |
+
on the Runtime Library.
|
| 235 |
+
|
| 236 |
+
"GCC" means a version of the GNU Compiler Collection, with or without
|
| 237 |
+
modifications, governed by version 3 (or a specified later version) of
|
| 238 |
+
the GNU General Public License (GPL) with the option of using any
|
| 239 |
+
subsequent versions published by the FSF.
|
| 240 |
+
|
| 241 |
+
"GPL-compatible Software" is software whose conditions of propagation,
|
| 242 |
+
modification and use would permit combination with GCC in accord with
|
| 243 |
+
the license of GCC.
|
| 244 |
+
|
| 245 |
+
"Target Code" refers to output from any compiler for a real or virtual
|
| 246 |
+
target processor architecture, in executable form or suitable for
|
| 247 |
+
input to an assembler, loader, linker and/or execution
|
| 248 |
+
phase. Notwithstanding that, Target Code does not include data in any
|
| 249 |
+
format that is used as a compiler intermediate representation, or used
|
| 250 |
+
for producing a compiler intermediate representation.
|
| 251 |
+
|
| 252 |
+
The "Compilation Process" transforms code entirely represented in
|
| 253 |
+
non-intermediate languages designed for human-written code, and/or in
|
| 254 |
+
Java Virtual Machine byte code, into Target Code. Thus, for example,
|
| 255 |
+
use of source code generators and preprocessors need not be considered
|
| 256 |
+
part of the Compilation Process, since the Compilation Process can be
|
| 257 |
+
understood as starting with the output of the generators or
|
| 258 |
+
preprocessors.
|
| 259 |
+
|
| 260 |
+
A Compilation Process is "Eligible" if it is done using GCC, alone or
|
| 261 |
+
with other GPL-compatible software, or if it is done without using any
|
| 262 |
+
work based on GCC. For example, using non-GPL-compatible Software to
|
| 263 |
+
optimize any GCC intermediate representations would not qualify as an
|
| 264 |
+
Eligible Compilation Process.
|
| 265 |
+
|
| 266 |
+
1. Grant of Additional Permission.
|
| 267 |
+
|
| 268 |
+
You have permission to propagate a work of Target Code formed by
|
| 269 |
+
combining the Runtime Library with Independent Modules, even if such
|
| 270 |
+
propagation would otherwise violate the terms of GPLv3, provided that
|
| 271 |
+
all Target Code was generated by Eligible Compilation Processes. You
|
| 272 |
+
may then convey such a combination under terms of your choice,
|
| 273 |
+
consistent with the licensing of the Independent Modules.
|
| 274 |
+
|
| 275 |
+
2. No Weakening of GCC Copyleft.
|
| 276 |
+
|
| 277 |
+
The availability of this Exception does not imply any general
|
| 278 |
+
presumption that third-party software is unaffected by the copyleft
|
| 279 |
+
requirements of the license of GCC.
|
| 280 |
+
|
| 281 |
+
----
|
| 282 |
+
|
| 283 |
+
GNU GENERAL PUBLIC LICENSE
|
| 284 |
+
Version 3, 29 June 2007
|
| 285 |
+
|
| 286 |
+
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
|
| 287 |
+
Everyone is permitted to copy and distribute verbatim copies
|
| 288 |
+
of this license document, but changing it is not allowed.
|
| 289 |
+
|
| 290 |
+
Preamble
|
| 291 |
+
|
| 292 |
+
The GNU General Public License is a free, copyleft license for
|
| 293 |
+
software and other kinds of works.
|
| 294 |
+
|
| 295 |
+
The licenses for most software and other practical works are designed
|
| 296 |
+
to take away your freedom to share and change the works. By contrast,
|
| 297 |
+
the GNU General Public License is intended to guarantee your freedom to
|
| 298 |
+
share and change all versions of a program--to make sure it remains free
|
| 299 |
+
software for all its users. We, the Free Software Foundation, use the
|
| 300 |
+
GNU General Public License for most of our software; it applies also to
|
| 301 |
+
any other work released this way by its authors. You can apply it to
|
| 302 |
+
your programs, too.
|
| 303 |
+
|
| 304 |
+
When we speak of free software, we are referring to freedom, not
|
| 305 |
+
price. Our General Public Licenses are designed to make sure that you
|
| 306 |
+
have the freedom to distribute copies of free software (and charge for
|
| 307 |
+
them if you wish), that you receive source code or can get it if you
|
| 308 |
+
want it, that you can change the software or use pieces of it in new
|
| 309 |
+
free programs, and that you know you can do these things.
|
| 310 |
+
|
| 311 |
+
To protect your rights, we need to prevent others from denying you
|
| 312 |
+
these rights or asking you to surrender the rights. Therefore, you have
|
| 313 |
+
certain responsibilities if you distribute copies of the software, or if
|
| 314 |
+
you modify it: responsibilities to respect the freedom of others.
|
| 315 |
+
|
| 316 |
+
For example, if you distribute copies of such a program, whether
|
| 317 |
+
gratis or for a fee, you must pass on to the recipients the same
|
| 318 |
+
freedoms that you received. You must make sure that they, too, receive
|
| 319 |
+
or can get the source code. And you must show them these terms so they
|
| 320 |
+
know their rights.
|
| 321 |
+
|
| 322 |
+
Developers that use the GNU GPL protect your rights with two steps:
|
| 323 |
+
(1) assert copyright on the software, and (2) offer you this License
|
| 324 |
+
giving you legal permission to copy, distribute and/or modify it.
|
| 325 |
+
|
| 326 |
+
For the developers' and authors' protection, the GPL clearly explains
|
| 327 |
+
that there is no warranty for this free software. For both users' and
|
| 328 |
+
authors' sake, the GPL requires that modified versions be marked as
|
| 329 |
+
changed, so that their problems will not be attributed erroneously to
|
| 330 |
+
authors of previous versions.
|
| 331 |
+
|
| 332 |
+
Some devices are designed to deny users access to install or run
|
| 333 |
+
modified versions of the software inside them, although the manufacturer
|
| 334 |
+
can do so. This is fundamentally incompatible with the aim of
|
| 335 |
+
protecting users' freedom to change the software. The systematic
|
| 336 |
+
pattern of such abuse occurs in the area of products for individuals to
|
| 337 |
+
use, which is precisely where it is most unacceptable. Therefore, we
|
| 338 |
+
have designed this version of the GPL to prohibit the practice for those
|
| 339 |
+
products. If such problems arise substantially in other domains, we
|
| 340 |
+
stand ready to extend this provision to those domains in future versions
|
| 341 |
+
of the GPL, as needed to protect the freedom of users.
|
| 342 |
+
|
| 343 |
+
Finally, every program is threatened constantly by software patents.
|
| 344 |
+
States should not allow patents to restrict development and use of
|
| 345 |
+
software on general-purpose computers, but in those that do, we wish to
|
| 346 |
+
avoid the special danger that patents applied to a free program could
|
| 347 |
+
make it effectively proprietary. To prevent this, the GPL assures that
|
| 348 |
+
patents cannot be used to render the program non-free.
|
| 349 |
+
|
| 350 |
+
The precise terms and conditions for copying, distribution and
|
| 351 |
+
modification follow.
|
| 352 |
+
|
| 353 |
+
TERMS AND CONDITIONS
|
| 354 |
+
|
| 355 |
+
0. Definitions.
|
| 356 |
+
|
| 357 |
+
"This License" refers to version 3 of the GNU General Public License.
|
| 358 |
+
|
| 359 |
+
"Copyright" also means copyright-like laws that apply to other kinds of
|
| 360 |
+
works, such as semiconductor masks.
|
| 361 |
+
|
| 362 |
+
"The Program" refers to any copyrightable work licensed under this
|
| 363 |
+
License. Each licensee is addressed as "you". "Licensees" and
|
| 364 |
+
"recipients" may be individuals or organizations.
|
| 365 |
+
|
| 366 |
+
To "modify" a work means to copy from or adapt all or part of the work
|
| 367 |
+
in a fashion requiring copyright permission, other than the making of an
|
| 368 |
+
exact copy. The resulting work is called a "modified version" of the
|
| 369 |
+
earlier work or a work "based on" the earlier work.
|
| 370 |
+
|
| 371 |
+
A "covered work" means either the unmodified Program or a work based
|
| 372 |
+
on the Program.
|
| 373 |
+
|
| 374 |
+
To "propagate" a work means to do anything with it that, without
|
| 375 |
+
permission, would make you directly or secondarily liable for
|
| 376 |
+
infringement under applicable copyright law, except executing it on a
|
| 377 |
+
computer or modifying a private copy. Propagation includes copying,
|
| 378 |
+
distribution (with or without modification), making available to the
|
| 379 |
+
public, and in some countries other activities as well.
|
| 380 |
+
|
| 381 |
+
To "convey" a work means any kind of propagation that enables other
|
| 382 |
+
parties to make or receive copies. Mere interaction with a user through
|
| 383 |
+
a computer network, with no transfer of a copy, is not conveying.
|
| 384 |
+
|
| 385 |
+
An interactive user interface displays "Appropriate Legal Notices"
|
| 386 |
+
to the extent that it includes a convenient and prominently visible
|
| 387 |
+
feature that (1) displays an appropriate copyright notice, and (2)
|
| 388 |
+
tells the user that there is no warranty for the work (except to the
|
| 389 |
+
extent that warranties are provided), that licensees may convey the
|
| 390 |
+
work under this License, and how to view a copy of this License. If
|
| 391 |
+
the interface presents a list of user commands or options, such as a
|
| 392 |
+
menu, a prominent item in the list meets this criterion.
|
| 393 |
+
|
| 394 |
+
1. Source Code.
|
| 395 |
+
|
| 396 |
+
The "source code" for a work means the preferred form of the work
|
| 397 |
+
for making modifications to it. "Object code" means any non-source
|
| 398 |
+
form of a work.
|
| 399 |
+
|
| 400 |
+
A "Standard Interface" means an interface that either is an official
|
| 401 |
+
standard defined by a recognized standards body, or, in the case of
|
| 402 |
+
interfaces specified for a particular programming language, one that
|
| 403 |
+
is widely used among developers working in that language.
|
| 404 |
+
|
| 405 |
+
The "System Libraries" of an executable work include anything, other
|
| 406 |
+
than the work as a whole, that (a) is included in the normal form of
|
| 407 |
+
packaging a Major Component, but which is not part of that Major
|
| 408 |
+
Component, and (b) serves only to enable use of the work with that
|
| 409 |
+
Major Component, or to implement a Standard Interface for which an
|
| 410 |
+
implementation is available to the public in source code form. A
|
| 411 |
+
"Major Component", in this context, means a major essential component
|
| 412 |
+
(kernel, window system, and so on) of the specific operating system
|
| 413 |
+
(if any) on which the executable work runs, or a compiler used to
|
| 414 |
+
produce the work, or an object code interpreter used to run it.
|
| 415 |
+
|
| 416 |
+
The "Corresponding Source" for a work in object code form means all
|
| 417 |
+
the source code needed to generate, install, and (for an executable
|
| 418 |
+
work) run the object code and to modify the work, including scripts to
|
| 419 |
+
control those activities. However, it does not include the work's
|
| 420 |
+
System Libraries, or general-purpose tools or generally available free
|
| 421 |
+
programs which are used unmodified in performing those activities but
|
| 422 |
+
which are not part of the work. For example, Corresponding Source
|
| 423 |
+
includes interface definition files associated with source files for
|
| 424 |
+
the work, and the source code for shared libraries and dynamically
|
| 425 |
+
linked subprograms that the work is specifically designed to require,
|
| 426 |
+
such as by intimate data communication or control flow between those
|
| 427 |
+
subprograms and other parts of the work.
|
| 428 |
+
|
| 429 |
+
The Corresponding Source need not include anything that users
|
| 430 |
+
can regenerate automatically from other parts of the Corresponding
|
| 431 |
+
Source.
|
| 432 |
+
|
| 433 |
+
The Corresponding Source for a work in source code form is that
|
| 434 |
+
same work.
|
| 435 |
+
|
| 436 |
+
2. Basic Permissions.
|
| 437 |
+
|
| 438 |
+
All rights granted under this License are granted for the term of
|
| 439 |
+
copyright on the Program, and are irrevocable provided the stated
|
| 440 |
+
conditions are met. This License explicitly affirms your unlimited
|
| 441 |
+
permission to run the unmodified Program. The output from running a
|
| 442 |
+
covered work is covered by this License only if the output, given its
|
| 443 |
+
content, constitutes a covered work. This License acknowledges your
|
| 444 |
+
rights of fair use or other equivalent, as provided by copyright law.
|
| 445 |
+
|
| 446 |
+
You may make, run and propagate covered works that you do not
|
| 447 |
+
convey, without conditions so long as your license otherwise remains
|
| 448 |
+
in force. You may convey covered works to others for the sole purpose
|
| 449 |
+
of having them make modifications exclusively for you, or provide you
|
| 450 |
+
with facilities for running those works, provided that you comply with
|
| 451 |
+
the terms of this License in conveying all material for which you do
|
| 452 |
+
not control copyright. Those thus making or running the covered works
|
| 453 |
+
for you must do so exclusively on your behalf, under your direction
|
| 454 |
+
and control, on terms that prohibit them from making any copies of
|
| 455 |
+
your copyrighted material outside their relationship with you.
|
| 456 |
+
|
| 457 |
+
Conveying under any other circumstances is permitted solely under
|
| 458 |
+
the conditions stated below. Sublicensing is not allowed; section 10
|
| 459 |
+
makes it unnecessary.
|
| 460 |
+
|
| 461 |
+
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
| 462 |
+
|
| 463 |
+
No covered work shall be deemed part of an effective technological
|
| 464 |
+
measure under any applicable law fulfilling obligations under article
|
| 465 |
+
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
| 466 |
+
similar laws prohibiting or restricting circumvention of such
|
| 467 |
+
measures.
|
| 468 |
+
|
| 469 |
+
When you convey a covered work, you waive any legal power to forbid
|
| 470 |
+
circumvention of technological measures to the extent such circumvention
|
| 471 |
+
is effected by exercising rights under this License with respect to
|
| 472 |
+
the covered work, and you disclaim any intention to limit operation or
|
| 473 |
+
modification of the work as a means of enforcing, against the work's
|
| 474 |
+
users, your or third parties' legal rights to forbid circumvention of
|
| 475 |
+
technological measures.
|
| 476 |
+
|
| 477 |
+
4. Conveying Verbatim Copies.
|
| 478 |
+
|
| 479 |
+
You may convey verbatim copies of the Program's source code as you
|
| 480 |
+
receive it, in any medium, provided that you conspicuously and
|
| 481 |
+
appropriately publish on each copy an appropriate copyright notice;
|
| 482 |
+
keep intact all notices stating that this License and any
|
| 483 |
+
non-permissive terms added in accord with section 7 apply to the code;
|
| 484 |
+
keep intact all notices of the absence of any warranty; and give all
|
| 485 |
+
recipients a copy of this License along with the Program.
|
| 486 |
+
|
| 487 |
+
You may charge any price or no price for each copy that you convey,
|
| 488 |
+
and you may offer support or warranty protection for a fee.
|
| 489 |
+
|
| 490 |
+
5. Conveying Modified Source Versions.
|
| 491 |
+
|
| 492 |
+
You may convey a work based on the Program, or the modifications to
|
| 493 |
+
produce it from the Program, in the form of source code under the
|
| 494 |
+
terms of section 4, provided that you also meet all of these conditions:
|
| 495 |
+
|
| 496 |
+
a) The work must carry prominent notices stating that you modified
|
| 497 |
+
it, and giving a relevant date.
|
| 498 |
+
|
| 499 |
+
b) The work must carry prominent notices stating that it is
|
| 500 |
+
released under this License and any conditions added under section
|
| 501 |
+
7. This requirement modifies the requirement in section 4 to
|
| 502 |
+
"keep intact all notices".
|
| 503 |
+
|
| 504 |
+
c) You must license the entire work, as a whole, under this
|
| 505 |
+
License to anyone who comes into possession of a copy. This
|
| 506 |
+
License will therefore apply, along with any applicable section 7
|
| 507 |
+
additional terms, to the whole of the work, and all its parts,
|
| 508 |
+
regardless of how they are packaged. This License gives no
|
| 509 |
+
permission to license the work in any other way, but it does not
|
| 510 |
+
invalidate such permission if you have separately received it.
|
| 511 |
+
|
| 512 |
+
d) If the work has interactive user interfaces, each must display
|
| 513 |
+
Appropriate Legal Notices; however, if the Program has interactive
|
| 514 |
+
interfaces that do not display Appropriate Legal Notices, your
|
| 515 |
+
work need not make them do so.
|
| 516 |
+
|
| 517 |
+
A compilation of a covered work with other separate and independent
|
| 518 |
+
works, which are not by their nature extensions of the covered work,
|
| 519 |
+
and which are not combined with it such as to form a larger program,
|
| 520 |
+
in or on a volume of a storage or distribution medium, is called an
|
| 521 |
+
"aggregate" if the compilation and its resulting copyright are not
|
| 522 |
+
used to limit the access or legal rights of the compilation's users
|
| 523 |
+
beyond what the individual works permit. Inclusion of a covered work
|
| 524 |
+
in an aggregate does not cause this License to apply to the other
|
| 525 |
+
parts of the aggregate.
|
| 526 |
+
|
| 527 |
+
6. Conveying Non-Source Forms.
|
| 528 |
+
|
| 529 |
+
You may convey a covered work in object code form under the terms
|
| 530 |
+
of sections 4 and 5, provided that you also convey the
|
| 531 |
+
machine-readable Corresponding Source under the terms of this License,
|
| 532 |
+
in one of these ways:
|
| 533 |
+
|
| 534 |
+
a) Convey the object code in, or embodied in, a physical product
|
| 535 |
+
(including a physical distribution medium), accompanied by the
|
| 536 |
+
Corresponding Source fixed on a durable physical medium
|
| 537 |
+
customarily used for software interchange.
|
| 538 |
+
|
| 539 |
+
b) Convey the object code in, or embodied in, a physical product
|
| 540 |
+
(including a physical distribution medium), accompanied by a
|
| 541 |
+
written offer, valid for at least three years and valid for as
|
| 542 |
+
long as you offer spare parts or customer support for that product
|
| 543 |
+
model, to give anyone who possesses the object code either (1) a
|
| 544 |
+
copy of the Corresponding Source for all the software in the
|
| 545 |
+
product that is covered by this License, on a durable physical
|
| 546 |
+
medium customarily used for software interchange, for a price no
|
| 547 |
+
more than your reasonable cost of physically performing this
|
| 548 |
+
conveying of source, or (2) access to copy the
|
| 549 |
+
Corresponding Source from a network server at no charge.
|
| 550 |
+
|
| 551 |
+
c) Convey individual copies of the object code with a copy of the
|
| 552 |
+
written offer to provide the Corresponding Source. This
|
| 553 |
+
alternative is allowed only occasionally and noncommercially, and
|
| 554 |
+
only if you received the object code with such an offer, in accord
|
| 555 |
+
with subsection 6b.
|
| 556 |
+
|
| 557 |
+
d) Convey the object code by offering access from a designated
|
| 558 |
+
place (gratis or for a charge), and offer equivalent access to the
|
| 559 |
+
Corresponding Source in the same way through the same place at no
|
| 560 |
+
further charge. You need not require recipients to copy the
|
| 561 |
+
Corresponding Source along with the object code. If the place to
|
| 562 |
+
copy the object code is a network server, the Corresponding Source
|
| 563 |
+
may be on a different server (operated by you or a third party)
|
| 564 |
+
that supports equivalent copying facilities, provided you maintain
|
| 565 |
+
clear directions next to the object code saying where to find the
|
| 566 |
+
Corresponding Source. Regardless of what server hosts the
|
| 567 |
+
Corresponding Source, you remain obligated to ensure that it is
|
| 568 |
+
available for as long as needed to satisfy these requirements.
|
| 569 |
+
|
| 570 |
+
e) Convey the object code using peer-to-peer transmission, provided
|
| 571 |
+
you inform other peers where the object code and Corresponding
|
| 572 |
+
Source of the work are being offered to the general public at no
|
| 573 |
+
charge under subsection 6d.
|
| 574 |
+
|
| 575 |
+
A separable portion of the object code, whose source code is excluded
|
| 576 |
+
from the Corresponding Source as a System Library, need not be
|
| 577 |
+
included in conveying the object code work.
|
| 578 |
+
|
| 579 |
+
A "User Product" is either (1) a "consumer product", which means any
|
| 580 |
+
tangible personal property which is normally used for personal, family,
|
| 581 |
+
or household purposes, or (2) anything designed or sold for incorporation
|
| 582 |
+
into a dwelling. In determining whether a product is a consumer product,
|
| 583 |
+
doubtful cases shall be resolved in favor of coverage. For a particular
|
| 584 |
+
product received by a particular user, "normally used" refers to a
|
| 585 |
+
typical or common use of that class of product, regardless of the status
|
| 586 |
+
of the particular user or of the way in which the particular user
|
| 587 |
+
actually uses, or expects or is expected to use, the product. A product
|
| 588 |
+
is a consumer product regardless of whether the product has substantial
|
| 589 |
+
commercial, industrial or non-consumer uses, unless such uses represent
|
| 590 |
+
the only significant mode of use of the product.
|
| 591 |
+
|
| 592 |
+
"Installation Information" for a User Product means any methods,
|
| 593 |
+
procedures, authorization keys, or other information required to install
|
| 594 |
+
and execute modified versions of a covered work in that User Product from
|
| 595 |
+
a modified version of its Corresponding Source. The information must
|
| 596 |
+
suffice to ensure that the continued functioning of the modified object
|
| 597 |
+
code is in no case prevented or interfered with solely because
|
| 598 |
+
modification has been made.
|
| 599 |
+
|
| 600 |
+
If you convey an object code work under this section in, or with, or
|
| 601 |
+
specifically for use in, a User Product, and the conveying occurs as
|
| 602 |
+
part of a transaction in which the right of possession and use of the
|
| 603 |
+
User Product is transferred to the recipient in perpetuity or for a
|
| 604 |
+
fixed term (regardless of how the transaction is characterized), the
|
| 605 |
+
Corresponding Source conveyed under this section must be accompanied
|
| 606 |
+
by the Installation Information. But this requirement does not apply
|
| 607 |
+
if neither you nor any third party retains the ability to install
|
| 608 |
+
modified object code on the User Product (for example, the work has
|
| 609 |
+
been installed in ROM).
|
| 610 |
+
|
| 611 |
+
The requirement to provide Installation Information does not include a
|
| 612 |
+
requirement to continue to provide support service, warranty, or updates
|
| 613 |
+
for a work that has been modified or installed by the recipient, or for
|
| 614 |
+
the User Product in which it has been modified or installed. Access to a
|
| 615 |
+
network may be denied when the modification itself materially and
|
| 616 |
+
adversely affects the operation of the network or violates the rules and
|
| 617 |
+
protocols for communication across the network.
|
| 618 |
+
|
| 619 |
+
Corresponding Source conveyed, and Installation Information provided,
|
| 620 |
+
in accord with this section must be in a format that is publicly
|
| 621 |
+
documented (and with an implementation available to the public in
|
| 622 |
+
source code form), and must require no special password or key for
|
| 623 |
+
unpacking, reading or copying.
|
| 624 |
+
|
| 625 |
+
7. Additional Terms.
|
| 626 |
+
|
| 627 |
+
"Additional permissions" are terms that supplement the terms of this
|
| 628 |
+
License by making exceptions from one or more of its conditions.
|
| 629 |
+
Additional permissions that are applicable to the entire Program shall
|
| 630 |
+
be treated as though they were included in this License, to the extent
|
| 631 |
+
that they are valid under applicable law. If additional permissions
|
| 632 |
+
apply only to part of the Program, that part may be used separately
|
| 633 |
+
under those permissions, but the entire Program remains governed by
|
| 634 |
+
this License without regard to the additional permissions.
|
| 635 |
+
|
| 636 |
+
When you convey a copy of a covered work, you may at your option
|
| 637 |
+
remove any additional permissions from that copy, or from any part of
|
| 638 |
+
it. (Additional permissions may be written to require their own
|
| 639 |
+
removal in certain cases when you modify the work.) You may place
|
| 640 |
+
additional permissions on material, added by you to a covered work,
|
| 641 |
+
for which you have or can give appropriate copyright permission.
|
| 642 |
+
|
| 643 |
+
Notwithstanding any other provision of this License, for material you
|
| 644 |
+
add to a covered work, you may (if authorized by the copyright holders of
|
| 645 |
+
that material) supplement the terms of this License with terms:
|
| 646 |
+
|
| 647 |
+
a) Disclaiming warranty or limiting liability differently from the
|
| 648 |
+
terms of sections 15 and 16 of this License; or
|
| 649 |
+
|
| 650 |
+
b) Requiring preservation of specified reasonable legal notices or
|
| 651 |
+
author attributions in that material or in the Appropriate Legal
|
| 652 |
+
Notices displayed by works containing it; or
|
| 653 |
+
|
| 654 |
+
c) Prohibiting misrepresentation of the origin of that material, or
|
| 655 |
+
requiring that modified versions of such material be marked in
|
| 656 |
+
reasonable ways as different from the original version; or
|
| 657 |
+
|
| 658 |
+
d) Limiting the use for publicity purposes of names of licensors or
|
| 659 |
+
authors of the material; or
|
| 660 |
+
|
| 661 |
+
e) Declining to grant rights under trademark law for use of some
|
| 662 |
+
trade names, trademarks, or service marks; or
|
| 663 |
+
|
| 664 |
+
f) Requiring indemnification of licensors and authors of that
|
| 665 |
+
material by anyone who conveys the material (or modified versions of
|
| 666 |
+
it) with contractual assumptions of liability to the recipient, for
|
| 667 |
+
any liability that these contractual assumptions directly impose on
|
| 668 |
+
those licensors and authors.
|
| 669 |
+
|
| 670 |
+
All other non-permissive additional terms are considered "further
|
| 671 |
+
restrictions" within the meaning of section 10. If the Program as you
|
| 672 |
+
received it, or any part of it, contains a notice stating that it is
|
| 673 |
+
governed by this License along with a term that is a further
|
| 674 |
+
restriction, you may remove that term. If a license document contains
|
| 675 |
+
a further restriction but permits relicensing or conveying under this
|
| 676 |
+
License, you may add to a covered work material governed by the terms
|
| 677 |
+
of that license document, provided that the further restriction does
|
| 678 |
+
not survive such relicensing or conveying.
|
| 679 |
+
|
| 680 |
+
If you add terms to a covered work in accord with this section, you
|
| 681 |
+
must place, in the relevant source files, a statement of the
|
| 682 |
+
additional terms that apply to those files, or a notice indicating
|
| 683 |
+
where to find the applicable terms.
|
| 684 |
+
|
| 685 |
+
Additional terms, permissive or non-permissive, may be stated in the
|
| 686 |
+
form of a separately written license, or stated as exceptions;
|
| 687 |
+
the above requirements apply either way.
|
| 688 |
+
|
| 689 |
+
8. Termination.
|
| 690 |
+
|
| 691 |
+
You may not propagate or modify a covered work except as expressly
|
| 692 |
+
provided under this License. Any attempt otherwise to propagate or
|
| 693 |
+
modify it is void, and will automatically terminate your rights under
|
| 694 |
+
this License (including any patent licenses granted under the third
|
| 695 |
+
paragraph of section 11).
|
| 696 |
+
|
| 697 |
+
However, if you cease all violation of this License, then your
|
| 698 |
+
license from a particular copyright holder is reinstated (a)
|
| 699 |
+
provisionally, unless and until the copyright holder explicitly and
|
| 700 |
+
finally terminates your license, and (b) permanently, if the copyright
|
| 701 |
+
holder fails to notify you of the violation by some reasonable means
|
| 702 |
+
prior to 60 days after the cessation.
|
| 703 |
+
|
| 704 |
+
Moreover, your license from a particular copyright holder is
|
| 705 |
+
reinstated permanently if the copyright holder notifies you of the
|
| 706 |
+
violation by some reasonable means, this is the first time you have
|
| 707 |
+
received notice of violation of this License (for any work) from that
|
| 708 |
+
copyright holder, and you cure the violation prior to 30 days after
|
| 709 |
+
your receipt of the notice.
|
| 710 |
+
|
| 711 |
+
Termination of your rights under this section does not terminate the
|
| 712 |
+
licenses of parties who have received copies or rights from you under
|
| 713 |
+
this License. If your rights have been terminated and not permanently
|
| 714 |
+
reinstated, you do not qualify to receive new licenses for the same
|
| 715 |
+
material under section 10.
|
| 716 |
+
|
| 717 |
+
9. Acceptance Not Required for Having Copies.
|
| 718 |
+
|
| 719 |
+
You are not required to accept this License in order to receive or
|
| 720 |
+
run a copy of the Program. Ancillary propagation of a covered work
|
| 721 |
+
occurring solely as a consequence of using peer-to-peer transmission
|
| 722 |
+
to receive a copy likewise does not require acceptance. However,
|
| 723 |
+
nothing other than this License grants you permission to propagate or
|
| 724 |
+
modify any covered work. These actions infringe copyright if you do
|
| 725 |
+
not accept this License. Therefore, by modifying or propagating a
|
| 726 |
+
covered work, you indicate your acceptance of this License to do so.
|
| 727 |
+
|
| 728 |
+
10. Automatic Licensing of Downstream Recipients.
|
| 729 |
+
|
| 730 |
+
Each time you convey a covered work, the recipient automatically
|
| 731 |
+
receives a license from the original licensors, to run, modify and
|
| 732 |
+
propagate that work, subject to this License. You are not responsible
|
| 733 |
+
for enforcing compliance by third parties with this License.
|
| 734 |
+
|
| 735 |
+
An "entity transaction" is a transaction transferring control of an
|
| 736 |
+
organization, or substantially all assets of one, or subdividing an
|
| 737 |
+
organization, or merging organizations. If propagation of a covered
|
| 738 |
+
work results from an entity transaction, each party to that
|
| 739 |
+
transaction who receives a copy of the work also receives whatever
|
| 740 |
+
licenses to the work the party's predecessor in interest had or could
|
| 741 |
+
give under the previous paragraph, plus a right to possession of the
|
| 742 |
+
Corresponding Source of the work from the predecessor in interest, if
|
| 743 |
+
the predecessor has it or can get it with reasonable efforts.
|
| 744 |
+
|
| 745 |
+
You may not impose any further restrictions on the exercise of the
|
| 746 |
+
rights granted or affirmed under this License. For example, you may
|
| 747 |
+
not impose a license fee, royalty, or other charge for exercise of
|
| 748 |
+
rights granted under this License, and you may not initiate litigation
|
| 749 |
+
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
| 750 |
+
any patent claim is infringed by making, using, selling, offering for
|
| 751 |
+
sale, or importing the Program or any portion of it.
|
| 752 |
+
|
| 753 |
+
11. Patents.
|
| 754 |
+
|
| 755 |
+
A "contributor" is a copyright holder who authorizes use under this
|
| 756 |
+
License of the Program or a work on which the Program is based. The
|
| 757 |
+
work thus licensed is called the contributor's "contributor version".
|
| 758 |
+
|
| 759 |
+
A contributor's "essential patent claims" are all patent claims
|
| 760 |
+
owned or controlled by the contributor, whether already acquired or
|
| 761 |
+
hereafter acquired, that would be infringed by some manner, permitted
|
| 762 |
+
by this License, of making, using, or selling its contributor version,
|
| 763 |
+
but do not include claims that would be infringed only as a
|
| 764 |
+
consequence of further modification of the contributor version. For
|
| 765 |
+
purposes of this definition, "control" includes the right to grant
|
| 766 |
+
patent sublicenses in a manner consistent with the requirements of
|
| 767 |
+
this License.
|
| 768 |
+
|
| 769 |
+
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
| 770 |
+
patent license under the contributor's essential patent claims, to
|
| 771 |
+
make, use, sell, offer for sale, import and otherwise run, modify and
|
| 772 |
+
propagate the contents of its contributor version.
|
| 773 |
+
|
| 774 |
+
In the following three paragraphs, a "patent license" is any express
|
| 775 |
+
agreement or commitment, however denominated, not to enforce a patent
|
| 776 |
+
(such as an express permission to practice a patent or covenant not to
|
| 777 |
+
sue for patent infringement). To "grant" such a patent license to a
|
| 778 |
+
party means to make such an agreement or commitment not to enforce a
|
| 779 |
+
patent against the party.
|
| 780 |
+
|
| 781 |
+
If you convey a covered work, knowingly relying on a patent license,
|
| 782 |
+
and the Corresponding Source of the work is not available for anyone
|
| 783 |
+
to copy, free of charge and under the terms of this License, through a
|
| 784 |
+
publicly available network server or other readily accessible means,
|
| 785 |
+
then you must either (1) cause the Corresponding Source to be so
|
| 786 |
+
available, or (2) arrange to deprive yourself of the benefit of the
|
| 787 |
+
patent license for this particular work, or (3) arrange, in a manner
|
| 788 |
+
consistent with the requirements of this License, to extend the patent
|
| 789 |
+
license to downstream recipients. "Knowingly relying" means you have
|
| 790 |
+
actual knowledge that, but for the patent license, your conveying the
|
| 791 |
+
covered work in a country, or your recipient's use of the covered work
|
| 792 |
+
in a country, would infringe one or more identifiable patents in that
|
| 793 |
+
country that you have reason to believe are valid.
|
| 794 |
+
|
| 795 |
+
If, pursuant to or in connection with a single transaction or
|
| 796 |
+
arrangement, you convey, or propagate by procuring conveyance of, a
|
| 797 |
+
covered work, and grant a patent license to some of the parties
|
| 798 |
+
receiving the covered work authorizing them to use, propagate, modify
|
| 799 |
+
or convey a specific copy of the covered work, then the patent license
|
| 800 |
+
you grant is automatically extended to all recipients of the covered
|
| 801 |
+
work and works based on it.
|
| 802 |
+
|
| 803 |
+
A patent license is "discriminatory" if it does not include within
|
| 804 |
+
the scope of its coverage, prohibits the exercise of, or is
|
| 805 |
+
conditioned on the non-exercise of one or more of the rights that are
|
| 806 |
+
specifically granted under this License. You may not convey a covered
|
| 807 |
+
work if you are a party to an arrangement with a third party that is
|
| 808 |
+
in the business of distributing software, under which you make payment
|
| 809 |
+
to the third party based on the extent of your activity of conveying
|
| 810 |
+
the work, and under which the third party grants, to any of the
|
| 811 |
+
parties who would receive the covered work from you, a discriminatory
|
| 812 |
+
patent license (a) in connection with copies of the covered work
|
| 813 |
+
conveyed by you (or copies made from those copies), or (b) primarily
|
| 814 |
+
for and in connection with specific products or compilations that
|
| 815 |
+
contain the covered work, unless you entered into that arrangement,
|
| 816 |
+
or that patent license was granted, prior to 28 March 2007.
|
| 817 |
+
|
| 818 |
+
Nothing in this License shall be construed as excluding or limiting
|
| 819 |
+
any implied license or other defenses to infringement that may
|
| 820 |
+
otherwise be available to you under applicable patent law.
|
| 821 |
+
|
| 822 |
+
12. No Surrender of Others' Freedom.
|
| 823 |
+
|
| 824 |
+
If conditions are imposed on you (whether by court order, agreement or
|
| 825 |
+
otherwise) that contradict the conditions of this License, they do not
|
| 826 |
+
excuse you from the conditions of this License. If you cannot convey a
|
| 827 |
+
covered work so as to satisfy simultaneously your obligations under this
|
| 828 |
+
License and any other pertinent obligations, then as a consequence you may
|
| 829 |
+
not convey it at all. For example, if you agree to terms that obligate you
|
| 830 |
+
to collect a royalty for further conveying from those to whom you convey
|
| 831 |
+
the Program, the only way you could satisfy both those terms and this
|
| 832 |
+
License would be to refrain entirely from conveying the Program.
|
| 833 |
+
|
| 834 |
+
13. Use with the GNU Affero General Public License.
|
| 835 |
+
|
| 836 |
+
Notwithstanding any other provision of this License, you have
|
| 837 |
+
permission to link or combine any covered work with a work licensed
|
| 838 |
+
under version 3 of the GNU Affero General Public License into a single
|
| 839 |
+
combined work, and to convey the resulting work. The terms of this
|
| 840 |
+
License will continue to apply to the part which is the covered work,
|
| 841 |
+
but the special requirements of the GNU Affero General Public License,
|
| 842 |
+
section 13, concerning interaction through a network will apply to the
|
| 843 |
+
combination as such.
|
| 844 |
+
|
| 845 |
+
14. Revised Versions of this License.
|
| 846 |
+
|
| 847 |
+
The Free Software Foundation may publish revised and/or new versions of
|
| 848 |
+
the GNU General Public License from time to time. Such new versions will
|
| 849 |
+
be similar in spirit to the present version, but may differ in detail to
|
| 850 |
+
address new problems or concerns.
|
| 851 |
+
|
| 852 |
+
Each version is given a distinguishing version number. If the
|
| 853 |
+
Program specifies that a certain numbered version of the GNU General
|
| 854 |
+
Public License "or any later version" applies to it, you have the
|
| 855 |
+
option of following the terms and conditions either of that numbered
|
| 856 |
+
version or of any later version published by the Free Software
|
| 857 |
+
Foundation. If the Program does not specify a version number of the
|
| 858 |
+
GNU General Public License, you may choose any version ever published
|
| 859 |
+
by the Free Software Foundation.
|
| 860 |
+
|
| 861 |
+
If the Program specifies that a proxy can decide which future
|
| 862 |
+
versions of the GNU General Public License can be used, that proxy's
|
| 863 |
+
public statement of acceptance of a version permanently authorizes you
|
| 864 |
+
to choose that version for the Program.
|
| 865 |
+
|
| 866 |
+
Later license versions may give you additional or different
|
| 867 |
+
permissions. However, no additional obligations are imposed on any
|
| 868 |
+
author or copyright holder as a result of your choosing to follow a
|
| 869 |
+
later version.
|
| 870 |
+
|
| 871 |
+
15. Disclaimer of Warranty.
|
| 872 |
+
|
| 873 |
+
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
| 874 |
+
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
| 875 |
+
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
| 876 |
+
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
| 877 |
+
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 878 |
+
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
| 879 |
+
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
| 880 |
+
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
| 881 |
+
|
| 882 |
+
16. Limitation of Liability.
|
| 883 |
+
|
| 884 |
+
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
| 885 |
+
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
| 886 |
+
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
| 887 |
+
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
| 888 |
+
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
| 889 |
+
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
| 890 |
+
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
| 891 |
+
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
| 892 |
+
SUCH DAMAGES.
|
| 893 |
+
|
| 894 |
+
17. Interpretation of Sections 15 and 16.
|
| 895 |
+
|
| 896 |
+
If the disclaimer of warranty and limitation of liability provided
|
| 897 |
+
above cannot be given local legal effect according to their terms,
|
| 898 |
+
reviewing courts shall apply local law that most closely approximates
|
| 899 |
+
an absolute waiver of all civil liability in connection with the
|
| 900 |
+
Program, unless a warranty or assumption of liability accompanies a
|
| 901 |
+
copy of the Program in return for a fee.
|
| 902 |
+
|
| 903 |
+
END OF TERMS AND CONDITIONS
|
| 904 |
+
|
| 905 |
+
How to Apply These Terms to Your New Programs
|
| 906 |
+
|
| 907 |
+
If you develop a new program, and you want it to be of the greatest
|
| 908 |
+
possible use to the public, the best way to achieve this is to make it
|
| 909 |
+
free software which everyone can redistribute and change under these terms.
|
| 910 |
+
|
| 911 |
+
To do so, attach the following notices to the program. It is safest
|
| 912 |
+
to attach them to the start of each source file to most effectively
|
| 913 |
+
state the exclusion of warranty; and each file should have at least
|
| 914 |
+
the "copyright" line and a pointer to where the full notice is found.
|
| 915 |
+
|
| 916 |
+
<one line to give the program's name and a brief idea of what it does.>
|
| 917 |
+
Copyright (C) <year> <name of author>
|
| 918 |
+
|
| 919 |
+
This program is free software: you can redistribute it and/or modify
|
| 920 |
+
it under the terms of the GNU General Public License as published by
|
| 921 |
+
the Free Software Foundation, either version 3 of the License, or
|
| 922 |
+
(at your option) any later version.
|
| 923 |
+
|
| 924 |
+
This program is distributed in the hope that it will be useful,
|
| 925 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 926 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 927 |
+
GNU General Public License for more details.
|
| 928 |
+
|
| 929 |
+
You should have received a copy of the GNU General Public License
|
| 930 |
+
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
| 931 |
+
|
| 932 |
+
Also add information on how to contact you by electronic and paper mail.
|
| 933 |
+
|
| 934 |
+
If the program does terminal interaction, make it output a short
|
| 935 |
+
notice like this when it starts in an interactive mode:
|
| 936 |
+
|
| 937 |
+
<program> Copyright (C) <year> <name of author>
|
| 938 |
+
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
| 939 |
+
This is free software, and you are welcome to redistribute it
|
| 940 |
+
under certain conditions; type `show c' for details.
|
| 941 |
+
|
| 942 |
+
The hypothetical commands `show w' and `show c' should show the appropriate
|
| 943 |
+
parts of the General Public License. Of course, your program's commands
|
| 944 |
+
might be different; for a GUI interface, you would use an "about box".
|
| 945 |
+
|
| 946 |
+
You should also get your employer (if you work as a programmer) or school,
|
| 947 |
+
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
| 948 |
+
For more information on this, and how to apply and follow the GNU GPL, see
|
| 949 |
+
<http://www.gnu.org/licenses/>.
|
| 950 |
+
|
| 951 |
+
The GNU General Public License does not permit incorporating your program
|
| 952 |
+
into proprietary programs. If your program is a subroutine library, you
|
| 953 |
+
may consider it more useful to permit linking proprietary applications with
|
| 954 |
+
the library. If this is what you want to do, use the GNU Lesser General
|
| 955 |
+
Public License instead of this License. But first, please read
|
| 956 |
+
<http://www.gnu.org/philosophy/why-not-lgpl.html>.
|
| 957 |
+
|
| 958 |
+
Name: libquadmath
|
| 959 |
+
Files: numpy.libs/libquadmath*.so
|
| 960 |
+
Description: dynamically linked to files compiled with gcc
|
| 961 |
+
Availability: https://gcc.gnu.org/git/?p=gcc.git;a=tree;f=libquadmath
|
| 962 |
+
License: LGPL-2.1-or-later
|
| 963 |
+
|
| 964 |
+
GCC Quad-Precision Math Library
|
| 965 |
+
Copyright (C) 2010-2019 Free Software Foundation, Inc.
|
| 966 |
+
Written by Francois-Xavier Coudert <fxcoudert@gcc.gnu.org>
|
| 967 |
+
|
| 968 |
+
This file is part of the libquadmath library.
|
| 969 |
+
Libquadmath is free software; you can redistribute it and/or
|
| 970 |
+
modify it under the terms of the GNU Library General Public
|
| 971 |
+
License as published by the Free Software Foundation; either
|
| 972 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 973 |
+
|
| 974 |
+
Libquadmath is distributed in the hope that it will be useful,
|
| 975 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 976 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 977 |
+
Lesser General Public License for more details.
|
| 978 |
+
https://www.gnu.org/licenses/old-licenses/lgpl-2.1.html
|
| 979 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 980 |
+
Classifier: Intended Audience :: Science/Research
|
| 981 |
+
Classifier: Intended Audience :: Developers
|
| 982 |
+
Classifier: License :: OSI Approved :: BSD License
|
| 983 |
+
Classifier: Programming Language :: C
|
| 984 |
+
Classifier: Programming Language :: Python
|
| 985 |
+
Classifier: Programming Language :: Python :: 3
|
| 986 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 987 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 988 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 989 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 990 |
+
Classifier: Programming Language :: Python :: 3 :: Only
|
| 991 |
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
| 992 |
+
Classifier: Topic :: Software Development
|
| 993 |
+
Classifier: Topic :: Scientific/Engineering
|
| 994 |
+
Classifier: Typing :: Typed
|
| 995 |
+
Classifier: Operating System :: Microsoft :: Windows
|
| 996 |
+
Classifier: Operating System :: POSIX
|
| 997 |
+
Classifier: Operating System :: Unix
|
| 998 |
+
Classifier: Operating System :: MacOS
|
| 999 |
+
Project-URL: Homepage, https://numpy.org
|
| 1000 |
+
Project-URL: Documentation, https://numpy.org/doc/
|
| 1001 |
+
Project-URL: Source, https://github.com/numpy/numpy
|
| 1002 |
+
Project-URL: Download, https://pypi.org/project/numpy/#files
|
| 1003 |
+
Project-URL: Tracker, https://github.com/numpy/numpy/issues
|
| 1004 |
+
Project-URL: Release notes, https://numpy.org/doc/stable/release
|
| 1005 |
+
Requires-Python: >=3.9
|
| 1006 |
+
Description-Content-Type: text/markdown
|
| 1007 |
+
|
| 1008 |
+
<h1 align="center">
|
| 1009 |
+
<img src="https://raw.githubusercontent.com/numpy/numpy/main/branding/logo/primary/numpylogo.svg" width="300">
|
| 1010 |
+
</h1><br>
|
| 1011 |
+
|
| 1012 |
+
|
| 1013 |
+
[](
|
| 1014 |
+
https://numfocus.org)
|
| 1015 |
+
[](
|
| 1016 |
+
https://pypi.org/project/numpy/)
|
| 1017 |
+
[](
|
| 1018 |
+
https://anaconda.org/conda-forge/numpy)
|
| 1019 |
+
[](
|
| 1020 |
+
https://stackoverflow.com/questions/tagged/numpy)
|
| 1021 |
+
[](
|
| 1022 |
+
https://doi.org/10.1038/s41586-020-2649-2)
|
| 1023 |
+
[](https://api.securityscorecards.dev/projects/github.com/numpy/numpy)
|
| 1024 |
+
|
| 1025 |
+
|
| 1026 |
+
NumPy is the fundamental package for scientific computing with Python.
|
| 1027 |
+
|
| 1028 |
+
- **Website:** https://www.numpy.org
|
| 1029 |
+
- **Documentation:** https://numpy.org/doc
|
| 1030 |
+
- **Mailing list:** https://mail.python.org/mailman/listinfo/numpy-discussion
|
| 1031 |
+
- **Source code:** https://github.com/numpy/numpy
|
| 1032 |
+
- **Contributing:** https://www.numpy.org/devdocs/dev/index.html
|
| 1033 |
+
- **Bug reports:** https://github.com/numpy/numpy/issues
|
| 1034 |
+
- **Report a security vulnerability:** https://tidelift.com/docs/security
|
| 1035 |
+
|
| 1036 |
+
It provides:
|
| 1037 |
+
|
| 1038 |
+
- a powerful N-dimensional array object
|
| 1039 |
+
- sophisticated (broadcasting) functions
|
| 1040 |
+
- tools for integrating C/C++ and Fortran code
|
| 1041 |
+
- useful linear algebra, Fourier transform, and random number capabilities
|
| 1042 |
+
|
| 1043 |
+
Testing:
|
| 1044 |
+
|
| 1045 |
+
NumPy requires `pytest` and `hypothesis`. Tests can then be run after installation with:
|
| 1046 |
+
|
| 1047 |
+
python -c "import numpy, sys; sys.exit(numpy.test() is False)"
|
| 1048 |
+
|
| 1049 |
+
Code of Conduct
|
| 1050 |
+
----------------------
|
| 1051 |
+
|
| 1052 |
+
NumPy is a community-driven open source project developed by a diverse group of
|
| 1053 |
+
[contributors](https://numpy.org/teams/). The NumPy leadership has made a strong
|
| 1054 |
+
commitment to creating an open, inclusive, and positive community. Please read the
|
| 1055 |
+
[NumPy Code of Conduct](https://numpy.org/code-of-conduct/) for guidance on how to interact
|
| 1056 |
+
with others in a way that makes our community thrive.
|
| 1057 |
+
|
| 1058 |
+
Call for Contributions
|
| 1059 |
+
----------------------
|
| 1060 |
+
|
| 1061 |
+
The NumPy project welcomes your expertise and enthusiasm!
|
| 1062 |
+
|
| 1063 |
+
Small improvements or fixes are always appreciated. If you are considering larger contributions
|
| 1064 |
+
to the source code, please contact us through the [mailing
|
| 1065 |
+
list](https://mail.python.org/mailman/listinfo/numpy-discussion) first.
|
| 1066 |
+
|
| 1067 |
+
Writing code isn’t the only way to contribute to NumPy. You can also:
|
| 1068 |
+
- review pull requests
|
| 1069 |
+
- help us stay on top of new and old issues
|
| 1070 |
+
- develop tutorials, presentations, and other educational materials
|
| 1071 |
+
- maintain and improve [our website](https://github.com/numpy/numpy.org)
|
| 1072 |
+
- develop graphic design for our brand assets and promotional materials
|
| 1073 |
+
- translate website content
|
| 1074 |
+
- help with outreach and onboard new contributors
|
| 1075 |
+
- write grant proposals and help with other fundraising efforts
|
| 1076 |
+
|
| 1077 |
+
For more information about the ways you can contribute to NumPy, visit [our website](https://numpy.org/contribute/).
|
| 1078 |
+
If you’re unsure where to start or how your skills fit in, reach out! You can
|
| 1079 |
+
ask on the mailing list or here, on GitHub, by opening a new issue or leaving a
|
| 1080 |
+
comment on a relevant issue that is already open.
|
| 1081 |
+
|
| 1082 |
+
Our preferred channels of communication are all public, but if you’d like to
|
| 1083 |
+
speak to us in private first, contact our community coordinators at
|
| 1084 |
+
numpy-team@googlegroups.com or on Slack (write numpy-team@googlegroups.com for
|
| 1085 |
+
an invitation).
|
| 1086 |
+
|
| 1087 |
+
We also have a biweekly community call, details of which are announced on the
|
| 1088 |
+
mailing list. You are very welcome to join.
|
| 1089 |
+
|
| 1090 |
+
If you are new to contributing to open source, [this
|
| 1091 |
+
guide](https://opensource.guide/how-to-contribute/) helps explain why, what,
|
| 1092 |
+
and how to successfully get involved.
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/RECORD
ADDED
|
@@ -0,0 +1,792 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
../../../bin/f2py,sha256=ScMFUWEA5j-JCSbPSeQ-eEGgHxanaIyHWKHzNTwpC6A,398
|
| 2 |
+
numpy-1.26.4.dist-info/INSTALLER,sha256=5hhM4Q4mYTT9z6QB6PGpUAW81PGNFrYrdXMj4oM_6ak,2
|
| 3 |
+
numpy-1.26.4.dist-info/LICENSE.txt,sha256=EQewyDHpGNTx28KKMxkMdyFe8njUpMQAlXIIh3DUM0o,47721
|
| 4 |
+
numpy-1.26.4.dist-info/METADATA,sha256=sJc0p_7UToS0yBYZNM5TLf8ed57Ggi1BVkTRF_Y4EHA,61041
|
| 5 |
+
numpy-1.26.4.dist-info/RECORD,,
|
| 6 |
+
numpy-1.26.4.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 7 |
+
numpy-1.26.4.dist-info/WHEEL,sha256=3qIDcXCk577AXiK3pDifO-gE9U_MYWYGgtD78gLa2_U,137
|
| 8 |
+
numpy-1.26.4.dist-info/entry_points.txt,sha256=zddyYJuUw9Uud7LeLfynXk62_ry0lGihDwCIgugBdZM,144
|
| 9 |
+
numpy.libs/libgfortran-040039e1.so.5.0.0,sha256=FK-zEpsai1C8QKOwggx_EVLqm8EBIaqxUpQ_cFdHKIY,2686065
|
| 10 |
+
numpy.libs/libopenblas64_p-r0-0cf96a72.3.23.dev.so,sha256=klTQhU3XYV4R3ijXca5AiHjKgSOnrCBPIeTMejdswuU,35123345
|
| 11 |
+
numpy.libs/libquadmath-96973f99.so.0.0.0,sha256=k0wi3tDn0WnE1GeIdslgUa3z2UVF2pYvYLQWWbB12js,247609
|
| 12 |
+
numpy/__config__.py,sha256=z0NFqd9D20ShQlKyPTlbfAPWIJFDEJ7aVp3TQ5_vTxU,4902
|
| 13 |
+
numpy/__init__.cython-30.pxd,sha256=yk2a3etxRNlBgj5uLfIho2RYDYDzhRW8oagAG-wzbPI,36690
|
| 14 |
+
numpy/__init__.pxd,sha256=Pa0VYRSeQRSFepQ6ROgZrNtGY5TzBXIddWsMHtK0OkM,35066
|
| 15 |
+
numpy/__init__.py,sha256=Is0VNfoU10729FfMoUn_3ICHX0YL4xO4-JUnP3i8QC4,17005
|
| 16 |
+
numpy/__init__.pyi,sha256=9kK465XL9oS_X3fJLv0Na29NEYnWvtdMhXPtrnF_cG8,154080
|
| 17 |
+
numpy/_core/__init__.py,sha256=C8_7wbHqUkB35JouY_XKsas1KLpRZ7JHWuZ7VGOPVpU,136
|
| 18 |
+
numpy/_core/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 19 |
+
numpy/_core/_dtype.py,sha256=vE16-yiwUSYsAIbq7FlEY1GbXZAp8wjADDxJg3eBX-U,126
|
| 20 |
+
numpy/_core/_dtype_ctypes.py,sha256=i5EhoWPUhu4kla3Xu4ZvXF1lVLPiI6Zg4h6o8jaiamo,147
|
| 21 |
+
numpy/_core/_internal.py,sha256=g5ugmqDgUhSlie5-onOctcm4p0gcMHSIRLHVYtFTk1M,135
|
| 22 |
+
numpy/_core/_multiarray_umath.py,sha256=VPtoT2uHnyU3rKL0G27CgmNmB1WRHM0mtc7Y9L85C3U,159
|
| 23 |
+
numpy/_core/multiarray.py,sha256=kZxC_7P3Jwz1RApzQU2QGmqSq4MAEvKmaJEYnAsbSOs,138
|
| 24 |
+
numpy/_core/umath.py,sha256=YcV0cdbGcem6D5P3yX7cR9HGYBrT8VMoAgCBzGwPhgg,123
|
| 25 |
+
numpy/_distributor_init.py,sha256=IKy2THwmu5UgBjtVbwbD9H-Ap8uaUJoPJ2btQ4Jatdo,407
|
| 26 |
+
numpy/_globals.py,sha256=neEdcfLZoHLwber_1Xyrn26LcXy0MrSta03Ze7aKa6g,3094
|
| 27 |
+
numpy/_pyinstaller/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 28 |
+
numpy/_pyinstaller/hook-numpy.py,sha256=PUQ-mNWje6bFALB-mLVFRPkvbM4JpLXunB6sjBbTy5g,1409
|
| 29 |
+
numpy/_pyinstaller/pyinstaller-smoke.py,sha256=6iL-eHMQaG3rxnS5EgcvrCqElm9aKL07Cjr1FZJSXls,1143
|
| 30 |
+
numpy/_pyinstaller/test_pyinstaller.py,sha256=8K-7QxmfoXCG0NwR0bhIgCNrDjGlrTzWnrR1sR8btgU,1135
|
| 31 |
+
numpy/_pytesttester.py,sha256=lQUTvKVz6kT8b4yiMV-uW-vG9KSv9UzqAmxaEMezTd8,6731
|
| 32 |
+
numpy/_pytesttester.pyi,sha256=OtyXSiuSy8o_78w3QNQRjMLpvvNyEdC0aMsx6T-vRxU,489
|
| 33 |
+
numpy/_typing/__init__.py,sha256=6w9E9V9VaT7vTM-veua8XcySv50Je5qSPJzK9HTocIg,7003
|
| 34 |
+
numpy/_typing/_add_docstring.py,sha256=xQhQX372aN_m3XN95CneMxOST2FdPcovR-MXM-9ep58,3922
|
| 35 |
+
numpy/_typing/_array_like.py,sha256=L4gnx2KWG8yYcouz5b9boJIkkFNtOJV6QjcnGCrbnRY,4298
|
| 36 |
+
numpy/_typing/_callable.pyi,sha256=Mf57BwohRn9ye6ixJqjNEnK0gKqnVPE9Gy8vK-6_zxo,11121
|
| 37 |
+
numpy/_typing/_char_codes.py,sha256=LR51O5AUBDbCmJvlMoxyUvsfvb1p7WHrexgtTGtuWTc,5916
|
| 38 |
+
numpy/_typing/_dtype_like.py,sha256=21Uxy0UgIawGM82xjDF_ifMq-nP-Bkhn_LpiK_HvWC4,5661
|
| 39 |
+
numpy/_typing/_extended_precision.py,sha256=dGios-1k-QBGew7YFzONZTzVWxz-aYAaqlccl2_h5Bo,777
|
| 40 |
+
numpy/_typing/_nbit.py,sha256=-EQOShHpB3r30b4RVEcruQRTcTaFAZwtqCJ4BsvpEzA,345
|
| 41 |
+
numpy/_typing/_nested_sequence.py,sha256=5eNaVZAV9tZQLFWHYOuVs336JjoiaWxyZQ7cMKb6m1I,2566
|
| 42 |
+
numpy/_typing/_scalars.py,sha256=eVP8PjlcTIlY7v0fRI3tFXPogWtpLJZ8nFvRRrLjDqs,980
|
| 43 |
+
numpy/_typing/_shape.py,sha256=JPy7jJMkISGFTnkgiEifYM-4xTcjb7JMRkLIIjZLw08,211
|
| 44 |
+
numpy/_typing/_ufunc.pyi,sha256=e74LtOP9e8kkRhvrIJ_RXz9Ua_L43Pd9IixwNwermnM,12638
|
| 45 |
+
numpy/_typing/setup.py,sha256=SE0Q6HPqDjWUfceA4yXgkII8y3z7EiSF0Z-MNwOIyG4,337
|
| 46 |
+
numpy/_utils/__init__.py,sha256=Hhetwsi3eTBe8HdWbG51zXmcrX1DiPLxkYSrslMLYcc,723
|
| 47 |
+
numpy/_utils/_convertions.py,sha256=0xMxdeLOziDmHsRM_8luEh4S-kQdMoMg6GxNDDas69k,329
|
| 48 |
+
numpy/_utils/_inspect.py,sha256=8Ma7QBRwfSWKeK1ShJpFNc7CDhE6fkIE_wr1FxrG1A8,7447
|
| 49 |
+
numpy/_utils/_pep440.py,sha256=Vr7B3QsijR5p6h8YAz2LjNGUyzHUJ5gZ4v26NpZAKDc,14069
|
| 50 |
+
numpy/array_api/__init__.py,sha256=XtttWbDf6Yh0_m4zp-L_us4HKnV3oGwdlB6n-01Q9M8,10375
|
| 51 |
+
numpy/array_api/_array_object.py,sha256=rfCBzE6vUjk4HElQGTVwe6Tw2vxiUx7tmBpQEmm1iBk,43794
|
| 52 |
+
numpy/array_api/_constants.py,sha256=AYayN2jf1Dp5rXZ7WPBdUhtPBo_JMCi-pD9oW5zmFkI,87
|
| 53 |
+
numpy/array_api/_creation_functions.py,sha256=6SqHdzZqHOJFEyWFtqnj6KIKRivrGXxROlgnez_3Mt0,10050
|
| 54 |
+
numpy/array_api/_data_type_functions.py,sha256=P57FOsNdXahNUriVtdldonbvBQrrZkVzxZbcqkR_8AA,6288
|
| 55 |
+
numpy/array_api/_dtypes.py,sha256=kDU1NLvEQN-W2HPmJ2wGPx8jiNkFbrvTCD1T1RT8Pwo,4823
|
| 56 |
+
numpy/array_api/_elementwise_functions.py,sha256=0kGuDX3Ur_Qp6tBMBWTO7LPUxzXNGAlA2SSJhdAp4DU,25992
|
| 57 |
+
numpy/array_api/_indexing_functions.py,sha256=d-gzqzyvR45FQerRYJrbBzCWFnDsZWSI9pggA5QWRO4,715
|
| 58 |
+
numpy/array_api/_manipulation_functions.py,sha256=qCoW5B5FXcFOWKPU9D9MXHdMeXIuzvnHUUvprNlwfjc,3317
|
| 59 |
+
numpy/array_api/_searching_functions.py,sha256=mGZiqheYXGWiDK9rqXFiDKX0_B0mJ1OjdA-9FC2o5lA,1715
|
| 60 |
+
numpy/array_api/_set_functions.py,sha256=ULpfK1zznW9joX1DXSiP0R3ahcDB_po7mZlpsRqi7Fs,2948
|
| 61 |
+
numpy/array_api/_sorting_functions.py,sha256=7pszlxNN7-DNqEZlonGLFQrlXPP7evVA8jN31NShg00,2031
|
| 62 |
+
numpy/array_api/_statistical_functions.py,sha256=HspfYteZWSa3InMs10KZz-sk3ZuW6teX6fNdo829T84,3584
|
| 63 |
+
numpy/array_api/_typing.py,sha256=uKidRp6nYxgHnEPaqXXZsDDZ6tw1LshpbwLvy-09eeM,1347
|
| 64 |
+
numpy/array_api/_utility_functions.py,sha256=HwycylbPAgRVz4nZvjvwqN3mQnJbqKA-NRMaAvIP-CE,824
|
| 65 |
+
numpy/array_api/linalg.py,sha256=QPpG2tG1pZgzjrtTjjOu2GDu3cI6UpSsLrsG_o1jXYk,18411
|
| 66 |
+
numpy/array_api/setup.py,sha256=Wx6qD7GU_APiqKolYPO0OHv4eHGYrjPZmDAgjWhOEhM,341
|
| 67 |
+
numpy/array_api/tests/__init__.py,sha256=t_2GZ3lKcsu4ec4GMKPUDYaeMUJyDquBlQAcPgj7kFE,282
|
| 68 |
+
numpy/array_api/tests/test_array_object.py,sha256=FQoAxP4CLDiv6iih8KKUDSLuYM6dtnDcB1f0pMHw4-M,17035
|
| 69 |
+
numpy/array_api/tests/test_creation_functions.py,sha256=s3A1COWmXIAJdhzd8v7VtL-jbiSspskTqwYy0BTpmpw,5023
|
| 70 |
+
numpy/array_api/tests/test_data_type_functions.py,sha256=qc8ktRlVXWC3PKhxPVWI_UF9f1zZtpmzHjdCtf3e16E,1018
|
| 71 |
+
numpy/array_api/tests/test_elementwise_functions.py,sha256=CTj4LLwtusI51HkpzD0JPohP1ffNxogAVFz8WLuWFzM,3800
|
| 72 |
+
numpy/array_api/tests/test_indexing_functions.py,sha256=AbuBGyEufEAf24b7fy8JQhdJtGPdP9XEIxPTJAfAFFo,627
|
| 73 |
+
numpy/array_api/tests/test_manipulation_functions.py,sha256=wce25dSJjubrGhFxmiatzR_IpmNYp9ICJ9PZBBnZTOQ,1087
|
| 74 |
+
numpy/array_api/tests/test_set_functions.py,sha256=D016G7v3ko49bND5sVERP8IqQXZiwr-2yrKbBPJ-oqg,546
|
| 75 |
+
numpy/array_api/tests/test_sorting_functions.py,sha256=INPiYnuGBcsmWtYqdTTX3ENHmM4iUx4zs9KdwDaSmdA,602
|
| 76 |
+
numpy/array_api/tests/test_validation.py,sha256=QUG9yWC3QhkPxNhbQeakwBbl-0Rr0iTuZ41_0sfVIGU,676
|
| 77 |
+
numpy/compat/__init__.py,sha256=iAHrmsZWzouOMSyD9bdSE0APWMlRpqW92MQgF8y6x3E,448
|
| 78 |
+
numpy/compat/py3k.py,sha256=Je74CVk_7qI_qX7pLbYcuQJsxlMq1poGIfRIrH99kZQ,3833
|
| 79 |
+
numpy/compat/setup.py,sha256=36X1kF0C_NVROXfJ7w3SQeBm5AIDBuJbM5qT7cvSDgU,335
|
| 80 |
+
numpy/compat/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 81 |
+
numpy/compat/tests/test_compat.py,sha256=YqV67pSN8nXPbXaEdjhmyaoVetNyFupVv57OMEgCwKA,579
|
| 82 |
+
numpy/conftest.py,sha256=HZyWo_wJyrbgnyXxI8t05WOg_IrzNAMnEV7O8koHous,4623
|
| 83 |
+
numpy/core/__init__.py,sha256=CNsO-Ab4ywM2Wz3AbqWOH3ig1q5Bno9PsUMrCv-HNS4,5780
|
| 84 |
+
numpy/core/__init__.pyi,sha256=xtd9OFYza-ZG3jyEJrlzRPT-SkVoB_qYmVCe6FxRks0,126
|
| 85 |
+
numpy/core/_add_newdocs.py,sha256=39JFaeDPN2OQlSwfpY6_Jq9fO5vML8ZMF8J4ZTx_nrs,208972
|
| 86 |
+
numpy/core/_add_newdocs_scalars.py,sha256=PF9v8POcSNH6ELYltkx9e07DWgMmft6NJy9zER3Jk44,12106
|
| 87 |
+
numpy/core/_asarray.py,sha256=P2ddlZAsg1iGleRRfoQv_aKs2N7AGwpo5K4ZQv4Ujlk,3884
|
| 88 |
+
numpy/core/_asarray.pyi,sha256=gNNxUVhToNU_F1QpgeEvUYddpUFN-AKP0QWa4gqcTGw,1086
|
| 89 |
+
numpy/core/_dtype.py,sha256=SihUz41pHRB3Q2LiYYkug6LgMBKh6VV89MOpLxnXQdo,10606
|
| 90 |
+
numpy/core/_dtype_ctypes.py,sha256=Vug4i7xKhznK2tdIjmn4ebclClpaCJwSZUlvEoYl0Eg,3673
|
| 91 |
+
numpy/core/_exceptions.py,sha256=dZWKqfdLRvJvbAEG_fof_8ikEKxjakADMty1kLC_l_M,5379
|
| 92 |
+
numpy/core/_internal.py,sha256=f9kNDuT-FGxF1EtVOVIxXWnH9gM9n-J5V2zwHMv4HEk,28348
|
| 93 |
+
numpy/core/_internal.pyi,sha256=_mCTOX6Su8D4R9fV4HNeohPJx7515B-WOlv4uq6mry8,1032
|
| 94 |
+
numpy/core/_machar.py,sha256=G3a3TXu8VDW_1EMxKKLnGMbvUShEIUEve3ealBlJJ3E,11565
|
| 95 |
+
numpy/core/_methods.py,sha256=m31p0WjcFUGckbJiHnCpSaIQGqv-Lq5niIYkdd33YMo,8613
|
| 96 |
+
numpy/core/_multiarray_tests.cpython-312-x86_64-linux-gnu.so,sha256=Cyy7dBn_wvcSmHqrr1GKOx2d6EBgk_edyx1xKjSrYFc,175912
|
| 97 |
+
numpy/core/_multiarray_umath.cpython-312-x86_64-linux-gnu.so,sha256=amUIEKhzXL25iPdHKZc3QKM3ZF3RWF_vaW5z4tvGW-s,7463681
|
| 98 |
+
numpy/core/_operand_flag_tests.cpython-312-x86_64-linux-gnu.so,sha256=VPbGfwOkzwWoNNVSh3jahuBTI8LrKbN_dCaMcOtDfQE,16856
|
| 99 |
+
numpy/core/_rational_tests.cpython-312-x86_64-linux-gnu.so,sha256=0JmPpR0Ej5eZ4vrHN_6fvrKVCeUVuQam83AxViSkN2k,59776
|
| 100 |
+
numpy/core/_simd.cpython-312-x86_64-linux-gnu.so,sha256=lAK8a8uKjaYoFqMQZBWnVvjeUm-KDsnZzyH_RThl9do,3535232
|
| 101 |
+
numpy/core/_string_helpers.py,sha256=-fQM8z5s8_yX440PmgNEH3SUjEoXMPpPSysZwWZNbuo,2852
|
| 102 |
+
numpy/core/_struct_ufunc_tests.cpython-312-x86_64-linux-gnu.so,sha256=PB6RqEbim2Ezi96GVTzyqi9IuqNcVGCKcgPxwHBVCAM,16960
|
| 103 |
+
numpy/core/_type_aliases.py,sha256=qV6AZlsUWHMWTydmZya73xuBkKXiUKq_WXLj7q2CbZ0,7534
|
| 104 |
+
numpy/core/_type_aliases.pyi,sha256=lguMSqMwvqAFHuRtm8YZSdKbikVz985BdKo_lo7GQCg,404
|
| 105 |
+
numpy/core/_ufunc_config.py,sha256=-Twpe8dnd45ccXH-w-B9nvU8yCOd1E0e3Wpsts3g_bQ,13944
|
| 106 |
+
numpy/core/_ufunc_config.pyi,sha256=-615enOVQMBhVx7Pln7DY_s4H6JjSgSnBy89YkpvuLg,1066
|
| 107 |
+
numpy/core/_umath_tests.cpython-312-x86_64-linux-gnu.so,sha256=kT7z3gJc2t_GgamgqAf3MNRWeVo8KrSWPZVh3mLs_t8,42272
|
| 108 |
+
numpy/core/arrayprint.py,sha256=ySZj4TZFFVCa5yhMmJKFYQYhuQTabZTRBb1YoiCD-ac,63608
|
| 109 |
+
numpy/core/arrayprint.pyi,sha256=21pOWjTSfJOBaKgOOPzRox1ERb3c9ydufqL0b11_P_Q,4428
|
| 110 |
+
numpy/core/cversions.py,sha256=H_iNIpx9-hY1cQNxqjT2d_5SXZhJbMo_caq4_q6LB7I,347
|
| 111 |
+
numpy/core/defchararray.py,sha256=G1LExk-dMeVTYRhtYgcCZEsHk5tkawk7giXcK4Q5KVM,73617
|
| 112 |
+
numpy/core/defchararray.pyi,sha256=ib3aWFcM7F4KooU57mWUNi4GlosNjdfgrLKBVSIKDvU,9216
|
| 113 |
+
numpy/core/einsumfunc.py,sha256=TrL6t79F0H0AQH0y5Cj7Tq0_pzk4fVFi-4q4jJmujYQ,51868
|
| 114 |
+
numpy/core/einsumfunc.pyi,sha256=IJZNdHHG_soig8XvCbXZl43gMr3MMKl9dckTYWecqLs,4860
|
| 115 |
+
numpy/core/fromnumeric.py,sha256=YMtxOBg51VMem39AHXFs-4_vOb1p48ei7njXdYTRJ_Q,128821
|
| 116 |
+
numpy/core/fromnumeric.pyi,sha256=KATMFeFxUJ8YNRaC-jd_dTOt3opz2ng6lHgke5u5COk,23726
|
| 117 |
+
numpy/core/function_base.py,sha256=tHg1qSHTz1eO_wHXNFRt3Q40uqVtPT2eyQdrWbIi4wQ,19836
|
| 118 |
+
numpy/core/function_base.pyi,sha256=3ZYad3cdaGwNEyP8VwK97IYMqk2PDoVjpjQzhIYHjk0,4725
|
| 119 |
+
numpy/core/getlimits.py,sha256=AopcTZDCUXMPcEKIZE1botc3mEhmLb2p1_ejlq1CLqY,25865
|
| 120 |
+
numpy/core/getlimits.pyi,sha256=qeIXUEtognTHr_T-tv-VcZI7n8Z2VzAyIpIgKXzsLkc,82
|
| 121 |
+
numpy/core/include/numpy/__multiarray_api.c,sha256=nPRzTez_Wy3YXy3zZNJNPMspAzxbLOdohqhXwouwMLM,12116
|
| 122 |
+
numpy/core/include/numpy/__multiarray_api.h,sha256=ZM--FKMhIaSQS39cPW0hj5dx8ngNMmbcy6SbgXZBd8U,61450
|
| 123 |
+
numpy/core/include/numpy/__ufunc_api.c,sha256=670Gcz-vhkF4taBDmktCpFRBrZ9CHJnPRx7ag7Z6HsI,1714
|
| 124 |
+
numpy/core/include/numpy/__ufunc_api.h,sha256=0MBOl7dgO3ldqdDi-SdciEOuqGv1UNsmk7mp7tEy4AY,12456
|
| 125 |
+
numpy/core/include/numpy/_dtype_api.h,sha256=4veCexGvx9KNWMIUuEUAVOfcsei9GqugohDY5ud16pA,16697
|
| 126 |
+
numpy/core/include/numpy/_neighborhood_iterator_imp.h,sha256=s-Hw_l5WRwKtYvsiIghF0bg-mA_CgWnzFFOYVFJ-q4k,1857
|
| 127 |
+
numpy/core/include/numpy/_numpyconfig.h,sha256=o0fV_jb-wgVtRxnVIWvUttiZafyrWYFm2ab9Uixz1Cw,855
|
| 128 |
+
numpy/core/include/numpy/arrayobject.h,sha256=-BlWQ7kfVbzCqzHn0qaeMe0_08AbwliuG98XWG57lT8,282
|
| 129 |
+
numpy/core/include/numpy/arrayscalars.h,sha256=C3vDRndZTZRbppiDyV5jp8sV3dRKsrwBIZcNlh9gSTA,3944
|
| 130 |
+
numpy/core/include/numpy/experimental_dtype_api.h,sha256=tlehD5r_pYhHbGzIrUea6vtOgf6IQ8Txblnhx7455h8,15532
|
| 131 |
+
numpy/core/include/numpy/halffloat.h,sha256=TRZfXgipa-dFppX2uNgkrjrPli-1BfJtadWjAembJ4s,1959
|
| 132 |
+
numpy/core/include/numpy/ndarrayobject.h,sha256=PhY4NjRZDoU5Zbc8MW0swPEm81hwgWZ63gAU93bLVVI,10183
|
| 133 |
+
numpy/core/include/numpy/ndarraytypes.h,sha256=EjWXv-J8C5JET4AlIbJRdctycL7-dyJZcnoWgnlCPc8,68009
|
| 134 |
+
numpy/core/include/numpy/noprefix.h,sha256=d83l1QpCCVqMV2k29NMkL3Ld1qNjiC6hzOPWZAivEjQ,6830
|
| 135 |
+
numpy/core/include/numpy/npy_1_7_deprecated_api.h,sha256=y0MJ8Qw7Bkt4H_4VxIzHzpkw5JqAdj5ECgtn08fZFrI,4327
|
| 136 |
+
numpy/core/include/numpy/npy_3kcompat.h,sha256=SvN9yRA3i02O4JFMXxZz0Uq_vJ5ZpvC-pC2sfF56A5I,15883
|
| 137 |
+
numpy/core/include/numpy/npy_common.h,sha256=apWBsCJeP8P5T0exgzhFcGohbASsUF8vtFdS2jc1VfU,37746
|
| 138 |
+
numpy/core/include/numpy/npy_cpu.h,sha256=pcVRtj-Y6120C5kWB1VAiAjZoxkTPDEg0gGm5IAt3jM,4629
|
| 139 |
+
numpy/core/include/numpy/npy_endian.h,sha256=we7X9fPeWzNpo_YTh09MPGDwdE0Rw_WDM4c9y4nBj5I,2786
|
| 140 |
+
numpy/core/include/numpy/npy_interrupt.h,sha256=DQZIxi6FycLXD8drdHn2SSmLoRhIpo6osvPv13vowUA,1948
|
| 141 |
+
numpy/core/include/numpy/npy_math.h,sha256=SbKRoc7O3gVuDl7HOZjk424O049I0zn-7i9GwBwNmmk,18945
|
| 142 |
+
numpy/core/include/numpy/npy_no_deprecated_api.h,sha256=0yZrJcQEJ6MCHJInQk5TP9_qZ4t7EfBuoLOJ34IlJd4,678
|
| 143 |
+
numpy/core/include/numpy/npy_os.h,sha256=hlQsg_7-RkvS3s8OM8KXy99xxyJbCm-W1AYVcdnO1cw,1256
|
| 144 |
+
numpy/core/include/numpy/numpyconfig.h,sha256=Nr59kE3cXmen6y0UymIBaU7F1BSIuPwgKZ4gdV5Q5JU,5308
|
| 145 |
+
numpy/core/include/numpy/old_defines.h,sha256=xuYQDDlMywu0Zsqm57hkgGwLsOFx6IvxzN2eiNF-gJY,6405
|
| 146 |
+
numpy/core/include/numpy/random/LICENSE.txt,sha256=-8U59H0M-DvGE3gID7hz1cFGMBJsrL_nVANcOSbapew,1018
|
| 147 |
+
numpy/core/include/numpy/random/bitgen.h,sha256=49AwKOR552r-NkhuSOF1usb_URiMSRMvD22JF5pKIng,488
|
| 148 |
+
numpy/core/include/numpy/random/distributions.h,sha256=W5tOyETd0m1W0GdaZ5dJP8fKlBtsTpG23V2Zlmrlqpg,9861
|
| 149 |
+
numpy/core/include/numpy/random/libdivide.h,sha256=ew9MNhPQd1LsCZiWiFmj9IZ7yOnA3HKOXffDeR9X1jw,80138
|
| 150 |
+
numpy/core/include/numpy/ufuncobject.h,sha256=Xmnny_ulZo9VwxkfkXF-1HCTKDavIp9PV_H7XWhi0Z8,12070
|
| 151 |
+
numpy/core/include/numpy/utils.h,sha256=wMNomSH3Dfj0q78PrjLVtFtN-FPo7UJ4o0ifCUO-6Es,1185
|
| 152 |
+
numpy/core/lib/libnpymath.a,sha256=mb8EluEp8SLpEeCTQJ0VshL-CqeZfWxSbS5ItM-9POc,93960
|
| 153 |
+
numpy/core/lib/npy-pkg-config/mlib.ini,sha256=_LsWV1eStNqwhdiYPa2538GL46dnfVwT4MrI1zbsoFw,147
|
| 154 |
+
numpy/core/lib/npy-pkg-config/npymath.ini,sha256=kamUNrYKAmXqQa8BcNv7D5sLqHh6bnChM0_5rZCsTfY,360
|
| 155 |
+
numpy/core/memmap.py,sha256=yWBJLeVClHsD8BYusnf9bdqypOMPrj3_zoO_lQ2zVMc,11771
|
| 156 |
+
numpy/core/memmap.pyi,sha256=sxIQ7T5hPLG-RBNndAc8JPvrsKEX1amBSH2HGg48Obo,55
|
| 157 |
+
numpy/core/multiarray.py,sha256=zXaWf_DSkFEWjUQqVRCGeevwsI6kjQ3x6_MUwA1Y8fk,56097
|
| 158 |
+
numpy/core/multiarray.pyi,sha256=_0X4W90U5ZiKt2n-9OscK-pcQyV6oGK-8jwGy5k1qxA,24768
|
| 159 |
+
numpy/core/numeric.py,sha256=DgajaCDXiiQR-zuW_rrx_QhApSsa5k5FONK3Uk9mfTs,77014
|
| 160 |
+
numpy/core/numeric.pyi,sha256=oVQkI4ABayFl_ZzCiGH4DxfYASL-3aETi-3B93THnEQ,14315
|
| 161 |
+
numpy/core/numerictypes.py,sha256=qIf9v1OpNjjVQzXnKpD-3V01y5Bj9huw5F-U5Wa4glc,18098
|
| 162 |
+
numpy/core/numerictypes.pyi,sha256=dEqtq9MLrGaqqeAF1sdXBgnEwDWOzlK02A6MTg1PS5g,3267
|
| 163 |
+
numpy/core/overrides.py,sha256=YUZFS8RCBvOJ27sH-jDRcyMjOCn9VigMyuQY4J21JBI,7093
|
| 164 |
+
numpy/core/records.py,sha256=4mpIjUp2XtZxY5cD2S8mgfn8GCzQGGrrkqLBqAJwM-Q,37533
|
| 165 |
+
numpy/core/records.pyi,sha256=uYwE6cAoGKgN6U4ryfGZx_3m-3sY006jytjWLrDRRy0,5692
|
| 166 |
+
numpy/core/shape_base.py,sha256=RPMKxA7_FCAgg_CruExl0LehnczSTFaxA6hrcfrUzns,29743
|
| 167 |
+
numpy/core/shape_base.pyi,sha256=Ilb4joJmbjkIZLzKww7NJeaxg2FP3AfFib3HtfOsrC0,2774
|
| 168 |
+
numpy/core/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 169 |
+
numpy/core/tests/_locales.py,sha256=S4x5soqF0oxpBYOE8J9Iky72O9J25IiZ8349m93pWC4,2206
|
| 170 |
+
numpy/core/tests/data/astype_copy.pkl,sha256=lWSzCcvzRB_wpuRGj92spGIw-rNPFcd9hwJaRVvfWdk,716
|
| 171 |
+
numpy/core/tests/data/generate_umath_validation_data.cpp,sha256=fyhQPNhIX9hzjeXujn6mhi1MVc133zELSV_hlSQ7BQU,5842
|
| 172 |
+
numpy/core/tests/data/numpy_2_0_array.pkl,sha256=Vh02tdyCypa8Nb4QzdVhnDAiXEO2WQrcwcvOdDDFF5w,718
|
| 173 |
+
numpy/core/tests/data/recarray_from_file.fits,sha256=NA0kliz31FlLnYxv3ppzeruONqNYkuEvts5wzXEeIc4,8640
|
| 174 |
+
numpy/core/tests/data/umath-validation-set-README.txt,sha256=pxWwOaGGahaRd-AlAidDfocLyrAiDp0whf5hC7hYwqM,967
|
| 175 |
+
numpy/core/tests/data/umath-validation-set-arccos.csv,sha256=W_aL99bjzVjlVyd5omfDUORag8jHzx6uctedPVZgOHQ,61365
|
| 176 |
+
numpy/core/tests/data/umath-validation-set-arccosh.csv,sha256=Uko_d0kDXr1YlN-6Ii-fQQxUvbXAhRfC7Un4gJ23GJk,61365
|
| 177 |
+
numpy/core/tests/data/umath-validation-set-arcsin.csv,sha256=15Aenze4WD2a2dF2aOBXpv9B7u3wwAeUVJdEm4TjOkQ,61339
|
| 178 |
+
numpy/core/tests/data/umath-validation-set-arcsinh.csv,sha256=uDwx4PStpfV21IaPF8pmzQpul6i72g7zDwlfcynWaVQ,60289
|
| 179 |
+
numpy/core/tests/data/umath-validation-set-arctan.csv,sha256=mw5tYze_BMs6ugGEZfg5mcXoInGYdn7fvSCYSUi9Bqw,60305
|
| 180 |
+
numpy/core/tests/data/umath-validation-set-arctanh.csv,sha256=95l4Uu5RmZajljabfqlv5U34RVrifCMhhkop6iLeNBo,61339
|
| 181 |
+
numpy/core/tests/data/umath-validation-set-cbrt.csv,sha256=v855MTZih-fZp_GuEDst2qaIsxU4a7vlAbeIJy2xKpc,60846
|
| 182 |
+
numpy/core/tests/data/umath-validation-set-cos.csv,sha256=0PNnDqKkokZ7ERVDgbes8KNZc-ISJrZUlVZc5LkW18E,59122
|
| 183 |
+
numpy/core/tests/data/umath-validation-set-cosh.csv,sha256=FGCNeUSUTAeASsb_j18iRSsCxXLxmzF-_C7tq1elVrQ,60869
|
| 184 |
+
numpy/core/tests/data/umath-validation-set-exp.csv,sha256=BKg1_cyrKD2GXYMX_EB0DnXua8DI2O1KWODXf_BRhrk,17491
|
| 185 |
+
numpy/core/tests/data/umath-validation-set-exp2.csv,sha256=f1b05MRXPOXihC9M-yi52udKBzVXalhbTuIcqoDAk-g,58624
|
| 186 |
+
numpy/core/tests/data/umath-validation-set-expm1.csv,sha256=_ghc1xiUECNsBGrKCFUAy2lvu01_lkpeYJN0zDtCYWk,60299
|
| 187 |
+
numpy/core/tests/data/umath-validation-set-log.csv,sha256=z9ej1ykKUoMRqYMUIJENWXbYi_A_x_RKs7K_GuXZJus,11692
|
| 188 |
+
numpy/core/tests/data/umath-validation-set-log10.csv,sha256=RJgpruL16FVPgUT3-3xW4eppS_tn6o5yEW79KnITn48,68922
|
| 189 |
+
numpy/core/tests/data/umath-validation-set-log1p.csv,sha256=IZZI-hi55HGCOvBat3vSBVha_8Nt-5alf2fqz6QeTG0,60303
|
| 190 |
+
numpy/core/tests/data/umath-validation-set-log2.csv,sha256=HL2rOCsrEi378rNrbsXHPqlWlEGkXQq8R4e63YeTksU,68917
|
| 191 |
+
numpy/core/tests/data/umath-validation-set-sin.csv,sha256=8PUjnQ_YfmxFb42XJrvpvmkeSpEOlEXSmNvIK4VgfAM,58611
|
| 192 |
+
numpy/core/tests/data/umath-validation-set-sinh.csv,sha256=CYiibE8aX7MQnBatl__5k_PWc_9vHUifwS-sFZzzKk0,60293
|
| 193 |
+
numpy/core/tests/data/umath-validation-set-tan.csv,sha256=Oq7gxMvblRVBrQ23kMxc8iT0bHnCWKg9EE4ZqzbJbOA,60299
|
| 194 |
+
numpy/core/tests/data/umath-validation-set-tanh.csv,sha256=iolZF_MOyWRgYSa-SsD4df5mnyFK18zrICI740SWoTc,60299
|
| 195 |
+
numpy/core/tests/examples/cython/checks.pyx,sha256=rKAhPSGHJ9oPK9Q_85YoUQyRTftEP1jcYOR5lSPB6oQ,662
|
| 196 |
+
numpy/core/tests/examples/cython/meson.build,sha256=Qk4Q6OkpZ0xsLUkcGQVVrYkzb0ozoyL6YlSZ8_5tH1I,1088
|
| 197 |
+
numpy/core/tests/examples/cython/setup.py,sha256=aAR-TvQabUabnCzuB6UdWdmRXaaPfIG7MzTIfMF-0tk,496
|
| 198 |
+
numpy/core/tests/examples/limited_api/limited_api.c,sha256=mncE8TjjXmYpkwli433G0jB2zGQO_5NqWmGKdzRJZug,344
|
| 199 |
+
numpy/core/tests/examples/limited_api/setup.py,sha256=p2w7F1ardi_GRXSrnNIR8W1oeH_pgmw_1P2wS0A2I6M,435
|
| 200 |
+
numpy/core/tests/test__exceptions.py,sha256=QqxQSLXboPXEVwHz-TyE2JeIl_TC-rPugzfo25nbcns,2846
|
| 201 |
+
numpy/core/tests/test_abc.py,sha256=FfgYA_HjYAi8XWGK_oOh6Zw86chB_KG_XoW_7ZlFp4c,2220
|
| 202 |
+
numpy/core/tests/test_api.py,sha256=UMc7SvczAQ5ngHxE-NoXVvNpVzYRrn8oMwFNta1yMS0,22995
|
| 203 |
+
numpy/core/tests/test_argparse.py,sha256=C0zBbwQ9xzzymXe_hHpWnnWQPwOi2ZdQB78gBAgJHvU,1969
|
| 204 |
+
numpy/core/tests/test_array_coercion.py,sha256=zY4Pjlt4QZ0w71WxWGLHcrPnnhEF51yXYVLg5HMIy5c,34379
|
| 205 |
+
numpy/core/tests/test_array_interface.py,sha256=8tGgj1Nzi76H_WF5GULkxqWL7Yu_Xf0lvTJZOwOBKsI,7774
|
| 206 |
+
numpy/core/tests/test_arraymethod.py,sha256=VpjDYTmoMDTZcY7CsGzinBh0R_OICuwOykWCbmCRQZU,3244
|
| 207 |
+
numpy/core/tests/test_arrayprint.py,sha256=cKaIoD9ZvsjJH0PHwZyOxmcRcBt1kN1WfFneqVqs0b8,40462
|
| 208 |
+
numpy/core/tests/test_casting_floatingpoint_errors.py,sha256=W3Fgk0oKtXFv684fEZ7POwj6DHTYK0Jj_oGRLZ8UdyA,5063
|
| 209 |
+
numpy/core/tests/test_casting_unittests.py,sha256=9-vkR0oXczQz8ED8DxGVPmalC8IZXe2jKgOCMGr8hIg,34298
|
| 210 |
+
numpy/core/tests/test_conversion_utils.py,sha256=jNhbNNI-T8qtQnsIMEax7KFN30kjh0ICntLMwTyxJ5Q,6559
|
| 211 |
+
numpy/core/tests/test_cpu_dispatcher.py,sha256=v_SlhUpENuoe7QYXizzYITLGXa7WfZ7jqcqmbSBg7JU,1542
|
| 212 |
+
numpy/core/tests/test_cpu_features.py,sha256=mieGx7dxXFiyTYatbcCCjIjR67Un2hVcbJx4GEf2yFo,14892
|
| 213 |
+
numpy/core/tests/test_custom_dtypes.py,sha256=JogRmttDLwfQ3PTbewEnGLKco9zV2Nu3yIfrMeCsx_I,9401
|
| 214 |
+
numpy/core/tests/test_cython.py,sha256=t5-h4XSIFNLyw_9BIAQDYl8_80t_pH0SCfEa1Vf_3aI,3755
|
| 215 |
+
numpy/core/tests/test_datetime.py,sha256=2vAGbrCQmsrWNXCVXOMZqUGZn2c-cQT-eZ1wTprYbcM,116211
|
| 216 |
+
numpy/core/tests/test_defchararray.py,sha256=F88HUkByEP4H6cJ_ITvIe0a_T1BH2JOdRysMCu1XIn0,24997
|
| 217 |
+
numpy/core/tests/test_deprecations.py,sha256=w2lhHb-W8hh7RoE_0Ftg8thpG86jvbFAJgior22DY2Q,31076
|
| 218 |
+
numpy/core/tests/test_dlpack.py,sha256=cDlwFmTombb2rDeB8RHEAJ4eVMUiDbw8Oz5Jo1NQwk0,3522
|
| 219 |
+
numpy/core/tests/test_dtype.py,sha256=J09pJF59v7UO6iNuJFISKP2DLPgdkQ_df5OAMDRLikU,75702
|
| 220 |
+
numpy/core/tests/test_einsum.py,sha256=QzQAPIC-IjTV3Dxz97hBnvLBCmF8kpsBTBckThhgRjQ,53712
|
| 221 |
+
numpy/core/tests/test_errstate.py,sha256=U3GT9I058jkF725mx4GdWUr9RoceCkGDV7Go79VA4wY,2219
|
| 222 |
+
numpy/core/tests/test_extint128.py,sha256=gCZfAwPOb-F1TLsEEeDI0amQYwHk-60-OXi0ccZrrZ8,5643
|
| 223 |
+
numpy/core/tests/test_function_base.py,sha256=Ibs6-WXZE5hsRx4VCnX-cZOWYKU-5PFXjouwAQzgnqQ,15595
|
| 224 |
+
numpy/core/tests/test_getlimits.py,sha256=apdxr0zKkxaVHIUpLrqAvO39q54JKN14sV4xSbK2Ifs,6718
|
| 225 |
+
numpy/core/tests/test_half.py,sha256=VYPyap9GYOWZuphsfFofcIRl-oa5Ufrtv83OTp6azdU,24593
|
| 226 |
+
numpy/core/tests/test_hashtable.py,sha256=ZV8HL8NkDnoQZfnje7BP0fyIp4fSFqjKsQc40PaTggc,1011
|
| 227 |
+
numpy/core/tests/test_indexerrors.py,sha256=kN9xLl6FVTzmI7fumn_cuZ3k0omXnTetgtCnPY44cvw,5130
|
| 228 |
+
numpy/core/tests/test_indexing.py,sha256=x0ojWuhOwWD5MZuiJ9Ncim3CgkwI-GldWxrSCmjmFJM,54314
|
| 229 |
+
numpy/core/tests/test_item_selection.py,sha256=kI30kiX8mIrZYPn0jw3lGGw1ruZF4PpE9zw-aai9EPA,6458
|
| 230 |
+
numpy/core/tests/test_limited_api.py,sha256=5yO0nGmCKZ9b3S66QP7vY-HIgAoyOtHZmp8mvzKuOHI,1172
|
| 231 |
+
numpy/core/tests/test_longdouble.py,sha256=jO8YMm_Hsz-XPKbmv6iMcOdHgTlIFkKTwAtxpy3Q1pE,13905
|
| 232 |
+
numpy/core/tests/test_machar.py,sha256=_5_TDUVtAJvJI5jBfEFKpCZtAfKCsCFt7tXlWSkWzzc,1067
|
| 233 |
+
numpy/core/tests/test_mem_overlap.py,sha256=QJ0unWD_LOoAGAo4ra0IvYenj56IYUtiz1fEJEmTY9Q,29086
|
| 234 |
+
numpy/core/tests/test_mem_policy.py,sha256=CXa10FQw2Qj6MqJuaC8Fm4slsoipKFjCIpYF6c5IIAU,16801
|
| 235 |
+
numpy/core/tests/test_memmap.py,sha256=tZ5lJs_4ZFsJmg392ZQ33fX0m8tdfZ8ZtY9Lq41LNtk,7477
|
| 236 |
+
numpy/core/tests/test_multiarray.py,sha256=GPv4IJR9dijNG-icUsQsX2tBD2RdP3EhUehY4cxvVQU,380106
|
| 237 |
+
numpy/core/tests/test_nditer.py,sha256=nVQ00aNxPHqf4ZcFs3e9AVDK64TCqlO0TzfocTAACZQ,130818
|
| 238 |
+
numpy/core/tests/test_nep50_promotions.py,sha256=2TwtFvj1LBpYTtdR6NFe1RAAGXIJltLqwpA1vhQCVY4,8840
|
| 239 |
+
numpy/core/tests/test_numeric.py,sha256=ZGNW5NKgShEjZC_TcPOtTuRaTM_GbuM21u82D205UPs,137294
|
| 240 |
+
numpy/core/tests/test_numerictypes.py,sha256=f_xMjZJnyDwlc6XCrd71b6x1_6dAWOv-kZ3-NEq37hU,21687
|
| 241 |
+
numpy/core/tests/test_numpy_2_0_compat.py,sha256=kVCTAXska7Xi5w_TYduWhid0nlCqI6Nvmt-gDnYsuKI,1630
|
| 242 |
+
numpy/core/tests/test_overrides.py,sha256=t0gOZOzu7pevE58HA-npFYJqnInHR-LLBklnzKJWHqo,26080
|
| 243 |
+
numpy/core/tests/test_print.py,sha256=ErZAWd88b0ygSEoYpd0BL2tFjkerMtn1vZ7dWvaNqTc,6837
|
| 244 |
+
numpy/core/tests/test_protocols.py,sha256=fEXE9K9s22oiVWkX92BY-g00-uXCK-HxjZhZxxYAKFc,1168
|
| 245 |
+
numpy/core/tests/test_records.py,sha256=pluit5x6jkWoPEIrHXM13L3xZuuSSiaxoXFsOdkakCU,20269
|
| 246 |
+
numpy/core/tests/test_regression.py,sha256=SJo9cPTVr2SNjhgtW7boUMyNQlXxygsZ5g0oyqC8Eks,91595
|
| 247 |
+
numpy/core/tests/test_scalar_ctors.py,sha256=qDIZV-tBukwAxNDhUmGtH3CemDXlS3xd_q3L52touuA,6115
|
| 248 |
+
numpy/core/tests/test_scalar_methods.py,sha256=Uj-zU0zzzKAjMBdpkzsWZ3nSFj5gJkUlqi_euhOYdnU,7541
|
| 249 |
+
numpy/core/tests/test_scalarbuffer.py,sha256=FSL94hriWX1_uV6Z33wB3ZXUrpmmX2-x87kNjIxUeBk,5580
|
| 250 |
+
numpy/core/tests/test_scalarinherit.py,sha256=fMInDGKsiH3IS_2ejZtIcmJZ0Ry8c7kVsHx7wp5XDoM,2368
|
| 251 |
+
numpy/core/tests/test_scalarmath.py,sha256=XZj_m2I2TLktJdFD1SWj2XtV8hT26VIxasDz3cAFvgA,43247
|
| 252 |
+
numpy/core/tests/test_scalarprint.py,sha256=1599W5X0tjGhBnSQjalXkg6AY8eHXnr6PMqs4vYZQqs,18771
|
| 253 |
+
numpy/core/tests/test_shape_base.py,sha256=D9haeuUVx3x3pOLmFQ9vUz7iU4T2bFTsPoI8HgSncFU,29723
|
| 254 |
+
numpy/core/tests/test_simd.py,sha256=-L1UhIn9Eu_euLwaSU7bPRfYpWWOTb43qovoJS7Ws7w,48696
|
| 255 |
+
numpy/core/tests/test_simd_module.py,sha256=OSpYhH_3QDxItyQcaW6SjXW57k2m-weRwpYOnJjCqN0,3902
|
| 256 |
+
numpy/core/tests/test_strings.py,sha256=A9t1B65lFrYRLXgDJSg3mMDAe_hypIPcTMVOdAYIbU0,3835
|
| 257 |
+
numpy/core/tests/test_ufunc.py,sha256=5pS2x3LACHn8GogYYad8LRAjByK7Gg9xTD9ik3d0Fm0,124907
|
| 258 |
+
numpy/core/tests/test_umath.py,sha256=huHpclJqkO32k7BTflRHj8nImzg3p6yyryeS9LyHKWU,186482
|
| 259 |
+
numpy/core/tests/test_umath_accuracy.py,sha256=mFcVdzXhhD9mqhzLDJVZsWfCHbjbFQ6XeEl5G8l-PTc,3897
|
| 260 |
+
numpy/core/tests/test_umath_complex.py,sha256=WvZZZWeijo52RiOfx-G83bxzQOp_IJ3i9fEnUDVukLQ,23247
|
| 261 |
+
numpy/core/tests/test_unicode.py,sha256=hUXIwMmoq89y_KXWzuXVyQaXvRwGjfY4TvKJsCbygEI,12775
|
| 262 |
+
numpy/core/umath.py,sha256=JbT_SxnZ_3MEmjOI9UtX3CcAzX5Q-4RDlnnhDAEJ5Vo,2040
|
| 263 |
+
numpy/core/umath_tests.py,sha256=TIzaDfrEHHgSc2J5kxFEibq8MOPhwSuyOZOUBsZNVSM,389
|
| 264 |
+
numpy/ctypeslib.py,sha256=Po4XCWfxhwFQ1Q8x8DeayGiMCJLxREaCDkVyeladxBU,17247
|
| 265 |
+
numpy/ctypeslib.pyi,sha256=A9te473aRO920iDVuyKypeVIQp-ueZK6EiI-qLSwJNg,7972
|
| 266 |
+
numpy/doc/__init__.py,sha256=OYmE-F6x0CD05PCDY2MiW1HLlwB6i9vhDpk-a3r4lHY,508
|
| 267 |
+
numpy/doc/constants.py,sha256=PlXoj7b4A8Aa9nADbg83uzTBRJaX8dvJmEdbn4FDPPo,9155
|
| 268 |
+
numpy/doc/ufuncs.py,sha256=i1alLg19mNyCFZ2LYSOZGm--RsRN1x63U_UYU-N3x60,5357
|
| 269 |
+
numpy/dtypes.py,sha256=BuBztrPQRasUmVZhXr2_NgJujdUTNhNwd59pZZHk3lA,2229
|
| 270 |
+
numpy/dtypes.pyi,sha256=tIHniAYP7ALg2iT7NgSXO67jvE-zRlDod3MazEmD4M8,1315
|
| 271 |
+
numpy/exceptions.py,sha256=7j7tv8cwXGZYgldyMisGmnAxAl2s4YU0vexME81yYlA,7339
|
| 272 |
+
numpy/exceptions.pyi,sha256=KsZqWNvyPUEXUGR9EhZCUQF2f9EVSpBRlJUlGqRT02k,600
|
| 273 |
+
numpy/f2py/__init__.py,sha256=m-ty_WiJZ4GVfV5--kJ3MFJaLXestz5Eo-4H0FPscK4,5565
|
| 274 |
+
numpy/f2py/__init__.pyi,sha256=eA7uYXZr0p0aaz5rBW-EypLx9RchrvqDYtSnkEJQsYw,1087
|
| 275 |
+
numpy/f2py/__main__.py,sha256=6i2jVH2fPriV1aocTY_dUFvWK18qa-zjpnISA-OpF3w,130
|
| 276 |
+
numpy/f2py/__version__.py,sha256=7HHdjR82FCBmftwMRyrlhcEj-8mGQb6oCH-wlUPH4Nw,34
|
| 277 |
+
numpy/f2py/_backends/__init__.py,sha256=7_bA7c_xDpLc4_8vPfH32-Lxn9fcUTgjQ25srdvwvAM,299
|
| 278 |
+
numpy/f2py/_backends/_backend.py,sha256=GKb9-UaFszT045vUgVukPs1n97iyyjqahrWKxLOKNYo,1187
|
| 279 |
+
numpy/f2py/_backends/_distutils.py,sha256=pxh2YURFYYSykIOvBFwVvhoNX1oSk-c30IPPhzlko-0,2383
|
| 280 |
+
numpy/f2py/_backends/_meson.py,sha256=gi-nbnPFDC38sumfAjg-Q5FPu6nNkyQXTjEuVf9W9Cc,6916
|
| 281 |
+
numpy/f2py/_backends/meson.build.template,sha256=oTPNMAQzS4CJ_lfEzYv-oBeJTtQuThUYVN5R6ROWpNU,1579
|
| 282 |
+
numpy/f2py/_isocbind.py,sha256=zaBgpfPNRmxVG3doUIlbZIiyB990MsXiwDabrSj9HnQ,2360
|
| 283 |
+
numpy/f2py/_src_pyf.py,sha256=4t6TN4ZKWciC4f1z6fwaGrpIGhHKRiwHfcrNj4FIzCg,7654
|
| 284 |
+
numpy/f2py/auxfuncs.py,sha256=dNs4b2KDIcG4M1hPBvD09-Vh7CDzlPIrFscOdvL3p1o,26539
|
| 285 |
+
numpy/f2py/capi_maps.py,sha256=ENjYyeZ3CCJcLwJJgmKOSYrD1KPuhpwauXqeizdV55o,30563
|
| 286 |
+
numpy/f2py/cb_rules.py,sha256=5TuHbJWGjsF6yVNzKuV2tAnwdLyhcWlmdsjYlDOZOv4,24992
|
| 287 |
+
numpy/f2py/cfuncs.py,sha256=KJyW7mdjmFSmxssfeegGJs5NZyF3mZMgNvOxN9-vYHQ,51913
|
| 288 |
+
numpy/f2py/common_rules.py,sha256=gHB76WypbkVmhaD_RWhy8Od4zDTgj8cbDOdUdIp6PIQ,5131
|
| 289 |
+
numpy/f2py/crackfortran.py,sha256=ErLdkWP8MxeyW5vVPGXwyvrxZAwymlvIBC0th2rvK74,148553
|
| 290 |
+
numpy/f2py/diagnose.py,sha256=0SRXBE2hJgKJN_Rf4Zn00oKXC_Tka3efPWM47zg6BoY,5197
|
| 291 |
+
numpy/f2py/f2py2e.py,sha256=5t093ZQ4xs0_0UbyaYVd2yA2EVOaOAcuU29JI-IU2Ag,27717
|
| 292 |
+
numpy/f2py/f90mod_rules.py,sha256=otm3_dmVIna0eBVHLu_693s3a_82lU3pqeqDacWI37s,9594
|
| 293 |
+
numpy/f2py/func2subr.py,sha256=6d2R5awuHRT4xzgfUfwS7JHTqhhAieSXcENlssD_2c4,10298
|
| 294 |
+
numpy/f2py/rules.py,sha256=B4FxSYEfZ_1j_z9GulQNZ1BNrPrUvlU3ybxwTkrIxjI,62727
|
| 295 |
+
numpy/f2py/setup.cfg,sha256=Fpn4sjqTl5OT5sp8haqKIRnUcTPZNM6MIvUJBU7BIhg,48
|
| 296 |
+
numpy/f2py/setup.py,sha256=MmAVspT8DDTqDuL8ZJhxK62g0lcso4vqI6QNQ9CsfoQ,2422
|
| 297 |
+
numpy/f2py/src/fortranobject.c,sha256=g4BKDO1_9pCu6hithKXD2oH_Mt-HH1NTnP6leCqJrzc,46017
|
| 298 |
+
numpy/f2py/src/fortranobject.h,sha256=neMKotYWbHvrhW9KXz4QzQ8fzPkiQXLHHjy82vLSeog,5835
|
| 299 |
+
numpy/f2py/symbolic.py,sha256=jWBoAwECCxRdWczR9r7O6UERcYmH_GbdcAReNp7cmJY,53270
|
| 300 |
+
numpy/f2py/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 301 |
+
numpy/f2py/tests/src/abstract_interface/foo.f90,sha256=JFU2w98cB_XNwfrqNtI0yDTmpEdxYO_UEl2pgI_rnt8,658
|
| 302 |
+
numpy/f2py/tests/src/abstract_interface/gh18403_mod.f90,sha256=gvQJIzNtvacWE0dhysxn30-iUeI65Hpq7DiE9oRauz8,105
|
| 303 |
+
numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c,sha256=Ff5wHYV9-OJnZuelfFWcjAibRvDkEIlbTVczTyv6TG8,7299
|
| 304 |
+
numpy/f2py/tests/src/assumed_shape/.f2py_f2cmap,sha256=But9r9m4iL7EGq_haMW8IiQ4VivH0TgUozxX4pPvdpE,29
|
| 305 |
+
numpy/f2py/tests/src/assumed_shape/foo_free.f90,sha256=oBwbGSlbr9MkFyhVO2aldjc01dr9GHrMrSiRQek8U64,460
|
| 306 |
+
numpy/f2py/tests/src/assumed_shape/foo_mod.f90,sha256=rfzw3QdI-eaDSl-hslCgGpd5tHftJOVhXvb21Y9Gf6M,499
|
| 307 |
+
numpy/f2py/tests/src/assumed_shape/foo_use.f90,sha256=rmT9k4jP9Ru1PLcGqepw9Jc6P9XNXM0axY7o4hi9lUw,269
|
| 308 |
+
numpy/f2py/tests/src/assumed_shape/precision.f90,sha256=r08JeTVmTTExA-hYZ6HzaxVwBn1GMbPAuuwBhBDtJUk,130
|
| 309 |
+
numpy/f2py/tests/src/block_docstring/foo.f,sha256=y7lPCPu7_Fhs_Tf2hfdpDQo1bhtvNSKRaZAOpM_l3dg,97
|
| 310 |
+
numpy/f2py/tests/src/callback/foo.f,sha256=C1hjfpRCQWiOVVzIHqnsYcnLrqQcixrnHCn8hd9GhVk,1254
|
| 311 |
+
numpy/f2py/tests/src/callback/gh17797.f90,sha256=_Nrl0a2HgUbtymGU0twaJ--7rMa1Uco2A3swbWvHoMo,148
|
| 312 |
+
numpy/f2py/tests/src/callback/gh18335.f90,sha256=NraOyKIXyvv_Y-3xGnmTjtNjW2Znsnlk8AViI8zfovc,506
|
| 313 |
+
numpy/f2py/tests/src/callback/gh25211.f,sha256=a2sxlQhtDVbYn8KOKHUYqwc-aCFt7sDPSnJsXFG35uI,179
|
| 314 |
+
numpy/f2py/tests/src/callback/gh25211.pyf,sha256=FWxo0JWQlw519BpZV8PoYeI_FZ_K6C-3Wk6gLrfBPlw,447
|
| 315 |
+
numpy/f2py/tests/src/cli/gh_22819.pyf,sha256=5rvOfCv-wSosB354LC9pExJmMoSHnbGZGl_rtA2fogA,142
|
| 316 |
+
numpy/f2py/tests/src/cli/hi77.f,sha256=ttyI6vAP3qLnDqy82V04XmoqrXNM6uhMvvLri2p0dq0,71
|
| 317 |
+
numpy/f2py/tests/src/cli/hiworld.f90,sha256=QWOLPrTxYQu1yrEtyQMbM0fE9M2RmXe7c185KnD5x3o,51
|
| 318 |
+
numpy/f2py/tests/src/common/block.f,sha256=GQ0Pd-VMX3H3a-__f2SuosSdwNXHpBqoGnQDjf8aG9g,224
|
| 319 |
+
numpy/f2py/tests/src/common/gh19161.f90,sha256=BUejyhqpNVfHZHQ-QC7o7ZSo7lQ6YHyX08lSmQqs6YM,193
|
| 320 |
+
numpy/f2py/tests/src/crackfortran/accesstype.f90,sha256=-5Din7YlY1TU7tUHD2p-_DSTxGBpDsWYNeT9WOwGhno,208
|
| 321 |
+
numpy/f2py/tests/src/crackfortran/data_common.f,sha256=ZSUAh3uhn9CCF-cYqK5TNmosBGPfsuHBIEfudgysun4,193
|
| 322 |
+
numpy/f2py/tests/src/crackfortran/data_multiplier.f,sha256=jYrJKZWF_59JF9EMOSALUjn0UupWvp1teuGpcL5s1Sc,197
|
| 323 |
+
numpy/f2py/tests/src/crackfortran/data_stmts.f90,sha256=19YO7OGj0IksyBlmMLZGRBQLjoE3erfkR4tFvhznvvE,693
|
| 324 |
+
numpy/f2py/tests/src/crackfortran/data_with_comments.f,sha256=hoyXw330VHh8duMVmAQZjr1lgLVF4zFCIuEaUIrupv0,175
|
| 325 |
+
numpy/f2py/tests/src/crackfortran/foo_deps.f90,sha256=CaH7mnWTG7FcnJe2vXN_0zDbMadw6NCqK-JJ2HmDjK8,128
|
| 326 |
+
numpy/f2py/tests/src/crackfortran/gh15035.f,sha256=jJly1AzF5L9VxbVQ0vr-sf4LaUo4eQzJguhuemFxnvg,375
|
| 327 |
+
numpy/f2py/tests/src/crackfortran/gh17859.f,sha256=7K5dtOXGuBDAENPNCt-tAGJqTfNKz5OsqVSk16_e7Es,340
|
| 328 |
+
numpy/f2py/tests/src/crackfortran/gh22648.pyf,sha256=qZHPRNQljIeYNwbqPLxREnOrSdVV14f3fnaHqB1M7c0,241
|
| 329 |
+
numpy/f2py/tests/src/crackfortran/gh23533.f,sha256=w3tr_KcY3s7oSWGDmjfMHv5h0RYVGUpyXquNdNFOJQg,126
|
| 330 |
+
numpy/f2py/tests/src/crackfortran/gh23598.f90,sha256=41W6Ire-5wjJTTg6oAo7O1WZfd1Ug9vvNtNgHS5MhEU,101
|
| 331 |
+
numpy/f2py/tests/src/crackfortran/gh23598Warn.f90,sha256=1v-hMCT_K7prhhamoM20nMU9zILam84Hr-imck_dYYk,205
|
| 332 |
+
numpy/f2py/tests/src/crackfortran/gh23879.f90,sha256=LWDJTYR3t9h1IsrKC8dVXZlBfWX7clLeU006X6Ow8oI,332
|
| 333 |
+
numpy/f2py/tests/src/crackfortran/gh2848.f90,sha256=gPNasx98SIf7Z9ibk_DHiGKCvl7ERtsfoGXiFDT7FbM,282
|
| 334 |
+
numpy/f2py/tests/src/crackfortran/operators.f90,sha256=-Fc-qjW1wBr3Dkvdd5dMTrt0hnjnV-1AYo-NFWcwFSo,1184
|
| 335 |
+
numpy/f2py/tests/src/crackfortran/privatemod.f90,sha256=7bubZGMIn7iD31wDkjF1TlXCUM7naCIK69M9d0e3y-U,174
|
| 336 |
+
numpy/f2py/tests/src/crackfortran/publicmod.f90,sha256=Pnwyf56Qd6W3FUH-ZMgnXEYkb7gn18ptNTdwmGan0Jo,167
|
| 337 |
+
numpy/f2py/tests/src/crackfortran/pubprivmod.f90,sha256=eYpJwBYLKGOxVbKgEqfny1znib-b7uYhxcRXIf7uwXg,165
|
| 338 |
+
numpy/f2py/tests/src/crackfortran/unicode_comment.f90,sha256=aINLh6GlfTwFewxvDoqnMqwuCNb4XAqi5Nj5vXguXYs,98
|
| 339 |
+
numpy/f2py/tests/src/f2cmap/.f2py_f2cmap,sha256=iUOtfHd3OuT1Rz2-yiSgt4uPKGvCt5AzQ1iygJt_yjg,82
|
| 340 |
+
numpy/f2py/tests/src/f2cmap/isoFortranEnvMap.f90,sha256=iJCD8a8MUTmuPuedbcmxW54Nr4alYuLhksBe1sHS4K0,298
|
| 341 |
+
numpy/f2py/tests/src/isocintrin/isoCtests.f90,sha256=jcw-fzrFh0w5U66uJYfeUW4gv94L5MnWQ_NpsV9y0oI,998
|
| 342 |
+
numpy/f2py/tests/src/kind/foo.f90,sha256=zIHpw1KdkWbTzbXb73hPbCg4N2Htj3XL8DIwM7seXpo,347
|
| 343 |
+
numpy/f2py/tests/src/mixed/foo.f,sha256=90zmbSHloY1XQYcPb8B5d9bv9mCZx8Z8AMTtgDwJDz8,85
|
| 344 |
+
numpy/f2py/tests/src/mixed/foo_fixed.f90,sha256=pxKuPzxF3Kn5khyFq9ayCsQiolxB3SaNtcWaK5j6Rv4,179
|
| 345 |
+
numpy/f2py/tests/src/mixed/foo_free.f90,sha256=fIQ71wrBc00JUAVUj_r3QF9SdeNniBiMw6Ly7CGgPWU,139
|
| 346 |
+
numpy/f2py/tests/src/module_data/mod.mod,sha256=EkjrU7NTZrOH68yKrz6C_eyJMSFSxGgC2yMQT9Zscek,412
|
| 347 |
+
numpy/f2py/tests/src/module_data/module_data_docstring.f90,sha256=tDZ3fUlazLL8ThJm3VwNGJ75QIlLcW70NnMFv-JA4W0,224
|
| 348 |
+
numpy/f2py/tests/src/negative_bounds/issue_20853.f90,sha256=fdOPhRi7ipygwYCXcda7p_dlrws5Hd2GlpF9EZ-qnck,157
|
| 349 |
+
numpy/f2py/tests/src/parameter/constant_both.f90,sha256=-bBf2eqHb-uFxgo6Q7iAtVUUQzrGFqzhHDNaxwSICfQ,1939
|
| 350 |
+
numpy/f2py/tests/src/parameter/constant_compound.f90,sha256=re7pfzcuaquiOia53UT7qNNrTYu2euGKOF4IhoLmT6g,469
|
| 351 |
+
numpy/f2py/tests/src/parameter/constant_integer.f90,sha256=nEmMLitKoSAG7gBBEQLWumogN-KS3DBZOAZJWcSDnFw,612
|
| 352 |
+
numpy/f2py/tests/src/parameter/constant_non_compound.f90,sha256=IcxESVLKJUZ1k9uYKoSb8Hfm9-O_4rVnlkiUU2diy8Q,609
|
| 353 |
+
numpy/f2py/tests/src/parameter/constant_real.f90,sha256=quNbDsM1Ts2rN4WtPO67S9Xi_8l2cXabWRO00CPQSSQ,610
|
| 354 |
+
numpy/f2py/tests/src/quoted_character/foo.f,sha256=WjC9D9171fe2f7rkUAZUvik9bkIf9adByfRGzh6V0cM,482
|
| 355 |
+
numpy/f2py/tests/src/regression/gh25337/data.f90,sha256=9Uz8CHB9i3_mjC3cTOmkTgPAF5tWSwYacG3MUrU-SY0,180
|
| 356 |
+
numpy/f2py/tests/src/regression/gh25337/use_data.f90,sha256=WATiDGAoCKnGgMzm_iMgmfVU0UKOQlk5Fm0iXCmPAkE,179
|
| 357 |
+
numpy/f2py/tests/src/regression/inout.f90,sha256=CpHpgMrf0bqA1W3Ozo3vInDz0RP904S7LkpdAH6ODck,277
|
| 358 |
+
numpy/f2py/tests/src/return_character/foo77.f,sha256=WzDNF3d_hUDSSZjtxd3DtE-bSx1ilOMEviGyYHbcFgM,980
|
| 359 |
+
numpy/f2py/tests/src/return_character/foo90.f90,sha256=ULcETDEt7gXHRzmsMhPsGG4o3lGrcx-FEFaJsPGFKyA,1248
|
| 360 |
+
numpy/f2py/tests/src/return_complex/foo77.f,sha256=8ECRJkfX82oFvGWKbIrCvKjf5QQQClx4sSEvsbkB6A8,973
|
| 361 |
+
numpy/f2py/tests/src/return_complex/foo90.f90,sha256=c1BnrtWwL2dkrTr7wvlEqNDg59SeNMo3gyJuGdRwcDw,1238
|
| 362 |
+
numpy/f2py/tests/src/return_integer/foo77.f,sha256=_8k1evlzBwvgZ047ofpdcbwKdF8Bm3eQ7VYl2Y8b5kA,1178
|
| 363 |
+
numpy/f2py/tests/src/return_integer/foo90.f90,sha256=bzxbYtofivGRYH35Ang9ScnbNsVERN8-6ub5-eI-LGQ,1531
|
| 364 |
+
numpy/f2py/tests/src/return_logical/foo77.f,sha256=FxiF_X0HkyXHzJM2rLyTubZJu4JB-ObLnVqfZwAQFl8,1188
|
| 365 |
+
numpy/f2py/tests/src/return_logical/foo90.f90,sha256=9KmCe7yJYpi4ftkKOM3BCDnPOdBPTbUNrKxY3p37O14,1531
|
| 366 |
+
numpy/f2py/tests/src/return_real/foo77.f,sha256=ZTrzb6oDrIDPlrVWP3Bmtkbz3ffHaaSQoXkfTGtCuFE,933
|
| 367 |
+
numpy/f2py/tests/src/return_real/foo90.f90,sha256=gZuH5lj2lG6gqHlH766KQ3J4-Ero-G4WpOOo2MG3ohU,1194
|
| 368 |
+
numpy/f2py/tests/src/size/foo.f90,sha256=IlFAQazwBRr3zyT7v36-tV0-fXtB1d7WFp6S1JVMstg,815
|
| 369 |
+
numpy/f2py/tests/src/string/char.f90,sha256=ihr_BH9lY7eXcQpHHDQhFoKcbu7VMOX5QP2Tlr7xlaM,618
|
| 370 |
+
numpy/f2py/tests/src/string/fixed_string.f90,sha256=5n6IkuASFKgYICXY9foCVoqndfAY0AQZFEK8L8ARBGM,695
|
| 371 |
+
numpy/f2py/tests/src/string/gh24008.f,sha256=UA8Pr-_yplfOFmc6m4v9ryFQ8W9OulaglulefkFWD68,217
|
| 372 |
+
numpy/f2py/tests/src/string/gh24662.f90,sha256=-Tp9Kd1avvM7AIr8ZukFA9RVr-wusziAnE8AvG9QQI4,197
|
| 373 |
+
numpy/f2py/tests/src/string/gh25286.f90,sha256=2EpxvC-0_dA58MBfGQcLyHzpZgKcMf_W9c73C_Mqnok,304
|
| 374 |
+
numpy/f2py/tests/src/string/gh25286.pyf,sha256=GjgWKh1fHNdPGRiX5ek60i1XSeZsfFalydWqjISPVV8,381
|
| 375 |
+
numpy/f2py/tests/src/string/gh25286_bc.pyf,sha256=6Y9zU66NfcGhTXlFOdFjCSMSwKXpq5ZfAe3FwpkAsm4,384
|
| 376 |
+
numpy/f2py/tests/src/string/scalar_string.f90,sha256=ACxV2i6iPDk-a6L_Bs4jryVKYJMEGUTitEIYTjbJes4,176
|
| 377 |
+
numpy/f2py/tests/src/string/string.f,sha256=shr3fLVZaa6SyUJFYIF1OZuhff8v5lCwsVNBU2B-3pk,248
|
| 378 |
+
numpy/f2py/tests/src/value_attrspec/gh21665.f90,sha256=JC0FfVXsnB2lZHb-nGbySnxv_9VHAyD0mKaLDowczFU,190
|
| 379 |
+
numpy/f2py/tests/test_abstract_interface.py,sha256=C8-ly0_TqkmpQNZmwPHwo2IV2MBH0jQEjAhpqHrg8Y4,832
|
| 380 |
+
numpy/f2py/tests/test_array_from_pyobj.py,sha256=Txff89VUeEhWqUCRVybIqsqH4YQvpk4Uyjmh_XjyMi0,24049
|
| 381 |
+
numpy/f2py/tests/test_assumed_shape.py,sha256=FeaqtrWyBf5uyArcmI0D2e_f763aSMpgU3QmdDXe-tA,1466
|
| 382 |
+
numpy/f2py/tests/test_block_docstring.py,sha256=SEpuq73T9oVtHhRVilFf1xF7nb683d4-Kv7V0kfL4AA,564
|
| 383 |
+
numpy/f2py/tests/test_callback.py,sha256=cReSlVjgnoT74wmtNn-oEIZiJUTfRX7ljjlqJi716IQ,6494
|
| 384 |
+
numpy/f2py/tests/test_character.py,sha256=3ugjM1liymMRbY8wub1eiap-jdyNYVHxlNZBqNoRLe4,21868
|
| 385 |
+
numpy/f2py/tests/test_common.py,sha256=m7TTSJt5zUZKJF-MQUeTtCyxW7YwRBSETINXGPFu8S4,896
|
| 386 |
+
numpy/f2py/tests/test_compile_function.py,sha256=9d_FZ8P2wbIlQ2qPDRrsFqPb4nMH8tiWqYZN-P_shCs,4186
|
| 387 |
+
numpy/f2py/tests/test_crackfortran.py,sha256=y1x3U-jlQWD5rmTXz1I2RlTz7LEfbI6qxCDkR5fzPwY,13441
|
| 388 |
+
numpy/f2py/tests/test_data.py,sha256=HFcmPYbiveKa-swJ8x8XlRR9sM0ESB9FEN-txZnHTok,2876
|
| 389 |
+
numpy/f2py/tests/test_docs.py,sha256=jqtuHE5ZjxP4D8Of3Fkzz36F8_0qKbeS040_m0ac4v4,1662
|
| 390 |
+
numpy/f2py/tests/test_f2cmap.py,sha256=p-Sylbr3ctdKT3UQV9FzpCuYPH5U7Vyn8weXFAjiI9o,391
|
| 391 |
+
numpy/f2py/tests/test_f2py2e.py,sha256=eoswH-daMEBlueoVpxXrDloahCpr0RLzHbr3zBHOsjk,25423
|
| 392 |
+
numpy/f2py/tests/test_isoc.py,sha256=_nPTPxNEEagiKriZBeFNesOattIlHDzaNKmj35xxDBY,1406
|
| 393 |
+
numpy/f2py/tests/test_kind.py,sha256=aOMQSBoD_dw49acKN25_abEvQBLI27DsnWIb9CNpSAE,1671
|
| 394 |
+
numpy/f2py/tests/test_mixed.py,sha256=Ctuw-H7DxhPjSt7wZdJ2xffawIoEBCPWc5F7PSkY4HY,848
|
| 395 |
+
numpy/f2py/tests/test_module_doc.py,sha256=sjCXWIKrqMD1NQ1DUAzgQqkjS5w9h9gvM_Lj29Rdcrg,863
|
| 396 |
+
numpy/f2py/tests/test_parameter.py,sha256=ADI7EV_CM4ztICpqHqeq8LI-WdB6cX0ttatdRdjbsUA,3941
|
| 397 |
+
numpy/f2py/tests/test_pyf_src.py,sha256=eD0bZu_GWfoCq--wWqEKRf-F2h5AwoTyO6GMA9wJPr4,1135
|
| 398 |
+
numpy/f2py/tests/test_quoted_character.py,sha256=cpjMdrHwimnkoJkXd_W_FSlh43oWytY5VHySW9oskO4,454
|
| 399 |
+
numpy/f2py/tests/test_regression.py,sha256=v_6RDQr6IcMmbCMElfzRSLPgZhHnH5l99uztrbJAzqE,2532
|
| 400 |
+
numpy/f2py/tests/test_return_character.py,sha256=18HJtiRwQ7a_2mdPUonD5forKWZJEapD-Vi1DsbTjVs,1493
|
| 401 |
+
numpy/f2py/tests/test_return_complex.py,sha256=BZIIqQ1abdiPLgVmu03_q37yCtND0ijxGSMhGz2Wf-o,2397
|
| 402 |
+
numpy/f2py/tests/test_return_integer.py,sha256=t--9UsdLF9flLTQv7a0KTSVoBuoDtTnmOG2QIFPINVc,1758
|
| 403 |
+
numpy/f2py/tests/test_return_logical.py,sha256=XCmp8E8I6BOeNYF59HjSFAdv1hM9WaDvl8UDS10_05o,2017
|
| 404 |
+
numpy/f2py/tests/test_return_real.py,sha256=ATek5AM7dCCPeIvoMOQIt5yFNFzKrFb1Kno8B4M0rn4,3235
|
| 405 |
+
numpy/f2py/tests/test_semicolon_split.py,sha256=_Mdsi84lES18pPjl9J-QsbGttV4tPFFjZvJvejNcqPc,1635
|
| 406 |
+
numpy/f2py/tests/test_size.py,sha256=q6YqQvcyqdXJeWbGijTiCbxyEG3EkPcvT8AlAW6RCMo,1164
|
| 407 |
+
numpy/f2py/tests/test_string.py,sha256=5xZOfdReoHnId0950XfmtfduPPfBbtMkzBoXMtygvMk,2962
|
| 408 |
+
numpy/f2py/tests/test_symbolic.py,sha256=28quk2kTKfWhKe56n4vINJ8G9weKBfc7HysMlE9J3_g,18341
|
| 409 |
+
numpy/f2py/tests/test_value_attrspec.py,sha256=rWwJBfE2qGzqilZZurJ-7ucNoJDICye6lLetQSLFees,323
|
| 410 |
+
numpy/f2py/tests/util.py,sha256=bEhG699c4bLVPR2WR8fV67avgX6kH5I74SicGb7Z7T4,11167
|
| 411 |
+
numpy/f2py/use_rules.py,sha256=3pTDOPur6gbPHPtwuMJPQvpnUMw39Law1KFSH0coB_0,3527
|
| 412 |
+
numpy/fft/__init__.py,sha256=HqjmF6s_dh0Ri4UZzUDtOKbNUyfAfJAWew3e3EL_KUk,8175
|
| 413 |
+
numpy/fft/__init__.pyi,sha256=vD9Xzz5r13caF4AVL87Y4U9KOj9ic25Vci_wb3dmgpk,550
|
| 414 |
+
numpy/fft/_pocketfft.py,sha256=Xkm8wcP4JyBNMbp0ZoHIWhNDlgliX24RzrDuo29uRks,52897
|
| 415 |
+
numpy/fft/_pocketfft.pyi,sha256=S6-ylUuHbgm8vNbh7tLru6K2R5SJzE81BC_Sllm6QrQ,2371
|
| 416 |
+
numpy/fft/_pocketfft_internal.cpython-312-x86_64-linux-gnu.so,sha256=ONIiSfNRsdUOkmnFloif_GOGOevBUMHnX8n1Wg8zGrU,97008
|
| 417 |
+
numpy/fft/helper.py,sha256=aNj1AcLvtfoX26RiLOwcR-k2QSMuBZkGj2Fu0CeFPJs,6154
|
| 418 |
+
numpy/fft/helper.pyi,sha256=NLTEjy2Gz1aAMDZwCgssIyUne0ubjJqukfYkpsL3gXM,1176
|
| 419 |
+
numpy/fft/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 420 |
+
numpy/fft/tests/test_helper.py,sha256=whgeaQ8PzFf3B1wkbXobGZ5sF4WxPp4gf1UPUVZest8,6148
|
| 421 |
+
numpy/fft/tests/test_pocketfft.py,sha256=RdeCCvUQmJYVvccOJwToobTKDg9yzUL06o9MkPmRfmI,12895
|
| 422 |
+
numpy/lib/__init__.py,sha256=XMPNJkG_mQ__xuvbf0OcpotgMbA9owt10ZHYVnYHq8E,2713
|
| 423 |
+
numpy/lib/__init__.pyi,sha256=y5ANokFm7EkrlNoHdeQm1FsUhLFxkYtLuanCbsWrGio,5596
|
| 424 |
+
numpy/lib/_datasource.py,sha256=CDF3im6IxdY3Mu6fwRQmkSEBmXS3kQVInQ4plXsoX9c,22631
|
| 425 |
+
numpy/lib/_iotools.py,sha256=Yg9HCfPg4tbhbdgLPcxSMiZXq1xDprvJKLebLwhDszY,30868
|
| 426 |
+
numpy/lib/_version.py,sha256=6vK7czNSB_KrWx2rZJzJ1pyOc73Q07hAgfLB5ItUCnU,4855
|
| 427 |
+
numpy/lib/_version.pyi,sha256=B572hyWrUWG-TAAAXrNNAT4AgyUAmJ4lvgpwMkDzunk,633
|
| 428 |
+
numpy/lib/arraypad.py,sha256=bKP7ZS9NYFYzqSk8OnpFLFrMsua4m_hcqFsi7cGkrJE,31803
|
| 429 |
+
numpy/lib/arraypad.pyi,sha256=ADXphtAORYl3EqvE5qs_u32B_TALKSOtF43jOLmoxRw,1728
|
| 430 |
+
numpy/lib/arraysetops.py,sha256=GJ2RhkzIJmIbwyG6h3LOFTPXg62kM9tcV1a-7tdbVuU,33655
|
| 431 |
+
numpy/lib/arraysetops.pyi,sha256=6X-5l5Yss_9y10LYyIsDLbGX77vt7PtVLDqxOlSRPfY,8372
|
| 432 |
+
numpy/lib/arrayterator.py,sha256=BQ97S00zvfURUZfes0GZo-5hydYNRuvwX1I1bLzeRik,7063
|
| 433 |
+
numpy/lib/arrayterator.pyi,sha256=f7Pwp83_6DiMYmJGUsffncM-FRAynB1iYGvhmHM_SZE,1537
|
| 434 |
+
numpy/lib/format.py,sha256=T8qJMyG2DDVjjYNNpUvBgfA9tCo23IS0w9byRB6twwQ,34769
|
| 435 |
+
numpy/lib/format.pyi,sha256=YWBxC3GdsZ7SKBN8I7nMwWeVuFD1aT9d-VJ8zE4-P-o,748
|
| 436 |
+
numpy/lib/function_base.py,sha256=IhhgfSmYJE-dHoUOMXHPiGYXso-NdXPpLXF9y0gEA6I,189172
|
| 437 |
+
numpy/lib/function_base.pyi,sha256=KWaC5UOBANU4hiIoN2eptE4HYsm4vgp_8BMFV1Y3JX4,16585
|
| 438 |
+
numpy/lib/histograms.py,sha256=xsj_qpaZoI2Bv1FBpY8mIMPJrYRiuIBszn_6kO7YFRA,37778
|
| 439 |
+
numpy/lib/histograms.pyi,sha256=hNwR2xYWkgJCP-nfRGxc-EgHLTD3qm4zmWXthZLt08M,995
|
| 440 |
+
numpy/lib/index_tricks.py,sha256=4PEvXk6VFTkttMViYBVC4yDhyOiKIon6JpIm0d_CmNg,31346
|
| 441 |
+
numpy/lib/index_tricks.pyi,sha256=D2nkNXOB9Vea1PfMaTn94OGBGayjTaQ-bKMsjDmYpak,4251
|
| 442 |
+
numpy/lib/mixins.py,sha256=y6_MzQuiNjv-1EFVROqv2y2cAJi5X4rQYzbZCyUyXgw,7071
|
| 443 |
+
numpy/lib/mixins.pyi,sha256=h9N1kbZsUntF0zjOxPYeD_rCB2dMiG35TYYPl9ymkI4,3117
|
| 444 |
+
numpy/lib/nanfunctions.py,sha256=6EjzydZlugIzfiENKtC4ycZ2Nckt8ZQg5v6D6tX1SiU,65775
|
| 445 |
+
numpy/lib/nanfunctions.pyi,sha256=oPqAfCinmBL85Ji7ko4QlzAzLAK9nZL0t2_CllEbCEU,606
|
| 446 |
+
numpy/lib/npyio.py,sha256=NUjtFvAmPdTjwJQ-ia-xbCr849M_M6NilP5IHfkKaRg,97316
|
| 447 |
+
numpy/lib/npyio.pyi,sha256=SUFWJh90vWZCdd6GCSGbfYeXKlWut0XY_SHvZJc8yqY,9728
|
| 448 |
+
numpy/lib/polynomial.py,sha256=6Aw3_2vdbh4urERQ6NaPhf9a_T1o1o6cjm3fb5Z3_YE,44133
|
| 449 |
+
numpy/lib/polynomial.pyi,sha256=GerIpQnf5LdtFMOy9AxhOTqUyfn57k4MxqEYrfdckWE,6958
|
| 450 |
+
numpy/lib/recfunctions.py,sha256=-90AbWWvVFOqVUPLh9K9NYdKUHYIgSEyg2Y35MnOVUA,59423
|
| 451 |
+
numpy/lib/scimath.py,sha256=T4ITysZgqhY1J8IxyXCtioHjMTg2ci-4i3mr9TBF2UA,15037
|
| 452 |
+
numpy/lib/scimath.pyi,sha256=E2roKJzMFwWSyhLu8UPUr54WOpxF8jp_pyXYBgsUSQ8,2883
|
| 453 |
+
numpy/lib/setup.py,sha256=0K5NJKuvKvNEWp-EX7j0ODi3ZQQgIMHobzSFJq3G7yM,405
|
| 454 |
+
numpy/lib/shape_base.py,sha256=AhCO9DEyysE-P-QJF9ryUtJ1ghU4_0mORhAJ59poObU,38947
|
| 455 |
+
numpy/lib/shape_base.pyi,sha256=bGJhLA_RvUpVTiDFgCV-1rUjV8e1qCh0gK_3PLgXA_U,5341
|
| 456 |
+
numpy/lib/stride_tricks.py,sha256=brY5b-0YQJuIH2CavfpIinMolyTUv5k9DUvLoZ-imis,17911
|
| 457 |
+
numpy/lib/stride_tricks.pyi,sha256=0pQ4DP9l6g21q2Ajv6dJFRWMr9auPGTNV9BmZUbogPY,1747
|
| 458 |
+
numpy/lib/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 459 |
+
numpy/lib/tests/data/py2-objarr.npy,sha256=F4cyUC-_TB9QSFLAo2c7c44rC6NUYIgrfGx9PqWPSKk,258
|
| 460 |
+
numpy/lib/tests/data/py2-objarr.npz,sha256=xo13HBT0FbFZ2qvZz0LWGDb3SuQASSaXh7rKfVcJjx4,366
|
| 461 |
+
numpy/lib/tests/data/py3-objarr.npy,sha256=pTTVh8ezp-lwAK3fkgvdKU8Arp5NMKznVD-M6Ex_uA0,341
|
| 462 |
+
numpy/lib/tests/data/py3-objarr.npz,sha256=qQR0gS57e9ta16d_vCQjaaKM74gPdlwCPkp55P-qrdw,449
|
| 463 |
+
numpy/lib/tests/data/python3.npy,sha256=X0ad3hAaLGXig9LtSHAo-BgOvLlFfPYMnZuVIxRmj-0,96
|
| 464 |
+
numpy/lib/tests/data/win64python2.npy,sha256=agOcgHVYFJrV-nrRJDbGnUnF4ZTPYXuSeF-Mtg7GMpc,96
|
| 465 |
+
numpy/lib/tests/test__datasource.py,sha256=65KXfUUvp8wXSqgQisuYlkhg-qHjBV5FXYetL8Ba-rc,10571
|
| 466 |
+
numpy/lib/tests/test__iotools.py,sha256=HerCqvDE07JxjFQlWEfpZO7lC9z0Sbr3z20GSutoCPs,13743
|
| 467 |
+
numpy/lib/tests/test__version.py,sha256=aO3YgkAohLsLzCNQ7vjIwdpFUMz0cPLbcuuxIkjuN74,1999
|
| 468 |
+
numpy/lib/tests/test_arraypad.py,sha256=obohHbyM0gPYPUkd7iJSOSiDqyqtJsjDNtQX68NC4lM,54830
|
| 469 |
+
numpy/lib/tests/test_arraysetops.py,sha256=5-T1MVhfIMivat8Z47GZw0ZaR811W_FskM1bAXnFyLU,35912
|
| 470 |
+
numpy/lib/tests/test_arrayterator.py,sha256=AYs2SwV5ankgwnvKI9RSO1jZck118nu3SyZ4ngzZNso,1291
|
| 471 |
+
numpy/lib/tests/test_financial_expired.py,sha256=yq5mqGMvqpkiiw9CuZhJgrYa7Squj1mXr_G-IvAFgwI,247
|
| 472 |
+
numpy/lib/tests/test_format.py,sha256=xV0oi1eoRnVwAAhSOcPFQHQWF7TfsROtDYShQLPtdaA,41028
|
| 473 |
+
numpy/lib/tests/test_function_base.py,sha256=DBKugIUEFTMP7g6iL1bk986E6ldCrcNdBCWOJbQla_Y,157830
|
| 474 |
+
numpy/lib/tests/test_histograms.py,sha256=16_XJp-eFgsuM8B4mDQpQ4w_Ib29Hg0EPO-WFsdaFWA,32815
|
| 475 |
+
numpy/lib/tests/test_index_tricks.py,sha256=Vjz25Y6H_ih0iEE2AG0kaxO9U8PwcXSrofzqnN4XBwI,20256
|
| 476 |
+
numpy/lib/tests/test_io.py,sha256=3Tow1pucrQ7z7osNN4a2grBYUoBGNkQEhjmCjXT6Vag,107891
|
| 477 |
+
numpy/lib/tests/test_loadtxt.py,sha256=gwcDJDJmLJRMLpg322yjQ1IzI505w9EqJoq4DmDPCdI,38560
|
| 478 |
+
numpy/lib/tests/test_mixins.py,sha256=Wivwz3XBWsEozGzrzsyyvL3qAuE14t1BHk2LPm9Z9Zc,7030
|
| 479 |
+
numpy/lib/tests/test_nanfunctions.py,sha256=01r_mmTCvKVdZuOGTEHNDZXrMS724us_jwZANzCd74A,47609
|
| 480 |
+
numpy/lib/tests/test_packbits.py,sha256=OWGAd5g5GG0gl7WHqNfwkZ7G-2rrtLt2sI854PG4nnw,17546
|
| 481 |
+
numpy/lib/tests/test_polynomial.py,sha256=URouxJpr8FQ5hiKybqhtOcLA7e-3hj4kWzjLBROByyA,11395
|
| 482 |
+
numpy/lib/tests/test_recfunctions.py,sha256=6jzouPEQ7Uhtj8_-W5yTI6ymNp2nLgmdHzxdd74jVuM,44001
|
| 483 |
+
numpy/lib/tests/test_regression.py,sha256=KzGFkhTcvEG97mymoOQ2hP2CEr2nPZou0Ztf4-WaXCs,8257
|
| 484 |
+
numpy/lib/tests/test_shape_base.py,sha256=2iQCEFR6evVpF8woaenxUOzooHkfuMYkBaUj8ecyJ-E,26817
|
| 485 |
+
numpy/lib/tests/test_stride_tricks.py,sha256=wprpWWH5eq07DY7rzG0WDv5fMtLxzRQz6fm6TZWlScQ,22849
|
| 486 |
+
numpy/lib/tests/test_twodim_base.py,sha256=ll-72RhqCItIPB97nOWhH7H292h4nVIX_w1toKTPMUg,18841
|
| 487 |
+
numpy/lib/tests/test_type_check.py,sha256=lxCH5aApWVYhhSoDQSLDTCHLVHuK2c-jBbnfnZUrOaA,15114
|
| 488 |
+
numpy/lib/tests/test_ufunclike.py,sha256=4hSnXGlSC8HE-_pRRMzD8-HI4hGHqsAWu1pD0o2kPI0,2982
|
| 489 |
+
numpy/lib/tests/test_utils.py,sha256=RVAxrzSFu6N3C4_jIgAlTDOWF_B7wr2v1Y20dX5upYM,6218
|
| 490 |
+
numpy/lib/twodim_base.py,sha256=Mvzn_PyShIb9m7nJjJ4IetdxwmLYEsCPHvJoK7n2viU,32947
|
| 491 |
+
numpy/lib/twodim_base.pyi,sha256=xFRcEVJdDj4mrXW_6iVP1lTMoJx4QJjYRD3o2_9f2eY,5370
|
| 492 |
+
numpy/lib/type_check.py,sha256=_EOtB296nFYlNT7ztBYoC_yK9aycIb0KTmRjvzVdZNg,19954
|
| 493 |
+
numpy/lib/type_check.pyi,sha256=LPvAvIxU-p5i_Qe-ic7hEvo4OTfSrNpplxMG7OAZe8Q,5571
|
| 494 |
+
numpy/lib/ufunclike.py,sha256=_ceBGbGCMOd3u_h2UVzyaRK6ZY7ryoJ0GJB7zqcJG3w,6325
|
| 495 |
+
numpy/lib/ufunclike.pyi,sha256=hLxcYfQprh1tTY_UO2QscA3Hd9Zd7cVGXIINZLhMFqY,1293
|
| 496 |
+
numpy/lib/user_array.py,sha256=LE958--CMkBI2r3l1SQxmCHdCSw6HY6-RhWCnduzGA4,7721
|
| 497 |
+
numpy/lib/utils.py,sha256=6NdleaELZiqARdj-ECZjxtwLf1bqklOcK43m9yoZefs,37804
|
| 498 |
+
numpy/lib/utils.pyi,sha256=mVHVzWuc2-M3Oz60lFsbok0v8LH_HRHMjZpXwrtzF_c,2360
|
| 499 |
+
numpy/linalg/__init__.py,sha256=mpdlEXWtTvpF7In776ONLwp6RIyo4U_GLPT1L1eIJnw,1813
|
| 500 |
+
numpy/linalg/__init__.pyi,sha256=XBy4ocuypsRVflw_mbSTUhR4N5Roemu6w5SfeVwbkAc,620
|
| 501 |
+
numpy/linalg/_umath_linalg.cpython-312-x86_64-linux-gnu.so,sha256=iCLnctdD1AWYPxucazS3BN0pd4CJDcJFRU8Qga31Ckw,216793
|
| 502 |
+
numpy/linalg/lapack_lite.cpython-312-x86_64-linux-gnu.so,sha256=UAZPuN2wY1u7YCi4990o-QwErZqxw_rd0RF8K7fcj_0,29849
|
| 503 |
+
numpy/linalg/linalg.py,sha256=kDVK1GBxbUjlRgxXCoEfkRJm8yrNr1Iu7hMn2rKK8RE,90923
|
| 504 |
+
numpy/linalg/linalg.pyi,sha256=zD9U5BUCB1uQggSxfZaTGX_uB2Hkp75sttGmZbCGgBI,7505
|
| 505 |
+
numpy/linalg/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 506 |
+
numpy/linalg/tests/test_deprecations.py,sha256=9p_SRmtxj2zc1doY9Ie3dyy5JzWy-tCQWFoajcAJUmM,640
|
| 507 |
+
numpy/linalg/tests/test_linalg.py,sha256=rgvmK6Or70u8mN04puetL3FgSxZ8fJrOlI5ptTgCU5k,78085
|
| 508 |
+
numpy/linalg/tests/test_regression.py,sha256=qbugUmrENybkEaM1GhfA01RXQUy8AkzalbrfzSIgUmM,5434
|
| 509 |
+
numpy/ma/API_CHANGES.txt,sha256=F_4jW8X5cYBbzpcwteymkonTmvzgKKY2kGrHF1AtnrI,3405
|
| 510 |
+
numpy/ma/LICENSE,sha256=BfO4g1GYjs-tEKvpLAxQ5YdcZFLVAJoAhMwpFVH_zKY,1593
|
| 511 |
+
numpy/ma/README.rst,sha256=q-gCsZ4Cw_gUGGvEjog556sJUHIm8WTAwkFK5Qnz9XA,9872
|
| 512 |
+
numpy/ma/__init__.py,sha256=dgP0WdnOpph28Fd6UiqoyDKhfrct0H6QWqbCcETsk6M,1404
|
| 513 |
+
numpy/ma/__init__.pyi,sha256=ppCg_TS0POutNB3moJE4kBabWURnc0WGXyYPquXZxS4,6063
|
| 514 |
+
numpy/ma/core.py,sha256=4MglVRJtmQ9_iIVaQ2b-_Vmw1TjAhEsMJdtKOhyBFXQ,278213
|
| 515 |
+
numpy/ma/core.pyi,sha256=YfgyuBuKxZ5v4I2JxZDvCLhnztOCRgzTeDg-JGTon_M,14305
|
| 516 |
+
numpy/ma/extras.py,sha256=MC7QPS34PC4wxNbOp7pTy57dqF9B-L6L1KMI6rrfe2w,64383
|
| 517 |
+
numpy/ma/extras.pyi,sha256=BBsiCZbaPpGCY506fkmqZdBkJNCXcglc3wcSBuAACNk,2646
|
| 518 |
+
numpy/ma/mrecords.py,sha256=degd6dLaDEvEWNHmvSnUZXos1csIzaqjR_jAutm8JfI,27232
|
| 519 |
+
numpy/ma/mrecords.pyi,sha256=r1a2I662ywnhGS6zvfcyK-9RHVvb4sHxiCx9Dhf5AE4,1934
|
| 520 |
+
numpy/ma/setup.py,sha256=MqmMicr_xHkAGoG-T7NJ4YdUZIJLO4ZFp6AmEJDlyhw,418
|
| 521 |
+
numpy/ma/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 522 |
+
numpy/ma/tests/test_core.py,sha256=xd5S3oa0jObo8jnsJk0-o46d-KNC3RtgNRKinJeY_kE,215100
|
| 523 |
+
numpy/ma/tests/test_deprecations.py,sha256=nq_wFVt2EBHcT3AHxattfKXx2JDf1K5D-QBzUU0_15A,2566
|
| 524 |
+
numpy/ma/tests/test_extras.py,sha256=lX4cbdGDEXaBHzA3q8hJxve4635XCJw4AP7FO7zhOfk,74858
|
| 525 |
+
numpy/ma/tests/test_mrecords.py,sha256=PsJhUlABgdpSsPUeijonfyFNqz5AfNSGQTtJUte7yts,19890
|
| 526 |
+
numpy/ma/tests/test_old_ma.py,sha256=h4BncexBcBigqvZMA6RjDjpHPurWtt99A7KTag2rmOs,32690
|
| 527 |
+
numpy/ma/tests/test_regression.py,sha256=foMpI0luAvwkkRpAfPDV_810h1URISXDZhmaNhxb50k,3287
|
| 528 |
+
numpy/ma/tests/test_subclassing.py,sha256=HeTIE_n1I8atwzF8tpvNtGHp-0dmM8PT8AS4IDWbcso,16967
|
| 529 |
+
numpy/ma/testutils.py,sha256=RQw0RyS7hOSVTk4KrCGleq0VHlnDqzwwaLtuZbRE4_I,10235
|
| 530 |
+
numpy/ma/timer_comparison.py,sha256=pIGSZG-qYYYlRWSTgzPlyCAINbGKhXrZrDZBBjiM080,15658
|
| 531 |
+
numpy/matlib.py,sha256=-54vTuGIgeTMg9ZUmElRPZ4Hr-XZ-om9xLzAsSoTvnc,10465
|
| 532 |
+
numpy/matrixlib/__init__.py,sha256=BHBpQKoQv4EjT0UpWBA-Ck4L5OsMqTI2IuY24p-ucXk,242
|
| 533 |
+
numpy/matrixlib/__init__.pyi,sha256=-t3ZuvbzRuRwWfZOeN4xlNWdm7gQEprhUsWzu8MRvUE,252
|
| 534 |
+
numpy/matrixlib/defmatrix.py,sha256=JXdJGm1LayOOXfKpp7OVZfb0pzzP4Lwh45sTJrleALc,30656
|
| 535 |
+
numpy/matrixlib/defmatrix.pyi,sha256=lmBMRahKcMOl2PHDo79J67VRAZOkI54BzfDaTLpE0LI,451
|
| 536 |
+
numpy/matrixlib/setup.py,sha256=1r7JRkSM4HyVorgtjoKJGWLcOcPO3wmvivpeEsVtAEg,426
|
| 537 |
+
numpy/matrixlib/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 538 |
+
numpy/matrixlib/tests/test_defmatrix.py,sha256=8E_-y7VD2vsq1y8CcI8km37pp5qcAtkciO16xqf2UIs,14982
|
| 539 |
+
numpy/matrixlib/tests/test_interaction.py,sha256=PpjmgjEKighDXvt38labKE6L7f2jP74UEmp3JRb_iOY,11875
|
| 540 |
+
numpy/matrixlib/tests/test_masked_matrix.py,sha256=7YO_LCO8DOhW3CuXJuxH93rnmttfvHnU7El-MBzxzFw,8932
|
| 541 |
+
numpy/matrixlib/tests/test_matrix_linalg.py,sha256=ObbSUXU4R2pWajH__xAdizADrU2kBKDDCxkDV-oVBXc,2059
|
| 542 |
+
numpy/matrixlib/tests/test_multiarray.py,sha256=jB3XCBmAtcqf-Wb9PwBW6uIykPpMPthuXLJ0giTKzZE,554
|
| 543 |
+
numpy/matrixlib/tests/test_numeric.py,sha256=MP70qUwgshTtThKZaZDp7_6U-Z66NIV1geVhasGXejQ,441
|
| 544 |
+
numpy/matrixlib/tests/test_regression.py,sha256=8sHDtO8Zi8p3a1eQKEWxtCmKrXmHoD3qxlIokg2AIAU,927
|
| 545 |
+
numpy/polynomial/__init__.py,sha256=braLh6zP2QwuNKRKAaZGdC_qKWZ-tJlc3BN83LeuE_0,6781
|
| 546 |
+
numpy/polynomial/__init__.pyi,sha256=W8szYtVUy0RUi83jmFLK58BN8CKVSoHA2CW7IcdUl1c,701
|
| 547 |
+
numpy/polynomial/_polybase.py,sha256=YEnnQwlTgbn3dyD89ueraUx5nxx3x_pH6K6mmyEmhi8,39271
|
| 548 |
+
numpy/polynomial/_polybase.pyi,sha256=J7yU9PPZW4W8mkqAltDfnL4ZNwljuM-bDEj4DPTJZpY,2321
|
| 549 |
+
numpy/polynomial/chebyshev.py,sha256=NZCKjIblcX99foqZyp51i0_r8p0r1VKVGZFmQ1__kEk,62796
|
| 550 |
+
numpy/polynomial/chebyshev.pyi,sha256=035CNdOas4dnb6lFLzRiBrYT_VnWh2T1-A3ibm_HYkI,1387
|
| 551 |
+
numpy/polynomial/hermite.py,sha256=t5CFM-qE4tszYJiQZ301VcMn7IM67y2rUZPFPtnVRAc,52514
|
| 552 |
+
numpy/polynomial/hermite.pyi,sha256=hdsvTULow8bIjnATudf0i6brpLHV7vbOoHzaMvbjMy0,1217
|
| 553 |
+
numpy/polynomial/hermite_e.py,sha256=jRR3f8Oth8poV2Ix8c0eLEQR3UZary-2RupOrEAEUMY,52642
|
| 554 |
+
numpy/polynomial/hermite_e.pyi,sha256=zV7msb9v9rV0iv_rnD3SjP-TGyc6pd3maCqiPCj3PbA,1238
|
| 555 |
+
numpy/polynomial/laguerre.py,sha256=mcVw0ckWVX-kzJ1QIhdcuuxzPjuFmA3plQLkloQMOYM,50858
|
| 556 |
+
numpy/polynomial/laguerre.pyi,sha256=Gxc9SLISNKMWrKdsVJ9fKFFFwfxxZzfF-Yc-2r__z5M,1178
|
| 557 |
+
numpy/polynomial/legendre.py,sha256=wjtgFajmKEbYkSUk3vWSCveMHDP6UymK28bNUk4Ov0s,51550
|
| 558 |
+
numpy/polynomial/legendre.pyi,sha256=9dmANwkxf7EbOHV3XQBPoaDtc56cCkf75Wo7FG9Zfj4,1178
|
| 559 |
+
numpy/polynomial/polynomial.py,sha256=XsaZPHmLGJFqpJs7rPvO5E0loWQ1L3YHLIUybVu4dU8,49112
|
| 560 |
+
numpy/polynomial/polynomial.pyi,sha256=bOPRnub4xXxsUwNGeiQLTT4PCfN1ysSrf6LBZIcAN2Y,1132
|
| 561 |
+
numpy/polynomial/polyutils.py,sha256=Xy5qjdrjnRaqSlClG1ROmwWccLkAPC7IcHaNJLvhCf4,23237
|
| 562 |
+
numpy/polynomial/polyutils.pyi,sha256=cFAyZ9Xzuw8Huhn9FEz4bhyD00m2Dp-2DiUSyogJwSo,264
|
| 563 |
+
numpy/polynomial/setup.py,sha256=dXQfzVUMP9OcB6iKv5yo1GLEwFB3gJ48phIgo4N-eM0,373
|
| 564 |
+
numpy/polynomial/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 565 |
+
numpy/polynomial/tests/test_chebyshev.py,sha256=6tMsFP1h7K8Zf72mNOta6Tv52_fVTlXknseuffj080c,20522
|
| 566 |
+
numpy/polynomial/tests/test_classes.py,sha256=DFyY2IQBj3r2GZkvbRIeZO2EEY466xbuwc4PShAl4Sw,18331
|
| 567 |
+
numpy/polynomial/tests/test_hermite.py,sha256=N9b2dx2UWPyja5v02dSoWYPnKvb6H-Ozgtrx-xjWz2k,18577
|
| 568 |
+
numpy/polynomial/tests/test_hermite_e.py,sha256=_A3ohAWS4HXrQG06S8L47dImdZGTwYosCXnoyw7L45o,18911
|
| 569 |
+
numpy/polynomial/tests/test_laguerre.py,sha256=BZOgs49VBXOFBepHopxuEDkIROHEvFBfWe4X73UZhn8,17511
|
| 570 |
+
numpy/polynomial/tests/test_legendre.py,sha256=b_bblHs0F_BWw9ESuSq52ZsLKcQKFR5eqPf_SppWFqo,18673
|
| 571 |
+
numpy/polynomial/tests/test_polynomial.py,sha256=4cuO8-5wdIxcz5CrucB5Ix7ySuMROokUF12F7ogQ_hc,20529
|
| 572 |
+
numpy/polynomial/tests/test_polyutils.py,sha256=IxkbVfpcBqe5lOZluHFUPbLATLu1rwVg7ghLASpfYrY,3579
|
| 573 |
+
numpy/polynomial/tests/test_printing.py,sha256=rfP4MaQbjGcO52faHmYrgsaarkm3Ndi3onwr6DDuapE,20525
|
| 574 |
+
numpy/polynomial/tests/test_symbol.py,sha256=msTPv7B1niaKujU33kuZmdxJvLYvOjfl1oykmlL0dXo,5371
|
| 575 |
+
numpy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 576 |
+
numpy/random/LICENSE.md,sha256=EDFmtiuARDr7nrNIjgUuoGvgz_VmuQjxmeVh_eSa8Z8,3511
|
| 577 |
+
numpy/random/__init__.pxd,sha256=9JbnX540aJNSothGs-7e23ozhilG6U8tINOUEp08M_k,431
|
| 578 |
+
numpy/random/__init__.py,sha256=81Thnexg5umN5WZwD5TRyzNc2Yp-d14B6UC7NBgVKh8,7506
|
| 579 |
+
numpy/random/__init__.pyi,sha256=RfW8mco48UaWDL1UC5ROv9vXiFZ9EGho62avhgEAHPc,2143
|
| 580 |
+
numpy/random/_bounded_integers.cpython-312-x86_64-linux-gnu.so,sha256=s59-K0zP1pBK5g_hUX9r2ovng1tb9p1U3sDWk8Xot5M,348704
|
| 581 |
+
numpy/random/_bounded_integers.pxd,sha256=hcoucPH5hkFEM2nm12zYO-5O_Rt8RujEXT5YWuAzl1Q,1669
|
| 582 |
+
numpy/random/_common.cpython-312-x86_64-linux-gnu.so,sha256=q9iMqPRH8ixPUfImc000cylmuuYe3SqiX3S_7JVL7ig,258888
|
| 583 |
+
numpy/random/_common.pxd,sha256=s2_IdIQ0MhNbogamulvXe-b93wbx882onmYkxqswwpo,4939
|
| 584 |
+
numpy/random/_examples/cffi/extending.py,sha256=xSla3zWqxi6Hj48EvnYfD3WHfE189VvC4XsKu4_T_Iw,880
|
| 585 |
+
numpy/random/_examples/cffi/parse.py,sha256=Bnb7t_6S_c5-3dZrQ-XX9EazOKhftUfcCejXXWyd1EU,1771
|
| 586 |
+
numpy/random/_examples/cython/extending.pyx,sha256=4IE692pq1V53UhPZqQiQGcIHXDoNyqTx62x5a36puVg,2290
|
| 587 |
+
numpy/random/_examples/cython/extending_distributions.pyx,sha256=oazFVWeemfE0eDzax7r7MMHNL1_Yofws2m-c_KT2Hbo,3870
|
| 588 |
+
numpy/random/_examples/cython/meson.build,sha256=rXtugURMEo-ef4bPE1QIv4mzvWbeGjmcTdKCBvjxjtw,1443
|
| 589 |
+
numpy/random/_examples/numba/extending.py,sha256=Ipyzel_h5iU_DMJ_vnXUgQC38uMDMn7adUpWSeEQLFE,1957
|
| 590 |
+
numpy/random/_examples/numba/extending_distributions.py,sha256=Jnr9aWkHyIWygNbdae32GVURK-5T9BTGhuExRpvve98,2034
|
| 591 |
+
numpy/random/_generator.cpython-312-x86_64-linux-gnu.so,sha256=Wz7yrIt4qoO8hptw4w4qcPvTqzc8UlPtbrqZgqVf1-I,946872
|
| 592 |
+
numpy/random/_generator.pyi,sha256=zRvo_y6g0pWkE4fO1M9jLYUkxDfGdA6Enreb3U2AADM,22442
|
| 593 |
+
numpy/random/_mt19937.cpython-312-x86_64-linux-gnu.so,sha256=Nhn3-Rue5xl8KQLA4Zfmmy5d1F-xHNIuVy6bC4hlFKk,119488
|
| 594 |
+
numpy/random/_mt19937.pyi,sha256=_iZKaAmuKBQ4itSggfQvYYj_KjktcN4rt-YpE6bqFAM,724
|
| 595 |
+
numpy/random/_pcg64.cpython-312-x86_64-linux-gnu.so,sha256=V3wUaPT7QLsjGEND4sG2RaF9HUk2QeqSwFLyhtxutVY,125040
|
| 596 |
+
numpy/random/_pcg64.pyi,sha256=uxr5CbEJetN6lv9vBG21jlRhuzOK8SQnXrwqAQBxj_c,1091
|
| 597 |
+
numpy/random/_philox.cpython-312-x86_64-linux-gnu.so,sha256=LJsf5T7xGePtKstzyALPKZZQKw_VHUkm1AR1ds6ldRQ,106712
|
| 598 |
+
numpy/random/_philox.pyi,sha256=OKlaiIU-hj72Bp04zjNifwusOD_3-mYxIfvyuys8c_o,978
|
| 599 |
+
numpy/random/_pickle.py,sha256=4NhdT-yk7C0m3tyZWmouYAs3ZGNPdPVNGfUIyuh8HDY,2318
|
| 600 |
+
numpy/random/_sfc64.cpython-312-x86_64-linux-gnu.so,sha256=WIMwLOM6_VTbZjGtv14AApe460LA7IlLvsMYteaxQmg,76224
|
| 601 |
+
numpy/random/_sfc64.pyi,sha256=09afHTedVW-519493ZXtGcl-H-_zluj-B_yfEJG8MMs,709
|
| 602 |
+
numpy/random/bit_generator.cpython-312-x86_64-linux-gnu.so,sha256=h8XHMIh5Q8YDsWxTzyEzmDNu5BDvRllVGAk5d6_VsMs,234016
|
| 603 |
+
numpy/random/bit_generator.pxd,sha256=lArpIXSgTwVnJMYc4XX0NGxegXq3h_QsUDK6qeZKbNc,1007
|
| 604 |
+
numpy/random/bit_generator.pyi,sha256=aXv7a_hwa0nkjY8P2YENslwWp89UcFRn09woXh7Uoc0,3510
|
| 605 |
+
numpy/random/c_distributions.pxd,sha256=7DE-mV3H_Dihk4OK4gMHHkyD4tPX1cAi4570zi5CI30,6344
|
| 606 |
+
numpy/random/lib/libnpyrandom.a,sha256=xUcvOvieju5PThPQ8q0-uGJ5fjsCd5umnjIerIc85Sg,71926
|
| 607 |
+
numpy/random/mtrand.cpython-312-x86_64-linux-gnu.so,sha256=jYZrS2EHQBq5VGBEVkII4KWJnDkD2gknfRxIddmLzw8,749040
|
| 608 |
+
numpy/random/mtrand.pyi,sha256=3vAGOXsvyFFv0yZl34pVVPP7Dgt22COyfn4tUoi_hEQ,19753
|
| 609 |
+
numpy/random/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 610 |
+
numpy/random/tests/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 611 |
+
numpy/random/tests/data/mt19937-testset-1.csv,sha256=Xkef402AVB-eZgYQkVtoxERHkxffCA9Jyt_oMbtJGwY,15844
|
| 612 |
+
numpy/random/tests/data/mt19937-testset-2.csv,sha256=nsBEQNnff-aFjHYK4thjvUK4xSXDSfv5aTbcE59pOkE,15825
|
| 613 |
+
numpy/random/tests/data/pcg64-testset-1.csv,sha256=xB00DpknGUTTCxDr9L6aNo9Hs-sfzEMbUSS4t11TTfE,23839
|
| 614 |
+
numpy/random/tests/data/pcg64-testset-2.csv,sha256=NTdzTKvG2U7_WyU_IoQUtMzU3kEvDH39CgnR6VzhTkw,23845
|
| 615 |
+
numpy/random/tests/data/pcg64dxsm-testset-1.csv,sha256=vNSUT-gXS_oEw_awR3O30ziVO4seNPUv1UIZ01SfVnI,23833
|
| 616 |
+
numpy/random/tests/data/pcg64dxsm-testset-2.csv,sha256=uylS8PU2AIKZ185OC04RBr_OePweGRtvn-dE4YN0yYA,23839
|
| 617 |
+
numpy/random/tests/data/philox-testset-1.csv,sha256=SedRaIy5zFadmk71nKrGxCFZ6BwKz8g1A9-OZp3IkkY,23852
|
| 618 |
+
numpy/random/tests/data/philox-testset-2.csv,sha256=dWECt-sbfvaSiK8-Ygp5AqyjoN5i26VEOrXqg01rk3g,23838
|
| 619 |
+
numpy/random/tests/data/sfc64-testset-1.csv,sha256=iHs6iX6KR8bxGwKk-3tedAdMPz6ZW8slDSUECkAqC8Q,23840
|
| 620 |
+
numpy/random/tests/data/sfc64-testset-2.csv,sha256=FIDIDFCaPZfWUSxsJMAe58hPNmMrU27kCd9FhCEYt_k,23833
|
| 621 |
+
numpy/random/tests/test_direct.py,sha256=6vLpCyeKnAWFEZei7l2YihVLQ0rSewO1hJBWt7A5fyQ,17779
|
| 622 |
+
numpy/random/tests/test_extending.py,sha256=S3Wrzu3di4uBhr-Pxnx5dOPvlBY0FRdZqVX6CC1IN6s,4038
|
| 623 |
+
numpy/random/tests/test_generator_mt19937.py,sha256=35LBwV6TtWPnxhefutxTQmhLzAQ5Ee4YiY8ziDXM-eQ,115477
|
| 624 |
+
numpy/random/tests/test_generator_mt19937_regressions.py,sha256=xGkdz76BMX1EK0QPfabVxpNx9qQ9OC-1ZStWOs6N_M8,6387
|
| 625 |
+
numpy/random/tests/test_random.py,sha256=kEkQs3i7zcpm9MozIRIz1FIx5B6fmXk0QqX0l6l-u_Y,70087
|
| 626 |
+
numpy/random/tests/test_randomstate.py,sha256=DxF7rMUSxaAlL4h1qC3onHcHR7T_6rKWPbr0nJH84nE,85031
|
| 627 |
+
numpy/random/tests/test_randomstate_regression.py,sha256=VucYWIjA7sAquWsalvZMnfkmYLM1O6ysyWnLl931-lA,7917
|
| 628 |
+
numpy/random/tests/test_regression.py,sha256=trntK51UvajOVELiluEO85l64CKSw5nvBSc5SqYyr9w,5439
|
| 629 |
+
numpy/random/tests/test_seed_sequence.py,sha256=GNRJ4jyzrtfolOND3gUWamnbvK6-b_p1bBK_RIG0sfU,3311
|
| 630 |
+
numpy/random/tests/test_smoke.py,sha256=jjNz0aEGD1_oQl9a9UWt6Mz_298alG7KryLT1pgHljw,28183
|
| 631 |
+
numpy/testing/__init__.py,sha256=InpVKoDAzMKO_l_HNcatziW_u1k9_JZze__t2nybrL0,595
|
| 632 |
+
numpy/testing/__init__.pyi,sha256=AhK5NuOpdD-JjIzXOlssE8_iSLyFAAHzyGV_w1BT7vA,1674
|
| 633 |
+
numpy/testing/_private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 634 |
+
numpy/testing/_private/extbuild.py,sha256=nG2dwP4nUmQS3e5eIRinxt0s_f4sxxA1YfohCg-navo,8017
|
| 635 |
+
numpy/testing/_private/utils.py,sha256=3FrSTMi0OdpDODBDoncgiDQzdo5NKA6YVfQ3uKRSQnc,85242
|
| 636 |
+
numpy/testing/_private/utils.pyi,sha256=MMNrvwEeSTYzZFWawSSzHnTFYG-cSAIiID-1FuJ1f8U,10123
|
| 637 |
+
numpy/testing/overrides.py,sha256=u6fcKSBC8HIzMPWKAbdyowU71h2Fx2ekDQxpG5NhIr8,2123
|
| 638 |
+
numpy/testing/print_coercion_tables.py,sha256=ndxOsS4XfrZ4UY_9nqRTCnxhkzgdqcuUHL8nezd7Op4,6180
|
| 639 |
+
numpy/testing/setup.py,sha256=GPKAtTTBRsNW4kmR7NjP6mmBR_GTdpaTvkTm10_VcLg,709
|
| 640 |
+
numpy/testing/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 641 |
+
numpy/testing/tests/test_utils.py,sha256=IDOr-GXuNGlrsb-XzGSYUHXEqcGYJ78p60jOpBqyPM4,55740
|
| 642 |
+
numpy/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 643 |
+
numpy/tests/test__all__.py,sha256=L3mCnYPTpzAgNfedVuq9g7xPWbc0c1Pot94k9jZ9NpI,221
|
| 644 |
+
numpy/tests/test_ctypeslib.py,sha256=B06QKeFRgDIEbkEPBy_zYA1H5E2exuhTi7IDkzV8gfo,12257
|
| 645 |
+
numpy/tests/test_lazyloading.py,sha256=YETrYiDLAqLX04K_u5_3NVxAfxDoeguxwkIRfz6qKcY,1162
|
| 646 |
+
numpy/tests/test_matlib.py,sha256=gwhIXrJJo9DiecaGLCHLJBjhx2nVGl6yHq80AOUQSRM,1852
|
| 647 |
+
numpy/tests/test_numpy_config.py,sha256=qHvepgi9oyAbQuZD06k7hpcCC2MYhdzcY6D1iQDPNMI,1241
|
| 648 |
+
numpy/tests/test_numpy_version.py,sha256=A8cXFzp4k-p6J5zkOxlDfDvkoFMxDW2hpTFVXcaQRVo,1479
|
| 649 |
+
numpy/tests/test_public_api.py,sha256=DTq7SO84uBjC2tKPoqX17xazc-SLkTAbQ2fLZwGM2jc,18170
|
| 650 |
+
numpy/tests/test_reloading.py,sha256=QuVaPQulcNLg4Fl31Lw-O89L42KclYCK68n5GVy0PNQ,2354
|
| 651 |
+
numpy/tests/test_scripts.py,sha256=jluCLfG94VM1cuX-5RcLFBli_yaJZpIvmVuMxRKRJrc,1645
|
| 652 |
+
numpy/tests/test_warnings.py,sha256=ZEtXqHI1iyeVeLfVxDcMfN5qw67Ti2u54709hvBG4eY,2284
|
| 653 |
+
numpy/typing/__init__.py,sha256=VoTILNDrUWvZx0LK9_97lBLQFKtSGmDt4QLOH8zYvlo,5234
|
| 654 |
+
numpy/typing/mypy_plugin.py,sha256=24zVk4Ei3qH4Hc3SSz3v0XtIsycTo8HKoY6ilhB_7AQ,6376
|
| 655 |
+
numpy/typing/setup.py,sha256=Cnz9q53w-vJNyE6vYxqYvQXx0pJbrG9quHyz9sqxfek,374
|
| 656 |
+
numpy/typing/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 657 |
+
numpy/typing/tests/data/fail/arithmetic.pyi,sha256=4rY_ASCERAl8WCus1RakOe0Aw-8vvjilL29mgdD4lv0,3850
|
| 658 |
+
numpy/typing/tests/data/fail/array_constructors.pyi,sha256=X9y_jUYS17WfYmXW5NwkVudyiR6ouUaAwEh0JRte42o,1089
|
| 659 |
+
numpy/typing/tests/data/fail/array_like.pyi,sha256=OVAlEJZ5k8ZRKt0aGpZQwIjlUGpy0PzOOYqfI-IMqBQ,455
|
| 660 |
+
numpy/typing/tests/data/fail/array_pad.pyi,sha256=57oK0Yp53rtKjjIrRFYLcxa-IfIGhtI-bEem7ggJKwI,132
|
| 661 |
+
numpy/typing/tests/data/fail/arrayprint.pyi,sha256=-Fs9VnQfxyfak008Hq8kJWfB0snA6jGDXZz8ljQnwGE,549
|
| 662 |
+
numpy/typing/tests/data/fail/arrayterator.pyi,sha256=FoU4ahHkJZ67dwWXer5FXLjjjesKKg-w2Jq1X1bHymA,480
|
| 663 |
+
numpy/typing/tests/data/fail/bitwise_ops.pyi,sha256=GN9dVqk4_HFXn7zbRrHzJq_UGRFBccoYVUG1UuE7bXs,515
|
| 664 |
+
numpy/typing/tests/data/fail/char.pyi,sha256=-vgN6EmfQ8VaA4SOZ5Ol9u4-Z7Q5I7G78LmaxZOuZ90,2615
|
| 665 |
+
numpy/typing/tests/data/fail/chararray.pyi,sha256=jrNryZFpr8nxG2IHb9e0x3ranpvJpBy_RDex-WpT5rU,2296
|
| 666 |
+
numpy/typing/tests/data/fail/comparisons.pyi,sha256=U4neWzwwtxG6QXsKlNGJuKXHBtwzYBQOa47_7SKF5Wg,888
|
| 667 |
+
numpy/typing/tests/data/fail/constants.pyi,sha256=YSqNbXdhbdMmYbs7ntH0FCKbnm8IFeqsDlZBqcU43iw,286
|
| 668 |
+
numpy/typing/tests/data/fail/datasource.pyi,sha256=PRT2hixR-mVxr2UILvHa99Dr54EF2h3snJXE-v3rWcc,395
|
| 669 |
+
numpy/typing/tests/data/fail/dtype.pyi,sha256=OAGABqdXNB8gClJFEGMckoycuZcIasMaAlS2RkiKROI,334
|
| 670 |
+
numpy/typing/tests/data/fail/einsumfunc.pyi,sha256=RS7GZqUCT_vEFJoyUx4gZlPO8GNFFNFWidxl-wLyRv0,539
|
| 671 |
+
numpy/typing/tests/data/fail/false_positives.pyi,sha256=Q61qMsSsNCtmO0EMRxHj5Z7RYTyrELVpkzfJY5eK8Z0,366
|
| 672 |
+
numpy/typing/tests/data/fail/flatiter.pyi,sha256=qLM4qm7gvJtEZ0rTHcyasUzoP5JbX4FREtqV3g1w6Lo,843
|
| 673 |
+
numpy/typing/tests/data/fail/fromnumeric.pyi,sha256=FH2mjkgtCbA9soqlJRhYN7IIfRRrUL1i9mwqcbYKZSc,5591
|
| 674 |
+
numpy/typing/tests/data/fail/histograms.pyi,sha256=yAPVt0rYTwtxnigoGT-u7hhKCE9iYxsXc24x2HGBrmA,367
|
| 675 |
+
numpy/typing/tests/data/fail/index_tricks.pyi,sha256=moINir9iQoi6Q1ZuVg5BuSB9hSBtbg_uzv-Qm_lLYZk,509
|
| 676 |
+
numpy/typing/tests/data/fail/lib_function_base.pyi,sha256=6y9T773CBLX-jUry1sCQGVuKVKM2wMuQ56Ni5V5j4Dw,2081
|
| 677 |
+
numpy/typing/tests/data/fail/lib_polynomial.pyi,sha256=Ur7Y4iZX6WmoH5SDm0ePi8C8LPsuPs2Yr7g7P5O613g,899
|
| 678 |
+
numpy/typing/tests/data/fail/lib_utils.pyi,sha256=VFpE6_DisvlDByyp1PiNPJEe5IcZp8cH0FlAJyoZipo,276
|
| 679 |
+
numpy/typing/tests/data/fail/lib_version.pyi,sha256=7-ZJDZwDcB-wzpMN8TeYtZAgaqc7xnQ8Dnx2ISiX2Ts,158
|
| 680 |
+
numpy/typing/tests/data/fail/linalg.pyi,sha256=yDd05aK1dI37RPt3pD2eJYo4dZFaT2yB1PEu3K0y9Tg,1322
|
| 681 |
+
numpy/typing/tests/data/fail/memmap.pyi,sha256=HSTCQYNuW1Y6X1Woj361pN4rusSPs4oDCXywqk20yUo,159
|
| 682 |
+
numpy/typing/tests/data/fail/modules.pyi,sha256=_ek4zKcdP-sIh_f-IDY0tP-RbLORKCSWelM9AOYxsyA,670
|
| 683 |
+
numpy/typing/tests/data/fail/multiarray.pyi,sha256=XCdBxufNhR8ZtG8UMzk8nt9_NC5gJTKP9-xTqKO_K9I,1693
|
| 684 |
+
numpy/typing/tests/data/fail/ndarray.pyi,sha256=YnjXy16RHs_esKelMjB07865CQ7gLyQnXhnitq5Kv5c,405
|
| 685 |
+
numpy/typing/tests/data/fail/ndarray_misc.pyi,sha256=w-10xTDDWoff9Lq0dBO-jBeiBR-XjCz2qmes0dLx238,1372
|
| 686 |
+
numpy/typing/tests/data/fail/nditer.pyi,sha256=w7emjnOxnf3NcvLktNLlke6Cuivn2gU3sVmGCfbG6rw,325
|
| 687 |
+
numpy/typing/tests/data/fail/nested_sequence.pyi,sha256=em4GZwLDFE0QSxxg081wVwhh-Dmtkn8f7wThI0DiXVs,427
|
| 688 |
+
numpy/typing/tests/data/fail/npyio.pyi,sha256=56QuHo9SvVR3Uhzl6gQZncCpX575Gy5wugjMICh20m0,620
|
| 689 |
+
numpy/typing/tests/data/fail/numerictypes.pyi,sha256=fevH9x80CafYkiyBJ7LMLVl6GyTvQrZ34trBu6O8TtM,276
|
| 690 |
+
numpy/typing/tests/data/fail/random.pyi,sha256=p5WsUGyOL-MGIeALh9Y0dVhYSRQLaUwMdjXc3G6C_7Q,2830
|
| 691 |
+
numpy/typing/tests/data/fail/rec.pyi,sha256=Ws3TyesnoQjt7Q0wwtpShRDJmZCs2jjP17buFMomVGA,704
|
| 692 |
+
numpy/typing/tests/data/fail/scalars.pyi,sha256=o91BwSfzPTczYVtbXsirqQUoUoYP1C_msGjc2GYsV04,2952
|
| 693 |
+
numpy/typing/tests/data/fail/shape_base.pyi,sha256=Y_f4buHtX2Q2ZA4kaDTyR8LErlPXTzCB_-jBoScGh_Q,152
|
| 694 |
+
numpy/typing/tests/data/fail/stride_tricks.pyi,sha256=IjA0Xrnx0lG3m07d1Hjbhtyo1Te5cXgjgr5fLUo4LYQ,315
|
| 695 |
+
numpy/typing/tests/data/fail/testing.pyi,sha256=e7b5GKTWCtKGoB8z2a8edsW0Xjl1rMheALsvzEJjlCw,1370
|
| 696 |
+
numpy/typing/tests/data/fail/twodim_base.pyi,sha256=ZqbRJfy5S_pW3fFLuomy4L5SBNqj6Nklexg9KDTo65c,899
|
| 697 |
+
numpy/typing/tests/data/fail/type_check.pyi,sha256=CIyI0j0Buxv0QgCvNG2urjaKpoIZ-ZNawC2m6NzGlbo,379
|
| 698 |
+
numpy/typing/tests/data/fail/ufunc_config.pyi,sha256=ukA0xwfJHLoGfoOIpWIN-91wj-DG8oaIjYbO72ymjg4,733
|
| 699 |
+
numpy/typing/tests/data/fail/ufunclike.pyi,sha256=lbxjJyfARmt_QK1HxhxFxvwQTqCEZwJ9I53Wp8X3KIY,679
|
| 700 |
+
numpy/typing/tests/data/fail/ufuncs.pyi,sha256=YaDTL7QLmGSUxE6JVMzpOlZTjHWrgbOo0UIlkX-6ZQk,1347
|
| 701 |
+
numpy/typing/tests/data/fail/warnings_and_errors.pyi,sha256=PrbYDFI7IGN3Gf0OPBkVfefzQs4AXHwDQ495pvrX3RY,174
|
| 702 |
+
numpy/typing/tests/data/misc/extended_precision.pyi,sha256=bS8bBeCFqjgtOiy-8_y39wfa7rwhdjLz2Vmo-RXAYD4,884
|
| 703 |
+
numpy/typing/tests/data/mypy.ini,sha256=Ynv1VSx_kXTD2mFC3ZpgEFuCOg1F2VJXxPk0dxUnF2M,108
|
| 704 |
+
numpy/typing/tests/data/pass/arithmetic.py,sha256=2z3dmuysQQmiPz8x0bg8SOOKW62mVJn97uMa9T0L7Vk,7455
|
| 705 |
+
numpy/typing/tests/data/pass/array_constructors.py,sha256=3GrhfBcmWX53pJHD0NvhXjwr2-uNKREbR1I9WCcZ7rI,2419
|
| 706 |
+
numpy/typing/tests/data/pass/array_like.py,sha256=ce_IVubBd7J6FkSpJmD7qMlRLuwmiidhOqhYfZb16Wo,916
|
| 707 |
+
numpy/typing/tests/data/pass/arrayprint.py,sha256=y_KkuLz1uM7pv53qfq7GQOuud4LoXE3apK1wtARdVyM,766
|
| 708 |
+
numpy/typing/tests/data/pass/arrayterator.py,sha256=FqcpKdUQBQ0FazHFxr9MsLEZG-jnJVGKWZX2owRr4DQ,393
|
| 709 |
+
numpy/typing/tests/data/pass/bitwise_ops.py,sha256=UnmxVr9HwI8ifdrutGm_u3EZU4iOOPQhrOku7hTaH0c,970
|
| 710 |
+
numpy/typing/tests/data/pass/comparisons.py,sha256=nTE-fvraLK6xTZcP4uPV02wOShzYKWDaoapx35AeDOY,2992
|
| 711 |
+
numpy/typing/tests/data/pass/dtype.py,sha256=MqDKC6Ywv6jNkWsR8rdLuabzHUco5w1OylDHEdxve_I,1069
|
| 712 |
+
numpy/typing/tests/data/pass/einsumfunc.py,sha256=eXj5L5MWPtQHgrHPsJ36qqrmBHqct9UoujjJCvHnF1k,1370
|
| 713 |
+
numpy/typing/tests/data/pass/flatiter.py,sha256=0BnbuLMBC7MQlprNZ0QhNSscfYwPhEhXOhWoyiRACWU,174
|
| 714 |
+
numpy/typing/tests/data/pass/fromnumeric.py,sha256=Xd_nJVVDoONdztUX8ddgo7EXJ2FD8AX51MO_Yujnmog,3742
|
| 715 |
+
numpy/typing/tests/data/pass/index_tricks.py,sha256=oaFD9vY01_RI5OkrXt-xTk1n_dd-SpuPp-eZ58XR3c8,1492
|
| 716 |
+
numpy/typing/tests/data/pass/lib_utils.py,sha256=sDQCjHVGUwct0RQqAtH5_16y241siSY4bXKZRsuJ8xA,434
|
| 717 |
+
numpy/typing/tests/data/pass/lib_version.py,sha256=HnuGOx7tQA_bcxFIJ3dRoMAR0fockxg4lGqQ4g7LGIw,299
|
| 718 |
+
numpy/typing/tests/data/pass/literal.py,sha256=DLzdWHD6ttW4S0NEvGQbsH_UEJjhZyhvO4OXJjoyvZQ,1331
|
| 719 |
+
numpy/typing/tests/data/pass/mod.py,sha256=HB9aK4_wGJbc44tomaoroNy0foIL5cI9KIjknvMTbkk,1578
|
| 720 |
+
numpy/typing/tests/data/pass/modules.py,sha256=t0KJxYWbrWd7HbbgIDFb3LAhJBiNNb6QPjjFDAgC2mU,576
|
| 721 |
+
numpy/typing/tests/data/pass/multiarray.py,sha256=MxHax6l94yqlTVZleAqG77ILEbW6wU5osPcHzxJ85ns,1331
|
| 722 |
+
numpy/typing/tests/data/pass/ndarray_conversion.py,sha256=yPgzXG6paY1uF_z-QyHYrcmrZvhX7qtvTUh7ANLseCA,1626
|
| 723 |
+
numpy/typing/tests/data/pass/ndarray_misc.py,sha256=z3mucbn9fLM1gxmbUhWlp2lcrOv4zFjqZFze0caE2EA,2715
|
| 724 |
+
numpy/typing/tests/data/pass/ndarray_shape_manipulation.py,sha256=37eYwMNqMLwanIW9-63hrokacnSz2K_qtPUlkdpsTjo,640
|
| 725 |
+
numpy/typing/tests/data/pass/numeric.py,sha256=SdnsD5zv0wm8T2hnIylyS14ig2McSz6rG9YslckbNQ4,1490
|
| 726 |
+
numpy/typing/tests/data/pass/numerictypes.py,sha256=r0_s-a0-H2MdWIn4U4P6W9RQO0V1xrDusgodHNZeIYM,750
|
| 727 |
+
numpy/typing/tests/data/pass/random.py,sha256=uJCnzlsOn9hr_G1TpHLdsweJI4EdhUSEQ4dxROPjqAs,61881
|
| 728 |
+
numpy/typing/tests/data/pass/scalars.py,sha256=En0adCZAwEigZrzdQ0JQwDEmrS0b-DMd1vvjkFcvwo8,3479
|
| 729 |
+
numpy/typing/tests/data/pass/simple.py,sha256=HmAfCOdZBWQF211YaZFrIGisMgu5FzTELApKny08n3Y,2676
|
| 730 |
+
numpy/typing/tests/data/pass/simple_py3.py,sha256=HuLrc5aphThQkLjU2_19KgGFaXwKOfSzXe0p2xMm8ZI,96
|
| 731 |
+
numpy/typing/tests/data/pass/ufunc_config.py,sha256=_M8v-QWAeT1-2MkfSeAbNl_ZwyPvYfPTsLl6c1X8d_w,1204
|
| 732 |
+
numpy/typing/tests/data/pass/ufunclike.py,sha256=Gve6cJ2AT3TAwOjUOQQDIUnqsRCGYq70_tv_sgODiiA,1039
|
| 733 |
+
numpy/typing/tests/data/pass/ufuncs.py,sha256=xGuKuqPetUTS4io5YDHaki5nbYRu-wC29SGU32tzVIg,462
|
| 734 |
+
numpy/typing/tests/data/pass/warnings_and_errors.py,sha256=Pcg-QWfY4PAhTKyehae8q6LhtbUABxa2Ye63-3h1f4w,150
|
| 735 |
+
numpy/typing/tests/data/reveal/arithmetic.pyi,sha256=Ndmi_IFAl8z28RHsYTbOouf-B5FH91x_9ky-JwsdXVg,19765
|
| 736 |
+
numpy/typing/tests/data/reveal/array_constructors.pyi,sha256=DcT8Z2rEpqYfjXySBejk8cGOUidUmizZGE5ZEy7r14E,10600
|
| 737 |
+
numpy/typing/tests/data/reveal/arraypad.pyi,sha256=Q1pcU4B3eRsw5jsv-S0MsEfNUbp_4aMdO_o3n0rtA2A,776
|
| 738 |
+
numpy/typing/tests/data/reveal/arrayprint.pyi,sha256=YyzzkL-wj4Rs-fdo3brpoaWtb5g3yk4Vn2HKu5KRo4w,876
|
| 739 |
+
numpy/typing/tests/data/reveal/arraysetops.pyi,sha256=ApCFQcZzQ08zV32SJ86Xyv_7jazl3XKMmJmULtNquJ8,4155
|
| 740 |
+
numpy/typing/tests/data/reveal/arrayterator.pyi,sha256=TF_1eneHoT0v9HqS9dKc5Xiv3iY3E330GR1RNcJ7s2Q,1111
|
| 741 |
+
numpy/typing/tests/data/reveal/bitwise_ops.pyi,sha256=nRkyUGrBB_Es7TKyDxS_s3u2dFgBfzjocInI9Ea-J10,3919
|
| 742 |
+
numpy/typing/tests/data/reveal/char.pyi,sha256=M_iTa9Pn8F7jQ1k6RN9KvbhEn00g7UYJZ5PV57ikcZM,7289
|
| 743 |
+
numpy/typing/tests/data/reveal/chararray.pyi,sha256=O0EfwnKc3W1Fnx1c7Yotb1O84kVMuqJLlMBXd2duvjI,6093
|
| 744 |
+
numpy/typing/tests/data/reveal/comparisons.pyi,sha256=huaf-seaF5ndTqfoaBfPtMMkOYovq7ibJl5-CRoQW7s,7468
|
| 745 |
+
numpy/typing/tests/data/reveal/constants.pyi,sha256=P9vFEMkPpJ5KeUnzqPOuyHlh3zAFl9lzB4WxyB2od7A,1949
|
| 746 |
+
numpy/typing/tests/data/reveal/ctypeslib.pyi,sha256=-Pk2rLEGCzz3B_y8Mu10JSVA8gPFztl5fV1dspPzqig,4727
|
| 747 |
+
numpy/typing/tests/data/reveal/datasource.pyi,sha256=e8wjn60tO5EdnkBF34JrZT5XvdyW7kRWD2abtgr6qUg,671
|
| 748 |
+
numpy/typing/tests/data/reveal/dtype.pyi,sha256=TKrYyxMu5IGobs0SDTIRcPuWsZ5X7zMYB4pmUlTTJxA,2872
|
| 749 |
+
numpy/typing/tests/data/reveal/einsumfunc.pyi,sha256=pbtSfzIWUJRkDpe2riHBlvFlNSC3CqVM-SbYtBgX9H0,2044
|
| 750 |
+
numpy/typing/tests/data/reveal/emath.pyi,sha256=-muNpWOv_niIn-zS3gUnFO4qBZAouNlVGue2x1L5Ris,2423
|
| 751 |
+
numpy/typing/tests/data/reveal/false_positives.pyi,sha256=AplTmZV7TS7nivU8vegbstMN5MdMv4U0JJdZ4IeeA5M,482
|
| 752 |
+
numpy/typing/tests/data/reveal/fft.pyi,sha256=ReQ9qn5frvJEy-g0RWpUGlPBntUS1cFSIu6WfPotHzE,1749
|
| 753 |
+
numpy/typing/tests/data/reveal/flatiter.pyi,sha256=e1OQsVxQpgyfqMNw2puUTATl-w3swvdknlctAiWxf_E,882
|
| 754 |
+
numpy/typing/tests/data/reveal/fromnumeric.pyi,sha256=PNtGQR1VmGk_xNbd0eP7k7B2oNCMBz2XOJ17-_SdE5M,12101
|
| 755 |
+
numpy/typing/tests/data/reveal/getlimits.pyi,sha256=nUGOMFpWj3pMgqLy6ZbR7A4G2q7iLIl5zEFBGf-Qcfw,1592
|
| 756 |
+
numpy/typing/tests/data/reveal/histograms.pyi,sha256=MxKWoa7UoJRRLim53H6OoyYfz87P3_9YUXGYPTknGVQ,1303
|
| 757 |
+
numpy/typing/tests/data/reveal/index_tricks.pyi,sha256=HpD7lU7hcyDoLdZbeqskPXnX7KYwPtll7uJKYUzrlE8,3177
|
| 758 |
+
numpy/typing/tests/data/reveal/lib_function_base.pyi,sha256=eSiSZUlmPXqVPKknM7GcEv76BDgj0IJRu3FXcZXpmqc,8318
|
| 759 |
+
numpy/typing/tests/data/reveal/lib_polynomial.pyi,sha256=TOzOdMPDqveDv3vDKSjtq6RRvN-j_s2J7aud2ySDAB0,5986
|
| 760 |
+
numpy/typing/tests/data/reveal/lib_utils.pyi,sha256=_zj7WGYGYMFXAHLK-F11aeFfDvjRvFARUjoXhbXn8V0,1049
|
| 761 |
+
numpy/typing/tests/data/reveal/lib_version.pyi,sha256=UCioUeykot8-nWL6goKxZnKZxtgB4lFEi9wdN_xyF1U,672
|
| 762 |
+
numpy/typing/tests/data/reveal/linalg.pyi,sha256=LPaY-RyYL7Xt3djCgNaWEgI8beI9Eo_XnvOwi6Y7-eo,4877
|
| 763 |
+
numpy/typing/tests/data/reveal/matrix.pyi,sha256=ciJXsn5v2O1IZ3VEn5Ilp8-40NTQokfrOOgVXMFsvLo,2922
|
| 764 |
+
numpy/typing/tests/data/reveal/memmap.pyi,sha256=A5PovMzjRp2zslF1vw3TdTQjj4Y0dIEJ__HDBV_svGM,842
|
| 765 |
+
numpy/typing/tests/data/reveal/mod.pyi,sha256=-CNWft2jQGSdrO8dYRgwbl7OhL3a78Zo60JVmiY-gQI,5666
|
| 766 |
+
numpy/typing/tests/data/reveal/modules.pyi,sha256=0WPq7A-aqWkJsV-IA1_7dFNCcxBacj1AWExaXbXErG4,1958
|
| 767 |
+
numpy/typing/tests/data/reveal/multiarray.pyi,sha256=6MvfNKihK-oN6QwG9HFNelgheo4lnL0FCrmIF_qxdoA,5326
|
| 768 |
+
numpy/typing/tests/data/reveal/nbit_base_example.pyi,sha256=DRUMGatQvQXTuovKEMF4dzazIU6it6FU53LkOEo2vNo,657
|
| 769 |
+
numpy/typing/tests/data/reveal/ndarray_conversion.pyi,sha256=BfjQD8U756l4gOfY0LD47HhDRxbq0yCFfEFKvbXs7Rs,1791
|
| 770 |
+
numpy/typing/tests/data/reveal/ndarray_misc.pyi,sha256=0EN-a47Msn4pZgKVdD-GrXCCmt-oxjlov5rszchBmOI,7126
|
| 771 |
+
numpy/typing/tests/data/reveal/ndarray_shape_manipulation.pyi,sha256=QDQ9g6l-e73pTJp-Dosiynb-okbqi91D4KirjhIjcv4,1233
|
| 772 |
+
numpy/typing/tests/data/reveal/nditer.pyi,sha256=VFXnT75BgWSUpb-dD-q5cZkfeOqsk-x9cH626g9FWT4,2021
|
| 773 |
+
numpy/typing/tests/data/reveal/nested_sequence.pyi,sha256=IQyRlXduk-ZEakOtoliMLCqNgGbeg0mzZf-a-a3Gq_0,734
|
| 774 |
+
numpy/typing/tests/data/reveal/npyio.pyi,sha256=YXagt2J-1suu5WXZ_si5NuJf7sHj_7NlaSLqQkam1Po,4209
|
| 775 |
+
numpy/typing/tests/data/reveal/numeric.pyi,sha256=aJKnav-X45tjSFfgGD4iCetwEFcJXdNgU7valktjiCg,6160
|
| 776 |
+
numpy/typing/tests/data/reveal/numerictypes.pyi,sha256=-YQRhwjBjsFJHjpGCRqzafNnKDdsmbBHbmPwccP0pLI,2487
|
| 777 |
+
numpy/typing/tests/data/reveal/random.pyi,sha256=s6T074ZIpGAUqHnA-yAlozTLvt7PNBjCBqd-nGMqWGg,104091
|
| 778 |
+
numpy/typing/tests/data/reveal/rec.pyi,sha256=DbRVk6lc7-3qPe-7Q26tUWpdaH9B4UVoQSYrRGJUo1Q,3858
|
| 779 |
+
numpy/typing/tests/data/reveal/scalars.pyi,sha256=Qn3B3rsqSN397Jh25xs4odt2pfCQtWkoJe-e0-oX8d4,4790
|
| 780 |
+
numpy/typing/tests/data/reveal/shape_base.pyi,sha256=YjiVukrK6OOydvopOaOmeAIIa0YQ2hn9_I_-FyYkHVU,2427
|
| 781 |
+
numpy/typing/tests/data/reveal/stride_tricks.pyi,sha256=EBZR8gSP385nhotwJ3GH9DOUD2q5nUEYbXfhLo5xrPo,1542
|
| 782 |
+
numpy/typing/tests/data/reveal/testing.pyi,sha256=_WOAj_t5SWYiqN0KG26Mza8RvaD3WAa7rFUlgksjLms,8611
|
| 783 |
+
numpy/typing/tests/data/reveal/twodim_base.pyi,sha256=ZdNVo2HIJcx8iF9PA-z5W3Bs0hWM2nlVdbhLuAQlljM,3132
|
| 784 |
+
numpy/typing/tests/data/reveal/type_check.pyi,sha256=yZSp50TtvPqv_PN7zmVcNOVUTUXMNYFGcguMNj25E9Y,3044
|
| 785 |
+
numpy/typing/tests/data/reveal/ufunc_config.pyi,sha256=buwSvat3SVFAFl5k8TL6Mgpi32o6hHZYZ2Lpn6AHdEU,1327
|
| 786 |
+
numpy/typing/tests/data/reveal/ufunclike.pyi,sha256=V_gLcZVrTXJ21VkUMwA0HyxUgA1r6OzjsdJegaKL2GE,1329
|
| 787 |
+
numpy/typing/tests/data/reveal/ufuncs.pyi,sha256=VnwYr5KT_FLKfc0wV7dtNz7bNtaC9VIQt-oz56Hb5EE,2798
|
| 788 |
+
numpy/typing/tests/data/reveal/warnings_and_errors.pyi,sha256=ImMlPt2PQBtX8Qf1EZFmLjNWm8fPE6IWQ_deaq_-85s,538
|
| 789 |
+
numpy/typing/tests/test_isfile.py,sha256=BhKZs4-LrhFUfKjcG0yelySjE6ZITMxGIBYWGDHMRb8,864
|
| 790 |
+
numpy/typing/tests/test_runtime.py,sha256=2qu8JEliITnZCBJ_QJpohacj_OQ08o73ixS2w2ooNXI,3275
|
| 791 |
+
numpy/typing/tests/test_typing.py,sha256=Da1ZOFjtPh_Mvb5whpI-okBJdgLOAfJtJNyG6leGFoQ,8743
|
| 792 |
+
numpy/version.py,sha256=OTLnSh0NGfWyL8VrnIj0Ndt_KZOTl1Z-kD9Cf-jRMmY,216
|
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/REQUESTED
ADDED
|
File without changes
|