JinghuiLuAstronaut commited on
Commit
36dad47
·
verified ·
1 Parent(s): edff6fa

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. LTA_openwebtext_dualt/logs/ar_lm1b_flmpack_bert_small_len128_gbs512_4gpu_1m_20260504_195806.log +305 -0
  2. LTA_openwebtext_dualt/logs/lta_lm1b_classic_dirichlet_len256_gbs512_4gpu_10k_save1k_20260523.train.pid +1 -0
  3. LTA_openwebtext_dualt/logs/lta_owt_dirichlet_categorical_fullvocab_c1024_fullycoupled_shufchunks_len128_gbs512_8gpu_1m.log +0 -0
  4. LTA_openwebtext_dualt/logs/lta_owt_gpt2cached_len1024_rollout1_p1_bench4gpu_20260513_152806.log +103 -0
  5. LTA_openwebtext_dualt/logs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525_watcher.log +458 -0
  6. LTA_openwebtext_dualt/logs/owt_candidate_catdualt_step246k_64_c1024_t1p2_blend_n64.log +3 -0
  7. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/smoke_gpt2_softendpoint_mn_n128_onehot.log +94 -0
  8. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_combo_len256_dirichlet_unigram_shared_highC_20260517_170456.log +197 -0
  9. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456.log +395 -0
  10. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_20260517_223933.log +229 -0
  11. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139.log +609 -0
  12. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705.log +0 -0
  13. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728.log +1034 -0
  14. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_t5tok_p50_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014620.log +196 -0
  15. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705.log +1024 -0
  16. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n1024_compactv2664_3l_bs512_hard_ce_allcorrupt.log +0 -0
  17. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n1024_compactv2664_3l_bs512_hard_ce_onehot.log +0 -0
  18. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n512_compactv1635_3l_bs512_hard_ce_allcorrupt.log +0 -0
  19. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n64_compactv335_3l_bs512_hard_ce_allcorrupt.log +0 -0
  20. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n8_compactv47_3l_hard_ce_onehot.log +0 -0
  21. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n8_linear_soft_kl_onehot_20260517_train8ctx8_overfit.log +326 -0
  22. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805.log +987 -0
  23. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805.log +791 -0
  24. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_noisegeo_len256_allcorrupt_unigram_shared0p5_20260517_163805.log +634 -0
  25. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_focused_len256_rollin_p100_s4_i32_20260517_1733focused.log +193 -0
  26. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_len1024_rollin_p50_s4_i32_20260517_1840ctx1024.log +397 -0
  27. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_len1024_rollin_p50_s4_i32_20260517_1855ctx1024bs128.log +0 -0
  28. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_synct_len256_synct_p50_s8_i64_20260517_1800synct.log +224 -0
  29. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_wrongfloor_len256_wrongfloor0p3_20260517_1815wrongfloor.log +199 -0
  30. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/hf_xet-1.5.0.dist-info/RECORD +9 -0
  31. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/hf_xet-1.5.0.dist-info/WHEEL +5 -0
  32. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/__init__.py +0 -0
  33. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/anyio.py +146 -0
  34. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/auto.py +52 -0
  35. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/base.py +101 -0
  36. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/mock.py +143 -0
  37. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/sync.py +241 -0
  38. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/trio.py +159 -0
  39. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/__init__.py +18 -0
  40. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_decode.py +104 -0
  41. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_encode.py +85 -0
  42. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_format.py +27 -0
  43. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_parse.py +304 -0
  44. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_url.py +14 -0
  45. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/py.typed +1 -0
  46. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/INSTALLER +1 -0
  47. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/LICENSE.txt +971 -0
  48. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/METADATA +1092 -0
  49. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/RECORD +792 -0
  50. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/REQUESTED +0 -0
LTA_openwebtext_dualt/logs/ar_lm1b_flmpack_bert_small_len128_gbs512_4gpu_1m_20260504_195806.log ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ *****************************************
3
+ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
4
+ *****************************************
5
+ NCCL version 2.25.1+cuda12.8
6
+ {
7
+ "task": "ar_lm",
8
+ "device": "cuda:0",
9
+ "rank": 0,
10
+ "world_size": 4,
11
+ "samples": "wrapped_streaming",
12
+ "vocab_size": 30522,
13
+ "bos_id": 101,
14
+ "eos_id": 102,
15
+ "save_dir": "runs/ar_lm1b_flmpack_bert_small_len128_gbs512_4gpu_1m_20260504_195806",
16
+ "params": 108440832,
17
+ "batch_size": 64,
18
+ "grad_accum": 2,
19
+ "effective_batch_size": 512,
20
+ "global_batch_size": 512,
21
+ "max_len": 128,
22
+ "wrap": true,
23
+ "text_detokenizer": "lm1b",
24
+ "openwebtext_split": "all",
25
+ "torch_compile": false
26
+ }
27
+ step=20 micro_steps=40 elapsed=4.3s lr=2.520000e-06 loss=10.3798 ppl=32478.6943 acc=0.0027 tokens=8128.0000
28
+ step=40 micro_steps=80 elapsed=3.6s lr=4.920000e-06 loss=9.6982 ppl=16677.9630 acc=0.0420 tokens=8128.0000
29
+ step=60 micro_steps=120 elapsed=3.6s lr=7.320000e-06 loss=9.1128 ppl=9132.2660 acc=0.0511 tokens=8128.0000
30
+ step=80 micro_steps=160 elapsed=3.6s lr=9.720000e-06 loss=8.8143 ppl=6750.1859 acc=0.0650 tokens=8128.0000
31
+ step=100 micro_steps=200 elapsed=3.6s lr=1.212000e-05 loss=8.5221 ppl=5041.4905 acc=0.0799 tokens=8128.0000
32
+ step=120 micro_steps=240 elapsed=3.6s lr=1.452000e-05 loss=8.2581 ppl=3870.1153 acc=0.0867 tokens=8128.0000
33
+ step=140 micro_steps=280 elapsed=3.6s lr=1.692000e-05 loss=7.9724 ppl=2911.8598 acc=0.0919 tokens=8128.0000
34
+ step=160 micro_steps=320 elapsed=3.6s lr=1.932000e-05 loss=7.6568 ppl=2123.6313 acc=0.1067 tokens=8128.0000
35
+ step=180 micro_steps=360 elapsed=3.6s lr=2.172000e-05 loss=7.3502 ppl=1563.0248 acc=0.1163 tokens=8128.0000
36
+ step=200 micro_steps=400 elapsed=3.6s lr=2.412000e-05 loss=7.0837 ppl=1195.9628 acc=0.1251 tokens=8128.0000
37
+ step=220 micro_steps=440 elapsed=3.6s lr=2.652000e-05 loss=6.8824 ppl=976.4170 acc=0.1333 tokens=8128.0000
38
+ step=240 micro_steps=480 elapsed=3.6s lr=2.892000e-05 loss=6.7489 ppl=854.0937 acc=0.1375 tokens=8128.0000
39
+ step=260 micro_steps=520 elapsed=3.6s lr=3.132000e-05 loss=6.6435 ppl=768.6732 acc=0.1441 tokens=8128.0000
40
+ step=280 micro_steps=560 elapsed=3.6s lr=3.372000e-05 loss=6.5504 ppl=700.1528 acc=0.1519 tokens=8128.0000
41
+ step=300 micro_steps=600 elapsed=3.6s lr=3.612000e-05 loss=6.4669 ppl=644.4497 acc=0.1587 tokens=8128.0000
42
+ step=320 micro_steps=640 elapsed=3.6s lr=3.852000e-05 loss=6.3869 ppl=594.5753 acc=0.1655 tokens=8128.0000
43
+ step=340 micro_steps=680 elapsed=3.6s lr=4.092000e-05 loss=6.2992 ppl=544.6934 acc=0.1715 tokens=8128.0000
44
+ step=360 micro_steps=720 elapsed=3.6s lr=4.332000e-05 loss=6.2257 ppl=506.2021 acc=0.1749 tokens=8128.0000
45
+ step=380 micro_steps=760 elapsed=3.6s lr=4.572000e-05 loss=6.1530 ppl=470.6410 acc=0.1788 tokens=8128.0000
46
+ step=400 micro_steps=800 elapsed=3.6s lr=4.812000e-05 loss=6.0747 ppl=435.1815 acc=0.1824 tokens=8128.0000
47
+ step=420 micro_steps=840 elapsed=3.6s lr=5.052000e-05 loss=5.9999 ppl=403.7083 acc=0.1866 tokens=8128.0000
48
+ step=440 micro_steps=880 elapsed=3.6s lr=5.292000e-05 loss=5.9622 ppl=388.8123 acc=0.1894 tokens=8128.0000
49
+ step=460 micro_steps=920 elapsed=3.6s lr=5.532000e-05 loss=5.9080 ppl=368.1609 acc=0.1923 tokens=8128.0000
50
+ step=480 micro_steps=960 elapsed=3.6s lr=5.772000e-05 loss=5.8534 ppl=348.7774 acc=0.1953 tokens=8128.0000
51
+ step=500 micro_steps=1000 elapsed=3.6s lr=6.012000e-05 loss=5.8124 ppl=334.7449 acc=0.1972 tokens=8128.0000
52
+ step=520 micro_steps=1040 elapsed=3.6s lr=6.252000e-05 loss=5.7628 ppl=318.4989 acc=0.1996 tokens=8128.0000
53
+ step=540 micro_steps=1080 elapsed=3.6s lr=6.492000e-05 loss=5.7351 ppl=309.8943 acc=0.2010 tokens=8128.0000
54
+ step=560 micro_steps=1120 elapsed=3.6s lr=6.732000e-05 loss=5.6905 ppl=296.3728 acc=0.2039 tokens=8128.0000
55
+ step=580 micro_steps=1160 elapsed=3.6s lr=6.972000e-05 loss=5.6509 ppl=284.9865 acc=0.2069 tokens=8128.0000
56
+ step=600 micro_steps=1200 elapsed=3.6s lr=7.212000e-05 loss=5.6142 ppl=274.5673 acc=0.2090 tokens=8128.0000
57
+ step=620 micro_steps=1240 elapsed=3.6s lr=7.452000e-05 loss=5.5644 ppl=261.2549 acc=0.2113 tokens=8128.0000
58
+ step=640 micro_steps=1280 elapsed=3.6s lr=7.692000e-05 loss=5.5398 ppl=254.9341 acc=0.2136 tokens=8128.0000
59
+ step=660 micro_steps=1320 elapsed=3.6s lr=7.932000e-05 loss=5.5199 ppl=249.8425 acc=0.2146 tokens=8128.0000
60
+ step=680 micro_steps=1360 elapsed=3.6s lr=8.172000e-05 loss=5.4786 ppl=239.7250 acc=0.2176 tokens=8128.0000
61
+ step=700 micro_steps=1400 elapsed=3.6s lr=8.412000e-05 loss=5.4594 ppl=235.2477 acc=0.2177 tokens=8128.0000
62
+ step=720 micro_steps=1440 elapsed=3.6s lr=8.652000e-05 loss=5.4230 ppl=226.7282 acc=0.2195 tokens=8128.0000
63
+ step=740 micro_steps=1480 elapsed=3.6s lr=8.892000e-05 loss=5.3881 ppl=219.0196 acc=0.2224 tokens=8128.0000
64
+ step=760 micro_steps=1520 elapsed=3.6s lr=9.132000e-05 loss=5.3681 ppl=214.6676 acc=0.2242 tokens=8128.0000
65
+ step=780 micro_steps=1560 elapsed=3.6s lr=9.372000e-05 loss=5.3561 ppl=212.1429 acc=0.2236 tokens=8128.0000
66
+ step=800 micro_steps=1600 elapsed=3.6s lr=9.612000e-05 loss=5.3154 ppl=203.6174 acc=0.2266 tokens=8128.0000
67
+ step=820 micro_steps=1640 elapsed=3.6s lr=9.852000e-05 loss=5.2852 ppl=197.5881 acc=0.2281 tokens=8128.0000
68
+ step=840 micro_steps=1680 elapsed=3.6s lr=1.009200e-04 loss=5.2622 ppl=193.1013 acc=0.2307 tokens=8128.0000
69
+ step=860 micro_steps=1720 elapsed=3.6s lr=1.033200e-04 loss=5.2473 ppl=190.2556 acc=0.2309 tokens=8128.0000
70
+ step=880 micro_steps=1760 elapsed=3.6s lr=1.057200e-04 loss=5.2202 ppl=185.1611 acc=0.2324 tokens=8128.0000
71
+ step=900 micro_steps=1800 elapsed=3.6s lr=1.081200e-04 loss=5.1907 ppl=179.7384 acc=0.2351 tokens=8128.0000
72
+ step=920 micro_steps=1840 elapsed=3.6s lr=1.105200e-04 loss=5.1668 ppl=175.4561 acc=0.2370 tokens=8128.0000
73
+ step=940 micro_steps=1880 elapsed=3.6s lr=1.129200e-04 loss=5.1512 ppl=172.8847 acc=0.2380 tokens=8128.0000
74
+ step=960 micro_steps=1920 elapsed=3.6s lr=1.153200e-04 loss=5.1224 ppl=167.9127 acc=0.2385 tokens=8128.0000
75
+ step=980 micro_steps=1960 elapsed=3.6s lr=1.177200e-04 loss=5.1084 ppl=165.6339 acc=0.2393 tokens=8128.0000
76
+ step=1000 micro_steps=2000 elapsed=3.6s lr=1.201200e-04 loss=5.0897 ppl=162.5289 acc=0.2409 tokens=8128.0000
77
+ [sample step=1000] [CLS] bash turned on and had five - day splash design. [SEP] the rails of proficiency were eastern, with the drum - smashed races of tif hoc sabbath. [SEP] good could be in liechtenstein, but not 314 of the same floor is imminent. [SEP] kristin fox expired 24 - 15 on beth gregor marsh's celebration and he was volleyed with his being sacked as top - completing practice in oakland. [SEP] passion for poverty and cuts: chief executive the company's office reported that markedly of last year'slight and the administration's potentially derivatives force indicates to become careful friday. [SEP] overall, then the economic compensation program has acquired [SEP]
78
+ step=1020 micro_steps=2040 elapsed=6.4s lr=1.225200e-04 loss=5.0651 ppl=158.5900 acc=0.2426 tokens=8128.0000
79
+ step=1040 micro_steps=2080 elapsed=3.6s lr=1.249200e-04 loss=5.0462 ppl=155.6245 acc=0.2436 tokens=8128.0000
80
+ step=1060 micro_steps=2120 elapsed=3.6s lr=1.273200e-04 loss=5.0489 ppl=156.0155 acc=0.2436 tokens=8128.0000
81
+ step=1080 micro_steps=2160 elapsed=3.6s lr=1.297200e-04 loss=5.0148 ppl=150.8239 acc=0.2466 tokens=8128.0000
82
+ step=1100 micro_steps=2200 elapsed=3.6s lr=1.321200e-04 loss=4.9821 ppl=145.9810 acc=0.2484 tokens=8128.0000
83
+ step=1120 micro_steps=2240 elapsed=3.6s lr=1.345200e-04 loss=4.9698 ppl=144.2456 acc=0.2497 tokens=8128.0000
84
+ step=1140 micro_steps=2280 elapsed=3.6s lr=1.369200e-04 loss=4.9559 ppl=142.2155 acc=0.2496 tokens=8128.0000
85
+ step=1160 micro_steps=2320 elapsed=3.6s lr=1.393200e-04 loss=4.9270 ppl=138.0649 acc=0.2513 tokens=8128.0000
86
+ step=1180 micro_steps=2360 elapsed=3.6s lr=1.417200e-04 loss=4.9200 ppl=137.1738 acc=0.2521 tokens=8128.0000
87
+ step=1200 micro_steps=2400 elapsed=3.6s lr=1.441200e-04 loss=4.9104 ppl=135.8119 acc=0.2530 tokens=8128.0000
88
+ step=1220 micro_steps=2440 elapsed=3.6s lr=1.465200e-04 loss=4.8784 ppl=131.6055 acc=0.2556 tokens=8128.0000
89
+ step=1240 micro_steps=2480 elapsed=3.6s lr=1.489200e-04 loss=4.8727 ppl=130.8100 acc=0.2568 tokens=8128.0000
90
+ step=1260 micro_steps=2520 elapsed=3.6s lr=1.513200e-04 loss=4.8468 ppl=127.4586 acc=0.2564 tokens=8128.0000
91
+ step=1280 micro_steps=2560 elapsed=3.6s lr=1.537200e-04 loss=4.8371 ppl=126.2536 acc=0.2580 tokens=8128.0000
92
+ step=1300 micro_steps=2600 elapsed=3.6s lr=1.561200e-04 loss=4.8055 ppl=122.3074 acc=0.2601 tokens=8128.0000
93
+ step=1320 micro_steps=2640 elapsed=3.6s lr=1.585200e-04 loss=4.8018 ppl=121.8846 acc=0.2605 tokens=8128.0000
94
+ step=1340 micro_steps=2680 elapsed=3.6s lr=1.609200e-04 loss=4.7835 ppl=119.6142 acc=0.2619 tokens=8128.0000
95
+ step=1360 micro_steps=2720 elapsed=3.6s lr=1.633200e-04 loss=4.7571 ppl=116.5595 acc=0.2649 tokens=8128.0000
96
+ step=1380 micro_steps=2760 elapsed=3.6s lr=1.657200e-04 loss=4.7526 ppl=116.0234 acc=0.2641 tokens=8128.0000
97
+ step=1400 micro_steps=2800 elapsed=3.6s lr=1.681200e-04 loss=4.7498 ppl=115.6697 acc=0.2641 tokens=8128.0000
98
+ step=1420 micro_steps=2840 elapsed=3.6s lr=1.705200e-04 loss=4.7416 ppl=114.7103 acc=0.2650 tokens=8128.0000
99
+ step=1440 micro_steps=2880 elapsed=3.6s lr=1.729200e-04 loss=4.7126 ppl=111.4812 acc=0.2663 tokens=8128.0000
100
+ step=1460 micro_steps=2920 elapsed=3.6s lr=1.753200e-04 loss=4.7078 ppl=110.9196 acc=0.2674 tokens=8128.0000
101
+ step=1480 micro_steps=2960 elapsed=3.6s lr=1.777200e-04 loss=4.6839 ppl=108.3107 acc=0.2680 tokens=8128.0000
102
+ step=1500 micro_steps=3000 elapsed=3.6s lr=1.801200e-04 loss=4.6728 ppl=107.1491 acc=0.2698 tokens=8128.0000
103
+ step=1520 micro_steps=3040 elapsed=3.6s lr=1.825200e-04 loss=4.6493 ppl=104.6331 acc=0.2710 tokens=8128.0000
104
+ step=1540 micro_steps=3080 elapsed=3.6s lr=1.849200e-04 loss=4.6501 ppl=104.7323 acc=0.2707 tokens=8128.0000
105
+ step=1560 micro_steps=3120 elapsed=3.6s lr=1.873200e-04 loss=4.6381 ppl=103.4611 acc=0.2718 tokens=8128.0000
106
+ step=1580 micro_steps=3160 elapsed=3.6s lr=1.897200e-04 loss=4.6088 ppl=100.5290 acc=0.2751 tokens=8128.0000
107
+ step=1600 micro_steps=3200 elapsed=3.6s lr=1.921200e-04 loss=4.6027 ppl=99.8427 acc=0.2747 tokens=8128.0000
108
+ step=1620 micro_steps=3240 elapsed=3.6s lr=1.945200e-04 loss=4.5819 ppl=97.7759 acc=0.2764 tokens=8128.0000
109
+ step=1640 micro_steps=3280 elapsed=3.6s lr=1.969200e-04 loss=4.5748 ppl=97.1006 acc=0.2762 tokens=8128.0000
110
+ step=1660 micro_steps=3320 elapsed=3.6s lr=1.993200e-04 loss=4.5683 ppl=96.5347 acc=0.2770 tokens=8128.0000
111
+ step=1680 micro_steps=3360 elapsed=3.6s lr=2.017200e-04 loss=4.5531 ppl=95.0791 acc=0.2777 tokens=8128.0000
112
+ step=1700 micro_steps=3400 elapsed=3.6s lr=2.041200e-04 loss=4.5314 ppl=92.9927 acc=0.2800 tokens=8128.0000
113
+ step=1720 micro_steps=3440 elapsed=3.6s lr=2.065200e-04 loss=4.5209 ppl=92.0741 acc=0.2796 tokens=8128.0000
114
+ step=1740 micro_steps=3480 elapsed=3.6s lr=2.089200e-04 loss=4.5257 ppl=92.4572 acc=0.2809 tokens=8128.0000
115
+ step=1760 micro_steps=3520 elapsed=3.6s lr=2.113200e-04 loss=4.5192 ppl=91.9197 acc=0.2803 tokens=8128.0000
116
+ step=1780 micro_steps=3560 elapsed=3.6s lr=2.137200e-04 loss=4.4929 ppl=89.4739 acc=0.2837 tokens=8128.0000
117
+ step=1800 micro_steps=3600 elapsed=3.6s lr=2.161200e-04 loss=4.4789 ppl=88.3597 acc=0.2846 tokens=8128.0000
118
+ step=1820 micro_steps=3640 elapsed=3.6s lr=2.185200e-04 loss=4.4777 ppl=88.1819 acc=0.2846 tokens=8128.0000
119
+ step=1840 micro_steps=3680 elapsed=3.6s lr=2.209200e-04 loss=4.4645 ppl=86.9802 acc=0.2859 tokens=8128.0000
120
+ step=1860 micro_steps=3720 elapsed=3.6s lr=2.233200e-04 loss=4.4604 ppl=86.6318 acc=0.2861 tokens=8128.0000
121
+ step=1880 micro_steps=3760 elapsed=3.6s lr=2.257200e-04 loss=4.4447 ppl=85.2599 acc=0.2866 tokens=8128.0000
122
+ step=1900 micro_steps=3800 elapsed=3.6s lr=2.281200e-04 loss=4.4276 ppl=83.8217 acc=0.2876 tokens=8128.0000
123
+ step=1920 micro_steps=3840 elapsed=3.6s lr=2.305200e-04 loss=4.4316 ppl=84.1305 acc=0.2883 tokens=8128.0000
124
+ step=1940 micro_steps=3880 elapsed=3.6s lr=2.329200e-04 loss=4.4196 ppl=83.1516 acc=0.2876 tokens=8128.0000
125
+ step=1960 micro_steps=3920 elapsed=3.6s lr=2.353200e-04 loss=4.4023 ppl=81.7102 acc=0.2905 tokens=8128.0000
126
+ step=1980 micro_steps=3960 elapsed=3.6s lr=2.377200e-04 loss=4.4063 ppl=82.0973 acc=0.2905 tokens=8128.0000
127
+ step=2000 micro_steps=4000 elapsed=3.6s lr=2.401200e-04 loss=4.3846 ppl=80.3016 acc=0.2917 tokens=8128.0000
128
+ [sample step=2000] [CLS]t ), and at the moment - - you really don't know the same the best proportions for getting things attention this is going to get hurt. [SEP] mr justice debashev had a positive interest in most of the country. [SEP] it wants to be a useful change. [SEP] the principal names of veteran officers, groups of fans committees and members of congress, because they while not horse - goers will play an important role in obtaining substantial invoking over his federal support, there are no help for the united states to preserve iranian community capacity. [SEP] the pendulum of dieting out is underscore the size of gas [SEP]
129
+ step=2020 micro_steps=4040 elapsed=5.8s lr=2.425200e-04 loss=4.3824 ppl=80.1633 acc=0.2902 tokens=8128.0000
130
+ step=2040 micro_steps=4080 elapsed=3.6s lr=2.449200e-04 loss=4.3680 ppl=78.9561 acc=0.2932 tokens=8128.0000
131
+ step=2060 micro_steps=4120 elapsed=3.6s lr=2.473200e-04 loss=4.3571 ppl=78.1244 acc=0.2932 tokens=8128.0000
132
+ step=2080 micro_steps=4160 elapsed=3.6s lr=2.497200e-04 loss=4.3752 ppl=79.5609 acc=0.2914 tokens=8128.0000
133
+ step=2100 micro_steps=4200 elapsed=3.6s lr=2.521200e-04 loss=4.3523 ppl=77.7967 acc=0.2944 tokens=8128.0000
134
+ step=2120 micro_steps=4240 elapsed=3.6s lr=2.545200e-04 loss=4.3403 ppl=76.8172 acc=0.2947 tokens=8128.0000
135
+ step=2140 micro_steps=4280 elapsed=3.6s lr=2.569200e-04 loss=4.3307 ppl=76.0678 acc=0.2953 tokens=8128.0000
136
+ step=2160 micro_steps=4320 elapsed=3.6s lr=2.593200e-04 loss=4.3392 ppl=76.7835 acc=0.2948 tokens=8128.0000
137
+ step=2180 micro_steps=4360 elapsed=3.6s lr=2.617200e-04 loss=4.3338 ppl=76.3271 acc=0.2947 tokens=8128.0000
138
+ step=2200 micro_steps=4400 elapsed=3.6s lr=2.641200e-04 loss=4.3230 ppl=75.4672 acc=0.2964 tokens=8128.0000
139
+ step=2220 micro_steps=4440 elapsed=3.6s lr=2.665200e-04 loss=4.3048 ppl=74.1440 acc=0.2987 tokens=8128.0000
140
+ step=2240 micro_steps=4480 elapsed=3.6s lr=2.689200e-04 loss=4.2885 ppl=72.9240 acc=0.2989 tokens=8128.0000
141
+ step=2260 micro_steps=4520 elapsed=3.6s lr=2.713200e-04 loss=4.3005 ppl=73.8235 acc=0.2969 tokens=8128.0000
142
+ step=2280 micro_steps=4560 elapsed=3.6s lr=2.737200e-04 loss=4.2877 ppl=72.8882 acc=0.2994 tokens=8128.0000
143
+ step=2300 micro_steps=4600 elapsed=3.6s lr=2.761200e-04 loss=4.2833 ppl=72.5995 acc=0.2992 tokens=8128.0000
144
+ step=2320 micro_steps=4640 elapsed=3.6s lr=2.785200e-04 loss=4.2780 ppl=72.1578 acc=0.2993 tokens=8128.0000
145
+ step=2340 micro_steps=4680 elapsed=3.6s lr=2.809200e-04 loss=4.2667 ppl=71.3834 acc=0.2998 tokens=8128.0000
146
+ step=2360 micro_steps=4720 elapsed=3.6s lr=2.833200e-04 loss=4.2594 ppl=70.8318 acc=0.3021 tokens=8128.0000
147
+ step=2380 micro_steps=4760 elapsed=3.6s lr=2.857200e-04 loss=4.2395 ppl=69.4661 acc=0.3038 tokens=8128.0000
148
+ step=2400 micro_steps=4800 elapsed=3.6s lr=2.881200e-04 loss=4.2583 ppl=70.7645 acc=0.3008 tokens=8128.0000
149
+ step=2420 micro_steps=4840 elapsed=3.6s lr=2.905200e-04 loss=4.2262 ppl=68.5519 acc=0.3055 tokens=8128.0000
150
+ step=2440 micro_steps=4880 elapsed=3.6s lr=2.929200e-04 loss=4.2411 ppl=69.5565 acc=0.3025 tokens=8128.0000
151
+ step=2460 micro_steps=4920 elapsed=3.6s lr=2.953200e-04 loss=4.2159 ppl=67.8834 acc=0.3051 tokens=8128.0000
152
+ step=2480 micro_steps=4960 elapsed=3.6s lr=2.977200e-04 loss=4.2350 ppl=69.1677 acc=0.3034 tokens=8128.0000
153
+ step=2500 micro_steps=5000 elapsed=3.6s lr=3.000000e-04 loss=4.2098 ppl=67.4287 acc=0.3069 tokens=8128.0000
154
+ step=2520 micro_steps=5040 elapsed=3.6s lr=3.000000e-04 loss=4.1991 ppl=66.6870 acc=0.3070 tokens=8128.0000
155
+ step=2540 micro_steps=5080 elapsed=3.6s lr=3.000000e-04 loss=4.2085 ppl=67.3566 acc=0.3055 tokens=8128.0000
156
+ step=2560 micro_steps=5120 elapsed=3.6s lr=3.000000e-04 loss=4.1952 ppl=66.4172 acc=0.3073 tokens=8128.0000
157
+ step=2580 micro_steps=5160 elapsed=3.6s lr=3.000000e-04 loss=4.1903 ppl=66.1340 acc=0.3071 tokens=8128.0000
158
+ step=2600 micro_steps=5200 elapsed=3.6s lr=3.000000e-04 loss=4.1832 ppl=65.6817 acc=0.3084 tokens=8128.0000
159
+ step=2620 micro_steps=5240 elapsed=3.6s lr=3.000000e-04 loss=4.1843 ppl=65.7183 acc=0.3079 tokens=8128.0000
160
+ step=2640 micro_steps=5280 elapsed=3.6s lr=3.000000e-04 loss=4.1928 ppl=66.2759 acc=0.3070 tokens=8128.0000
161
+ step=2660 micro_steps=5320 elapsed=3.6s lr=3.000000e-04 loss=4.1723 ppl=64.9454 acc=0.3093 tokens=8128.0000
162
+ step=2680 micro_steps=5360 elapsed=3.6s lr=3.000000e-04 loss=4.1549 ppl=63.8198 acc=0.3101 tokens=8128.0000
163
+ step=2700 micro_steps=5400 elapsed=3.6s lr=3.000000e-04 loss=4.1537 ppl=63.7657 acc=0.3095 tokens=8128.0000
164
+ step=2720 micro_steps=5440 elapsed=3.6s lr=3.000000e-04 loss=4.1471 ppl=63.3684 acc=0.3103 tokens=8128.0000
165
+ step=2740 micro_steps=5480 elapsed=3.6s lr=3.000000e-04 loss=4.1499 ppl=63.5364 acc=0.3112 tokens=8128.0000
166
+ step=2760 micro_steps=5520 elapsed=3.6s lr=3.000000e-04 loss=4.1403 ppl=62.9134 acc=0.3120 tokens=8128.0000
167
+ step=2780 micro_steps=5560 elapsed=3.6s lr=3.000000e-04 loss=4.1390 ppl=62.8143 acc=0.3123 tokens=8128.0000
168
+ step=2800 micro_steps=5600 elapsed=3.6s lr=3.000000e-04 loss=4.1324 ppl=62.4175 acc=0.3124 tokens=8128.0000
169
+ step=2820 micro_steps=5640 elapsed=3.6s lr=3.000000e-04 loss=4.1207 ppl=61.6718 acc=0.3127 tokens=8128.0000
170
+ step=2840 micro_steps=5680 elapsed=3.6s lr=3.000000e-04 loss=4.1225 ppl=61.7597 acc=0.3127 tokens=8128.0000
171
+ step=2860 micro_steps=5720 elapsed=3.6s lr=3.000000e-04 loss=4.1187 ppl=61.5916 acc=0.3140 tokens=8128.0000
172
+ step=2880 micro_steps=5760 elapsed=3.6s lr=3.000000e-04 loss=4.1156 ppl=61.3453 acc=0.3132 tokens=8128.0000
173
+ step=2900 micro_steps=5800 elapsed=3.6s lr=3.000000e-04 loss=4.0986 ppl=60.3072 acc=0.3161 tokens=8128.0000
174
+ step=2920 micro_steps=5840 elapsed=3.6s lr=3.000000e-04 loss=4.1038 ppl=60.6632 acc=0.3147 tokens=8128.0000
175
+ step=2940 micro_steps=5880 elapsed=3.6s lr=3.000000e-04 loss=4.1071 ppl=60.8467 acc=0.3137 tokens=8128.0000
176
+ step=2960 micro_steps=5920 elapsed=3.6s lr=3.000000e-04 loss=4.0904 ppl=59.8346 acc=0.3163 tokens=8128.0000
177
+ step=2980 micro_steps=5960 elapsed=3.6s lr=3.000000e-04 loss=4.1033 ppl=60.6203 acc=0.3137 tokens=8128.0000
178
+ step=3000 micro_steps=6000 elapsed=3.6s lr=3.000000e-04 loss=4.0854 ppl=59.5390 acc=0.3165 tokens=8128.0000
179
+ [sample step=3000] [CLS] hapless crude, trading from $38. 26 a barrel on reports on immeas roderick lee - - who had seen the big 12 experience when people arrived to check the stock. [SEP] attempted asylum ringleader s. b. son - in - law also linked col. umar with gen. rauci radan, who has until september 11 to attend the celebre jury hijacker in mount wootton, north yorkshire, and recently flown to new york, cuba with an apology from 18 self - proclaimed owners of a home where 13 - year - old gareth culbert abused his job as one of the first [SEP]
180
+ step=3020 micro_steps=6040 elapsed=5.8s lr=3.000000e-04 loss=4.0828 ppl=59.3799 acc=0.3161 tokens=8128.0000
181
+ step=3040 micro_steps=6080 elapsed=3.6s lr=3.000000e-04 loss=4.0786 ppl=59.1482 acc=0.3171 tokens=8128.0000
182
+ step=3060 micro_steps=6120 elapsed=3.6s lr=3.000000e-04 loss=4.0716 ppl=58.7107 acc=0.3180 tokens=8128.0000
183
+ step=3080 micro_steps=6160 elapsed=3.6s lr=3.000000e-04 loss=4.0767 ppl=59.0115 acc=0.3177 tokens=8128.0000
184
+ step=3100 micro_steps=6200 elapsed=3.6s lr=3.000000e-04 loss=4.0632 ppl=58.2278 acc=0.3177 tokens=8128.0000
185
+ step=3120 micro_steps=6240 elapsed=3.6s lr=3.000000e-04 loss=4.0738 ppl=58.8580 acc=0.3168 tokens=8128.0000
186
+ step=3140 micro_steps=6280 elapsed=3.6s lr=3.000000e-04 loss=4.0672 ppl=58.4934 acc=0.3178 tokens=8128.0000
187
+ step=3160 micro_steps=6320 elapsed=3.6s lr=3.000000e-04 loss=4.0560 ppl=57.8210 acc=0.3183 tokens=8128.0000
188
+ step=3180 micro_steps=6360 elapsed=3.6s lr=3.000000e-04 loss=4.0523 ppl=57.6100 acc=0.3202 tokens=8128.0000
189
+ step=3200 micro_steps=6400 elapsed=3.6s lr=3.000000e-04 loss=4.0588 ppl=57.9823 acc=0.3185 tokens=8128.0000
190
+ step=3220 micro_steps=6440 elapsed=3.6s lr=3.000000e-04 loss=4.0476 ppl=57.3425 acc=0.3196 tokens=8128.0000
191
+ step=3240 micro_steps=6480 elapsed=3.6s lr=3.000000e-04 loss=4.0504 ppl=57.5045 acc=0.3200 tokens=8128.0000
192
+ step=3260 micro_steps=6520 elapsed=3.6s lr=3.000000e-04 loss=4.0352 ppl=56.6460 acc=0.3207 tokens=8128.0000
193
+ step=3280 micro_steps=6560 elapsed=3.6s lr=3.000000e-04 loss=4.0359 ppl=56.6621 acc=0.3211 tokens=8128.0000
194
+ step=3300 micro_steps=6600 elapsed=3.6s lr=3.000000e-04 loss=4.0452 ppl=57.1848 acc=0.3194 tokens=8128.0000
195
+ step=3320 micro_steps=6640 elapsed=3.6s lr=3.000000e-04 loss=4.0308 ppl=56.3376 acc=0.3218 tokens=8128.0000
196
+ step=3340 micro_steps=6680 elapsed=3.6s lr=3.000000e-04 loss=4.0391 ppl=56.8609 acc=0.3209 tokens=8128.0000
197
+ step=3360 micro_steps=6720 elapsed=3.6s lr=3.000000e-04 loss=4.0293 ppl=56.2732 acc=0.3222 tokens=8128.0000
198
+ step=3380 micro_steps=6760 elapsed=3.6s lr=3.000000e-04 loss=4.0174 ppl=55.5969 acc=0.3221 tokens=8128.0000
199
+ step=3400 micro_steps=6800 elapsed=3.6s lr=3.000000e-04 loss=4.0322 ppl=56.4517 acc=0.3218 tokens=8128.0000
200
+ step=3420 micro_steps=6840 elapsed=3.6s lr=3.000000e-04 loss=4.0103 ppl=55.2042 acc=0.3222 tokens=8128.0000
201
+ step=3440 micro_steps=6880 elapsed=3.6s lr=3.000000e-04 loss=4.0159 ppl=55.5478 acc=0.3234 tokens=8128.0000
202
+ step=3460 micro_steps=6920 elapsed=3.6s lr=3.000000e-04 loss=4.0053 ppl=54.9606 acc=0.3242 tokens=8128.0000
203
+ step=3480 micro_steps=6960 elapsed=3.6s lr=3.000000e-04 loss=4.0200 ppl=55.7635 acc=0.3221 tokens=8128.0000
204
+ step=3500 micro_steps=7000 elapsed=3.6s lr=3.000000e-04 loss=3.9826 ppl=53.7106 acc=0.3250 tokens=8128.0000
205
+ step=3520 micro_steps=7040 elapsed=3.6s lr=3.000000e-04 loss=4.0046 ppl=54.9312 acc=0.3248 tokens=8128.0000
206
+ step=3540 micro_steps=7080 elapsed=3.6s lr=3.000000e-04 loss=3.9995 ppl=54.6437 acc=0.3252 tokens=8128.0000
207
+ step=3560 micro_steps=7120 elapsed=3.6s lr=3.000000e-04 loss=3.9930 ppl=54.2816 acc=0.3244 tokens=8128.0000
208
+ step=3580 micro_steps=7160 elapsed=3.6s lr=3.000000e-04 loss=3.9871 ppl=53.9546 acc=0.3248 tokens=8128.0000
209
+ step=3600 micro_steps=7200 elapsed=3.6s lr=3.000000e-04 loss=3.9826 ppl=53.7258 acc=0.3260 tokens=8128.0000
210
+ step=3620 micro_steps=7240 elapsed=3.6s lr=3.000000e-04 loss=3.9888 ppl=54.0497 acc=0.3256 tokens=8128.0000
211
+ step=3640 micro_steps=7280 elapsed=3.6s lr=3.000000e-04 loss=3.9819 ppl=53.6996 acc=0.3260 tokens=8128.0000
212
+ step=3660 micro_steps=7320 elapsed=3.6s lr=3.000000e-04 loss=3.9797 ppl=53.5662 acc=0.3270 tokens=8128.0000
213
+ step=3680 micro_steps=7360 elapsed=3.6s lr=3.000000e-04 loss=3.9860 ppl=53.8878 acc=0.3255 tokens=8128.0000
214
+ step=3700 micro_steps=7400 elapsed=3.6s lr=3.000000e-04 loss=3.9782 ppl=53.4761 acc=0.3250 tokens=8128.0000
215
+ step=3720 micro_steps=7440 elapsed=3.6s lr=3.000000e-04 loss=3.9659 ppl=52.8435 acc=0.3281 tokens=8128.0000
216
+ step=3740 micro_steps=7480 elapsed=3.6s lr=3.000000e-04 loss=3.9718 ppl=53.1304 acc=0.3275 tokens=8128.0000
217
+ step=3760 micro_steps=7520 elapsed=3.6s lr=3.000000e-04 loss=3.9885 ppl=54.0326 acc=0.3251 tokens=8128.0000
218
+ step=3780 micro_steps=7560 elapsed=3.6s lr=3.000000e-04 loss=3.9540 ppl=52.2093 acc=0.3283 tokens=8128.0000
219
+ step=3800 micro_steps=7600 elapsed=3.6s lr=3.000000e-04 loss=3.9595 ppl=52.4764 acc=0.3275 tokens=8128.0000
220
+ step=3820 micro_steps=7640 elapsed=3.6s lr=3.000000e-04 loss=3.9700 ppl=53.0521 acc=0.3263 tokens=8128.0000
221
+ step=3840 micro_steps=7680 elapsed=3.6s lr=3.000000e-04 loss=3.9764 ppl=53.4071 acc=0.3256 tokens=8128.0000
222
+ step=3860 micro_steps=7720 elapsed=3.6s lr=3.000000e-04 loss=3.9489 ppl=51.9420 acc=0.3287 tokens=8128.0000
223
+ step=3880 micro_steps=7760 elapsed=3.6s lr=3.000000e-04 loss=3.9492 ppl=51.9533 acc=0.3290 tokens=8128.0000
224
+ step=3900 micro_steps=7800 elapsed=3.6s lr=3.000000e-04 loss=3.9630 ppl=52.6557 acc=0.3273 tokens=8128.0000
225
+ step=3920 micro_steps=7840 elapsed=3.6s lr=3.000000e-04 loss=3.9416 ppl=51.5544 acc=0.3302 tokens=8128.0000
226
+ step=3940 micro_steps=7880 elapsed=3.6s lr=3.000000e-04 loss=3.9534 ppl=52.1683 acc=0.3286 tokens=8128.0000
227
+ step=3960 micro_steps=7920 elapsed=3.6s lr=3.000000e-04 loss=3.9490 ppl=51.9456 acc=0.3284 tokens=8128.0000
228
+ step=3980 micro_steps=7960 elapsed=3.6s lr=3.000000e-04 loss=3.9503 ppl=52.0025 acc=0.3275 tokens=8128.0000
229
+ step=4000 micro_steps=8000 elapsed=3.6s lr=3.000000e-04 loss=3.9315 ppl=51.0298 acc=0.3307 tokens=8128.0000
230
+ [sample step=4000] [CLS] such a showdown, which ended some years later with a trip to the united states. [SEP] worsening the pain in americans - - and hopefully inapturating it - - is hard on those people. [SEP] at one point, it would rise near the corset airport in post - communist north africa, drawing viewers who struggle to cope with tourist hikes. [SEP] he had sheltered some $40, 000 for a farmer from working his way to a nearby park instead of labeling a panda or watching the wild boar (30 feet) instead of the man. [SEP] is affluensky's demand definitely not worth? [SEP] [SEP]
231
+ step=4020 micro_steps=8040 elapsed=5.8s lr=3.000000e-04 loss=3.9312 ppl=51.0517 acc=0.3308 tokens=8128.0000
232
+ step=4040 micro_steps=8080 elapsed=3.6s lr=3.000000e-04 loss=3.9361 ppl=51.2798 acc=0.3303 tokens=8128.0000
233
+ step=4060 micro_steps=8120 elapsed=3.6s lr=3.000000e-04 loss=3.9501 ppl=52.0138 acc=0.3285 tokens=8128.0000
234
+ step=4080 micro_steps=8160 elapsed=3.6s lr=3.000000e-04 loss=3.9394 ppl=51.4709 acc=0.3286 tokens=8128.0000
235
+ step=4100 micro_steps=8200 elapsed=3.6s lr=3.000000e-04 loss=3.9253 ppl=50.7103 acc=0.3307 tokens=8128.0000
236
+ step=4120 micro_steps=8240 elapsed=3.6s lr=3.000000e-04 loss=3.9292 ppl=50.9211 acc=0.3316 tokens=8128.0000
237
+ step=4140 micro_steps=8280 elapsed=3.6s lr=3.000000e-04 loss=3.9309 ppl=51.0085 acc=0.3309 tokens=8128.0000
238
+ step=4160 micro_steps=8320 elapsed=3.6s lr=3.000000e-04 loss=3.9245 ppl=50.7222 acc=0.3310 tokens=8128.0000
239
+ step=4180 micro_steps=8360 elapsed=3.6s lr=3.000000e-04 loss=3.9219 ppl=50.5655 acc=0.3307 tokens=8128.0000
240
+ step=4200 micro_steps=8400 elapsed=3.6s lr=3.000000e-04 loss=3.9287 ppl=50.9071 acc=0.3310 tokens=8128.0000
241
+ step=4220 micro_steps=8440 elapsed=3.6s lr=3.000000e-04 loss=3.9173 ppl=50.3085 acc=0.3305 tokens=8128.0000
242
+ step=4240 micro_steps=8480 elapsed=3.6s lr=3.000000e-04 loss=3.9112 ppl=50.0233 acc=0.3331 tokens=8128.0000
243
+ step=4260 micro_steps=8520 elapsed=3.6s lr=3.000000e-04 loss=3.9256 ppl=50.7239 acc=0.3311 tokens=8128.0000
244
+ step=4280 micro_steps=8560 elapsed=3.6s lr=3.000000e-04 loss=3.9058 ppl=49.7594 acc=0.3336 tokens=8128.0000
245
+ step=4300 micro_steps=8600 elapsed=3.6s lr=3.000000e-04 loss=3.8825 ppl=48.6190 acc=0.3352 tokens=8128.0000
246
+ step=4320 micro_steps=8640 elapsed=3.6s lr=3.000000e-04 loss=3.9073 ppl=49.8234 acc=0.3320 tokens=8128.0000
247
+ step=4340 micro_steps=8680 elapsed=3.6s lr=3.000000e-04 loss=3.9016 ppl=49.5402 acc=0.3324 tokens=8128.0000
248
+ step=4360 micro_steps=8720 elapsed=3.6s lr=3.000000e-04 loss=3.9019 ppl=49.5451 acc=0.3337 tokens=8128.0000
249
+ step=4380 micro_steps=8760 elapsed=3.6s lr=3.000000e-04 loss=3.8976 ppl=49.3407 acc=0.3327 tokens=8128.0000
250
+ step=4400 micro_steps=8800 elapsed=3.6s lr=3.000000e-04 loss=3.8919 ppl=49.0610 acc=0.3336 tokens=8128.0000
251
+ step=4420 micro_steps=8840 elapsed=3.6s lr=3.000000e-04 loss=3.8915 ppl=49.0252 acc=0.3344 tokens=8128.0000
252
+ step=4440 micro_steps=8880 elapsed=3.6s lr=3.000000e-04 loss=3.8883 ppl=48.8853 acc=0.3333 tokens=8128.0000
253
+ step=4460 micro_steps=8920 elapsed=3.6s lr=3.000000e-04 loss=3.9000 ppl=49.4839 acc=0.3336 tokens=8128.0000
254
+ step=4480 micro_steps=8960 elapsed=3.6s lr=3.000000e-04 loss=3.8926 ppl=49.0913 acc=0.3341 tokens=8128.0000
255
+ step=4500 micro_steps=9000 elapsed=3.6s lr=3.000000e-04 loss=3.8436 ppl=47.3114 acc=0.3418 tokens=8128.0000
256
+ step=4520 micro_steps=9040 elapsed=3.6s lr=3.000000e-04 loss=3.8916 ppl=49.0472 acc=0.3343 tokens=8128.0000
257
+ step=4540 micro_steps=9080 elapsed=3.6s lr=3.000000e-04 loss=3.8798 ppl=48.4476 acc=0.3358 tokens=8128.0000
258
+ step=4560 micro_steps=9120 elapsed=3.6s lr=3.000000e-04 loss=3.8811 ppl=48.5576 acc=0.3355 tokens=8128.0000
259
+ step=4580 micro_steps=9160 elapsed=3.6s lr=3.000000e-04 loss=3.8873 ppl=48.8260 acc=0.3343 tokens=8128.0000
260
+ step=4600 micro_steps=9200 elapsed=3.6s lr=3.000000e-04 loss=3.8902 ppl=48.9647 acc=0.3335 tokens=8128.0000
261
+ step=4620 micro_steps=9240 elapsed=3.6s lr=3.000000e-04 loss=3.8757 ppl=48.2724 acc=0.3353 tokens=8128.0000
262
+ step=4640 micro_steps=9280 elapsed=3.6s lr=3.000000e-04 loss=3.8801 ppl=48.5090 acc=0.3355 tokens=8128.0000
263
+ step=4660 micro_steps=9320 elapsed=3.6s lr=3.000000e-04 loss=3.8660 ppl=47.8119 acc=0.3357 tokens=8128.0000
264
+ step=4680 micro_steps=9360 elapsed=3.6s lr=3.000000e-04 loss=3.8870 ppl=48.8011 acc=0.3352 tokens=8128.0000
265
+ step=4700 micro_steps=9400 elapsed=3.6s lr=3.000000e-04 loss=3.8677 ppl=47.9033 acc=0.3370 tokens=8128.0000
266
+ step=4720 micro_steps=9440 elapsed=3.6s lr=3.000000e-04 loss=3.8797 ppl=48.4573 acc=0.3350 tokens=8128.0000
267
+ step=4740 micro_steps=9480 elapsed=3.6s lr=3.000000e-04 loss=3.8766 ppl=48.3031 acc=0.3353 tokens=8128.0000
268
+ step=4760 micro_steps=9520 elapsed=3.6s lr=3.000000e-04 loss=3.8714 ppl=48.0545 acc=0.3365 tokens=8128.0000
269
+ step=4780 micro_steps=9560 elapsed=3.6s lr=3.000000e-04 loss=3.8526 ppl=47.1621 acc=0.3381 tokens=8128.0000
270
+ step=4800 micro_steps=9600 elapsed=3.6s lr=3.000000e-04 loss=3.8639 ppl=47.6841 acc=0.3374 tokens=8128.0000
271
+ step=4820 micro_steps=9640 elapsed=3.6s lr=3.000000e-04 loss=3.8566 ppl=47.3693 acc=0.3369 tokens=8128.0000
272
+ step=4840 micro_steps=9680 elapsed=3.6s lr=3.000000e-04 loss=3.8527 ppl=47.1919 acc=0.3392 tokens=8128.0000
273
+ step=4860 micro_steps=9720 elapsed=3.6s lr=3.000000e-04 loss=3.8581 ppl=47.4503 acc=0.3370 tokens=8128.0000
274
+ step=4880 micro_steps=9760 elapsed=3.6s lr=3.000000e-04 loss=3.8667 ppl=47.8499 acc=0.3356 tokens=8128.0000
275
+ step=4900 micro_steps=9800 elapsed=3.6s lr=3.000000e-04 loss=3.8576 ppl=47.4053 acc=0.3376 tokens=8128.0000
276
+ step=4920 micro_steps=9840 elapsed=3.6s lr=3.000000e-04 loss=3.8391 ppl=46.5728 acc=0.3389 tokens=8128.0000
277
+ step=4940 micro_steps=9880 elapsed=3.6s lr=3.000000e-04 loss=3.8671 ppl=47.8644 acc=0.3354 tokens=8128.0000
278
+ step=4960 micro_steps=9920 elapsed=3.6s lr=3.000000e-04 loss=3.8590 ppl=47.4656 acc=0.3372 tokens=8128.0000
279
+ step=4980 micro_steps=9960 elapsed=3.6s lr=3.000000e-04 loss=3.8433 ppl=46.7323 acc=0.3388 tokens=8128.0000
280
+ step=5000 micro_steps=10000 elapsed=3.6s lr=3.000000e-04 loss=3.8529 ppl=47.1858 acc=0.3383 tokens=8128.0000
281
+ [sample step=5000] [CLS] siddiqui, said he felt that mr. haidl's decision had been influenced at best by popular experiences, his long - time opponent in power and friendship among iraqis, his family and friends. [SEP] the oft found the us consumer credit agency in a weak showing in the early 1990s as well as a resurgence by the htc of the u. s. - based marketing firm roche. [SEP] tidal energy's developers will also have to acquire time warner network for up to $1 billion. [SEP] 0428: will clorox and epert 8 sell it in jars? [SEP] until now, ebay ' [SEP]
282
+ step=5020 micro_steps=10040 elapsed=5.8s lr=3.000000e-04 loss=3.8548 ppl=47.2749 acc=0.3385 tokens=8128.0000
283
+ step=5040 micro_steps=10080 elapsed=3.6s lr=3.000000e-04 loss=3.8402 ppl=46.5920 acc=0.3385 tokens=8128.0000
284
+ step=5060 micro_steps=10120 elapsed=3.6s lr=3.000000e-04 loss=3.8436 ppl=46.7635 acc=0.3386 tokens=8128.0000
285
+ step=5080 micro_steps=10160 elapsed=3.6s lr=3.000000e-04 loss=3.8557 ppl=47.3276 acc=0.3375 tokens=8128.0000
286
+ step=5100 micro_steps=10200 elapsed=3.6s lr=3.000000e-04 loss=3.8403 ppl=46.5893 acc=0.3387 tokens=8128.0000
287
+ step=5120 micro_steps=10240 elapsed=3.6s lr=3.000000e-04 loss=3.8308 ppl=46.1426 acc=0.3396 tokens=8128.0000
288
+ step=5140 micro_steps=10280 elapsed=3.6s lr=3.000000e-04 loss=3.8295 ppl=46.0875 acc=0.3423 tokens=8128.0000
289
+ step=5160 micro_steps=10320 elapsed=3.6s lr=3.000000e-04 loss=3.8385 ppl=46.4952 acc=0.3402 tokens=8128.0000
290
+ step=5180 micro_steps=10360 elapsed=3.6s lr=3.000000e-04 loss=3.8380 ppl=46.4819 acc=0.3401 tokens=8128.0000
291
+ step=5200 micro_steps=10400 elapsed=3.6s lr=3.000000e-04 loss=3.8301 ppl=46.1601 acc=0.3395 tokens=8128.0000
292
+ step=5220 micro_steps=10440 elapsed=3.6s lr=3.000000e-04 loss=3.8380 ppl=46.5034 acc=0.3395 tokens=8128.0000
293
+ step=5240 micro_steps=10480 elapsed=3.6s lr=3.000000e-04 loss=3.8326 ppl=46.2529 acc=0.3408 tokens=8128.0000
294
+ step=5260 micro_steps=10520 elapsed=3.6s lr=3.000000e-04 loss=3.8225 ppl=45.7632 acc=0.3410 tokens=8128.0000
295
+ step=5280 micro_steps=10560 elapsed=3.6s lr=3.000000e-04 loss=3.8151 ppl=45.4480 acc=0.3414 tokens=8128.0000
296
+ step=5300 micro_steps=10600 elapsed=3.6s lr=3.000000e-04 loss=3.8284 ppl=46.0495 acc=0.3399 tokens=8128.0000
297
+ step=5320 micro_steps=10640 elapsed=3.6s lr=3.000000e-04 loss=3.8339 ppl=46.2933 acc=0.3391 tokens=8128.0000
298
+ step=5340 micro_steps=10680 elapsed=3.6s lr=3.000000e-04 loss=3.8308 ppl=46.1532 acc=0.3395 tokens=8128.0000
299
+ step=5360 micro_steps=10720 elapsed=3.6s lr=3.000000e-04 loss=3.8233 ppl=45.8144 acc=0.3416 tokens=8128.0000
300
+ step=5380 micro_steps=10760 elapsed=3.6s lr=3.000000e-04 loss=3.8281 ppl=46.0230 acc=0.3399 tokens=8128.0000
301
+ step=5400 micro_steps=10800 elapsed=3.6s lr=3.000000e-04 loss=3.8238 ppl=45.8492 acc=0.3411 tokens=8128.0000
302
+ step=5420 micro_steps=10840 elapsed=3.6s lr=3.000000e-04 loss=3.8261 ppl=45.9348 acc=0.3401 tokens=8128.0000
303
+ step=5440 micro_steps=10880 elapsed=3.6s lr=3.000000e-04 loss=3.8114 ppl=45.2710 acc=0.3412 tokens=8128.0000
304
+ step=5460 micro_steps=10920 elapsed=3.6s lr=3.000000e-04 loss=3.8170 ppl=45.5219 acc=0.3415 tokens=8128.0000
305
+ step=5480 micro_steps=10960 elapsed=3.6s lr=3.000000e-04 loss=3.8110 ppl=45.2390 acc=0.3422 tokens=8128.0000
LTA_openwebtext_dualt/logs/lta_lm1b_classic_dirichlet_len256_gbs512_4gpu_10k_save1k_20260523.train.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 994417
LTA_openwebtext_dualt/logs/lta_owt_dirichlet_categorical_fullvocab_c1024_fullycoupled_shufchunks_len128_gbs512_8gpu_1m.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/lta_owt_gpt2cached_len1024_rollout1_p1_bench4gpu_20260513_152806.log ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ *****************************************
3
+ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
4
+ *****************************************
5
+ NCCL version 2.25.1+cuda12.8
6
+ {
7
+ "device": "cuda:0",
8
+ "rank": 0,
9
+ "world_size": 4,
10
+ "samples": "owt_cached_chunks:8734897",
11
+ "vocab_size": 50257,
12
+ "tokenizer_vocab_size": 50257,
13
+ "save_dir": "runs/lta_owt_gpt2cached_len1024_rollout1_p1_bench4gpu_20260513_152806",
14
+ "batch_size": 32,
15
+ "grad_accum": 4,
16
+ "effective_batch_size": 512,
17
+ "global_batch_size": 512,
18
+ "lr_schedule": "cosine",
19
+ "optimizer": "adamw",
20
+ "warmup_steps": 5,
21
+ "min_lr": 6e-05,
22
+ "weight_decay": 0.1,
23
+ "adamw_param_groups": "nanogpt",
24
+ "adam_beta1": 0.9,
25
+ "adam_beta2": 0.95,
26
+ "adam_eps": 1e-08,
27
+ "muon_momentum": 0.95,
28
+ "muon_ns_steps": 5,
29
+ "muon_update_scale": 1.0,
30
+ "ema_decay": 0.0,
31
+ "ema_start_step": 0,
32
+ "model_type": "ddit",
33
+ "dual_t": true,
34
+ "corrupt_t_mode": "same",
35
+ "corrupt_min_t": 0.0,
36
+ "corrupt_max_t": 1.0,
37
+ "prefix_block_prob": 0.0,
38
+ "prefix_block_len": 128,
39
+ "dirichlet_endpoint_mode": "categorical_dual_t",
40
+ "dirichlet_semantic_t_mode": "same",
41
+ "dirichlet_semantic_t_value": 0.0,
42
+ "categorical_wrong_from_full_vocab": true,
43
+ "categorical_wrong_from_batch_valid_tokens": false,
44
+ "mask_mixture_original_prob": 0.0,
45
+ "mask_mixture_lowk_prob": 0.0,
46
+ "mask_mixture_lowcorrupt_prob": 0.0,
47
+ "mask_mixture_block_prob": 0.0,
48
+ "mask_mixture_all_prob": 0.0,
49
+ "mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
50
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
51
+ "mask_mixture_block_tokens": "64,128",
52
+ "simplex_bridge_sampler": "dirichlet",
53
+ "logistic_normal_sigma_min": 0.18,
54
+ "logistic_normal_sigma_max": 2.2,
55
+ "logistic_normal_tau_min": 0.65,
56
+ "logistic_normal_tau_max": 1.15,
57
+ "torch_compile": false,
58
+ "compile_mode": "max-autotune",
59
+ "state_format": "prob",
60
+ "target_loss": "hard_ce",
61
+ "meanflow_weight": 0.0,
62
+ "rollout_train_prob": 1.0,
63
+ "rollout_train_steps": 1,
64
+ "rollout_train_infer_steps": 64,
65
+ "rollout_train_temp": 1.45,
66
+ "rollout_train_max_gamma": 1.0,
67
+ "rollout_train_corrupt_only": true,
68
+ "rollout_train_samplewise": false,
69
+ "rollout_train_compute_always": false,
70
+ "bridge_noise_init": "logistic_normal",
71
+ "noise_sigma": -1.0,
72
+ "allow_tf32": true,
73
+ "activation_checkpointing": false,
74
+ "activation_checkpoint_interval": 1,
75
+ "ddp_static_graph": false,
76
+ "ddp_gradient_as_bucket_view": true,
77
+ "blocking_data_transfer": false,
78
+ "dataloader_prefetch_factor": 4,
79
+ "full_train_stats": false,
80
+ "record_pad_truncate": false,
81
+ "record_add_eos": false,
82
+ "record_add_special_tokens": false,
83
+ "record_pad_token": "pad",
84
+ "record_shuffle_buffer": 10000,
85
+ "wrap": true,
86
+ "wrap_mode": "stream",
87
+ "wrap_record_buffer_size": 200,
88
+ "owt_cached_chunks": true,
89
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train_minus_100k",
90
+ "owt_chunk_cache_rebuild": false,
91
+ "owt_chunk_cache_write_batch": 4096,
92
+ "owt_exact_repeat_per_chunk": 0,
93
+ "online_chunk_shuffle": false,
94
+ "online_chunk_shuffle_buffer": 10000,
95
+ "openwebtext_split": "train_minus_100k",
96
+ "detokenizer": "auto",
97
+ "resolved_detokenizer": null,
98
+ "num_workers": 8,
99
+ "latest_every": 100000,
100
+ "resume_path": ""
101
+ }
102
+ step=5 micro_steps=20 elapsed=17.2s lr=6.000000e-04 loss=10.8125 loss_recon=10.8125 loss_meanflow=0.0000 mean_model_t=0.5068 mean_corrupt_t=0.5068 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 rollout_train_applied=1.0000 acc_all=0.0005 acc_corrupt=0.0005 corrupt_frac=0.5206 loss_all=10.8125 loss_corrupt=10.8125 acc_corrupt_t_0p0_0p2=0.0011 corrupt_frac_t_0p0_0p2=0.1067 acc_corrupt_t_0p2_0p4=0.0004 corrupt_frac_t_0p2_0p4=0.1344 acc_corrupt_t_0p4_0p6=0.0000 corrupt_frac_t_0p4_0p6=0.0925 acc_corrupt_t_0p6_0p8=0.0007 corrupt_frac_t_0p6_0p8=0.4272 acc_corrupt_t_0p8_1p0=0.0000 corrupt_frac_t_0p8_1p0=0.2392 wrong_frac=0.3916 init_acc_corrupt=0.5891 init_gold_top10=0.6052 init_gold_top100=0.6225
103
+ step=10 micro_steps=40 elapsed=19.8s lr=6.000000e-05 loss=10.8125 loss_recon=10.8125 loss_meanflow=0.0000 mean_model_t=0.4822 mean_corrupt_t=0.4822 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 rollout_train_applied=1.0000 acc_all=0.0007 acc_corrupt=0.0006 corrupt_frac=0.6584 loss_all=10.8125 loss_corrupt=10.8125 acc_corrupt_t_0p0_0p2=0.0010 corrupt_frac_t_0p0_0p2=0.3402 acc_corrupt_t_0p2_0p4=0.0003 corrupt_frac_t_0p2_0p4=0.1348 acc_corrupt_t_0p4_0p6=0.0005 corrupt_frac_t_0p4_0p6=0.1821 acc_corrupt_t_0p6_0p8=0.0000 corrupt_frac_t_0p6_0p8=0.1046 acc_corrupt_t_0p8_1p0=0.0004 corrupt_frac_t_0p8_1p0=0.2383 wrong_frac=0.5652 init_acc_corrupt=0.4016 init_gold_top10=0.4274 init_gold_top100=0.4691
LTA_openwebtext_dualt/logs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525_watcher.log ADDED
@@ -0,0 +1,458 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [watch-gumbel] run_dir=runs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525
2
+ [watch-gumbel] out_base=docs/lta_samples/metrics_20260525/owt_t5_absrope_adaln_Cv_to_2v_mask0p1_1p0_sameT_sde_gumbel_topp0.95_tau1.0_to_0.2_blend_c32100_64200_n128/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525
3
+ [watch-gumbel] interval=10000 max_len=1024 steps=128 c=32100->64200 temp=1.45 top_p=0.95 tau=1.0->0.2 n=128
4
+ [watch-gumbel] 2026-05-25_17:58:57 no ckpt yet
5
+ [watch-gumbel] 2026-05-25_17:59:27 no ckpt yet
6
+ [watch-gumbel] 2026-05-25_17:59:57 no ckpt yet
7
+ [watch-gumbel] 2026-05-25_18:00:27 no ckpt yet
8
+ [watch-gumbel] 2026-05-25_18:00:57 no ckpt yet
9
+ [watch-gumbel] 2026-05-25_18:01:27 no ckpt yet
10
+ [watch-gumbel] 2026-05-25_18:01:57 no ckpt yet
11
+ [watch-gumbel] 2026-05-25_18:02:27 no ckpt yet
12
+ [watch-gumbel] 2026-05-25_18:02:57 no ckpt yet
13
+ [watch-gumbel] 2026-05-25_18:03:27 no ckpt yet
14
+ [watch-gumbel] 2026-05-25_18:03:57 no ckpt yet
15
+ [watch-gumbel] 2026-05-25_18:04:27 no ckpt yet
16
+ [watch-gumbel] 2026-05-25_18:04:57 no ckpt yet
17
+ [watch-gumbel] 2026-05-25_18:05:27 no ckpt yet
18
+ [watch-gumbel] 2026-05-25_18:05:57 no ckpt yet
19
+ [watch-gumbel] 2026-05-25_18:06:27 no ckpt yet
20
+ [watch-gumbel] 2026-05-25_18:06:57 no ckpt yet
21
+ [watch-gumbel] 2026-05-25_18:07:27 no ckpt yet
22
+ [watch-gumbel] 2026-05-25_18:07:57 no ckpt yet
23
+ [watch-gumbel] 2026-05-25_18:08:27 no ckpt yet
24
+ [watch-gumbel] 2026-05-25_18:08:57 no ckpt yet
25
+ [watch-gumbel] 2026-05-25_18:09:27 no ckpt yet
26
+ [watch-gumbel] 2026-05-25_18:09:57 no ckpt yet
27
+ [watch-gumbel] 2026-05-25_18:10:27 no ckpt yet
28
+ [watch-gumbel] 2026-05-25_18:10:57 no ckpt yet
29
+ [watch-gumbel] 2026-05-25_18:11:27 no ckpt yet
30
+ [watch-gumbel] 2026-05-25_18:11:57 no ckpt yet
31
+ [watch-gumbel] 2026-05-25_18:12:27 no ckpt yet
32
+ [watch-gumbel] 2026-05-25_18:12:57 no ckpt yet
33
+ [watch-gumbel] 2026-05-25_18:13:27 no ckpt yet
34
+ [watch-gumbel] 2026-05-25_18:13:57 no ckpt yet
35
+ [watch-gumbel] 2026-05-25_18:14:27 no ckpt yet
36
+ [watch-gumbel] 2026-05-25_18:14:57 no ckpt yet
37
+ [watch-gumbel] 2026-05-25_18:15:27 no ckpt yet
38
+ [watch-gumbel] 2026-05-25_18:15:57 no ckpt yet
39
+ [watch-gumbel] 2026-05-25_18:16:27 no ckpt yet
40
+ [watch-gumbel] 2026-05-25_18:16:57 no ckpt yet
41
+ [watch-gumbel] 2026-05-25_18:17:27 no ckpt yet
42
+ [watch-gumbel] 2026-05-25_18:17:57 no ckpt yet
43
+ [watch-gumbel] 2026-05-25_18:18:27 no ckpt yet
44
+ [watch-gumbel] 2026-05-25_18:18:57 no ckpt yet
45
+ [watch-gumbel] 2026-05-25_18:19:27 no ckpt yet
46
+ [watch-gumbel] 2026-05-25_18:19:57 no ckpt yet
47
+ [watch-gumbel] 2026-05-25_18:20:27 no ckpt yet
48
+ [watch-gumbel] 2026-05-25_18:20:57 no ckpt yet
49
+ [watch-gumbel] 2026-05-25_18:21:27 no ckpt yet
50
+ [watch-gumbel] 2026-05-25_18:21:57 no ckpt yet
51
+ [watch-gumbel] 2026-05-25_18:22:27 no ckpt yet
52
+ [watch-gumbel] 2026-05-25_18:22:57 no ckpt yet
53
+ [watch-gumbel] 2026-05-25_18:23:27 no ckpt yet
54
+ [watch-gumbel] 2026-05-25_18:23:57 no ckpt yet
55
+ [watch-gumbel] 2026-05-25_18:24:27 no ckpt yet
56
+ [watch-gumbel] 2026-05-25_18:24:57 no ckpt yet
57
+ [watch-gumbel] 2026-05-25_18:25:27 no ckpt yet
58
+ [watch-gumbel] 2026-05-25_18:25:57 no ckpt yet
59
+ [watch-gumbel] 2026-05-25_18:26:27 no ckpt yet
60
+ [watch-gumbel] 2026-05-25_18:26:57 no ckpt yet
61
+ [watch-gumbel] 2026-05-25_18:27:27 no ckpt yet
62
+ [watch-gumbel] 2026-05-25_18:27:57 no ckpt yet
63
+ [watch-gumbel] 2026-05-25_18:28:27 no ckpt yet
64
+ [watch-gumbel] 2026-05-25_18:28:57 no ckpt yet
65
+ [watch-gumbel] 2026-05-25_18:29:27 no ckpt yet
66
+ [watch-gumbel] 2026-05-25_18:29:57 no ckpt yet
67
+ [watch-gumbel] 2026-05-25_18:30:27 no ckpt yet
68
+ [watch-gumbel] 2026-05-25_18:30:57 no ckpt yet
69
+ [watch-gumbel] 2026-05-25_18:31:27 no ckpt yet
70
+ [watch-gumbel] 2026-05-25_18:31:57 no ckpt yet
71
+ [watch-gumbel] 2026-05-25_18:32:27 no ckpt yet
72
+ [watch-gumbel] 2026-05-25_18:32:57 no ckpt yet
73
+ [watch-gumbel] 2026-05-25_18:33:27 no ckpt yet
74
+ [watch-gumbel] 2026-05-25_18:33:57 no ckpt yet
75
+ [watch-gumbel] 2026-05-25_18:34:27 no ckpt yet
76
+ [watch-gumbel] 2026-05-25_18:34:57 no ckpt yet
77
+ [watch-gumbel] 2026-05-25_18:35:27 no ckpt yet
78
+ [watch-gumbel] 2026-05-25_18:35:57 no ckpt yet
79
+ [watch-gumbel] 2026-05-25_18:36:27 no ckpt yet
80
+ [watch-gumbel] 2026-05-25_18:36:57 no ckpt yet
81
+ [watch-gumbel] 2026-05-25_18:37:27 no ckpt yet
82
+ [watch-gumbel] 2026-05-25_18:37:57 no ckpt yet
83
+ [watch-gumbel] 2026-05-25_18:38:27 no ckpt yet
84
+ [watch-gumbel] 2026-05-25_18:38:57 no ckpt yet
85
+ [watch-gumbel] 2026-05-25_18:39:27 no ckpt yet
86
+ [watch-gumbel] 2026-05-25_18:39:57 no ckpt yet
87
+ [watch-gumbel] 2026-05-25_18:40:27 no ckpt yet
88
+ [watch-gumbel] 2026-05-25_18:40:57 no ckpt yet
89
+ [watch-gumbel] 2026-05-25_18:41:27 no ckpt yet
90
+ [watch-gumbel] 2026-05-25_18:41:57 no ckpt yet
91
+ [watch-gumbel] 2026-05-25_18:42:27 no ckpt yet
92
+ [watch-gumbel] 2026-05-25_18:42:57 no ckpt yet
93
+ [watch-gumbel] 2026-05-25_18:43:27 no ckpt yet
94
+ [watch-gumbel] 2026-05-25_18:43:57 no ckpt yet
95
+ [watch-gumbel] 2026-05-25_18:44:27 no ckpt yet
96
+ [watch-gumbel] 2026-05-25_18:44:57 no ckpt yet
97
+ [watch-gumbel] 2026-05-25_18:45:27 no ckpt yet
98
+ [watch-gumbel] 2026-05-25_18:45:57 no ckpt yet
99
+ [watch-gumbel] 2026-05-25_18:46:27 no ckpt yet
100
+ [watch-gumbel] 2026-05-25_18:46:57 no ckpt yet
101
+ [watch-gumbel] 2026-05-25_18:47:27 no ckpt yet
102
+ [watch-gumbel] 2026-05-25_18:47:57 no ckpt yet
103
+ [watch-gumbel] 2026-05-25_18:48:27 no ckpt yet
104
+ [watch-gumbel] 2026-05-25_18:48:57 no ckpt yet
105
+ [watch-gumbel] 2026-05-25_18:49:27 no ckpt yet
106
+ [watch-gumbel] 2026-05-25_18:49:57 no ckpt yet
107
+ [watch-gumbel] 2026-05-25_18:50:27 no ckpt yet
108
+ [watch-gumbel] 2026-05-25_18:50:57 no ckpt yet
109
+ [watch-gumbel] 2026-05-25_18:51:27 no ckpt yet
110
+ [watch-gumbel] 2026-05-25_18:51:57 no ckpt yet
111
+ [watch-gumbel] 2026-05-25_18:52:27 no ckpt yet
112
+ [watch-gumbel] 2026-05-25_18:52:57 no ckpt yet
113
+ [watch-gumbel] 2026-05-25_18:53:27 no ckpt yet
114
+ [watch-gumbel] 2026-05-25_18:53:57 no ckpt yet
115
+ [watch-gumbel] 2026-05-25_18:54:27 no ckpt yet
116
+ [watch-gumbel] 2026-05-25_18:54:57 no ckpt yet
117
+ [watch-gumbel] 2026-05-25_18:55:27 no ckpt yet
118
+ [watch-gumbel] 2026-05-25_18:55:57 no ckpt yet
119
+ [watch-gumbel] 2026-05-25_18:56:27 no ckpt yet
120
+ [watch-gumbel] 2026-05-25_18:56:57 no ckpt yet
121
+ [watch-gumbel] 2026-05-25_18:57:27 no ckpt yet
122
+ [watch-gumbel] 2026-05-25_18:57:57 no ckpt yet
123
+ [watch-gumbel] 2026-05-25_18:58:27 no ckpt yet
124
+ [watch-gumbel] 2026-05-25_18:58:57 no ckpt yet
125
+ [watch-gumbel] 2026-05-25_18:59:27 no ckpt yet
126
+ [watch-gumbel] 2026-05-25_18:59:57 no ckpt yet
127
+ [watch-gumbel] 2026-05-25_19:00:27 no ckpt yet
128
+ [watch-gumbel] 2026-05-25_19:00:57 no ckpt yet
129
+ [watch-gumbel] 2026-05-25_19:01:27 no ckpt yet
130
+ [watch-gumbel] 2026-05-25_19:01:57 no ckpt yet
131
+ [watch-gumbel] 2026-05-25_19:02:27 no ckpt yet
132
+ [watch-gumbel] 2026-05-25_19:02:57 no ckpt yet
133
+ [watch-gumbel] 2026-05-25_19:03:27 no ckpt yet
134
+ [watch-gumbel] 2026-05-25_19:03:57 no ckpt yet
135
+ [watch-gumbel] 2026-05-25_19:04:27 no ckpt yet
136
+ [watch-gumbel] 2026-05-25_19:04:57 no ckpt yet
137
+ [watch-gumbel] 2026-05-25_19:05:27 no ckpt yet
138
+ [watch-gumbel] 2026-05-25_19:05:57 no ckpt yet
139
+ [watch-gumbel] 2026-05-25_19:06:27 no ckpt yet
140
+ [watch-gumbel] 2026-05-25_19:06:57 no ckpt yet
141
+ [watch-gumbel] 2026-05-25_19:07:27 no ckpt yet
142
+ [watch-gumbel] 2026-05-25_19:07:57 no ckpt yet
143
+ [watch-gumbel] 2026-05-25_19:08:27 no ckpt yet
144
+ [watch-gumbel] 2026-05-25_19:08:57 no ckpt yet
145
+ [watch-gumbel] 2026-05-25_19:09:27 no ckpt yet
146
+ [watch-gumbel] 2026-05-25_19:09:57 no ckpt yet
147
+ [watch-gumbel] 2026-05-25_19:10:27 no ckpt yet
148
+ [watch-gumbel] 2026-05-25_19:10:57 no ckpt yet
149
+ [watch-gumbel] 2026-05-25_19:11:27 no ckpt yet
150
+ [watch-gumbel] 2026-05-25_19:11:57 no ckpt yet
151
+ [watch-gumbel] 2026-05-25_19:12:27 no ckpt yet
152
+ [watch-gumbel] 2026-05-25_19:12:57 no ckpt yet
153
+ [watch-gumbel] 2026-05-25_19:13:27 no ckpt yet
154
+ [watch-gumbel] 2026-05-25_19:13:57 no ckpt yet
155
+ [watch-gumbel] 2026-05-25_19:14:27 no ckpt yet
156
+ [watch-gumbel] 2026-05-25_19:14:57 no ckpt yet
157
+ [watch-gumbel] 2026-05-25_19:15:27 no ckpt yet
158
+ [watch-gumbel] 2026-05-25_19:15:57 no ckpt yet
159
+ [watch-gumbel] 2026-05-25_19:16:27 no ckpt yet
160
+ [watch-gumbel] 2026-05-25_19:16:57 no ckpt yet
161
+ [watch-gumbel] 2026-05-25_19:17:27 no ckpt yet
162
+ [watch-gumbel] 2026-05-25_19:17:57 no ckpt yet
163
+ [watch-gumbel] 2026-05-25_19:18:27 no ckpt yet
164
+ [watch-gumbel] 2026-05-25_19:18:57 no ckpt yet
165
+ [watch-gumbel] 2026-05-25_19:19:27 no ckpt yet
166
+ [watch-gumbel] 2026-05-25_19:19:57 no ckpt yet
167
+ [watch-gumbel] 2026-05-25_19:20:27 no ckpt yet
168
+ [watch-gumbel] 2026-05-25_19:20:57 no ckpt yet
169
+ [watch-gumbel] 2026-05-25_19:21:27 no ckpt yet
170
+ [watch-gumbel] 2026-05-25_19:21:57 no ckpt yet
171
+ [watch-gumbel] 2026-05-25_19:22:27 no ckpt yet
172
+ [watch-gumbel] 2026-05-25_19:22:57 no ckpt yet
173
+ [watch-gumbel] 2026-05-25_19:23:27 no ckpt yet
174
+ [watch-gumbel] 2026-05-25_19:23:57 no ckpt yet
175
+ [watch-gumbel] 2026-05-25_19:24:27 no ckpt yet
176
+ [watch-gumbel] 2026-05-25_19:24:57 no ckpt yet
177
+ [watch-gumbel] 2026-05-25_19:25:27 no ckpt yet
178
+ [watch-gumbel] 2026-05-25_19:25:57 no ckpt yet
179
+ [watch-gumbel] 2026-05-25_19:26:27 no ckpt yet
180
+ [watch-gumbel] 2026-05-25_19:26:57 no ckpt yet
181
+ [watch-gumbel] 2026-05-25_19:27:27 no ckpt yet
182
+ [watch-gumbel] 2026-05-25_19:27:57 no ckpt yet
183
+ [watch-gumbel] 2026-05-25_19:28:27 no ckpt yet
184
+ [watch-gumbel] 2026-05-25_19:28:57 no ckpt yet
185
+ [watch-gumbel] 2026-05-25_19:29:27 no ckpt yet
186
+ [watch-gumbel] 2026-05-25_19:29:57 no ckpt yet
187
+ [watch-gumbel] 2026-05-25_19:30:27 infer runs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0010000.pt -> docs/lta_samples/metrics_20260525/owt_t5_absrope_adaln_Cv_to_2v_mask0p1_1p0_sameT_sde_gumbel_topp0.95_tau1.0_to_0.2_blend_c32100_64200_n128/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0010000
188
+ [load] runs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0010000.pt
189
+ [ckpt] step=10000
190
+ [sde] generated 2/128
191
+ [sde] generated 4/128
192
+ [sde] generated 6/128
193
+ [sde] generated 8/128
194
+ [sde] generated 10/128
195
+ [sde] generated 12/128
196
+ [sde] generated 14/128
197
+ [sde] generated 16/128
198
+ [sde] generated 18/128
199
+ [sde] generated 20/128
200
+ [sde] generated 22/128
201
+ [sde] generated 24/128
202
+ [sde] generated 26/128
203
+ [sde] generated 28/128
204
+ [sde] generated 30/128
205
+ [sde] generated 32/128
206
+ [sde] generated 34/128
207
+ [sde] generated 36/128
208
+ [sde] generated 38/128
209
+ [sde] generated 40/128
210
+ [sde] generated 42/128
211
+ [sde] generated 44/128
212
+ [sde] generated 46/128
213
+ [sde] generated 48/128
214
+ [sde] generated 50/128
215
+ [sde] generated 52/128
216
+ [sde] generated 54/128
217
+ [sde] generated 56/128
218
+ [sde] generated 58/128
219
+ [sde] generated 60/128
220
+ [sde] generated 62/128
221
+ [sde] generated 64/128
222
+ [sde] generated 66/128
223
+ [sde] generated 68/128
224
+ [sde] generated 70/128
225
+ [sde] generated 72/128
226
+ [sde] generated 74/128
227
+ [sde] generated 76/128
228
+ [sde] generated 78/128
229
+ [sde] generated 80/128
230
+ [sde] generated 82/128
231
+ [sde] generated 84/128
232
+ [sde] generated 86/128
233
+ [sde] generated 88/128
234
+ [sde] generated 90/128
235
+ [sde] generated 92/128
236
+ [sde] generated 94/128
237
+ [sde] generated 96/128
238
+ [sde] generated 98/128
239
+ [sde] generated 100/128
240
+ [sde] generated 102/128
241
+ [sde] generated 104/128
242
+ [sde] generated 106/128
243
+ [sde] generated 108/128
244
+ [sde] generated 110/128
245
+ [sde] generated 112/128
246
+ [sde] generated 114/128
247
+ [sde] generated 116/128
248
+ [sde] generated 118/128
249
+ [sde] generated 120/128
250
+ [sde] generated 122/128
251
+ [sde] generated 124/128
252
+ [sde] generated 126/128
253
+ [sde] generated 128/128
254
+ [score] loading scorer: /e2e-data/evad-tech-vla/wanghan58/models/flowtext_scorers/gpt2-large-standard
255
+ [summary] {
256
+ "type": "summary",
257
+ "checkpoint": "runs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0010000.pt",
258
+ "step": 10000,
259
+ "decode": {
260
+ "decode_rule": "dirichlet_resample_sde",
261
+ "steps": 128,
262
+ "model_t_mode": "support_t",
263
+ "mean_mode": "endpoint_only",
264
+ "anchor_gamma": 1.0,
265
+ "endpoint_floor": 0.0,
266
+ "concentration_min": 32100.0,
267
+ "concentration_max": 64200.0,
268
+ "endpoint_temp": 1.45,
269
+ "endpoint_temp_start": null,
270
+ "endpoint_temp_end": null,
271
+ "endpoint_projection": "gumbel_softmax",
272
+ "endpoint_top_k": 0,
273
+ "endpoint_top_p": 0.95,
274
+ "gumbel_tau_start": 1.0,
275
+ "gumbel_tau_end": 0.2,
276
+ "gumbel_noise_scale_start": 1.0,
277
+ "gumbel_noise_scale_end": 1.0,
278
+ "ban_special_tokens": false,
279
+ "banned_endpoint_ids": [],
280
+ "support_power": 1.0,
281
+ "semantic_power": 1.0,
282
+ "noise_init": "dirichlet",
283
+ "noise_sigma": -1.0,
284
+ "noise_dirichlet_concentration": 32100.0,
285
+ "sde_resample": "dirichlet",
286
+ "logistic_normal_sigma_min": 0.18,
287
+ "logistic_normal_sigma_max": 3.0,
288
+ "logistic_normal_tau_min": 0.65,
289
+ "logistic_normal_tau_max": 1.0,
290
+ "final_from": "blend_0.5",
291
+ "n_samples": 128,
292
+ "seed": 20260524
293
+ },
294
+ "raw_genppl": {
295
+ "ppl": 2.098817389847081,
296
+ "nll_per_token": 0.7413740384102933,
297
+ "tokens": 129915,
298
+ "kept_samples": 128,
299
+ "total_samples": 128,
300
+ "empty_rate": 0.0,
301
+ "skipped_samples": 0
302
+ },
303
+ "stripped_genppl": {
304
+ "ppl": 2.091704636784785,
305
+ "nll_per_token": 0.737979349229492,
306
+ "tokens": 129874,
307
+ "kept_samples": 128,
308
+ "total_samples": 128,
309
+ "empty_rate": 0.0,
310
+ "skipped_samples": 0
311
+ },
312
+ "diversity": {
313
+ "sample_entropy": 1.1927971824809422,
314
+ "unique_tokens": 450,
315
+ "token_count": 131072,
316
+ "distinct_1": 0.0034332275390625,
317
+ "distinct_2": 0.02113880742913001,
318
+ "top_token_mass": 0.6524658203125
319
+ }
320
+ }
321
+ [done] docs/lta_samples/metrics_20260525/owt_t5_absrope_adaln_Cv_to_2v_mask0p1_1p0_sameT_sde_gumbel_topp0.95_tau1.0_to_0.2_blend_c32100_64200_n128/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0010000/sde_steps128_samples128_scored.jsonl
322
+ [watch-gumbel] 2026-05-25_19:37:00 done step_0010000
323
+ [watch-gumbel] 2026-05-25_21:02:31 infer runs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0020000.pt -> docs/lta_samples/metrics_20260525/owt_t5_absrope_adaln_Cv_to_2v_mask0p1_1p0_sameT_sde_gumbel_topp0.95_tau1.0_to_0.2_blend_c32100_64200_n128/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0020000
324
+ [load] runs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0020000.pt
325
+ [ckpt] step=20000
326
+ [sde] generated 2/128
327
+ [sde] generated 4/128
328
+ [sde] generated 6/128
329
+ [sde] generated 8/128
330
+ [sde] generated 10/128
331
+ [sde] generated 12/128
332
+ [sde] generated 14/128
333
+ [sde] generated 16/128
334
+ [sde] generated 18/128
335
+ [sde] generated 20/128
336
+ [sde] generated 22/128
337
+ [sde] generated 24/128
338
+ [sde] generated 26/128
339
+ [sde] generated 28/128
340
+ [sde] generated 30/128
341
+ [sde] generated 32/128
342
+ [sde] generated 34/128
343
+ [sde] generated 36/128
344
+ [sde] generated 38/128
345
+ [sde] generated 40/128
346
+ [sde] generated 42/128
347
+ [sde] generated 44/128
348
+ [sde] generated 46/128
349
+ [sde] generated 48/128
350
+ [sde] generated 50/128
351
+ [sde] generated 52/128
352
+ [sde] generated 54/128
353
+ [sde] generated 56/128
354
+ [sde] generated 58/128
355
+ [sde] generated 60/128
356
+ [sde] generated 62/128
357
+ [sde] generated 64/128
358
+ [sde] generated 66/128
359
+ [sde] generated 68/128
360
+ [sde] generated 70/128
361
+ [sde] generated 72/128
362
+ [sde] generated 74/128
363
+ [sde] generated 76/128
364
+ [sde] generated 78/128
365
+ [sde] generated 80/128
366
+ [sde] generated 82/128
367
+ [sde] generated 84/128
368
+ [sde] generated 86/128
369
+ [sde] generated 88/128
370
+ [sde] generated 90/128
371
+ [sde] generated 92/128
372
+ [sde] generated 94/128
373
+ [sde] generated 96/128
374
+ [sde] generated 98/128
375
+ [sde] generated 100/128
376
+ [sde] generated 102/128
377
+ [sde] generated 104/128
378
+ [sde] generated 106/128
379
+ [sde] generated 108/128
380
+ [sde] generated 110/128
381
+ [sde] generated 112/128
382
+ [sde] generated 114/128
383
+ [sde] generated 116/128
384
+ [sde] generated 118/128
385
+ [sde] generated 120/128
386
+ [sde] generated 122/128
387
+ [sde] generated 124/128
388
+ [sde] generated 126/128
389
+ [sde] generated 128/128
390
+ [score] loading scorer: /e2e-data/evad-tech-vla/wanghan58/models/flowtext_scorers/gpt2-large-standard
391
+ [summary] {
392
+ "type": "summary",
393
+ "checkpoint": "runs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0020000.pt",
394
+ "step": 20000,
395
+ "decode": {
396
+ "decode_rule": "dirichlet_resample_sde",
397
+ "steps": 128,
398
+ "model_t_mode": "support_t",
399
+ "mean_mode": "endpoint_only",
400
+ "anchor_gamma": 1.0,
401
+ "endpoint_floor": 0.0,
402
+ "concentration_min": 32100.0,
403
+ "concentration_max": 64200.0,
404
+ "endpoint_temp": 1.45,
405
+ "endpoint_temp_start": null,
406
+ "endpoint_temp_end": null,
407
+ "endpoint_projection": "gumbel_softmax",
408
+ "endpoint_top_k": 0,
409
+ "endpoint_top_p": 0.95,
410
+ "gumbel_tau_start": 1.0,
411
+ "gumbel_tau_end": 0.2,
412
+ "gumbel_noise_scale_start": 1.0,
413
+ "gumbel_noise_scale_end": 1.0,
414
+ "ban_special_tokens": false,
415
+ "banned_endpoint_ids": [],
416
+ "support_power": 1.0,
417
+ "semantic_power": 1.0,
418
+ "noise_init": "dirichlet",
419
+ "noise_sigma": -1.0,
420
+ "noise_dirichlet_concentration": 32100.0,
421
+ "sde_resample": "dirichlet",
422
+ "logistic_normal_sigma_min": 0.18,
423
+ "logistic_normal_sigma_max": 3.0,
424
+ "logistic_normal_tau_min": 0.65,
425
+ "logistic_normal_tau_max": 1.0,
426
+ "final_from": "blend_0.5",
427
+ "n_samples": 128,
428
+ "seed": 20260524
429
+ },
430
+ "raw_genppl": {
431
+ "ppl": 3.4360435319768396,
432
+ "nll_per_token": 1.234320673418016,
433
+ "tokens": 60733,
434
+ "kept_samples": 128,
435
+ "total_samples": 128,
436
+ "empty_rate": 0.0,
437
+ "skipped_samples": 0
438
+ },
439
+ "stripped_genppl": {
440
+ "ppl": 3.399305871374786,
441
+ "nll_per_token": 1.2235712553015452,
442
+ "tokens": 60637,
443
+ "kept_samples": 128,
444
+ "total_samples": 128,
445
+ "empty_rate": 0.0,
446
+ "skipped_samples": 0
447
+ },
448
+ "diversity": {
449
+ "sample_entropy": 0.8467485464533029,
450
+ "unique_tokens": 298,
451
+ "token_count": 131072,
452
+ "distinct_1": 0.0022735595703125,
453
+ "distinct_2": 0.017037817693059627,
454
+ "top_token_mass": 0.5230484008789062
455
+ }
456
+ }
457
+ [done] docs/lta_samples/metrics_20260525/owt_t5_absrope_adaln_Cv_to_2v_mask0p1_1p0_sameT_sde_gumbel_topp0.95_tau1.0_to_0.2_blend_c32100_64200_n128/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/step_0020000/sde_steps128_samples128_scored.jsonl
458
+ [watch-gumbel] 2026-05-25_21:08:53 done step_0020000
LTA_openwebtext_dualt/logs/owt_candidate_catdualt_step246k_64_c1024_t1p2_blend_n64.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [forbid_endpoint_ids] n=352 first=[94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125]
2
+ [decode] steps64_c1024_mtpost_t1p2_tpow1p0_noise0_blend_anchored
3
+ [summary] {"name": "steps64_c1024_mtpost_t1p2_tpow1p0_noise0_blend_anchored", "step": 246000, "n_samples": 64, "steps": 64, "concentration_max": 1024.0, "temp_start": 1.2, "temp_end": 1.2, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "blend", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 0, "update_rule": "anchored", "model_t_mode": "post", "lock_bos": true, "lock_final_eos": false, "detok_genppl": 153.70967053904752, "sample_entropy": 4.642808948434547, "distinct_1": 0.1154937744140625, "distinct_2": 0.5061858504398827, "top_token_mass": 0.1103973388671875, "tokens_scored": 59240, "readability_score": 4.950578398852453, "mean_chars": 3752.34375, "replacement_chars": 0.0}
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/smoke_gpt2_softendpoint_mn_n128_onehot.log ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [rank0]: Traceback (most recent call last):
2
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2600, in <module>
3
+ [rank0]: main()
4
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1866, in main
5
+ [rank0]: dataset = CachedWrappedTextSequenceDataset(
6
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
7
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 704, in __init__
8
+ [rank0]: raise ValueError(f"cache max_len={cache_max_len} does not match requested max_len={self.max_len}")
9
+ [rank0]: ValueError: cache max_len=1024 does not match requested max_len=128
10
+ [rank0]:[W516 22:06:53.301173242 ProcessGroupNCCL.cpp:1487] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
11
+ W0516 22:06:53.771000 470531 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 470536 closing signal SIGTERM
12
+ W0516 22:06:53.772000 470531 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 470537 closing signal SIGTERM
13
+ W0516 22:06:53.773000 470531 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 470538 closing signal SIGTERM
14
+ E0516 22:06:53.950000 470531 torch/distributed/elastic/multiprocessing/api.py:870] failed (exitcode: 1) local_rank: 0 (pid: 470535) of binary: /usr/bin/python
15
+ Traceback (most recent call last):
16
+ File "<frozen runpy>", line 198, in _run_module_as_main
17
+ File "<frozen runpy>", line 88, in _run_code
18
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
19
+ main()
20
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
21
+ return f(*args, **kwargs)
22
+ ^^^^^^^^^^^^^^^^^^
23
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
24
+ run(args)
25
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
26
+ elastic_launch(
27
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
28
+ return launch_agent(self._config, self._entrypoint, list(args))
29
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
30
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
31
+ raise ChildFailedError(
32
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
33
+ ============================================================
34
+ train.py FAILED
35
+ ------------------------------------------------------------
36
+ Failures:
37
+ <NO_OTHER_FAILURES>
38
+ ------------------------------------------------------------
39
+ Root Cause (first observed failure):
40
+ [0]:
41
+ time : 2026-05-16_22:06:53
42
+ host : localhost
43
+ rank : 0 (local_rank: 0)
44
+ exitcode : 1 (pid: 470535)
45
+ error_file: <N/A>
46
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
47
+ ============================================================
48
+ [rank0]: Traceback (most recent call last):
49
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2600, in <module>
50
+ [rank0]: main()
51
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1866, in main
52
+ [rank0]: dataset = CachedWrappedTextSequenceDataset(
53
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
54
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 695, in __init__
55
+ [rank0]: raise RuntimeError(
56
+ [rank0]: RuntimeError: cached OWT chunks not found under /e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len128_train_minus_100k; build them first or set --owt_chunk_cache_rebuild on rank 0
57
+ [rank0]:[W516 22:07:24.000487031 ProcessGroupNCCL.cpp:1487] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
58
+ W0516 22:07:24.408000 470659 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 470664 closing signal SIGTERM
59
+ W0516 22:07:24.409000 470659 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 470665 closing signal SIGTERM
60
+ W0516 22:07:24.409000 470659 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 470666 closing signal SIGTERM
61
+ E0516 22:07:24.587000 470659 torch/distributed/elastic/multiprocessing/api.py:870] failed (exitcode: 1) local_rank: 0 (pid: 470663) of binary: /usr/bin/python
62
+ Traceback (most recent call last):
63
+ File "<frozen runpy>", line 198, in _run_module_as_main
64
+ File "<frozen runpy>", line 88, in _run_code
65
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
66
+ main()
67
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
68
+ return f(*args, **kwargs)
69
+ ^^^^^^^^^^^^^^^^^^
70
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
71
+ run(args)
72
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
73
+ elastic_launch(
74
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
75
+ return launch_agent(self._config, self._entrypoint, list(args))
76
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
77
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
78
+ raise ChildFailedError(
79
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
80
+ ============================================================
81
+ train.py FAILED
82
+ ------------------------------------------------------------
83
+ Failures:
84
+ <NO_OTHER_FAILURES>
85
+ ------------------------------------------------------------
86
+ Root Cause (first observed failure):
87
+ [0]:
88
+ time : 2026-05-16_22:07:24
89
+ host : localhost
90
+ rank : 0 (local_rank: 0)
91
+ exitcode : 1 (pid: 470663)
92
+ error_file: <N/A>
93
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
94
+ ============================================================
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_combo_len256_dirichlet_unigram_shared_highC_20260517_170456.log ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 969,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_combo_len256_dirichlet_unigram_shared_highC_20260517_170456",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 1965440,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 1.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_corpus_unigram_path": "",
124
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
125
+ "categorical_wrong_basin_shared_prob": 0.0,
126
+ "categorical_wrong_unigram_shared_prob": 0.5,
127
+ "mask_mixture_original_prob": 0.0,
128
+ "mask_mixture_lowk_prob": 0.0,
129
+ "mask_mixture_lowcorrupt_prob": 0.0,
130
+ "mask_mixture_block_prob": 0.0,
131
+ "mask_mixture_all_prob": 1.0,
132
+ "mask_mixture_lowk_clean_tokens": "0",
133
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
134
+ "mask_mixture_block_tokens": "64,128",
135
+ "simplex_bridge_sampler": "dirichlet",
136
+ "logistic_normal_sigma_min": 0.1,
137
+ "logistic_normal_sigma_max": 1.0,
138
+ "logistic_normal_tau_min": 1.0,
139
+ "logistic_normal_tau_max": 1.0,
140
+ "torch_compile": false,
141
+ "compile_mode": "max-autotune",
142
+ "state_format": "prob",
143
+ "meanflow_weight": 0.0,
144
+ "rollout_train_prob": 0.0,
145
+ "rollout_train_steps": 1,
146
+ "rollout_train_infer_steps": 64,
147
+ "rollout_train_temp": 1.45,
148
+ "rollout_train_max_gamma": 1.0,
149
+ "rollout_train_corrupt_only": true,
150
+ "rollout_train_samplewise": false,
151
+ "rollout_train_compute_always": false,
152
+ "bridge_noise_init": "logistic_normal",
153
+ "noise_sigma": -1.0,
154
+ "allow_tf32": true,
155
+ "activation_checkpointing": false,
156
+ "activation_checkpoint_interval": 1,
157
+ "activation_checkpoint_scope": "block",
158
+ "ddp_static_graph": false,
159
+ "ddp_gradient_as_bucket_view": true,
160
+ "blocking_data_transfer": false,
161
+ "dataloader_prefetch_factor": 4,
162
+ "full_train_stats": false,
163
+ "tokenized_hf": false,
164
+ "tokenized_pad_token": "pad",
165
+ "elf_conditional_hf": false,
166
+ "record_pad_truncate": false,
167
+ "record_add_eos": false,
168
+ "record_add_special_tokens": false,
169
+ "record_pad_token": "pad",
170
+ "record_shuffle_buffer": 10000,
171
+ "wrap": true,
172
+ "wrap_mode": "stream",
173
+ "wrap_record_buffer_size": 200,
174
+ "owt_cached_chunks": true,
175
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
176
+ "owt_chunk_cache_rebuild": false,
177
+ "owt_chunk_cache_write_batch": 4096,
178
+ "owt_exact_repeat_per_chunk": 64,
179
+ "online_chunk_shuffle": false,
180
+ "online_chunk_shuffle_buffer": 10000,
181
+ "openwebtext_split": "train_minus_100k",
182
+ "detokenizer": "auto",
183
+ "resolved_detokenizer": null,
184
+ "num_workers": 0,
185
+ "latest_every": 1000,
186
+ "resume_path": ""
187
+ }
188
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=4.7s lr=2.000000e-03 loss=6.6629 loss_recon=6.6629 loss_meanflow=0.0000 mean_model_t=0.2078 mean_corrupt_t=0.2078 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1701 corrupt_frac=1.0000 acc_corrupt=0.1701 loss_corrupt=6.6629 wrong_frac=0.7922 init_acc_corrupt=0.2054 acc_corrupt_t_0p0_0p2=0.0802 corrupt_frac_t_0p0_0p2=0.5620 acc_corrupt_t_0p2_0p4=0.2399 corrupt_frac_t_0p2_0p4=0.3533 acc_corrupt_t_0p4_0p6=0.4568 corrupt_frac_t_0p4_0p6=0.0755 acc_corrupt_t_0p6_0p8=0.6274 corrupt_frac_t_0p6_0p8=0.0117 out_w_norm=1.2395 out_g_norm=0.9648 acc_corrupt_t_0p8_1p0=0.9141 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.3374 init_gold_top10=0.2275 init_gold_top100=0.2997
189
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=4.0s lr=2.000000e-03 loss=5.9494 loss_recon=5.9494 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1795 corrupt_frac=1.0000 acc_corrupt=0.1795 loss_corrupt=5.9494 wrong_frac=0.7916 init_acc_corrupt=0.2060 acc_corrupt_t_0p0_0p2=0.0927 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=0.2447 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.4631 corrupt_frac_t_0p4_0p6=0.0744 acc_corrupt_t_0p6_0p8=0.6568 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=0.8105 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=4.1195 out_g_norm=1.3169 loss_all=5.5816 init_gold_top10=0.2191 init_gold_top100=0.2940
190
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=4.0s lr=2.000000e-03 loss=5.3088 loss_recon=5.3088 loss_meanflow=0.0000 mean_model_t=0.2105 mean_corrupt_t=0.2105 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2092 corrupt_frac=1.0000 acc_corrupt=0.2092 loss_corrupt=5.3088 wrong_frac=0.7894 init_acc_corrupt=0.2082 acc_corrupt_t_0p0_0p2=0.1143 corrupt_frac_t_0p0_0p2=0.5542 acc_corrupt_t_0p2_0p4=0.2854 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.4737 corrupt_frac_t_0p4_0p6=0.0813 acc_corrupt_t_0p6_0p8=0.6627 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=6.7925 out_g_norm=0.5991 acc_corrupt_t_0p8_1p0=0.8750 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.0754 init_gold_top10=0.2283 init_gold_top100=0.3011
191
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=4.0s lr=2.000000e-03 loss=5.0142 loss_recon=5.0142 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2180 corrupt_frac=1.0000 acc_corrupt=0.2180 loss_corrupt=5.0142 wrong_frac=0.7915 init_acc_corrupt=0.2061 acc_corrupt_t_0p0_0p2=0.1264 corrupt_frac_t_0p0_0p2=0.5597 acc_corrupt_t_0p2_0p4=0.2952 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.4799 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.6613 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=8.4766 out_g_norm=0.3326 acc_corrupt_t_0p8_1p0=0.7930 corrupt_frac_t_0p8_1p0=0.0078 loss_all=4.7706 init_gold_top10=0.2298 init_gold_top100=0.3036
192
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=4.0s lr=2.000000e-03 loss=4.5487 loss_recon=4.5487 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2257 corrupt_frac=1.0000 acc_corrupt=0.2257 loss_corrupt=4.5487 wrong_frac=0.7914 init_acc_corrupt=0.2062 acc_corrupt_t_0p0_0p2=0.1340 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.3049 corrupt_frac_t_0p2_0p4=0.3550 acc_corrupt_t_0p4_0p6=0.4866 corrupt_frac_t_0p4_0p6=0.0741 acc_corrupt_t_0p6_0p8=0.6614 corrupt_frac_t_0p6_0p8=0.0143 out_w_norm=9.7624 out_g_norm=0.4423 loss_all=4.2801 init_gold_top10=0.2068 init_gold_top100=0.2830
193
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=4.0s lr=2.000000e-03 loss=3.8447 loss_recon=3.8447 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2370 corrupt_frac=1.0000 acc_corrupt=0.2370 loss_corrupt=3.8447 wrong_frac=0.7917 init_acc_corrupt=0.2059 acc_corrupt_t_0p0_0p2=0.1444 corrupt_frac_t_0p0_0p2=0.5568 acc_corrupt_t_0p2_0p4=0.3165 corrupt_frac_t_0p2_0p4=0.3606 acc_corrupt_t_0p4_0p6=0.4950 corrupt_frac_t_0p4_0p6=0.0742 acc_corrupt_t_0p6_0p8=0.6750 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=10.1909 out_g_norm=0.4588 loss_all=3.5868 init_gold_top10=0.1960 init_gold_top100=0.2722
194
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=4.0s lr=2.000000e-03 loss=3.1110 loss_recon=3.1110 loss_meanflow=0.0000 mean_model_t=0.2079 mean_corrupt_t=0.2079 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2652 corrupt_frac=1.0000 acc_corrupt=0.2652 loss_corrupt=3.1110 wrong_frac=0.7925 init_acc_corrupt=0.2051 acc_corrupt_t_0p0_0p2=0.1631 corrupt_frac_t_0p0_0p2=0.5607 acc_corrupt_t_0p2_0p4=0.3575 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.5431 corrupt_frac_t_0p4_0p6=0.0739 acc_corrupt_t_0p6_0p8=0.7116 corrupt_frac_t_0p6_0p8=0.0132 out_w_norm=10.4454 out_g_norm=0.5395 loss_all=2.6425 init_gold_top10=0.2289 init_gold_top100=0.3008
195
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=4.0s lr=2.000000e-03 loss=2.2335 loss_recon=2.2335 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3921 corrupt_frac=1.0000 acc_corrupt=0.3921 loss_corrupt=2.2335 wrong_frac=0.7904 init_acc_corrupt=0.2072 acc_corrupt_t_0p0_0p2=0.2394 corrupt_frac_t_0p0_0p2=0.5505 acc_corrupt_t_0p2_0p4=0.5444 corrupt_frac_t_0p2_0p4=0.3649 acc_corrupt_t_0p4_0p6=0.7185 corrupt_frac_t_0p4_0p6=0.0765 out_w_norm=10.7231 out_g_norm=0.6774 acc_corrupt_t_0p6_0p8=0.8264 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=0.9297 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.8154 init_gold_top10=0.2095 init_gold_top100=0.2845
196
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=4.0s lr=2.000000e-03 loss=1.4273 loss_recon=1.4273 loss_meanflow=0.0000 mean_model_t=0.2102 mean_corrupt_t=0.2102 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6031 corrupt_frac=1.0000 acc_corrupt=0.6031 loss_corrupt=1.4273 wrong_frac=0.7901 init_acc_corrupt=0.2076 acc_corrupt_t_0p0_0p2=0.4064 corrupt_frac_t_0p0_0p2=0.5513 acc_corrupt_t_0p2_0p4=0.8248 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.9279 corrupt_frac_t_0p4_0p6=0.0755 out_w_norm=10.9722 out_g_norm=0.6506 acc_corrupt_t_0p6_0p8=0.9541 corrupt_frac_t_0p6_0p8=0.0127 acc_corrupt_t_0p8_1p0=0.9629 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.1917 init_gold_top10=0.2077 init_gold_top100=0.2817
197
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=4.0s lr=2.000000e-03 loss=0.9388 loss_recon=0.9388 loss_meanflow=0.0000 mean_model_t=0.2101 mean_corrupt_t=0.2101 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7497 corrupt_frac=1.0000 acc_corrupt=0.7497 loss_corrupt=0.9388 wrong_frac=0.7897 init_acc_corrupt=0.2080 acc_corrupt_t_0p0_0p2=0.5762 corrupt_frac_t_0p0_0p2=0.5591 acc_corrupt_t_0p2_0p4=0.9639 corrupt_frac_t_0p2_0p4=0.3526 acc_corrupt_t_0p4_0p6=0.9922 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p8_1p0=0.9941 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=11.2383 out_g_norm=0.6494 acc_corrupt_t_0p6_0p8=0.9947 corrupt_frac_t_0p6_0p8=0.0133 loss_all=0.7538 init_gold_top10=0.2115 init_gold_top100=0.2852
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456.log ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 969,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 1965440,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.5,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 1.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_corpus_unigram_path": "",
124
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
125
+ "categorical_wrong_basin_shared_prob": 0.0,
126
+ "categorical_wrong_unigram_shared_prob": 0.5,
127
+ "mask_mixture_original_prob": 0.0,
128
+ "mask_mixture_lowk_prob": 0.0,
129
+ "mask_mixture_lowcorrupt_prob": 0.0,
130
+ "mask_mixture_block_prob": 0.0,
131
+ "mask_mixture_all_prob": 1.0,
132
+ "mask_mixture_lowk_clean_tokens": "0",
133
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
134
+ "mask_mixture_block_tokens": "64,128",
135
+ "simplex_bridge_sampler": "logistic_normal_linear_mean",
136
+ "logistic_normal_sigma_min": 0.03,
137
+ "logistic_normal_sigma_max": 0.4,
138
+ "logistic_normal_tau_min": 1.0,
139
+ "logistic_normal_tau_max": 1.0,
140
+ "torch_compile": false,
141
+ "compile_mode": "max-autotune",
142
+ "state_format": "prob",
143
+ "meanflow_weight": 0.0,
144
+ "rollout_train_prob": 0.0,
145
+ "rollout_train_steps": 1,
146
+ "rollout_train_infer_steps": 64,
147
+ "rollout_train_temp": 1.45,
148
+ "rollout_train_max_gamma": 1.0,
149
+ "rollout_train_corrupt_only": true,
150
+ "rollout_train_samplewise": false,
151
+ "rollout_train_compute_always": false,
152
+ "bridge_noise_init": "logistic_normal",
153
+ "noise_sigma": -1.0,
154
+ "allow_tf32": true,
155
+ "activation_checkpointing": false,
156
+ "activation_checkpoint_interval": 1,
157
+ "activation_checkpoint_scope": "block",
158
+ "ddp_static_graph": false,
159
+ "ddp_gradient_as_bucket_view": true,
160
+ "blocking_data_transfer": false,
161
+ "dataloader_prefetch_factor": 4,
162
+ "full_train_stats": false,
163
+ "tokenized_hf": false,
164
+ "tokenized_pad_token": "pad",
165
+ "elf_conditional_hf": false,
166
+ "record_pad_truncate": false,
167
+ "record_add_eos": false,
168
+ "record_add_special_tokens": false,
169
+ "record_pad_token": "pad",
170
+ "record_shuffle_buffer": 10000,
171
+ "wrap": true,
172
+ "wrap_mode": "stream",
173
+ "wrap_record_buffer_size": 200,
174
+ "owt_cached_chunks": true,
175
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
176
+ "owt_chunk_cache_rebuild": false,
177
+ "owt_chunk_cache_write_batch": 4096,
178
+ "owt_exact_repeat_per_chunk": 64,
179
+ "online_chunk_shuffle": false,
180
+ "online_chunk_shuffle_buffer": 10000,
181
+ "openwebtext_split": "train_minus_100k",
182
+ "detokenizer": "auto",
183
+ "resolved_detokenizer": null,
184
+ "num_workers": 0,
185
+ "latest_every": 1000,
186
+ "resume_path": ""
187
+ }
188
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=4.5s lr=2.000000e-03 loss=6.7097 loss_recon=6.7097 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1179 corrupt_frac=1.0000 acc_corrupt=0.1179 loss_corrupt=6.7097 wrong_frac=0.8656 init_acc_corrupt=0.1344 acc_corrupt_t_0p0_0p2=0.0569 corrupt_frac_t_0p0_0p2=0.5599 acc_corrupt_t_0p2_0p4=0.1555 corrupt_frac_t_0p2_0p4=0.3564 acc_corrupt_t_0p4_0p6=0.3428 corrupt_frac_t_0p4_0p6=0.0738 acc_corrupt_t_0p6_0p8=0.5361 corrupt_frac_t_0p6_0p8=0.0138 out_w_norm=1.2263 out_g_norm=1.0547 acc_corrupt_t_0p8_1p0=0.8340 corrupt_frac_t_0p8_1p0=0.0104 loss_all=6.4790 init_gold_top10=0.1401 init_gold_top100=0.2195
189
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=3.8s lr=2.000000e-03 loss=6.1757 loss_recon=6.1757 loss_meanflow=0.0000 mean_model_t=0.2069 mean_corrupt_t=0.2069 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1118 corrupt_frac=1.0000 acc_corrupt=0.1118 loss_corrupt=6.1757 wrong_frac=0.8695 init_acc_corrupt=0.1305 acc_corrupt_t_0p0_0p2=0.0654 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.1396 corrupt_frac_t_0p2_0p4=0.3552 acc_corrupt_t_0p4_0p6=0.2837 corrupt_frac_t_0p4_0p6=0.0749 out_w_norm=3.8017 out_g_norm=1.5027 acc_corrupt_t_0p6_0p8=0.5003 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=0.7578 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.9357 init_gold_top10=0.1209 init_gold_top100=0.2034
190
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=3.8s lr=2.000000e-03 loss=5.6465 loss_recon=5.6465 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1395 corrupt_frac=1.0000 acc_corrupt=0.1395 loss_corrupt=5.6465 wrong_frac=0.8662 init_acc_corrupt=0.1338 acc_corrupt_t_0p0_0p2=0.0788 corrupt_frac_t_0p0_0p2=0.5585 acc_corrupt_t_0p2_0p4=0.1795 corrupt_frac_t_0p2_0p4=0.3541 acc_corrupt_t_0p4_0p6=0.3465 corrupt_frac_t_0p4_0p6=0.0783 acc_corrupt_t_0p6_0p8=0.5308 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=6.0405 out_g_norm=0.6592 acc_corrupt_t_0p8_1p0=0.4082 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.4633 init_gold_top10=0.1470 init_gold_top100=0.2274
191
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=3.9s lr=2.000000e-03 loss=5.4440 loss_recon=5.4440 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1499 corrupt_frac=1.0000 acc_corrupt=0.1499 loss_corrupt=5.4440 wrong_frac=0.8670 init_acc_corrupt=0.1330 acc_corrupt_t_0p0_0p2=0.0880 corrupt_frac_t_0p0_0p2=0.5540 acc_corrupt_t_0p2_0p4=0.1937 corrupt_frac_t_0p2_0p4=0.3638 acc_corrupt_t_0p4_0p6=0.3606 corrupt_frac_t_0p4_0p6=0.0743 acc_corrupt_t_0p6_0p8=0.4963 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=7.7267 out_g_norm=0.3063 acc_corrupt_t_0p8_1p0=0.7578 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.2757 init_gold_top10=0.1559 init_gold_top100=0.2377
192
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=3.8s lr=2.000000e-03 loss=5.1483 loss_recon=5.1483 loss_meanflow=0.0000 mean_model_t=0.2103 mean_corrupt_t=0.2103 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1601 corrupt_frac=1.0000 acc_corrupt=0.1601 loss_corrupt=5.1483 wrong_frac=0.8648 init_acc_corrupt=0.1352 acc_corrupt_t_0p0_0p2=0.0953 corrupt_frac_t_0p0_0p2=0.5508 acc_corrupt_t_0p2_0p4=0.2020 corrupt_frac_t_0p2_0p4=0.3598 acc_corrupt_t_0p4_0p6=0.3747 corrupt_frac_t_0p4_0p6=0.0818 out_w_norm=8.9678 out_g_norm=0.4350 acc_corrupt_t_0p6_0p8=0.5439 corrupt_frac_t_0p6_0p8=0.0136 acc_corrupt_t_0p8_1p0=0.8184 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.0288 init_gold_top10=0.1219 init_gold_top100=0.2024
193
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=3.8s lr=2.000000e-03 loss=4.6281 loss_recon=4.6281 loss_meanflow=0.0000 mean_model_t=0.2080 mean_corrupt_t=0.2080 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1673 corrupt_frac=1.0000 acc_corrupt=0.1673 loss_corrupt=4.6281 wrong_frac=0.8662 init_acc_corrupt=0.1338 acc_corrupt_t_0p0_0p2=0.1028 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.2130 corrupt_frac_t_0p2_0p4=0.3509 acc_corrupt_t_0p4_0p6=0.3788 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p6_0p8=0.5652 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=9.3953 out_g_norm=0.4892 acc_corrupt_t_0p8_1p0=0.8623 corrupt_frac_t_0p8_1p0=0.0078 loss_all=4.4779 init_gold_top10=0.1371 init_gold_top100=0.2186
194
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=3.8s lr=2.000000e-03 loss=4.0407 loss_recon=4.0407 loss_meanflow=0.0000 mean_model_t=0.2106 mean_corrupt_t=0.2106 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1789 corrupt_frac=1.0000 acc_corrupt=0.1789 loss_corrupt=4.0407 wrong_frac=0.8641 init_acc_corrupt=0.1359 acc_corrupt_t_0p0_0p2=0.1138 corrupt_frac_t_0p0_0p2=0.5534 acc_corrupt_t_0p2_0p4=0.2271 corrupt_frac_t_0p2_0p4=0.3573 acc_corrupt_t_0p4_0p6=0.3743 corrupt_frac_t_0p4_0p6=0.0812 out_w_norm=9.6182 out_g_norm=0.5365 acc_corrupt_t_0p6_0p8=0.5301 corrupt_frac_t_0p6_0p8=0.0129 acc_corrupt_t_0p8_1p0=0.8301 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.7657 init_gold_top10=0.1297 init_gold_top100=0.2126
195
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=3.8s lr=2.000000e-03 loss=3.5950 loss_recon=3.5950 loss_meanflow=0.0000 mean_model_t=0.2079 mean_corrupt_t=0.2079 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1916 corrupt_frac=1.0000 acc_corrupt=0.1916 loss_corrupt=3.5950 wrong_frac=0.8684 init_acc_corrupt=0.1316 acc_corrupt_t_0p0_0p2=0.1264 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.2418 corrupt_frac_t_0p2_0p4=0.3614 acc_corrupt_t_0p4_0p6=0.3942 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.5762 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=9.8701 out_g_norm=0.7488 acc_corrupt_t_0p8_1p0=0.8203 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.4492 init_gold_top10=0.1366 init_gold_top100=0.2184
196
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=3.8s lr=2.000000e-03 loss=3.0683 loss_recon=3.0683 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2626 corrupt_frac=1.0000 acc_corrupt=0.2626 loss_corrupt=3.0683 wrong_frac=0.8669 init_acc_corrupt=0.1331 acc_corrupt_t_0p0_0p2=0.1752 corrupt_frac_t_0p0_0p2=0.5614 acc_corrupt_t_0p2_0p4=0.3404 corrupt_frac_t_0p2_0p4=0.3548 acc_corrupt_t_0p4_0p6=0.5038 corrupt_frac_t_0p4_0p6=0.0757 out_w_norm=10.0897 out_g_norm=1.0269 acc_corrupt_t_0p6_0p8=0.6538 corrupt_frac_t_0p6_0p8=0.0121 acc_corrupt_t_0p8_1p0=0.9082 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.4332 init_gold_top10=0.1616 init_gold_top100=0.2392
197
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=3.8s lr=2.000000e-03 loss=2.4691 loss_recon=2.4691 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4023 corrupt_frac=1.0000 acc_corrupt=0.4023 loss_corrupt=2.4691 wrong_frac=0.8652 init_acc_corrupt=0.1348 acc_corrupt_t_0p0_0p2=0.2939 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=0.5031 corrupt_frac_t_0p2_0p4=0.3610 acc_corrupt_t_0p4_0p6=0.6783 corrupt_frac_t_0p4_0p6=0.0730 out_w_norm=10.3143 out_g_norm=1.0213 acc_corrupt_t_0p6_0p8=0.7848 corrupt_frac_t_0p6_0p8=0.0140 acc_corrupt_t_0p8_1p0=0.4844 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.1232 init_gold_top10=0.1450 init_gold_top100=0.2272
198
+ NCCL version 2.25.1+cuda12.8
199
+ resumed_from=runs/train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456/latest.pt start_step=1001
200
+ {
201
+ "device": "cuda:0",
202
+ "rank": 0,
203
+ "world_size": 4,
204
+ "samples": "owt_cached_chunks:8",
205
+ "vocab_size": 969,
206
+ "tokenizer_vocab_size": 50257,
207
+ "save_dir": "runs/train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456",
208
+ "batch_size": 128,
209
+ "grad_accum": 1,
210
+ "effective_batch_size": 512,
211
+ "global_batch_size": 512,
212
+ "lr_schedule": "constant_warmup",
213
+ "optimizer": "muon",
214
+ "epochs": 0.0,
215
+ "steps_per_epoch": 1,
216
+ "total_steps": 2000,
217
+ "warmup_steps": 10,
218
+ "warmup_epochs": -1.0,
219
+ "min_lr": 0.0,
220
+ "weight_decay": 0.1,
221
+ "output_weight_decay": -1.0,
222
+ "adamw_param_groups": "nanogpt",
223
+ "adam_beta1": 0.9,
224
+ "adam_beta2": 0.95,
225
+ "adam_eps": 1e-08,
226
+ "muon_impl": "legacy",
227
+ "muon_momentum": 0.95,
228
+ "muon_ns_steps": 5,
229
+ "muon_update_scale": 1.0,
230
+ "muon_nesterov": false,
231
+ "muon_width_scale": false,
232
+ "muon_grouping": "legacy_dim_ge_2",
233
+ "muon_param_count": 1965440,
234
+ "muon_adam_param_count": 8192,
235
+ "muon_param_names": [
236
+ "vocab_embed.embedding",
237
+ "sigma_map.net.0.weight",
238
+ "sigma_map.net.2.weight",
239
+ "blocks.0.attn_qkv.weight",
240
+ "blocks.0.attn_out.weight",
241
+ "blocks.0.mlp.0.weight",
242
+ "blocks.0.mlp.2.weight",
243
+ "blocks.0.adaLN_modulation.weight",
244
+ "blocks.1.attn_qkv.weight",
245
+ "blocks.1.attn_out.weight",
246
+ "blocks.1.mlp.0.weight",
247
+ "blocks.1.mlp.2.weight",
248
+ "blocks.1.adaLN_modulation.weight",
249
+ "blocks.2.attn_qkv.weight",
250
+ "blocks.2.attn_out.weight",
251
+ "blocks.2.mlp.0.weight",
252
+ "blocks.2.mlp.2.weight",
253
+ "blocks.2.adaLN_modulation.weight",
254
+ "output_layer.linear.weight",
255
+ "output_layer.adaLN_modulation.weight"
256
+ ],
257
+ "muon_adam_param_names": [
258
+ "sigma_map.net.0.bias",
259
+ "sigma_map.net.2.bias",
260
+ "blocks.0.norm1.weight",
261
+ "blocks.0.norm2.weight",
262
+ "blocks.0.mlp.0.bias",
263
+ "blocks.0.mlp.2.bias",
264
+ "blocks.0.adaLN_modulation.bias",
265
+ "blocks.1.norm1.weight",
266
+ "blocks.1.norm2.weight",
267
+ "blocks.1.mlp.0.bias",
268
+ "blocks.1.mlp.2.bias",
269
+ "blocks.1.adaLN_modulation.bias",
270
+ "blocks.2.norm1.weight",
271
+ "blocks.2.norm2.weight",
272
+ "blocks.2.mlp.0.bias",
273
+ "blocks.2.mlp.2.bias",
274
+ "blocks.2.adaLN_modulation.bias",
275
+ "output_layer.norm_final.weight",
276
+ "output_layer.adaLN_modulation.bias"
277
+ ],
278
+ "muon_effective_nesterov": false,
279
+ "muon_effective_width_scale": false,
280
+ "muon_effective_weight_decay": 0.1,
281
+ "muon_adam_fallback_nesterov": false,
282
+ "muon_adam_fallback_weight_decay": 0.1,
283
+ "ema_decay": 0.9999,
284
+ "ema_start_step": 0,
285
+ "model_type": "ddit",
286
+ "ddit_mlp_type": "gelu",
287
+ "elf_num_time_tokens": 4,
288
+ "elf_num_model_mode_tokens": 0,
289
+ "qk_norm": true,
290
+ "output_bias": false,
291
+ "output_init_std": -1.0,
292
+ "norm_type": "rmsnorm",
293
+ "target_loss": "hard_ce",
294
+ "linear_soft_target_power": 1.0,
295
+ "linear_soft_target_min_conf": 0.0,
296
+ "linear_soft_target_max_conf": 1.0,
297
+ "t_sampling_mode": "logit_normal",
298
+ "t_sampling_power": 1.0,
299
+ "t_sampling_eps": 0.0001,
300
+ "t_sampling_logit_mean": -1.5,
301
+ "t_sampling_logit_std": 0.8,
302
+ "dual_t": true,
303
+ "corrupt_t_mode": "same",
304
+ "corrupt_min_t": 0.0,
305
+ "corrupt_max_t": 1.0,
306
+ "prefix_block_prob": 0.0,
307
+ "prefix_block_len": 128,
308
+ "mask_ratio_floor_schedule": "none",
309
+ "dirichlet_endpoint_mode": "categorical_dual_t",
310
+ "dirichlet_semantic_t_mode": "same",
311
+ "dirichlet_semantic_t_value": 0.0,
312
+ "dirichlet_semantic_t_curve": "linear",
313
+ "dirichlet_semantic_t_power": 1.0,
314
+ "endpoint_sequence_random_prob_alpha": 0.5,
315
+ "categorical_wrong_from_full_vocab": true,
316
+ "categorical_wrong_from_batch_valid_tokens": false,
317
+ "categorical_wrong_basin_token_ids": "",
318
+ "categorical_wrong_basin_prob": 0.0,
319
+ "categorical_wrong_unigram_prob": 1.0,
320
+ "categorical_wrong_uniform_prob": 0.0,
321
+ "categorical_wrong_corpus_unigram_path": "",
322
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
323
+ "categorical_wrong_basin_shared_prob": 0.0,
324
+ "categorical_wrong_unigram_shared_prob": 0.5,
325
+ "mask_mixture_original_prob": 0.0,
326
+ "mask_mixture_lowk_prob": 0.0,
327
+ "mask_mixture_lowcorrupt_prob": 0.0,
328
+ "mask_mixture_block_prob": 0.0,
329
+ "mask_mixture_all_prob": 1.0,
330
+ "mask_mixture_lowk_clean_tokens": "0",
331
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
332
+ "mask_mixture_block_tokens": "64,128",
333
+ "simplex_bridge_sampler": "logistic_normal_linear_mean",
334
+ "logistic_normal_sigma_min": 0.03,
335
+ "logistic_normal_sigma_max": 0.4,
336
+ "logistic_normal_tau_min": 1.0,
337
+ "logistic_normal_tau_max": 1.0,
338
+ "torch_compile": false,
339
+ "compile_mode": "max-autotune",
340
+ "state_format": "prob",
341
+ "meanflow_weight": 0.0,
342
+ "rollout_train_prob": 0.0,
343
+ "rollout_train_steps": 1,
344
+ "rollout_train_infer_steps": 64,
345
+ "rollout_train_temp": 1.45,
346
+ "rollout_train_max_gamma": 1.0,
347
+ "rollout_train_corrupt_only": true,
348
+ "rollout_train_samplewise": false,
349
+ "rollout_train_compute_always": false,
350
+ "bridge_noise_init": "logistic_normal",
351
+ "noise_sigma": -1.0,
352
+ "allow_tf32": true,
353
+ "activation_checkpointing": false,
354
+ "activation_checkpoint_interval": 1,
355
+ "activation_checkpoint_scope": "block",
356
+ "ddp_static_graph": false,
357
+ "ddp_gradient_as_bucket_view": true,
358
+ "blocking_data_transfer": false,
359
+ "dataloader_prefetch_factor": 4,
360
+ "full_train_stats": false,
361
+ "tokenized_hf": false,
362
+ "tokenized_pad_token": "pad",
363
+ "elf_conditional_hf": false,
364
+ "record_pad_truncate": false,
365
+ "record_add_eos": false,
366
+ "record_add_special_tokens": false,
367
+ "record_pad_token": "pad",
368
+ "record_shuffle_buffer": 10000,
369
+ "wrap": true,
370
+ "wrap_mode": "stream",
371
+ "wrap_record_buffer_size": 200,
372
+ "owt_cached_chunks": true,
373
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
374
+ "owt_chunk_cache_rebuild": false,
375
+ "owt_chunk_cache_write_batch": 4096,
376
+ "owt_exact_repeat_per_chunk": 64,
377
+ "online_chunk_shuffle": false,
378
+ "online_chunk_shuffle_buffer": 10000,
379
+ "openwebtext_split": "train_minus_100k",
380
+ "detokenizer": "auto",
381
+ "resolved_detokenizer": null,
382
+ "num_workers": 0,
383
+ "latest_every": 1000,
384
+ "resume_path": "runs/train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456/latest.pt"
385
+ }
386
+ step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=4.7s lr=2.000000e-03 loss=2.0475 loss_recon=2.0475 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5133 corrupt_frac=1.0000 acc_corrupt=0.5133 loss_corrupt=2.0475 wrong_frac=0.8656 init_acc_corrupt=0.1344 acc_corrupt_t_0p0_0p2=0.4118 corrupt_frac_t_0p0_0p2=0.5599 acc_corrupt_t_0p2_0p4=0.6173 corrupt_frac_t_0p2_0p4=0.3564 acc_corrupt_t_0p4_0p6=0.7417 corrupt_frac_t_0p4_0p6=0.0738 acc_corrupt_t_0p6_0p8=0.8054 corrupt_frac_t_0p6_0p8=0.0138 out_w_norm=10.5718 out_g_norm=1.1653 acc_corrupt_t_0p8_1p0=0.9883 corrupt_frac_t_0p8_1p0=0.0104 loss_all=1.8965 init_gold_top10=0.1401 init_gold_top100=0.2195
387
+ step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=4.0s lr=2.000000e-03 loss=1.8251 loss_recon=1.8251 loss_meanflow=0.0000 mean_model_t=0.2069 mean_corrupt_t=0.2069 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5569 corrupt_frac=1.0000 acc_corrupt=0.5569 loss_corrupt=1.8251 wrong_frac=0.8695 init_acc_corrupt=0.1305 acc_corrupt_t_0p0_0p2=0.4760 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.6412 corrupt_frac_t_0p2_0p4=0.3552 acc_corrupt_t_0p4_0p6=0.7336 corrupt_frac_t_0p4_0p6=0.0749 out_w_norm=10.7986 out_g_norm=1.1972 acc_corrupt_t_0p6_0p8=0.8191 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.8619 init_gold_top10=0.1209 init_gold_top100=0.2034
388
+ step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=4.0s lr=2.000000e-03 loss=1.6495 loss_recon=1.6495 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5851 corrupt_frac=1.0000 acc_corrupt=0.5851 loss_corrupt=1.6495 wrong_frac=0.8662 init_acc_corrupt=0.1338 acc_corrupt_t_0p0_0p2=0.5137 corrupt_frac_t_0p0_0p2=0.5585 acc_corrupt_t_0p2_0p4=0.6553 corrupt_frac_t_0p2_0p4=0.3541 acc_corrupt_t_0p4_0p6=0.7488 corrupt_frac_t_0p4_0p6=0.0783 acc_corrupt_t_0p6_0p8=0.8325 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=10.9370 out_g_norm=1.2443 acc_corrupt_t_0p8_1p0=0.5098 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.5011 init_gold_top10=0.1470 init_gold_top100=0.2274
389
+ step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=4.0s lr=2.000000e-03 loss=1.4723 loss_recon=1.4723 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6098 corrupt_frac=1.0000 acc_corrupt=0.6098 loss_corrupt=1.4723 wrong_frac=0.8670 init_acc_corrupt=0.1330 acc_corrupt_t_0p0_0p2=0.5484 corrupt_frac_t_0p0_0p2=0.5540 acc_corrupt_t_0p2_0p4=0.6697 corrupt_frac_t_0p2_0p4=0.3638 acc_corrupt_t_0p4_0p6=0.7569 corrupt_frac_t_0p4_0p6=0.0743 acc_corrupt_t_0p6_0p8=0.7629 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=11.0332 out_g_norm=1.2546 acc_corrupt_t_0p8_1p0=0.9980 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0527 init_gold_top10=0.1559 init_gold_top100=0.2377
390
+ step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=4.0s lr=2.000000e-03 loss=1.3097 loss_recon=1.3097 loss_meanflow=0.0000 mean_model_t=0.2103 mean_corrupt_t=0.2103 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6239 corrupt_frac=1.0000 acc_corrupt=0.6239 loss_corrupt=1.3097 wrong_frac=0.8648 init_acc_corrupt=0.1352 acc_corrupt_t_0p0_0p2=0.5615 corrupt_frac_t_0p0_0p2=0.5508 acc_corrupt_t_0p2_0p4=0.6788 corrupt_frac_t_0p2_0p4=0.3598 acc_corrupt_t_0p4_0p6=0.7833 corrupt_frac_t_0p4_0p6=0.0818 out_w_norm=11.1027 out_g_norm=1.2651 acc_corrupt_t_0p6_0p8=0.8208 corrupt_frac_t_0p6_0p8=0.0136 acc_corrupt_t_0p8_1p0=0.9941 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.4609 init_gold_top10=0.1219 init_gold_top100=0.2024
391
+ step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=4.0s lr=2.000000e-03 loss=1.1768 loss_recon=1.1768 loss_meanflow=0.0000 mean_model_t=0.2080 mean_corrupt_t=0.2080 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6383 corrupt_frac=1.0000 acc_corrupt=0.6383 loss_corrupt=1.1768 wrong_frac=0.8662 init_acc_corrupt=0.1338 acc_corrupt_t_0p0_0p2=0.5824 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.6902 corrupt_frac_t_0p2_0p4=0.3509 acc_corrupt_t_0p4_0p6=0.7812 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p6_0p8=0.8484 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=11.1630 out_g_norm=1.2914 acc_corrupt_t_0p8_1p0=0.9971 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.3553 init_gold_top10=0.1371 init_gold_top100=0.2186
392
+ step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=4.0s lr=2.000000e-03 loss=1.0551 loss_recon=1.0551 loss_meanflow=0.0000 mean_model_t=0.2106 mean_corrupt_t=0.2106 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6547 corrupt_frac=1.0000 acc_corrupt=0.6547 loss_corrupt=1.0551 wrong_frac=0.8641 init_acc_corrupt=0.1359 acc_corrupt_t_0p0_0p2=0.6084 corrupt_frac_t_0p0_0p2=0.5534 acc_corrupt_t_0p2_0p4=0.6984 corrupt_frac_t_0p2_0p4=0.3573 acc_corrupt_t_0p4_0p6=0.7629 corrupt_frac_t_0p4_0p6=0.0812 out_w_norm=11.2082 out_g_norm=1.2603 acc_corrupt_t_0p6_0p8=0.7996 corrupt_frac_t_0p6_0p8=0.0129 acc_corrupt_t_0p8_1p0=0.9980 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0073 init_gold_top10=0.1297 init_gold_top100=0.2126
393
+ step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=4.0s lr=2.000000e-03 loss=0.9843 loss_recon=0.9843 loss_meanflow=0.0000 mean_model_t=0.2079 mean_corrupt_t=0.2079 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6577 corrupt_frac=1.0000 acc_corrupt=0.6577 loss_corrupt=0.9843 wrong_frac=0.8684 init_acc_corrupt=0.1316 acc_corrupt_t_0p0_0p2=0.6154 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.6959 corrupt_frac_t_0p2_0p4=0.3614 acc_corrupt_t_0p4_0p6=0.7686 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.8488 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=11.2253 out_g_norm=1.1781 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0908 init_gold_top10=0.1366 init_gold_top100=0.2184
394
+ step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=4.0s lr=2.000000e-03 loss=0.8950 loss_recon=0.8950 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6766 corrupt_frac=1.0000 acc_corrupt=0.6766 loss_corrupt=0.8950 wrong_frac=0.8669 init_acc_corrupt=0.1331 acc_corrupt_t_0p0_0p2=0.6318 corrupt_frac_t_0p0_0p2=0.5614 acc_corrupt_t_0p2_0p4=0.7194 corrupt_frac_t_0p2_0p4=0.3548 acc_corrupt_t_0p4_0p6=0.7877 corrupt_frac_t_0p4_0p6=0.0757 out_w_norm=11.2363 out_g_norm=1.1321 acc_corrupt_t_0p6_0p8=0.8710 corrupt_frac_t_0p6_0p8=0.0121 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.6929 init_gold_top10=0.1616 init_gold_top100=0.2392
395
+ step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=4.0s lr=2.000000e-03 loss=0.8544 loss_recon=0.8544 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6830 corrupt_frac=1.0000 acc_corrupt=0.6830 loss_corrupt=0.8544 wrong_frac=0.8652 init_acc_corrupt=0.1348 acc_corrupt_t_0p0_0p2=0.6393 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=0.7180 corrupt_frac_t_0p2_0p4=0.3610 acc_corrupt_t_0p4_0p6=0.8173 corrupt_frac_t_0p4_0p6=0.0730 out_w_norm=11.2470 out_g_norm=1.0160 acc_corrupt_t_0p6_0p8=0.8743 corrupt_frac_t_0p6_0p8=0.0140 acc_corrupt_t_0p8_1p0=0.5039 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.7242 init_gold_top10=0.1450 init_gold_top100=0.2272
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_20260517_223933.log ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 2664,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_20260517_223933",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 2616320,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_prob_floor": 0.0,
124
+ "categorical_wrong_corpus_unigram_path": "",
125
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
126
+ "categorical_wrong_basin_shared_prob": 0.0,
127
+ "categorical_wrong_unigram_shared_prob": 0.0,
128
+ "mask_mixture_original_prob": 0.0,
129
+ "mask_mixture_lowk_prob": 0.0,
130
+ "mask_mixture_lowcorrupt_prob": 0.0,
131
+ "mask_mixture_block_prob": 0.0,
132
+ "mask_mixture_all_prob": 1.0,
133
+ "mask_mixture_lowk_clean_tokens": "0",
134
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
135
+ "mask_mixture_block_tokens": "64,128",
136
+ "simplex_bridge_sampler": "dirichlet",
137
+ "logistic_normal_sigma_min": 0.1,
138
+ "logistic_normal_sigma_max": 1.0,
139
+ "logistic_normal_tau_min": 1.0,
140
+ "logistic_normal_tau_max": 1.0,
141
+ "torch_compile": false,
142
+ "compile_mode": "max-autotune",
143
+ "state_format": "prob",
144
+ "meanflow_weight": 0.0,
145
+ "rollout_train_prob": 0.5,
146
+ "rollout_train_steps": 1,
147
+ "rollout_train_infer_steps": 1,
148
+ "rollout_train_time_mode": "sampled_s",
149
+ "rollout_train_s_dist": "uniform",
150
+ "rollout_train_s_min_frac": 0.0,
151
+ "rollout_train_s_max_frac": 0.125,
152
+ "rollout_train_s_beta_alpha": 2.0,
153
+ "rollout_train_s_beta_beta": 6.0,
154
+ "rollout_train_temp": 1.45,
155
+ "rollout_train_max_gamma": 1.0,
156
+ "rollout_train_corrupt_only": true,
157
+ "rollout_train_samplewise": true,
158
+ "rollout_train_compute_always": false,
159
+ "rollout_train_sync_t": true,
160
+ "bridge_noise_init": "logistic_normal",
161
+ "noise_sigma": -1.0,
162
+ "allow_tf32": true,
163
+ "activation_checkpointing": false,
164
+ "activation_checkpoint_interval": 1,
165
+ "activation_checkpoint_scope": "block",
166
+ "ddp_static_graph": false,
167
+ "ddp_gradient_as_bucket_view": true,
168
+ "blocking_data_transfer": false,
169
+ "dataloader_prefetch_factor": 4,
170
+ "full_train_stats": false,
171
+ "tokenized_hf": false,
172
+ "tokenized_pad_token": "pad",
173
+ "elf_conditional_hf": false,
174
+ "record_pad_truncate": false,
175
+ "record_add_eos": false,
176
+ "record_add_special_tokens": false,
177
+ "record_pad_token": "pad",
178
+ "record_shuffle_buffer": 10000,
179
+ "wrap": true,
180
+ "wrap_mode": "stream",
181
+ "wrap_record_buffer_size": 200,
182
+ "owt_cached_chunks": true,
183
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
184
+ "owt_chunk_cache_rebuild": false,
185
+ "owt_chunk_cache_write_batch": 4096,
186
+ "owt_exact_repeat_per_chunk": 64,
187
+ "online_chunk_shuffle": false,
188
+ "online_chunk_shuffle_buffer": 10000,
189
+ "openwebtext_split": "train_minus_100k",
190
+ "detokenizer": "auto",
191
+ "resolved_detokenizer": null,
192
+ "num_workers": 0,
193
+ "latest_every": 1000,
194
+ "resume_path": ""
195
+ }
196
+ W0517 22:40:01.897000 386925 torch/distributed/elastic/agent/server/api.py:719] Received 15 death signal, shutting down workers
197
+ W0517 22:40:01.899000 386925 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 386929 closing signal SIGTERM
198
+ W0517 22:40:01.900000 386925 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 386930 closing signal SIGTERM
199
+ W0517 22:40:01.900000 386925 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 386931 closing signal SIGTERM
200
+ W0517 22:40:01.901000 386925 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 386932 closing signal SIGTERM
201
+ Traceback (most recent call last):
202
+ File "<frozen runpy>", line 198, in _run_module_as_main
203
+ File "<frozen runpy>", line 88, in _run_code
204
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
205
+ main()
206
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
207
+ return f(*args, **kwargs)
208
+ ^^^^^^^^^^^^^^^^^^
209
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
210
+ run(args)
211
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
212
+ elastic_launch(
213
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
214
+ return launch_agent(self._config, self._entrypoint, list(args))
215
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
216
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
217
+ result = agent.run()
218
+ ^^^^^^^^^^^
219
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
220
+ result = f(*args, **kwargs)
221
+ ^^^^^^^^^^^^^^^^^^
222
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run
223
+ result = self._invoke_run(role)
224
+ ^^^^^^^^^^^^^^^^^^^^^^
225
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 870, in _invoke_run
226
+ time.sleep(monitor_interval)
227
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler
228
+ raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
229
+ torch.distributed.elastic.multiprocessing.api.SignalException: Process 386925 got signal: 15
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139.log ADDED
@@ -0,0 +1,609 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 2664,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 2616320,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_prob_floor": 0.0,
124
+ "categorical_wrong_corpus_unigram_path": "",
125
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
126
+ "categorical_wrong_basin_shared_prob": 0.0,
127
+ "categorical_wrong_unigram_shared_prob": 0.0,
128
+ "mask_mixture_original_prob": 0.0,
129
+ "mask_mixture_lowk_prob": 0.0,
130
+ "mask_mixture_lowcorrupt_prob": 0.0,
131
+ "mask_mixture_block_prob": 0.0,
132
+ "mask_mixture_all_prob": 1.0,
133
+ "mask_mixture_lowk_clean_tokens": "0",
134
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
135
+ "mask_mixture_block_tokens": "64,128",
136
+ "simplex_bridge_sampler": "dirichlet",
137
+ "logistic_normal_sigma_min": 0.1,
138
+ "logistic_normal_sigma_max": 1.0,
139
+ "logistic_normal_tau_min": 1.0,
140
+ "logistic_normal_tau_max": 1.0,
141
+ "torch_compile": false,
142
+ "compile_mode": "max-autotune",
143
+ "state_format": "prob",
144
+ "meanflow_weight": 0.0,
145
+ "rollout_train_prob": 0.5,
146
+ "rollout_train_steps": 4,
147
+ "rollout_train_infer_steps": 1,
148
+ "rollout_train_time_mode": "sampled_path",
149
+ "rollout_train_s_dist": "uniform",
150
+ "rollout_train_s_min_frac": 0.0,
151
+ "rollout_train_s_max_frac": 0.125,
152
+ "rollout_train_s_beta_alpha": 2.0,
153
+ "rollout_train_s_beta_beta": 6.0,
154
+ "rollout_train_temp": 1.45,
155
+ "rollout_train_max_gamma": 1.0,
156
+ "rollout_train_corrupt_only": true,
157
+ "rollout_train_samplewise": true,
158
+ "rollout_train_compute_always": false,
159
+ "rollout_train_sync_t": true,
160
+ "bridge_noise_init": "logistic_normal",
161
+ "noise_sigma": -1.0,
162
+ "allow_tf32": true,
163
+ "activation_checkpointing": false,
164
+ "activation_checkpoint_interval": 1,
165
+ "activation_checkpoint_scope": "block",
166
+ "ddp_static_graph": false,
167
+ "ddp_gradient_as_bucket_view": true,
168
+ "blocking_data_transfer": false,
169
+ "dataloader_prefetch_factor": 4,
170
+ "full_train_stats": false,
171
+ "tokenized_hf": false,
172
+ "tokenized_pad_token": "pad",
173
+ "elf_conditional_hf": false,
174
+ "record_pad_truncate": false,
175
+ "record_add_eos": false,
176
+ "record_add_special_tokens": false,
177
+ "record_pad_token": "pad",
178
+ "record_shuffle_buffer": 10000,
179
+ "wrap": true,
180
+ "wrap_mode": "stream",
181
+ "wrap_record_buffer_size": 200,
182
+ "owt_cached_chunks": true,
183
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
184
+ "owt_chunk_cache_rebuild": false,
185
+ "owt_chunk_cache_write_batch": 4096,
186
+ "owt_exact_repeat_per_chunk": 64,
187
+ "online_chunk_shuffle": false,
188
+ "online_chunk_shuffle_buffer": 10000,
189
+ "openwebtext_split": "train_minus_100k",
190
+ "detokenizer": "auto",
191
+ "resolved_detokenizer": null,
192
+ "num_workers": 0,
193
+ "latest_every": 1000,
194
+ "resume_path": ""
195
+ }
196
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=24.8s lr=2.000000e-03 loss=7.7206 loss_recon=7.7206 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5077 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0958 corrupt_frac=1.0000 acc_corrupt=0.0958 loss_corrupt=7.7206 wrong_frac=0.7915 init_acc_corrupt=0.1164 acc_corrupt_t_0p0_0p2=0.0500 corrupt_frac_t_0p0_0p2=0.5640 acc_corrupt_t_0p2_0p4=0.1270 corrupt_frac_t_0p2_0p4=0.3466 acc_corrupt_t_0p4_0p6=0.2493 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p6_0p8=0.3719 corrupt_frac_t_0p6_0p8=0.0136 out_w_norm=1.0047 out_g_norm=1.0928 acc_corrupt_t_0p8_1p0=0.4936 corrupt_frac_t_0p8_1p0=0.0078 loss_all=7.4724 init_gold_top10=0.2003 init_gold_top100=0.4085 rollout_applied_pos_frac=0.4453 init_acc_rollout_applied=0.1056 init_acc_rollout_kept=0.1192 logit_acc_rollout_applied=0.0969 logit_acc_rollout_kept=0.0996
197
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=23.9s lr=2.000000e-03 loss=7.0874 loss_recon=7.0874 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4995 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1036 corrupt_frac=1.0000 acc_corrupt=0.1036 loss_corrupt=7.0874 wrong_frac=0.7905 init_acc_corrupt=0.1172 acc_corrupt_t_0p0_0p2=0.0560 corrupt_frac_t_0p0_0p2=0.5557 acc_corrupt_t_0p2_0p4=0.1392 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.2552 corrupt_frac_t_0p4_0p6=0.0762 acc_corrupt_t_0p6_0p8=0.3485 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=2.8612 out_g_norm=1.7761 acc_corrupt_t_0p8_1p0=0.4243 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.6891 init_gold_top10=0.2090 init_gold_top100=0.4276 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.1378 init_acc_rollout_kept=0.1215 logit_acc_rollout_applied=0.1143 logit_acc_rollout_kept=0.1146
198
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=24.0s lr=2.000000e-03 loss=6.4546 loss_recon=6.4546 loss_meanflow=0.0000 mean_model_t=0.2098 mean_corrupt_t=0.2098 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5023 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1137 corrupt_frac=1.0000 acc_corrupt=0.1137 loss_corrupt=6.4546 wrong_frac=0.7902 init_acc_corrupt=0.1187 acc_corrupt_t_0p0_0p2=0.0592 corrupt_frac_t_0p0_0p2=0.5544 acc_corrupt_t_0p2_0p4=0.1548 corrupt_frac_t_0p2_0p4=0.3617 acc_corrupt_t_0p4_0p6=0.2839 corrupt_frac_t_0p4_0p6=0.0743 out_w_norm=4.3408 out_g_norm=1.3199 acc_corrupt_t_0p6_0p8=0.3901 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=0.5415 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.2279 init_gold_top10=0.2020 init_gold_top100=0.4348 rollout_applied_pos_frac=0.4375 init_acc_rollout_applied=0.0878 init_acc_rollout_kept=0.1243 logit_acc_rollout_applied=0.1032 logit_acc_rollout_kept=0.1241
199
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=23.9s lr=2.000000e-03 loss=5.9837 loss_recon=5.9837 loss_meanflow=0.0000 mean_model_t=0.2072 mean_corrupt_t=0.2072 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5030 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1233 corrupt_frac=1.0000 acc_corrupt=0.1233 loss_corrupt=5.9837 wrong_frac=0.7929 init_acc_corrupt=0.1160 acc_corrupt_t_0p0_0p2=0.0639 corrupt_frac_t_0p0_0p2=0.5638 acc_corrupt_t_0p2_0p4=0.1708 corrupt_frac_t_0p2_0p4=0.3526 acc_corrupt_t_0p4_0p6=0.3106 corrupt_frac_t_0p4_0p6=0.0753 out_w_norm=5.4789 out_g_norm=0.5031 acc_corrupt_t_0p6_0p8=0.4367 corrupt_frac_t_0p6_0p8=0.0128 acc_corrupt_t_0p8_1p0=0.5306 corrupt_frac_t_0p8_1p0=0.0117 loss_all=5.7599 init_gold_top10=0.2017 init_gold_top100=0.4699 rollout_applied_pos_frac=0.4844 init_acc_rollout_applied=0.1182 init_acc_rollout_kept=0.1042 logit_acc_rollout_applied=0.1310 logit_acc_rollout_kept=0.1227
200
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=23.9s lr=2.000000e-03 loss=5.4774 loss_recon=5.4774 loss_meanflow=0.0000 mean_model_t=0.2101 mean_corrupt_t=0.2101 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4994 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1378 corrupt_frac=1.0000 acc_corrupt=0.1378 loss_corrupt=5.4774 wrong_frac=0.7898 init_acc_corrupt=0.1197 acc_corrupt_t_0p0_0p2=0.0677 corrupt_frac_t_0p0_0p2=0.5506 acc_corrupt_t_0p2_0p4=0.1918 corrupt_frac_t_0p2_0p4=0.3660 acc_corrupt_t_0p4_0p6=0.3501 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.4913 corrupt_frac_t_0p6_0p8=0.0128 out_w_norm=6.7180 out_g_norm=0.4070 acc_corrupt_t_0p8_1p0=0.6104 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.1003 init_gold_top10=0.2273 init_gold_top100=0.5161 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.1306 init_acc_rollout_kept=0.1486 logit_acc_rollout_applied=0.1487 logit_acc_rollout_kept=0.1648
201
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=23.9s lr=2.000000e-03 loss=4.8712 loss_recon=4.8712 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5037 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1521 corrupt_frac=1.0000 acc_corrupt=0.1521 loss_corrupt=4.8712 wrong_frac=0.7918 init_acc_corrupt=0.1187 acc_corrupt_t_0p0_0p2=0.0719 corrupt_frac_t_0p0_0p2=0.5629 acc_corrupt_t_0p2_0p4=0.2147 corrupt_frac_t_0p2_0p4=0.3502 acc_corrupt_t_0p4_0p6=0.4001 corrupt_frac_t_0p4_0p6=0.0768 out_w_norm=7.9804 out_g_norm=0.4280 acc_corrupt_t_0p6_0p8=0.5625 corrupt_frac_t_0p6_0p8=0.0133 loss_all=4.5656 init_gold_top10=0.2029 init_gold_top100=0.5925 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.1124 init_acc_rollout_kept=0.1078 logit_acc_rollout_applied=0.1600 logit_acc_rollout_kept=0.1533
202
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=24.1s lr=2.000000e-03 loss=4.2343 loss_recon=4.2343 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5123 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1770 corrupt_frac=1.0000 acc_corrupt=0.1770 loss_corrupt=4.2343 wrong_frac=0.7915 init_acc_corrupt=0.1192 acc_corrupt_t_0p0_0p2=0.0787 corrupt_frac_t_0p0_0p2=0.5591 acc_corrupt_t_0p2_0p4=0.2531 corrupt_frac_t_0p2_0p4=0.3563 acc_corrupt_t_0p4_0p6=0.4880 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=9.1564 out_g_norm=0.4726 acc_corrupt_t_0p6_0p8=0.6622 corrupt_frac_t_0p6_0p8=0.0125 acc_corrupt_t_0p8_1p0=0.8376 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.9427 init_gold_top10=0.2206 init_gold_top100=0.6230 rollout_applied_pos_frac=0.5156 init_acc_rollout_applied=0.1283 init_acc_rollout_kept=0.1214 logit_acc_rollout_applied=0.1920 logit_acc_rollout_kept=0.1924
203
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=23.9s lr=2.000000e-03 loss=3.7296 loss_recon=3.7296 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5020 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2008 corrupt_frac=1.0000 acc_corrupt=0.2008 loss_corrupt=3.7296 wrong_frac=0.7911 init_acc_corrupt=0.1208 acc_corrupt_t_0p0_0p2=0.0884 corrupt_frac_t_0p0_0p2=0.5509 acc_corrupt_t_0p2_0p4=0.2944 corrupt_frac_t_0p2_0p4=0.3674 acc_corrupt_t_0p4_0p6=0.5237 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.6942 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=9.9997 out_g_norm=0.5989 acc_corrupt_t_0p8_1p0=0.8555 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.4499 init_gold_top10=0.2440 init_gold_top100=0.5899 rollout_applied_pos_frac=0.4531 init_acc_rollout_applied=0.1209 init_acc_rollout_kept=0.1210 logit_acc_rollout_applied=0.2120 logit_acc_rollout_kept=0.2282
204
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=24.0s lr=2.000000e-03 loss=3.2983 loss_recon=3.2983 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5052 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2379 corrupt_frac=1.0000 acc_corrupt=0.2379 loss_corrupt=3.2983 wrong_frac=0.7905 init_acc_corrupt=0.1230 acc_corrupt_t_0p0_0p2=0.1028 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3571 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.5909 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.7411 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=10.4993 out_g_norm=0.8729 acc_corrupt_t_0p8_1p0=0.8477 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.0155 init_gold_top10=0.2915 init_gold_top100=0.6315 rollout_applied_pos_frac=0.4766 init_acc_rollout_applied=0.1272 init_acc_rollout_kept=0.1170 logit_acc_rollout_applied=0.2742 logit_acc_rollout_kept=0.2656
205
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=23.8s lr=2.000000e-03 loss=2.8778 loss_recon=2.8778 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4952 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2973 corrupt_frac=1.0000 acc_corrupt=0.2973 loss_corrupt=2.8778 wrong_frac=0.7924 init_acc_corrupt=0.1228 acc_corrupt_t_0p0_0p2=0.1297 corrupt_frac_t_0p0_0p2=0.5567 acc_corrupt_t_0p2_0p4=0.4607 corrupt_frac_t_0p2_0p4=0.3598 acc_corrupt_t_0p4_0p6=0.6989 corrupt_frac_t_0p4_0p6=0.0751 acc_corrupt_t_0p6_0p8=0.8142 corrupt_frac_t_0p6_0p8=0.0128 out_w_norm=10.8390 out_g_norm=1.0522 loss_all=2.7461 init_gold_top10=0.3628 init_gold_top100=0.6665 rollout_applied_pos_frac=0.5234 init_acc_rollout_applied=0.1368 init_acc_rollout_kept=0.0965 logit_acc_rollout_applied=0.3638 logit_acc_rollout_kept=0.2733
206
+ NCCL version 2.25.1+cuda12.8
207
+ resumed_from=runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139/latest.pt start_step=1001
208
+ {
209
+ "device": "cuda:0",
210
+ "rank": 0,
211
+ "world_size": 4,
212
+ "samples": "owt_cached_chunks:8",
213
+ "vocab_size": 2664,
214
+ "tokenizer_vocab_size": 50257,
215
+ "save_dir": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139",
216
+ "batch_size": 128,
217
+ "grad_accum": 1,
218
+ "effective_batch_size": 512,
219
+ "global_batch_size": 512,
220
+ "lr_schedule": "constant_warmup",
221
+ "optimizer": "muon",
222
+ "epochs": 0.0,
223
+ "steps_per_epoch": 1,
224
+ "total_steps": 2000,
225
+ "warmup_steps": 10,
226
+ "warmup_epochs": -1.0,
227
+ "min_lr": 0.0,
228
+ "weight_decay": 0.1,
229
+ "output_weight_decay": -1.0,
230
+ "adamw_param_groups": "nanogpt",
231
+ "adam_beta1": 0.9,
232
+ "adam_beta2": 0.95,
233
+ "adam_eps": 1e-08,
234
+ "muon_impl": "legacy",
235
+ "muon_momentum": 0.95,
236
+ "muon_ns_steps": 5,
237
+ "muon_update_scale": 1.0,
238
+ "muon_nesterov": false,
239
+ "muon_width_scale": false,
240
+ "muon_grouping": "legacy_dim_ge_2",
241
+ "muon_param_count": 2616320,
242
+ "muon_adam_param_count": 8192,
243
+ "muon_param_names": [
244
+ "vocab_embed.embedding",
245
+ "sigma_map.net.0.weight",
246
+ "sigma_map.net.2.weight",
247
+ "blocks.0.attn_qkv.weight",
248
+ "blocks.0.attn_out.weight",
249
+ "blocks.0.mlp.0.weight",
250
+ "blocks.0.mlp.2.weight",
251
+ "blocks.0.adaLN_modulation.weight",
252
+ "blocks.1.attn_qkv.weight",
253
+ "blocks.1.attn_out.weight",
254
+ "blocks.1.mlp.0.weight",
255
+ "blocks.1.mlp.2.weight",
256
+ "blocks.1.adaLN_modulation.weight",
257
+ "blocks.2.attn_qkv.weight",
258
+ "blocks.2.attn_out.weight",
259
+ "blocks.2.mlp.0.weight",
260
+ "blocks.2.mlp.2.weight",
261
+ "blocks.2.adaLN_modulation.weight",
262
+ "output_layer.linear.weight",
263
+ "output_layer.adaLN_modulation.weight"
264
+ ],
265
+ "muon_adam_param_names": [
266
+ "sigma_map.net.0.bias",
267
+ "sigma_map.net.2.bias",
268
+ "blocks.0.norm1.weight",
269
+ "blocks.0.norm2.weight",
270
+ "blocks.0.mlp.0.bias",
271
+ "blocks.0.mlp.2.bias",
272
+ "blocks.0.adaLN_modulation.bias",
273
+ "blocks.1.norm1.weight",
274
+ "blocks.1.norm2.weight",
275
+ "blocks.1.mlp.0.bias",
276
+ "blocks.1.mlp.2.bias",
277
+ "blocks.1.adaLN_modulation.bias",
278
+ "blocks.2.norm1.weight",
279
+ "blocks.2.norm2.weight",
280
+ "blocks.2.mlp.0.bias",
281
+ "blocks.2.mlp.2.bias",
282
+ "blocks.2.adaLN_modulation.bias",
283
+ "output_layer.norm_final.weight",
284
+ "output_layer.adaLN_modulation.bias"
285
+ ],
286
+ "muon_effective_nesterov": false,
287
+ "muon_effective_width_scale": false,
288
+ "muon_effective_weight_decay": 0.1,
289
+ "muon_adam_fallback_nesterov": false,
290
+ "muon_adam_fallback_weight_decay": 0.1,
291
+ "ema_decay": 0.9999,
292
+ "ema_start_step": 0,
293
+ "model_type": "ddit",
294
+ "ddit_mlp_type": "gelu",
295
+ "elf_num_time_tokens": 4,
296
+ "elf_num_model_mode_tokens": 0,
297
+ "qk_norm": true,
298
+ "output_bias": false,
299
+ "output_init_std": -1.0,
300
+ "norm_type": "rmsnorm",
301
+ "target_loss": "hard_ce",
302
+ "linear_soft_target_power": 1.0,
303
+ "linear_soft_target_min_conf": 0.0,
304
+ "linear_soft_target_max_conf": 1.0,
305
+ "t_sampling_mode": "logit_normal",
306
+ "t_sampling_power": 1.0,
307
+ "t_sampling_eps": 0.0001,
308
+ "t_sampling_logit_mean": -1.5,
309
+ "t_sampling_logit_std": 0.8,
310
+ "dual_t": true,
311
+ "corrupt_t_mode": "same",
312
+ "corrupt_min_t": 0.0,
313
+ "corrupt_max_t": 1.0,
314
+ "prefix_block_prob": 0.0,
315
+ "prefix_block_len": 128,
316
+ "mask_ratio_floor_schedule": "none",
317
+ "dirichlet_endpoint_mode": "categorical_dual_t",
318
+ "dirichlet_semantic_t_mode": "same",
319
+ "dirichlet_semantic_t_value": 0.0,
320
+ "dirichlet_semantic_t_curve": "linear",
321
+ "dirichlet_semantic_t_power": 1.0,
322
+ "endpoint_sequence_random_prob_alpha": 0.0,
323
+ "categorical_wrong_from_full_vocab": true,
324
+ "categorical_wrong_from_batch_valid_tokens": false,
325
+ "categorical_wrong_basin_token_ids": "",
326
+ "categorical_wrong_basin_prob": 0.0,
327
+ "categorical_wrong_unigram_prob": 0.0,
328
+ "categorical_wrong_uniform_prob": 0.0,
329
+ "categorical_wrong_prob_floor": 0.0,
330
+ "categorical_wrong_corpus_unigram_path": "",
331
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
332
+ "categorical_wrong_basin_shared_prob": 0.0,
333
+ "categorical_wrong_unigram_shared_prob": 0.0,
334
+ "mask_mixture_original_prob": 0.0,
335
+ "mask_mixture_lowk_prob": 0.0,
336
+ "mask_mixture_lowcorrupt_prob": 0.0,
337
+ "mask_mixture_block_prob": 0.0,
338
+ "mask_mixture_all_prob": 1.0,
339
+ "mask_mixture_lowk_clean_tokens": "0",
340
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
341
+ "mask_mixture_block_tokens": "64,128",
342
+ "simplex_bridge_sampler": "dirichlet",
343
+ "logistic_normal_sigma_min": 0.1,
344
+ "logistic_normal_sigma_max": 1.0,
345
+ "logistic_normal_tau_min": 1.0,
346
+ "logistic_normal_tau_max": 1.0,
347
+ "torch_compile": false,
348
+ "compile_mode": "max-autotune",
349
+ "state_format": "prob",
350
+ "meanflow_weight": 0.0,
351
+ "rollout_train_prob": 0.5,
352
+ "rollout_train_steps": 4,
353
+ "rollout_train_infer_steps": 1,
354
+ "rollout_train_time_mode": "sampled_path",
355
+ "rollout_train_s_dist": "uniform",
356
+ "rollout_train_s_min_frac": 0.0,
357
+ "rollout_train_s_max_frac": 0.125,
358
+ "rollout_train_s_beta_alpha": 2.0,
359
+ "rollout_train_s_beta_beta": 6.0,
360
+ "rollout_train_temp": 1.45,
361
+ "rollout_train_max_gamma": 1.0,
362
+ "rollout_train_corrupt_only": true,
363
+ "rollout_train_samplewise": true,
364
+ "rollout_train_compute_always": false,
365
+ "rollout_train_sync_t": true,
366
+ "bridge_noise_init": "logistic_normal",
367
+ "noise_sigma": -1.0,
368
+ "allow_tf32": true,
369
+ "activation_checkpointing": false,
370
+ "activation_checkpoint_interval": 1,
371
+ "activation_checkpoint_scope": "block",
372
+ "ddp_static_graph": false,
373
+ "ddp_gradient_as_bucket_view": true,
374
+ "blocking_data_transfer": false,
375
+ "dataloader_prefetch_factor": 4,
376
+ "full_train_stats": false,
377
+ "tokenized_hf": false,
378
+ "tokenized_pad_token": "pad",
379
+ "elf_conditional_hf": false,
380
+ "record_pad_truncate": false,
381
+ "record_add_eos": false,
382
+ "record_add_special_tokens": false,
383
+ "record_pad_token": "pad",
384
+ "record_shuffle_buffer": 10000,
385
+ "wrap": true,
386
+ "wrap_mode": "stream",
387
+ "wrap_record_buffer_size": 200,
388
+ "owt_cached_chunks": true,
389
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
390
+ "owt_chunk_cache_rebuild": false,
391
+ "owt_chunk_cache_write_batch": 4096,
392
+ "owt_exact_repeat_per_chunk": 64,
393
+ "online_chunk_shuffle": false,
394
+ "online_chunk_shuffle_buffer": 10000,
395
+ "openwebtext_split": "train_minus_100k",
396
+ "detokenizer": "auto",
397
+ "resolved_detokenizer": null,
398
+ "num_workers": 0,
399
+ "latest_every": 1000,
400
+ "resume_path": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139/latest.pt"
401
+ }
402
+ step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=24.6s lr=2.000000e-03 loss=2.4561 loss_recon=2.4561 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5077 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3788 corrupt_frac=1.0000 acc_corrupt=0.3788 loss_corrupt=2.4561 wrong_frac=0.7915 init_acc_corrupt=0.1281 acc_corrupt_t_0p0_0p2=0.1728 corrupt_frac_t_0p0_0p2=0.5640 acc_corrupt_t_0p2_0p4=0.6004 corrupt_frac_t_0p2_0p4=0.3466 acc_corrupt_t_0p4_0p6=0.8111 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p6_0p8=0.8828 corrupt_frac_t_0p6_0p8=0.0136 out_w_norm=11.1283 out_g_norm=1.2622 acc_corrupt_t_0p8_1p0=0.9307 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.1052 init_gold_top10=0.4033 init_gold_top100=0.6222 rollout_applied_pos_frac=0.4453 init_acc_rollout_applied=0.1418 init_acc_rollout_kept=0.1192 logit_acc_rollout_applied=0.4907 logit_acc_rollout_kept=0.4183
403
+ step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=23.7s lr=2.000000e-03 loss=2.0694 loss_recon=2.0694 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4995 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4665 corrupt_frac=1.0000 acc_corrupt=0.4665 loss_corrupt=2.0694 wrong_frac=0.7905 init_acc_corrupt=0.1362 acc_corrupt_t_0p0_0p2=0.2250 corrupt_frac_t_0p0_0p2=0.5557 acc_corrupt_t_0p2_0p4=0.7365 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.9014 corrupt_frac_t_0p4_0p6=0.0762 acc_corrupt_t_0p6_0p8=0.9335 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=11.3914 out_g_norm=1.4567 acc_corrupt_t_0p8_1p0=0.9663 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.9323 init_gold_top10=0.4233 init_gold_top100=0.6392 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.1810 init_acc_rollout_kept=0.1215 logit_acc_rollout_applied=0.5332 logit_acc_rollout_kept=0.4872
404
+ step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=23.7s lr=2.000000e-03 loss=1.7590 loss_recon=1.7590 loss_meanflow=0.0000 mean_model_t=0.2098 mean_corrupt_t=0.2098 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5023 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5421 corrupt_frac=1.0000 acc_corrupt=0.5421 loss_corrupt=1.7590 wrong_frac=0.7902 init_acc_corrupt=0.1490 acc_corrupt_t_0p0_0p2=0.2864 corrupt_frac_t_0p0_0p2=0.5544 acc_corrupt_t_0p2_0p4=0.8375 corrupt_frac_t_0p2_0p4=0.3617 acc_corrupt_t_0p4_0p6=0.9568 corrupt_frac_t_0p4_0p6=0.0743 out_w_norm=11.5756 out_g_norm=1.5131 acc_corrupt_t_0p6_0p8=0.9688 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=0.9624 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.4657 init_gold_top10=0.4631 init_gold_top100=0.6198 rollout_applied_pos_frac=0.4375 init_acc_rollout_applied=0.1669 init_acc_rollout_kept=0.1243 logit_acc_rollout_applied=0.6838 logit_acc_rollout_kept=0.5458
405
+ step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=23.7s lr=2.000000e-03 loss=1.5170 loss_recon=1.5170 loss_meanflow=0.0000 mean_model_t=0.2072 mean_corrupt_t=0.2072 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5030 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6018 corrupt_frac=1.0000 acc_corrupt=0.6018 loss_corrupt=1.5170 wrong_frac=0.7929 init_acc_corrupt=0.1570 acc_corrupt_t_0p0_0p2=0.3584 corrupt_frac_t_0p0_0p2=0.5638 acc_corrupt_t_0p2_0p4=0.9009 corrupt_frac_t_0p2_0p4=0.3526 acc_corrupt_t_0p4_0p6=0.9811 corrupt_frac_t_0p4_0p6=0.0753 out_w_norm=11.6982 out_g_norm=1.5191 acc_corrupt_t_0p6_0p8=0.9840 corrupt_frac_t_0p6_0p8=0.0128 acc_corrupt_t_0p8_1p0=0.9805 corrupt_frac_t_0p8_1p0=0.0117 loss_all=1.3672 init_gold_top10=0.4951 init_gold_top100=0.6454 rollout_applied_pos_frac=0.4844 init_acc_rollout_applied=0.2290 init_acc_rollout_kept=0.1042 logit_acc_rollout_applied=0.7189 logit_acc_rollout_kept=0.5401
406
+ step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=23.7s lr=2.000000e-03 loss=1.2915 loss_recon=1.2915 loss_meanflow=0.0000 mean_model_t=0.2101 mean_corrupt_t=0.2101 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4994 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6540 corrupt_frac=1.0000 acc_corrupt=0.6540 loss_corrupt=1.2915 wrong_frac=0.7898 init_acc_corrupt=0.1734 acc_corrupt_t_0p0_0p2=0.4134 corrupt_frac_t_0p0_0p2=0.5506 acc_corrupt_t_0p2_0p4=0.9392 corrupt_frac_t_0p2_0p4=0.3660 acc_corrupt_t_0p4_0p6=0.9911 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9905 corrupt_frac_t_0p6_0p8=0.0128 out_w_norm=11.7810 out_g_norm=1.4652 acc_corrupt_t_0p8_1p0=0.9917 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0025 init_gold_top10=0.5435 init_gold_top100=0.6617 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.2334 init_acc_rollout_kept=0.1486 logit_acc_rollout_applied=0.7939 logit_acc_rollout_kept=0.6810
407
+ step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=23.7s lr=2.000000e-03 loss=1.1774 loss_recon=1.1774 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5037 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6826 corrupt_frac=1.0000 acc_corrupt=0.6826 loss_corrupt=1.1774 wrong_frac=0.7918 init_acc_corrupt=0.1812 acc_corrupt_t_0p0_0p2=0.4607 corrupt_frac_t_0p0_0p2=0.5629 acc_corrupt_t_0p2_0p4=0.9616 corrupt_frac_t_0p2_0p4=0.3502 acc_corrupt_t_0p4_0p6=0.9956 corrupt_frac_t_0p4_0p6=0.0768 out_w_norm=11.8241 out_g_norm=1.3619 acc_corrupt_t_0p6_0p8=0.9946 corrupt_frac_t_0p6_0p8=0.0133 loss_all=1.1961 init_gold_top10=0.5235 init_gold_top100=0.6535 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.2526 init_acc_rollout_kept=0.1078 logit_acc_rollout_applied=0.8093 logit_acc_rollout_kept=0.5727
408
+ step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=23.8s lr=2.000000e-03 loss=1.0426 loss_recon=1.0426 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5123 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7140 corrupt_frac=1.0000 acc_corrupt=0.7140 loss_corrupt=1.0426 wrong_frac=0.7915 init_acc_corrupt=0.1897 acc_corrupt_t_0p0_0p2=0.5042 corrupt_frac_t_0p0_0p2=0.5591 acc_corrupt_t_0p2_0p4=0.9759 corrupt_frac_t_0p2_0p4=0.3563 acc_corrupt_t_0p4_0p6=0.9978 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=11.8438 out_g_norm=1.2704 acc_corrupt_t_0p6_0p8=0.9969 corrupt_frac_t_0p6_0p8=0.0125 acc_corrupt_t_0p8_1p0=0.9912 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0589 init_gold_top10=0.5409 init_gold_top100=0.6740 rollout_applied_pos_frac=0.5156 init_acc_rollout_applied=0.2758 init_acc_rollout_kept=0.1214 logit_acc_rollout_applied=0.7361 logit_acc_rollout_kept=0.6505
409
+ step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=23.7s lr=2.000000e-03 loss=0.9326 loss_recon=0.9326 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5020 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7394 corrupt_frac=1.0000 acc_corrupt=0.7394 loss_corrupt=0.9326 wrong_frac=0.7911 init_acc_corrupt=0.1967 acc_corrupt_t_0p0_0p2=0.5387 corrupt_frac_t_0p0_0p2=0.5509 acc_corrupt_t_0p2_0p4=0.9829 corrupt_frac_t_0p2_0p4=0.3674 acc_corrupt_t_0p4_0p6=0.9988 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.9967 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=11.8643 out_g_norm=1.1264 acc_corrupt_t_0p8_1p0=0.9866 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.7271 init_gold_top10=0.5306 init_gold_top100=0.6286 rollout_applied_pos_frac=0.4531 init_acc_rollout_applied=0.2711 init_acc_rollout_kept=0.1210 logit_acc_rollout_applied=0.8536 logit_acc_rollout_kept=0.7328
410
+ step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=23.8s lr=2.000000e-03 loss=0.8706 loss_recon=0.8706 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5052 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7491 corrupt_frac=1.0000 acc_corrupt=0.7491 loss_corrupt=0.8706 wrong_frac=0.7905 init_acc_corrupt=0.2009 acc_corrupt_t_0p0_0p2=0.5529 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.9885 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.9988 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.9971 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=11.8558 out_g_norm=1.0997 acc_corrupt_t_0p8_1p0=0.9964 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.7654 init_gold_top10=0.5507 init_gold_top100=0.6459 rollout_applied_pos_frac=0.4766 init_acc_rollout_applied=0.2787 init_acc_rollout_kept=0.1170 logit_acc_rollout_applied=0.8564 logit_acc_rollout_kept=0.7075
411
+ step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=23.7s lr=2.000000e-03 loss=0.7684 loss_recon=0.7684 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4952 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7720 corrupt_frac=1.0000 acc_corrupt=0.7720 loss_corrupt=0.7684 wrong_frac=0.7924 init_acc_corrupt=0.1993 acc_corrupt_t_0p0_0p2=0.5971 corrupt_frac_t_0p0_0p2=0.5567 acc_corrupt_t_0p2_0p4=0.9900 corrupt_frac_t_0p2_0p4=0.3598 acc_corrupt_t_0p4_0p6=0.9990 corrupt_frac_t_0p4_0p6=0.0751 acc_corrupt_t_0p6_0p8=0.9969 corrupt_frac_t_0p6_0p8=0.0128 out_w_norm=11.8523 out_g_norm=1.1152 loss_all=0.7562 init_gold_top10=0.5871 init_gold_top100=0.6723 rollout_applied_pos_frac=0.5234 init_acc_rollout_applied=0.3057 init_acc_rollout_kept=0.0965 logit_acc_rollout_applied=0.8930 logit_acc_rollout_kept=0.6352
412
+ NCCL version 2.25.1+cuda12.8
413
+ resumed_from=runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139/latest.pt start_step=2001
414
+ {
415
+ "device": "cuda:0",
416
+ "rank": 0,
417
+ "world_size": 4,
418
+ "samples": "owt_cached_chunks:8",
419
+ "vocab_size": 2664,
420
+ "tokenizer_vocab_size": 50257,
421
+ "save_dir": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139",
422
+ "batch_size": 128,
423
+ "grad_accum": 1,
424
+ "effective_batch_size": 512,
425
+ "global_batch_size": 512,
426
+ "lr_schedule": "constant_warmup",
427
+ "optimizer": "muon",
428
+ "epochs": 0.0,
429
+ "steps_per_epoch": 1,
430
+ "total_steps": 3000,
431
+ "warmup_steps": 10,
432
+ "warmup_epochs": -1.0,
433
+ "min_lr": 0.0,
434
+ "weight_decay": 0.1,
435
+ "output_weight_decay": -1.0,
436
+ "adamw_param_groups": "nanogpt",
437
+ "adam_beta1": 0.9,
438
+ "adam_beta2": 0.95,
439
+ "adam_eps": 1e-08,
440
+ "muon_impl": "legacy",
441
+ "muon_momentum": 0.95,
442
+ "muon_ns_steps": 5,
443
+ "muon_update_scale": 1.0,
444
+ "muon_nesterov": false,
445
+ "muon_width_scale": false,
446
+ "muon_grouping": "legacy_dim_ge_2",
447
+ "muon_param_count": 2616320,
448
+ "muon_adam_param_count": 8192,
449
+ "muon_param_names": [
450
+ "vocab_embed.embedding",
451
+ "sigma_map.net.0.weight",
452
+ "sigma_map.net.2.weight",
453
+ "blocks.0.attn_qkv.weight",
454
+ "blocks.0.attn_out.weight",
455
+ "blocks.0.mlp.0.weight",
456
+ "blocks.0.mlp.2.weight",
457
+ "blocks.0.adaLN_modulation.weight",
458
+ "blocks.1.attn_qkv.weight",
459
+ "blocks.1.attn_out.weight",
460
+ "blocks.1.mlp.0.weight",
461
+ "blocks.1.mlp.2.weight",
462
+ "blocks.1.adaLN_modulation.weight",
463
+ "blocks.2.attn_qkv.weight",
464
+ "blocks.2.attn_out.weight",
465
+ "blocks.2.mlp.0.weight",
466
+ "blocks.2.mlp.2.weight",
467
+ "blocks.2.adaLN_modulation.weight",
468
+ "output_layer.linear.weight",
469
+ "output_layer.adaLN_modulation.weight"
470
+ ],
471
+ "muon_adam_param_names": [
472
+ "sigma_map.net.0.bias",
473
+ "sigma_map.net.2.bias",
474
+ "blocks.0.norm1.weight",
475
+ "blocks.0.norm2.weight",
476
+ "blocks.0.mlp.0.bias",
477
+ "blocks.0.mlp.2.bias",
478
+ "blocks.0.adaLN_modulation.bias",
479
+ "blocks.1.norm1.weight",
480
+ "blocks.1.norm2.weight",
481
+ "blocks.1.mlp.0.bias",
482
+ "blocks.1.mlp.2.bias",
483
+ "blocks.1.adaLN_modulation.bias",
484
+ "blocks.2.norm1.weight",
485
+ "blocks.2.norm2.weight",
486
+ "blocks.2.mlp.0.bias",
487
+ "blocks.2.mlp.2.bias",
488
+ "blocks.2.adaLN_modulation.bias",
489
+ "output_layer.norm_final.weight",
490
+ "output_layer.adaLN_modulation.bias"
491
+ ],
492
+ "muon_effective_nesterov": false,
493
+ "muon_effective_width_scale": false,
494
+ "muon_effective_weight_decay": 0.1,
495
+ "muon_adam_fallback_nesterov": false,
496
+ "muon_adam_fallback_weight_decay": 0.1,
497
+ "ema_decay": 0.9999,
498
+ "ema_start_step": 0,
499
+ "model_type": "ddit",
500
+ "ddit_mlp_type": "gelu",
501
+ "elf_num_time_tokens": 4,
502
+ "elf_num_model_mode_tokens": 0,
503
+ "qk_norm": true,
504
+ "output_bias": false,
505
+ "output_init_std": -1.0,
506
+ "norm_type": "rmsnorm",
507
+ "target_loss": "hard_ce",
508
+ "linear_soft_target_power": 1.0,
509
+ "linear_soft_target_min_conf": 0.0,
510
+ "linear_soft_target_max_conf": 1.0,
511
+ "t_sampling_mode": "logit_normal",
512
+ "t_sampling_power": 1.0,
513
+ "t_sampling_eps": 0.0001,
514
+ "t_sampling_logit_mean": -1.5,
515
+ "t_sampling_logit_std": 0.8,
516
+ "dual_t": true,
517
+ "corrupt_t_mode": "same",
518
+ "corrupt_min_t": 0.0,
519
+ "corrupt_max_t": 1.0,
520
+ "prefix_block_prob": 0.0,
521
+ "prefix_block_len": 128,
522
+ "mask_ratio_floor_schedule": "none",
523
+ "dirichlet_endpoint_mode": "categorical_dual_t",
524
+ "dirichlet_semantic_t_mode": "same",
525
+ "dirichlet_semantic_t_value": 0.0,
526
+ "dirichlet_semantic_t_curve": "linear",
527
+ "dirichlet_semantic_t_power": 1.0,
528
+ "endpoint_sequence_random_prob_alpha": 0.0,
529
+ "categorical_wrong_from_full_vocab": true,
530
+ "categorical_wrong_from_batch_valid_tokens": false,
531
+ "categorical_wrong_basin_token_ids": "",
532
+ "categorical_wrong_basin_prob": 0.0,
533
+ "categorical_wrong_unigram_prob": 0.0,
534
+ "categorical_wrong_uniform_prob": 0.0,
535
+ "categorical_wrong_prob_floor": 0.0,
536
+ "categorical_wrong_corpus_unigram_path": "",
537
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
538
+ "categorical_wrong_basin_shared_prob": 0.0,
539
+ "categorical_wrong_unigram_shared_prob": 0.0,
540
+ "mask_mixture_original_prob": 0.0,
541
+ "mask_mixture_lowk_prob": 0.0,
542
+ "mask_mixture_lowcorrupt_prob": 0.0,
543
+ "mask_mixture_block_prob": 0.0,
544
+ "mask_mixture_all_prob": 1.0,
545
+ "mask_mixture_lowk_clean_tokens": "0",
546
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
547
+ "mask_mixture_block_tokens": "64,128",
548
+ "simplex_bridge_sampler": "dirichlet",
549
+ "logistic_normal_sigma_min": 0.1,
550
+ "logistic_normal_sigma_max": 1.0,
551
+ "logistic_normal_tau_min": 1.0,
552
+ "logistic_normal_tau_max": 1.0,
553
+ "torch_compile": false,
554
+ "compile_mode": "max-autotune",
555
+ "state_format": "prob",
556
+ "meanflow_weight": 0.0,
557
+ "rollout_train_prob": 0.5,
558
+ "rollout_train_steps": 4,
559
+ "rollout_train_infer_steps": 1,
560
+ "rollout_train_time_mode": "sampled_path",
561
+ "rollout_train_s_dist": "uniform",
562
+ "rollout_train_s_min_frac": 0.0,
563
+ "rollout_train_s_max_frac": 0.125,
564
+ "rollout_train_s_beta_alpha": 2.0,
565
+ "rollout_train_s_beta_beta": 6.0,
566
+ "rollout_train_temp": 1.45,
567
+ "rollout_train_max_gamma": 1.0,
568
+ "rollout_train_corrupt_only": true,
569
+ "rollout_train_samplewise": true,
570
+ "rollout_train_compute_always": false,
571
+ "rollout_train_sync_t": true,
572
+ "bridge_noise_init": "logistic_normal",
573
+ "noise_sigma": -1.0,
574
+ "allow_tf32": true,
575
+ "activation_checkpointing": false,
576
+ "activation_checkpoint_interval": 1,
577
+ "activation_checkpoint_scope": "block",
578
+ "ddp_static_graph": false,
579
+ "ddp_gradient_as_bucket_view": true,
580
+ "blocking_data_transfer": false,
581
+ "dataloader_prefetch_factor": 4,
582
+ "full_train_stats": false,
583
+ "tokenized_hf": false,
584
+ "tokenized_pad_token": "pad",
585
+ "elf_conditional_hf": false,
586
+ "record_pad_truncate": false,
587
+ "record_add_eos": false,
588
+ "record_add_special_tokens": false,
589
+ "record_pad_token": "pad",
590
+ "record_shuffle_buffer": 10000,
591
+ "wrap": true,
592
+ "wrap_mode": "stream",
593
+ "wrap_record_buffer_size": 200,
594
+ "owt_cached_chunks": true,
595
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
596
+ "owt_chunk_cache_rebuild": false,
597
+ "owt_chunk_cache_write_batch": 4096,
598
+ "owt_exact_repeat_per_chunk": 64,
599
+ "online_chunk_shuffle": false,
600
+ "online_chunk_shuffle_buffer": 10000,
601
+ "openwebtext_split": "train_minus_100k",
602
+ "detokenizer": "auto",
603
+ "resolved_detokenizer": null,
604
+ "num_workers": 0,
605
+ "latest_every": 1000,
606
+ "resume_path": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139/latest.pt"
607
+ }
608
+ step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=24.6s lr=2.000000e-03 loss=0.6727 loss_recon=0.6727 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5077 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7942 corrupt_frac=1.0000 acc_corrupt=0.7942 loss_corrupt=0.6727 wrong_frac=0.7915 init_acc_corrupt=0.2065 acc_corrupt_t_0p0_0p2=0.6391 corrupt_frac_t_0p0_0p2=0.5640 acc_corrupt_t_0p2_0p4=0.9938 corrupt_frac_t_0p2_0p4=0.3466 acc_corrupt_t_0p4_0p6=0.9993 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p6_0p8=0.9977 corrupt_frac_t_0p6_0p8=0.0136 out_w_norm=11.8640 out_g_norm=1.0068 acc_corrupt_t_0p8_1p0=0.9895 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5473 init_gold_top10=0.5368 init_gold_top100=0.6265 rollout_applied_pos_frac=0.4453 init_acc_rollout_applied=0.3061 init_acc_rollout_kept=0.1192 logit_acc_rollout_applied=0.8278 logit_acc_rollout_kept=0.8275
609
+ step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=23.6s lr=2.000000e-03 loss=0.5787 loss_recon=0.5787 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4995 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8214 corrupt_frac=1.0000 acc_corrupt=0.8214 loss_corrupt=0.5787 wrong_frac=0.7905 init_acc_corrupt=0.2095 acc_corrupt_t_0p0_0p2=0.6810 corrupt_frac_t_0p0_0p2=0.5557 acc_corrupt_t_0p2_0p4=0.9966 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.9994 corrupt_frac_t_0p4_0p6=0.0762 acc_corrupt_t_0p6_0p8=0.9984 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=11.8582 out_g_norm=0.9122 acc_corrupt_t_0p8_1p0=0.9941 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5341 init_gold_top10=0.5514 init_gold_top100=0.6421 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.2945 init_acc_rollout_kept=0.1215 logit_acc_rollout_applied=0.8562 logit_acc_rollout_kept=0.8293
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728.log ADDED
@@ -0,0 +1,1034 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 2423,
8
+ "tokenizer_vocab_size": 32100,
9
+ "save_dir": "runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 2523776,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "uniform",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_prob_floor": 0.0,
124
+ "categorical_wrong_corpus_unigram_path": "",
125
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
126
+ "categorical_wrong_basin_shared_prob": 0.0,
127
+ "categorical_wrong_unigram_shared_prob": 0.0,
128
+ "mask_mixture_original_prob": 0.0,
129
+ "mask_mixture_lowk_prob": 0.0,
130
+ "mask_mixture_lowcorrupt_prob": 0.0,
131
+ "mask_mixture_block_prob": 0.0,
132
+ "mask_mixture_all_prob": 1.0,
133
+ "mask_mixture_lowk_clean_tokens": "0",
134
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
135
+ "mask_mixture_block_tokens": "64,128",
136
+ "simplex_bridge_sampler": "dirichlet",
137
+ "logistic_normal_sigma_min": 0.1,
138
+ "logistic_normal_sigma_max": 1.0,
139
+ "logistic_normal_tau_min": 1.0,
140
+ "logistic_normal_tau_max": 1.0,
141
+ "torch_compile": false,
142
+ "compile_mode": "max-autotune",
143
+ "state_format": "prob",
144
+ "meanflow_weight": 0.0,
145
+ "rollout_train_prob": 0.35,
146
+ "rollout_train_steps": 3,
147
+ "rollout_train_steps_min": 0,
148
+ "rollout_train_infer_steps": 1,
149
+ "rollout_train_time_mode": "sampled_path",
150
+ "rollout_train_s_dist": "uniform",
151
+ "rollout_train_s_min_frac": 0.0,
152
+ "rollout_train_s_max_frac": 0.25,
153
+ "rollout_train_s_beta_alpha": 2.0,
154
+ "rollout_train_s_beta_beta": 6.0,
155
+ "rollout_train_temp": 1.0,
156
+ "rollout_train_max_gamma": 1.0,
157
+ "rollout_train_corrupt_only": true,
158
+ "rollout_train_samplewise": true,
159
+ "rollout_train_compute_always": false,
160
+ "rollout_train_sync_t": true,
161
+ "bridge_noise_init": "logistic_normal",
162
+ "noise_sigma": -1.0,
163
+ "allow_tf32": true,
164
+ "activation_checkpointing": false,
165
+ "activation_checkpoint_interval": 1,
166
+ "activation_checkpoint_scope": "block",
167
+ "ddp_static_graph": false,
168
+ "ddp_gradient_as_bucket_view": true,
169
+ "blocking_data_transfer": false,
170
+ "dataloader_prefetch_factor": 4,
171
+ "full_train_stats": false,
172
+ "tokenized_hf": false,
173
+ "tokenized_pad_token": "pad",
174
+ "elf_conditional_hf": false,
175
+ "record_pad_truncate": false,
176
+ "record_add_eos": false,
177
+ "record_add_special_tokens": false,
178
+ "record_pad_token": "pad",
179
+ "record_shuffle_buffer": 10000,
180
+ "wrap": true,
181
+ "wrap_mode": "stream",
182
+ "wrap_record_buffer_size": 200,
183
+ "owt_cached_chunks": true,
184
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
185
+ "owt_chunk_cache_rebuild": false,
186
+ "owt_chunk_cache_write_batch": 4096,
187
+ "owt_exact_repeat_per_chunk": 64,
188
+ "online_chunk_shuffle": false,
189
+ "online_chunk_shuffle_buffer": 10000,
190
+ "openwebtext_split": "train_minus_100k",
191
+ "detokenizer": "auto",
192
+ "resolved_detokenizer": null,
193
+ "num_workers": 0,
194
+ "latest_every": 1000,
195
+ "resume_path": ""
196
+ }
197
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=19.8s lr=2.000000e-03 loss=7.3417 loss_recon=7.3417 loss_meanflow=0.0000 mean_model_t=0.4971 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3513 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3284 corrupt_frac=1.0000 acc_corrupt=0.3284 loss_corrupt=7.3417 wrong_frac=0.5028 init_acc_corrupt=0.4627 acc_corrupt_t_0p0_0p2=0.0466 corrupt_frac_t_0p0_0p2=0.2071 acc_corrupt_t_0p2_0p4=0.1614 corrupt_frac_t_0p2_0p4=0.1988 acc_corrupt_t_0p4_0p6=0.3279 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=0.4813 corrupt_frac_t_0p6_0p8=0.1951 acc_corrupt_t_0p8_1p0=0.6352 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=1.0922 out_g_norm=1.0102 loss_all=6.7794 init_gold_top10=0.4656 init_gold_top100=0.6040 rollout_applied_pos_frac=0.3516 init_acc_rollout_applied=0.3928 init_acc_rollout_kept=0.4393 logit_acc_rollout_applied=0.2685 logit_acc_rollout_kept=0.2938
198
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=19.0s lr=2.000000e-03 loss=5.8034 loss_recon=5.8034 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3505 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3299 corrupt_frac=1.0000 acc_corrupt=0.3299 loss_corrupt=5.8034 wrong_frac=0.4984 init_acc_corrupt=0.4676 acc_corrupt_t_0p0_0p2=0.0524 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.1630 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.3295 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=0.4754 corrupt_frac_t_0p6_0p8=0.2007 acc_corrupt_t_0p8_1p0=0.6253 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=3.4952 out_g_norm=1.3311 loss_all=5.0183 init_gold_top10=0.5079 init_gold_top100=0.6301 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.4755 init_acc_rollout_kept=0.4824 logit_acc_rollout_applied=0.3469 logit_acc_rollout_kept=0.3570
199
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=19.0s lr=2.000000e-03 loss=4.7182 loss_recon=4.7182 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3535 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3676 corrupt_frac=1.0000 acc_corrupt=0.3676 loss_corrupt=4.7182 wrong_frac=0.4985 init_acc_corrupt=0.4685 acc_corrupt_t_0p0_0p2=0.0557 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.1898 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.3639 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=0.5242 corrupt_frac_t_0p6_0p8=0.2045 acc_corrupt_t_0p8_1p0=0.6974 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=5.5797 out_g_norm=0.5500 loss_all=4.3883 init_gold_top10=0.5102 init_gold_top100=0.6393 rollout_applied_pos_frac=0.4062 init_acc_rollout_applied=0.5089 init_acc_rollout_kept=0.4471 logit_acc_rollout_applied=0.4133 logit_acc_rollout_kept=0.3762
200
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=19.0s lr=2.000000e-03 loss=4.1225 loss_recon=4.1225 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3515 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4225 corrupt_frac=1.0000 acc_corrupt=0.4225 loss_corrupt=4.1225 wrong_frac=0.5016 init_acc_corrupt=0.4649 acc_corrupt_t_0p0_0p2=0.0583 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.2095 corrupt_frac_t_0p2_0p4=0.2001 acc_corrupt_t_0p4_0p6=0.4200 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.6151 corrupt_frac_t_0p6_0p8=0.1992 acc_corrupt_t_0p8_1p0=0.8157 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=7.1167 out_g_norm=0.2749 loss_all=3.9532 init_gold_top10=0.4850 init_gold_top100=0.6328 rollout_applied_pos_frac=0.3281 init_acc_rollout_applied=0.4657 init_acc_rollout_kept=0.4320 logit_acc_rollout_applied=0.4583 logit_acc_rollout_kept=0.4408
201
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=19.0s lr=2.000000e-03 loss=3.5466 loss_recon=3.5466 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3470 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4823 corrupt_frac=1.0000 acc_corrupt=0.4823 loss_corrupt=3.5466 wrong_frac=0.5023 init_acc_corrupt=0.4642 acc_corrupt_t_0p0_0p2=0.0594 corrupt_frac_t_0p0_0p2=0.2059 acc_corrupt_t_0p2_0p4=0.2412 corrupt_frac_t_0p2_0p4=0.1985 acc_corrupt_t_0p4_0p6=0.5048 corrupt_frac_t_0p4_0p6=0.1969 acc_corrupt_t_0p6_0p8=0.7094 corrupt_frac_t_0p6_0p8=0.1927 acc_corrupt_t_0p8_1p0=0.9033 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=8.4596 out_g_norm=0.2351 loss_all=3.3523 init_gold_top10=0.4858 init_gold_top100=0.6220 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.4617 init_acc_rollout_kept=0.4433 logit_acc_rollout_applied=0.4844 logit_acc_rollout_kept=0.4697
202
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=19.1s lr=2.000000e-03 loss=3.0821 loss_recon=3.0821 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3562 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4963 corrupt_frac=1.0000 acc_corrupt=0.4963 loss_corrupt=3.0821 wrong_frac=0.4987 init_acc_corrupt=0.4682 acc_corrupt_t_0p0_0p2=0.0622 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.2700 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=0.5248 corrupt_frac_t_0p4_0p6=0.1990 acc_corrupt_t_0p6_0p8=0.7152 corrupt_frac_t_0p6_0p8=0.2017 acc_corrupt_t_0p8_1p0=0.9049 corrupt_frac_t_0p8_1p0=0.1997 out_w_norm=9.7303 out_g_norm=0.2534 loss_all=2.4394 init_gold_top10=0.5858 init_gold_top100=0.7010 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5050 init_acc_rollout_kept=0.5727 logit_acc_rollout_applied=0.5326 logit_acc_rollout_kept=0.6010
203
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=18.9s lr=2.000000e-03 loss=2.7681 loss_recon=2.7681 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3412 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5034 corrupt_frac=1.0000 acc_corrupt=0.5034 loss_corrupt=2.7681 wrong_frac=0.4994 init_acc_corrupt=0.4679 acc_corrupt_t_0p0_0p2=0.0635 corrupt_frac_t_0p0_0p2=0.2030 acc_corrupt_t_0p2_0p4=0.2807 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.5331 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.7234 corrupt_frac_t_0p6_0p8=0.2001 acc_corrupt_t_0p8_1p0=0.9066 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=10.7428 out_g_norm=0.2922 loss_all=2.5966 init_gold_top10=0.5189 init_gold_top100=0.6915 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.4203 init_acc_rollout_kept=0.5121 logit_acc_rollout_applied=0.4614 logit_acc_rollout_kept=0.5496
204
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=19.0s lr=2.000000e-03 loss=2.3453 loss_recon=2.3453 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3430 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5308 corrupt_frac=1.0000 acc_corrupt=0.5308 loss_corrupt=2.3453 wrong_frac=0.5037 init_acc_corrupt=0.4633 acc_corrupt_t_0p0_0p2=0.0631 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.3065 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=0.5891 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=0.7800 corrupt_frac_t_0p6_0p8=0.1965 acc_corrupt_t_0p8_1p0=0.9311 corrupt_frac_t_0p8_1p0=0.1984 out_w_norm=11.2908 out_g_norm=0.3703 loss_all=2.2007 init_gold_top10=0.5154 init_gold_top100=0.6697 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.4141 init_acc_rollout_kept=0.4622 logit_acc_rollout_applied=0.5173 logit_acc_rollout_kept=0.5575
205
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=19.1s lr=2.000000e-03 loss=1.8132 loss_recon=1.8132 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.5040 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3546 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6146 corrupt_frac=1.0000 acc_corrupt=0.6146 loss_corrupt=1.8132 wrong_frac=0.4960 init_acc_corrupt=0.4740 acc_corrupt_t_0p0_0p2=0.0648 corrupt_frac_t_0p0_0p2=0.1931 acc_corrupt_t_0p2_0p4=0.3780 corrupt_frac_t_0p2_0p4=0.2013 acc_corrupt_t_0p4_0p6=0.7364 corrupt_frac_t_0p4_0p6=0.2019 acc_corrupt_t_0p6_0p8=0.8940 corrupt_frac_t_0p6_0p8=0.1981 acc_corrupt_t_0p8_1p0=0.9738 corrupt_frac_t_0p8_1p0=0.2056 out_w_norm=11.7769 out_g_norm=0.4581 loss_all=1.5519 init_gold_top10=0.5837 init_gold_top100=0.7187 rollout_applied_pos_frac=0.3125 init_acc_rollout_applied=0.4619 init_acc_rollout_kept=0.5155 logit_acc_rollout_applied=0.6272 logit_acc_rollout_kept=0.6871
206
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=19.0s lr=2.000000e-03 loss=1.5254 loss_recon=1.5254 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3534 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6756 corrupt_frac=1.0000 acc_corrupt=0.6756 loss_corrupt=1.5254 wrong_frac=0.4993 init_acc_corrupt=0.4741 acc_corrupt_t_0p0_0p2=0.0694 corrupt_frac_t_0p0_0p2=0.2034 acc_corrupt_t_0p2_0p4=0.4889 corrupt_frac_t_0p2_0p4=0.1967 acc_corrupt_t_0p4_0p6=0.8656 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=0.9668 corrupt_frac_t_0p6_0p8=0.1933 acc_corrupt_t_0p8_1p0=0.9947 corrupt_frac_t_0p8_1p0=0.2055 out_w_norm=12.1609 out_g_norm=0.5459 loss_all=1.5778 init_gold_top10=0.5685 init_gold_top100=0.7045 rollout_applied_pos_frac=0.4141 init_acc_rollout_applied=0.5418 init_acc_rollout_kept=0.4381 logit_acc_rollout_applied=0.7332 logit_acc_rollout_kept=0.6367
207
+ NCCL version 2.25.1+cuda12.8
208
+ resumed_from=runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt start_step=1001
209
+ {
210
+ "device": "cuda:0",
211
+ "rank": 0,
212
+ "world_size": 4,
213
+ "samples": "owt_cached_chunks:8",
214
+ "vocab_size": 2423,
215
+ "tokenizer_vocab_size": 32100,
216
+ "save_dir": "runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728",
217
+ "batch_size": 128,
218
+ "grad_accum": 1,
219
+ "effective_batch_size": 512,
220
+ "global_batch_size": 512,
221
+ "lr_schedule": "constant_warmup",
222
+ "optimizer": "muon",
223
+ "epochs": 0.0,
224
+ "steps_per_epoch": 1,
225
+ "total_steps": 2000,
226
+ "warmup_steps": 10,
227
+ "warmup_epochs": -1.0,
228
+ "min_lr": 0.0,
229
+ "weight_decay": 0.1,
230
+ "output_weight_decay": -1.0,
231
+ "adamw_param_groups": "nanogpt",
232
+ "adam_beta1": 0.9,
233
+ "adam_beta2": 0.95,
234
+ "adam_eps": 1e-08,
235
+ "muon_impl": "legacy",
236
+ "muon_momentum": 0.95,
237
+ "muon_ns_steps": 5,
238
+ "muon_update_scale": 1.0,
239
+ "muon_nesterov": false,
240
+ "muon_width_scale": false,
241
+ "muon_grouping": "legacy_dim_ge_2",
242
+ "muon_param_count": 2523776,
243
+ "muon_adam_param_count": 8192,
244
+ "muon_param_names": [
245
+ "vocab_embed.embedding",
246
+ "sigma_map.net.0.weight",
247
+ "sigma_map.net.2.weight",
248
+ "blocks.0.attn_qkv.weight",
249
+ "blocks.0.attn_out.weight",
250
+ "blocks.0.mlp.0.weight",
251
+ "blocks.0.mlp.2.weight",
252
+ "blocks.0.adaLN_modulation.weight",
253
+ "blocks.1.attn_qkv.weight",
254
+ "blocks.1.attn_out.weight",
255
+ "blocks.1.mlp.0.weight",
256
+ "blocks.1.mlp.2.weight",
257
+ "blocks.1.adaLN_modulation.weight",
258
+ "blocks.2.attn_qkv.weight",
259
+ "blocks.2.attn_out.weight",
260
+ "blocks.2.mlp.0.weight",
261
+ "blocks.2.mlp.2.weight",
262
+ "blocks.2.adaLN_modulation.weight",
263
+ "output_layer.linear.weight",
264
+ "output_layer.adaLN_modulation.weight"
265
+ ],
266
+ "muon_adam_param_names": [
267
+ "sigma_map.net.0.bias",
268
+ "sigma_map.net.2.bias",
269
+ "blocks.0.norm1.weight",
270
+ "blocks.0.norm2.weight",
271
+ "blocks.0.mlp.0.bias",
272
+ "blocks.0.mlp.2.bias",
273
+ "blocks.0.adaLN_modulation.bias",
274
+ "blocks.1.norm1.weight",
275
+ "blocks.1.norm2.weight",
276
+ "blocks.1.mlp.0.bias",
277
+ "blocks.1.mlp.2.bias",
278
+ "blocks.1.adaLN_modulation.bias",
279
+ "blocks.2.norm1.weight",
280
+ "blocks.2.norm2.weight",
281
+ "blocks.2.mlp.0.bias",
282
+ "blocks.2.mlp.2.bias",
283
+ "blocks.2.adaLN_modulation.bias",
284
+ "output_layer.norm_final.weight",
285
+ "output_layer.adaLN_modulation.bias"
286
+ ],
287
+ "muon_effective_nesterov": false,
288
+ "muon_effective_width_scale": false,
289
+ "muon_effective_weight_decay": 0.1,
290
+ "muon_adam_fallback_nesterov": false,
291
+ "muon_adam_fallback_weight_decay": 0.1,
292
+ "ema_decay": 0.9999,
293
+ "ema_start_step": 0,
294
+ "model_type": "ddit",
295
+ "ddit_mlp_type": "gelu",
296
+ "elf_num_time_tokens": 4,
297
+ "elf_num_model_mode_tokens": 0,
298
+ "qk_norm": true,
299
+ "output_bias": false,
300
+ "output_init_std": -1.0,
301
+ "norm_type": "rmsnorm",
302
+ "target_loss": "hard_ce",
303
+ "linear_soft_target_power": 1.0,
304
+ "linear_soft_target_min_conf": 0.0,
305
+ "linear_soft_target_max_conf": 1.0,
306
+ "t_sampling_mode": "uniform",
307
+ "t_sampling_power": 1.0,
308
+ "t_sampling_eps": 0.0001,
309
+ "t_sampling_logit_mean": -1.5,
310
+ "t_sampling_logit_std": 0.8,
311
+ "dual_t": true,
312
+ "corrupt_t_mode": "same",
313
+ "corrupt_min_t": 0.0,
314
+ "corrupt_max_t": 1.0,
315
+ "prefix_block_prob": 0.0,
316
+ "prefix_block_len": 128,
317
+ "mask_ratio_floor_schedule": "none",
318
+ "dirichlet_endpoint_mode": "categorical_dual_t",
319
+ "dirichlet_semantic_t_mode": "same",
320
+ "dirichlet_semantic_t_value": 0.0,
321
+ "dirichlet_semantic_t_curve": "linear",
322
+ "dirichlet_semantic_t_power": 1.0,
323
+ "endpoint_sequence_random_prob_alpha": 0.0,
324
+ "categorical_wrong_from_full_vocab": true,
325
+ "categorical_wrong_from_batch_valid_tokens": false,
326
+ "categorical_wrong_basin_token_ids": "",
327
+ "categorical_wrong_basin_prob": 0.0,
328
+ "categorical_wrong_unigram_prob": 0.0,
329
+ "categorical_wrong_uniform_prob": 0.0,
330
+ "categorical_wrong_prob_floor": 0.0,
331
+ "categorical_wrong_corpus_unigram_path": "",
332
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
333
+ "categorical_wrong_basin_shared_prob": 0.0,
334
+ "categorical_wrong_unigram_shared_prob": 0.0,
335
+ "mask_mixture_original_prob": 0.0,
336
+ "mask_mixture_lowk_prob": 0.0,
337
+ "mask_mixture_lowcorrupt_prob": 0.0,
338
+ "mask_mixture_block_prob": 0.0,
339
+ "mask_mixture_all_prob": 1.0,
340
+ "mask_mixture_lowk_clean_tokens": "0",
341
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
342
+ "mask_mixture_block_tokens": "64,128",
343
+ "simplex_bridge_sampler": "dirichlet",
344
+ "logistic_normal_sigma_min": 0.1,
345
+ "logistic_normal_sigma_max": 1.0,
346
+ "logistic_normal_tau_min": 1.0,
347
+ "logistic_normal_tau_max": 1.0,
348
+ "torch_compile": false,
349
+ "compile_mode": "max-autotune",
350
+ "state_format": "prob",
351
+ "meanflow_weight": 0.0,
352
+ "rollout_train_prob": 0.35,
353
+ "rollout_train_steps": 3,
354
+ "rollout_train_steps_min": 0,
355
+ "rollout_train_infer_steps": 1,
356
+ "rollout_train_time_mode": "sampled_path",
357
+ "rollout_train_s_dist": "uniform",
358
+ "rollout_train_s_min_frac": 0.0,
359
+ "rollout_train_s_max_frac": 0.25,
360
+ "rollout_train_s_beta_alpha": 2.0,
361
+ "rollout_train_s_beta_beta": 6.0,
362
+ "rollout_train_temp": 1.0,
363
+ "rollout_train_max_gamma": 1.0,
364
+ "rollout_train_corrupt_only": true,
365
+ "rollout_train_samplewise": true,
366
+ "rollout_train_compute_always": false,
367
+ "rollout_train_sync_t": true,
368
+ "bridge_noise_init": "logistic_normal",
369
+ "noise_sigma": -1.0,
370
+ "allow_tf32": true,
371
+ "activation_checkpointing": false,
372
+ "activation_checkpoint_interval": 1,
373
+ "activation_checkpoint_scope": "block",
374
+ "ddp_static_graph": false,
375
+ "ddp_gradient_as_bucket_view": true,
376
+ "blocking_data_transfer": false,
377
+ "dataloader_prefetch_factor": 4,
378
+ "full_train_stats": false,
379
+ "tokenized_hf": false,
380
+ "tokenized_pad_token": "pad",
381
+ "elf_conditional_hf": false,
382
+ "record_pad_truncate": false,
383
+ "record_add_eos": false,
384
+ "record_add_special_tokens": false,
385
+ "record_pad_token": "pad",
386
+ "record_shuffle_buffer": 10000,
387
+ "wrap": true,
388
+ "wrap_mode": "stream",
389
+ "wrap_record_buffer_size": 200,
390
+ "owt_cached_chunks": true,
391
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
392
+ "owt_chunk_cache_rebuild": false,
393
+ "owt_chunk_cache_write_batch": 4096,
394
+ "owt_exact_repeat_per_chunk": 64,
395
+ "online_chunk_shuffle": false,
396
+ "online_chunk_shuffle_buffer": 10000,
397
+ "openwebtext_split": "train_minus_100k",
398
+ "detokenizer": "auto",
399
+ "resolved_detokenizer": null,
400
+ "num_workers": 0,
401
+ "latest_every": 1000,
402
+ "resume_path": "runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt"
403
+ }
404
+ step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=20.2s lr=2.000000e-03 loss=1.3296 loss_recon=1.3296 loss_meanflow=0.0000 mean_model_t=0.4971 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3513 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7155 corrupt_frac=1.0000 acc_corrupt=0.7155 loss_corrupt=1.3296 wrong_frac=0.5028 init_acc_corrupt=0.4747 acc_corrupt_t_0p0_0p2=0.0794 corrupt_frac_t_0p0_0p2=0.2071 acc_corrupt_t_0p2_0p4=0.6046 corrupt_frac_t_0p2_0p4=0.1988 acc_corrupt_t_0p4_0p6=0.9346 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=0.9896 corrupt_frac_t_0p6_0p8=0.1951 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.4307 out_g_norm=0.5976 loss_all=1.1705 init_gold_top10=0.5527 init_gold_top100=0.6892 rollout_applied_pos_frac=0.3516 init_acc_rollout_applied=0.4722 init_acc_rollout_kept=0.4393 logit_acc_rollout_applied=0.7318 logit_acc_rollout_kept=0.7314
405
+ step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=19.0s lr=2.000000e-03 loss=1.1354 loss_recon=1.1354 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3505 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7552 corrupt_frac=1.0000 acc_corrupt=0.7552 loss_corrupt=1.1354 wrong_frac=0.4984 init_acc_corrupt=0.4838 acc_corrupt_t_0p0_0p2=0.0990 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.7128 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.9679 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=0.9960 corrupt_frac_t_0p6_0p8=0.2007 acc_corrupt_t_0p8_1p0=0.9995 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.6239 out_g_norm=0.6265 loss_all=1.0650 init_gold_top10=0.5828 init_gold_top100=0.7012 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.5340 init_acc_rollout_kept=0.4824 logit_acc_rollout_applied=0.7880 logit_acc_rollout_kept=0.7683
406
+ step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=19.0s lr=2.000000e-03 loss=0.9741 loss_recon=0.9741 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3535 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7882 corrupt_frac=1.0000 acc_corrupt=0.7882 loss_corrupt=0.9741 wrong_frac=0.4985 init_acc_corrupt=0.4873 acc_corrupt_t_0p0_0p2=0.1305 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.8078 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.9844 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=0.9982 corrupt_frac_t_0p6_0p8=0.2045 acc_corrupt_t_0p8_1p0=0.9998 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=12.7840 out_g_norm=0.6228 loss_all=0.8968 init_gold_top10=0.5980 init_gold_top100=0.7052 rollout_applied_pos_frac=0.4062 init_acc_rollout_applied=0.5557 init_acc_rollout_kept=0.4471 logit_acc_rollout_applied=0.8387 logit_acc_rollout_kept=0.7854
407
+ step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=19.0s lr=2.000000e-03 loss=0.8769 loss_recon=0.8769 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3515 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8068 corrupt_frac=1.0000 acc_corrupt=0.8068 loss_corrupt=0.8769 wrong_frac=0.5016 init_acc_corrupt=0.4879 acc_corrupt_t_0p0_0p2=0.1718 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.8725 corrupt_frac_t_0p2_0p4=0.2001 acc_corrupt_t_0p4_0p6=0.9927 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.9991 corrupt_frac_t_0p6_0p8=0.1992 acc_corrupt_t_0p8_1p0=0.9998 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=12.9245 out_g_norm=0.6431 loss_all=0.7940 init_gold_top10=0.5796 init_gold_top100=0.7022 rollout_applied_pos_frac=0.3281 init_acc_rollout_applied=0.5728 init_acc_rollout_kept=0.4320 logit_acc_rollout_applied=0.8450 logit_acc_rollout_kept=0.8035
408
+ step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=19.0s lr=2.000000e-03 loss=0.8124 loss_recon=0.8124 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3470 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8199 corrupt_frac=1.0000 acc_corrupt=0.8199 loss_corrupt=0.8124 wrong_frac=0.5023 init_acc_corrupt=0.4897 acc_corrupt_t_0p0_0p2=0.2024 corrupt_frac_t_0p0_0p2=0.2059 acc_corrupt_t_0p2_0p4=0.9242 corrupt_frac_t_0p2_0p4=0.1985 acc_corrupt_t_0p4_0p6=0.9964 corrupt_frac_t_0p4_0p6=0.1969 acc_corrupt_t_0p6_0p8=0.9994 corrupt_frac_t_0p6_0p8=0.1927 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=13.0274 out_g_norm=0.6095 loss_all=0.7331 init_gold_top10=0.5547 init_gold_top100=0.6604 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5269 init_acc_rollout_kept=0.4433 logit_acc_rollout_applied=0.8746 logit_acc_rollout_kept=0.8114
409
+ step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=19.1s lr=2.000000e-03 loss=0.6964 loss_recon=0.6964 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3562 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8418 corrupt_frac=1.0000 acc_corrupt=0.8418 loss_corrupt=0.6964 wrong_frac=0.4987 init_acc_corrupt=0.4972 acc_corrupt_t_0p0_0p2=0.2464 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.9545 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=0.9982 corrupt_frac_t_0p4_0p6=0.1990 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.2017 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.1997 out_w_norm=13.1072 out_g_norm=0.5888 loss_all=0.5103 init_gold_top10=0.6593 init_gold_top100=0.7448 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5605 init_acc_rollout_kept=0.5727 logit_acc_rollout_applied=0.8679 logit_acc_rollout_kept=0.8962
410
+ step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=18.9s lr=2.000000e-03 loss=0.6638 loss_recon=0.6638 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3412 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8463 corrupt_frac=1.0000 acc_corrupt=0.8463 loss_corrupt=0.6638 wrong_frac=0.4994 init_acc_corrupt=0.4951 acc_corrupt_t_0p0_0p2=0.2739 corrupt_frac_t_0p0_0p2=0.2030 acc_corrupt_t_0p2_0p4=0.9684 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.9989 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.2001 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=13.1660 out_g_norm=0.6050 loss_all=0.8292 init_gold_top10=0.5939 init_gold_top100=0.7235 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.4727 init_acc_rollout_kept=0.5121 logit_acc_rollout_applied=0.7681 logit_acc_rollout_kept=0.8222
411
+ step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=19.0s lr=2.000000e-03 loss=0.6150 loss_recon=0.6150 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3430 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8536 corrupt_frac=1.0000 acc_corrupt=0.8536 loss_corrupt=0.6150 wrong_frac=0.5037 init_acc_corrupt=0.4900 acc_corrupt_t_0p0_0p2=0.2985 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.9798 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=0.9994 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.1965 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.1984 out_w_norm=13.2069 out_g_norm=0.5504 loss_all=0.6526 init_gold_top10=0.5530 init_gold_top100=0.6933 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.4390 init_acc_rollout_kept=0.4622 logit_acc_rollout_applied=0.8218 logit_acc_rollout_kept=0.8440
412
+ step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=19.1s lr=2.000000e-03 loss=0.5333 loss_recon=0.5333 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.5040 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3546 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8671 corrupt_frac=1.0000 acc_corrupt=0.8671 loss_corrupt=0.5333 wrong_frac=0.4960 init_acc_corrupt=0.5035 acc_corrupt_t_0p0_0p2=0.3275 corrupt_frac_t_0p0_0p2=0.1931 acc_corrupt_t_0p2_0p4=0.9857 corrupt_frac_t_0p2_0p4=0.2013 acc_corrupt_t_0p4_0p6=0.9995 corrupt_frac_t_0p4_0p6=0.2019 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1981 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.2056 out_w_norm=13.2327 out_g_norm=0.5815 loss_all=0.3862 init_gold_top10=0.6463 init_gold_top100=0.7374 rollout_applied_pos_frac=0.3125 init_acc_rollout_applied=0.5274 init_acc_rollout_kept=0.5155 logit_acc_rollout_applied=0.8912 logit_acc_rollout_kept=0.8996
413
+ step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=19.0s lr=2.000000e-03 loss=0.4955 loss_recon=0.4955 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3534 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8700 corrupt_frac=1.0000 acc_corrupt=0.8700 loss_corrupt=0.4955 wrong_frac=0.4993 init_acc_corrupt=0.4977 acc_corrupt_t_0p0_0p2=0.3724 corrupt_frac_t_0p0_0p2=0.2034 acc_corrupt_t_0p2_0p4=0.9888 corrupt_frac_t_0p2_0p4=0.1967 acc_corrupt_t_0p4_0p6=0.9996 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1933 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2055 out_w_norm=13.2466 out_g_norm=0.5578 loss_all=0.6899 init_gold_top10=0.6007 init_gold_top100=0.7234 rollout_applied_pos_frac=0.4141 init_acc_rollout_applied=0.6060 init_acc_rollout_kept=0.4381 logit_acc_rollout_applied=0.8636 logit_acc_rollout_kept=0.7849
414
+ NCCL version 2.25.1+cuda12.8
415
+ resumed_from=runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt start_step=2001
416
+ {
417
+ "device": "cuda:0",
418
+ "rank": 0,
419
+ "world_size": 4,
420
+ "samples": "owt_cached_chunks:8",
421
+ "vocab_size": 2423,
422
+ "tokenizer_vocab_size": 32100,
423
+ "save_dir": "runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728",
424
+ "batch_size": 128,
425
+ "grad_accum": 1,
426
+ "effective_batch_size": 512,
427
+ "global_batch_size": 512,
428
+ "lr_schedule": "constant_warmup",
429
+ "optimizer": "muon",
430
+ "epochs": 0.0,
431
+ "steps_per_epoch": 1,
432
+ "total_steps": 3000,
433
+ "warmup_steps": 10,
434
+ "warmup_epochs": -1.0,
435
+ "min_lr": 0.0,
436
+ "weight_decay": 0.1,
437
+ "output_weight_decay": -1.0,
438
+ "adamw_param_groups": "nanogpt",
439
+ "adam_beta1": 0.9,
440
+ "adam_beta2": 0.95,
441
+ "adam_eps": 1e-08,
442
+ "muon_impl": "legacy",
443
+ "muon_momentum": 0.95,
444
+ "muon_ns_steps": 5,
445
+ "muon_update_scale": 1.0,
446
+ "muon_nesterov": false,
447
+ "muon_width_scale": false,
448
+ "muon_grouping": "legacy_dim_ge_2",
449
+ "muon_param_count": 2523776,
450
+ "muon_adam_param_count": 8192,
451
+ "muon_param_names": [
452
+ "vocab_embed.embedding",
453
+ "sigma_map.net.0.weight",
454
+ "sigma_map.net.2.weight",
455
+ "blocks.0.attn_qkv.weight",
456
+ "blocks.0.attn_out.weight",
457
+ "blocks.0.mlp.0.weight",
458
+ "blocks.0.mlp.2.weight",
459
+ "blocks.0.adaLN_modulation.weight",
460
+ "blocks.1.attn_qkv.weight",
461
+ "blocks.1.attn_out.weight",
462
+ "blocks.1.mlp.0.weight",
463
+ "blocks.1.mlp.2.weight",
464
+ "blocks.1.adaLN_modulation.weight",
465
+ "blocks.2.attn_qkv.weight",
466
+ "blocks.2.attn_out.weight",
467
+ "blocks.2.mlp.0.weight",
468
+ "blocks.2.mlp.2.weight",
469
+ "blocks.2.adaLN_modulation.weight",
470
+ "output_layer.linear.weight",
471
+ "output_layer.adaLN_modulation.weight"
472
+ ],
473
+ "muon_adam_param_names": [
474
+ "sigma_map.net.0.bias",
475
+ "sigma_map.net.2.bias",
476
+ "blocks.0.norm1.weight",
477
+ "blocks.0.norm2.weight",
478
+ "blocks.0.mlp.0.bias",
479
+ "blocks.0.mlp.2.bias",
480
+ "blocks.0.adaLN_modulation.bias",
481
+ "blocks.1.norm1.weight",
482
+ "blocks.1.norm2.weight",
483
+ "blocks.1.mlp.0.bias",
484
+ "blocks.1.mlp.2.bias",
485
+ "blocks.1.adaLN_modulation.bias",
486
+ "blocks.2.norm1.weight",
487
+ "blocks.2.norm2.weight",
488
+ "blocks.2.mlp.0.bias",
489
+ "blocks.2.mlp.2.bias",
490
+ "blocks.2.adaLN_modulation.bias",
491
+ "output_layer.norm_final.weight",
492
+ "output_layer.adaLN_modulation.bias"
493
+ ],
494
+ "muon_effective_nesterov": false,
495
+ "muon_effective_width_scale": false,
496
+ "muon_effective_weight_decay": 0.1,
497
+ "muon_adam_fallback_nesterov": false,
498
+ "muon_adam_fallback_weight_decay": 0.1,
499
+ "ema_decay": 0.9999,
500
+ "ema_start_step": 0,
501
+ "model_type": "ddit",
502
+ "ddit_mlp_type": "gelu",
503
+ "elf_num_time_tokens": 4,
504
+ "elf_num_model_mode_tokens": 0,
505
+ "qk_norm": true,
506
+ "output_bias": false,
507
+ "output_init_std": -1.0,
508
+ "norm_type": "rmsnorm",
509
+ "target_loss": "hard_ce",
510
+ "linear_soft_target_power": 1.0,
511
+ "linear_soft_target_min_conf": 0.0,
512
+ "linear_soft_target_max_conf": 1.0,
513
+ "t_sampling_mode": "uniform",
514
+ "t_sampling_power": 1.0,
515
+ "t_sampling_eps": 0.0001,
516
+ "t_sampling_logit_mean": -1.5,
517
+ "t_sampling_logit_std": 0.8,
518
+ "dual_t": true,
519
+ "corrupt_t_mode": "same",
520
+ "corrupt_min_t": 0.0,
521
+ "corrupt_max_t": 1.0,
522
+ "prefix_block_prob": 0.0,
523
+ "prefix_block_len": 128,
524
+ "mask_ratio_floor_schedule": "none",
525
+ "dirichlet_endpoint_mode": "categorical_dual_t",
526
+ "dirichlet_semantic_t_mode": "same",
527
+ "dirichlet_semantic_t_value": 0.0,
528
+ "dirichlet_semantic_t_curve": "linear",
529
+ "dirichlet_semantic_t_power": 1.0,
530
+ "endpoint_sequence_random_prob_alpha": 0.0,
531
+ "categorical_wrong_from_full_vocab": true,
532
+ "categorical_wrong_from_batch_valid_tokens": false,
533
+ "categorical_wrong_basin_token_ids": "",
534
+ "categorical_wrong_basin_prob": 0.0,
535
+ "categorical_wrong_unigram_prob": 0.0,
536
+ "categorical_wrong_uniform_prob": 0.0,
537
+ "categorical_wrong_prob_floor": 0.0,
538
+ "categorical_wrong_corpus_unigram_path": "",
539
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
540
+ "categorical_wrong_basin_shared_prob": 0.0,
541
+ "categorical_wrong_unigram_shared_prob": 0.0,
542
+ "mask_mixture_original_prob": 0.0,
543
+ "mask_mixture_lowk_prob": 0.0,
544
+ "mask_mixture_lowcorrupt_prob": 0.0,
545
+ "mask_mixture_block_prob": 0.0,
546
+ "mask_mixture_all_prob": 1.0,
547
+ "mask_mixture_lowk_clean_tokens": "0",
548
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
549
+ "mask_mixture_block_tokens": "64,128",
550
+ "simplex_bridge_sampler": "dirichlet",
551
+ "logistic_normal_sigma_min": 0.1,
552
+ "logistic_normal_sigma_max": 1.0,
553
+ "logistic_normal_tau_min": 1.0,
554
+ "logistic_normal_tau_max": 1.0,
555
+ "torch_compile": false,
556
+ "compile_mode": "max-autotune",
557
+ "state_format": "prob",
558
+ "meanflow_weight": 0.0,
559
+ "rollout_train_prob": 0.35,
560
+ "rollout_train_steps": 3,
561
+ "rollout_train_steps_min": 0,
562
+ "rollout_train_infer_steps": 1,
563
+ "rollout_train_time_mode": "sampled_path",
564
+ "rollout_train_s_dist": "uniform",
565
+ "rollout_train_s_min_frac": 0.0,
566
+ "rollout_train_s_max_frac": 0.25,
567
+ "rollout_train_s_beta_alpha": 2.0,
568
+ "rollout_train_s_beta_beta": 6.0,
569
+ "rollout_train_temp": 1.0,
570
+ "rollout_train_max_gamma": 1.0,
571
+ "rollout_train_corrupt_only": true,
572
+ "rollout_train_samplewise": true,
573
+ "rollout_train_compute_always": false,
574
+ "rollout_train_sync_t": true,
575
+ "bridge_noise_init": "logistic_normal",
576
+ "noise_sigma": -1.0,
577
+ "allow_tf32": true,
578
+ "activation_checkpointing": false,
579
+ "activation_checkpoint_interval": 1,
580
+ "activation_checkpoint_scope": "block",
581
+ "ddp_static_graph": false,
582
+ "ddp_gradient_as_bucket_view": true,
583
+ "blocking_data_transfer": false,
584
+ "dataloader_prefetch_factor": 4,
585
+ "full_train_stats": false,
586
+ "tokenized_hf": false,
587
+ "tokenized_pad_token": "pad",
588
+ "elf_conditional_hf": false,
589
+ "record_pad_truncate": false,
590
+ "record_add_eos": false,
591
+ "record_add_special_tokens": false,
592
+ "record_pad_token": "pad",
593
+ "record_shuffle_buffer": 10000,
594
+ "wrap": true,
595
+ "wrap_mode": "stream",
596
+ "wrap_record_buffer_size": 200,
597
+ "owt_cached_chunks": true,
598
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
599
+ "owt_chunk_cache_rebuild": false,
600
+ "owt_chunk_cache_write_batch": 4096,
601
+ "owt_exact_repeat_per_chunk": 64,
602
+ "online_chunk_shuffle": false,
603
+ "online_chunk_shuffle_buffer": 10000,
604
+ "openwebtext_split": "train_minus_100k",
605
+ "detokenizer": "auto",
606
+ "resolved_detokenizer": null,
607
+ "num_workers": 0,
608
+ "latest_every": 1000,
609
+ "resume_path": "runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt"
610
+ }
611
+ step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=19.8s lr=2.000000e-03 loss=0.4583 loss_recon=0.4583 loss_meanflow=0.0000 mean_model_t=0.4971 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3513 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8735 corrupt_frac=1.0000 acc_corrupt=0.8735 loss_corrupt=0.4583 wrong_frac=0.5028 init_acc_corrupt=0.4935 acc_corrupt_t_0p0_0p2=0.3975 corrupt_frac_t_0p0_0p2=0.2071 acc_corrupt_t_0p2_0p4=0.9918 corrupt_frac_t_0p2_0p4=0.1988 acc_corrupt_t_0p4_0p6=0.9997 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1951 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=13.2591 out_g_norm=0.5308 loss_all=0.2927 init_gold_top10=0.5962 init_gold_top100=0.6969 rollout_applied_pos_frac=0.3516 init_acc_rollout_applied=0.5366 init_acc_rollout_kept=0.4393 logit_acc_rollout_applied=0.9168 logit_acc_rollout_kept=0.9161
612
+ step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=19.0s lr=2.000000e-03 loss=0.4055 loss_recon=0.4055 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3505 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8848 corrupt_frac=1.0000 acc_corrupt=0.8848 loss_corrupt=0.4055 wrong_frac=0.4984 init_acc_corrupt=0.5000 acc_corrupt_t_0p0_0p2=0.4320 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9939 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.9997 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.2007 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=13.2548 out_g_norm=0.5596 loss_all=0.4308 init_gold_top10=0.6072 init_gold_top100=0.7113 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.5487 init_acc_rollout_kept=0.4824 logit_acc_rollout_applied=0.8832 logit_acc_rollout_kept=0.8671
613
+ step=2300 epoch=2300/3000 epoch_step=1/1 micro_steps=2300 elapsed=19.1s lr=2.000000e-03 loss=0.3642 loss_recon=0.3642 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3535 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8920 corrupt_frac=1.0000 acc_corrupt=0.8920 loss_corrupt=0.3642 wrong_frac=0.4985 init_acc_corrupt=0.4995 acc_corrupt_t_0p0_0p2=0.4506 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.9954 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.2045 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=13.2547 out_g_norm=0.5233 loss_all=0.3508 init_gold_top10=0.6184 init_gold_top100=0.7122 rollout_applied_pos_frac=0.4062 init_acc_rollout_applied=0.5912 init_acc_rollout_kept=0.4471 logit_acc_rollout_applied=0.9022 logit_acc_rollout_kept=0.8909
614
+ step=2400 epoch=2400/3000 epoch_step=1/1 micro_steps=2400 elapsed=19.0s lr=2.000000e-03 loss=0.3392 loss_recon=0.3392 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3515 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8964 corrupt_frac=1.0000 acc_corrupt=0.8964 loss_corrupt=0.3392 wrong_frac=0.5016 init_acc_corrupt=0.4981 acc_corrupt_t_0p0_0p2=0.4873 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9962 corrupt_frac_t_0p2_0p4=0.2001 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1992 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=13.2416 out_g_norm=0.5315 loss_all=0.3230 init_gold_top10=0.6008 init_gold_top100=0.7086 rollout_applied_pos_frac=0.3281 init_acc_rollout_applied=0.6073 init_acc_rollout_kept=0.4320 logit_acc_rollout_applied=0.8929 logit_acc_rollout_kept=0.9091
615
+ step=2500 epoch=2500/3000 epoch_step=1/1 micro_steps=2500 elapsed=19.0s lr=2.000000e-03 loss=0.3342 loss_recon=0.3342 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3470 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8955 corrupt_frac=1.0000 acc_corrupt=0.8955 loss_corrupt=0.3342 wrong_frac=0.5023 init_acc_corrupt=0.4972 acc_corrupt_t_0p0_0p2=0.4951 corrupt_frac_t_0p0_0p2=0.2059 acc_corrupt_t_0p2_0p4=0.9976 corrupt_frac_t_0p2_0p4=0.1985 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.1969 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1927 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=13.2247 out_g_norm=0.4966 loss_all=0.2946 init_gold_top10=0.5647 init_gold_top100=0.6631 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5336 init_acc_rollout_kept=0.4433 logit_acc_rollout_applied=0.9112 logit_acc_rollout_kept=0.9019
616
+ step=2600 epoch=2600/3000 epoch_step=1/1 micro_steps=2600 elapsed=19.1s lr=2.000000e-03 loss=0.2927 loss_recon=0.2927 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3562 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9071 corrupt_frac=1.0000 acc_corrupt=0.9071 loss_corrupt=0.2927 wrong_frac=0.4987 init_acc_corrupt=0.5030 acc_corrupt_t_0p0_0p2=0.5311 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.9981 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1990 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2017 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1997 out_w_norm=13.1977 out_g_norm=0.4920 loss_all=0.2278 init_gold_top10=0.6797 init_gold_top100=0.7468 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5799 init_acc_rollout_kept=0.5727 logit_acc_rollout_applied=0.8991 logit_acc_rollout_kept=0.9380
617
+ step=2700 epoch=2700/3000 epoch_step=1/1 micro_steps=2700 elapsed=18.9s lr=2.000000e-03 loss=0.2925 loss_recon=0.2925 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3412 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9053 corrupt_frac=1.0000 acc_corrupt=0.9053 loss_corrupt=0.2925 wrong_frac=0.4994 init_acc_corrupt=0.5003 acc_corrupt_t_0p0_0p2=0.5350 corrupt_frac_t_0p0_0p2=0.2030 acc_corrupt_t_0p2_0p4=0.9984 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2001 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=13.1540 out_g_norm=0.4638 loss_all=0.4191 init_gold_top10=0.6214 init_gold_top100=0.7245 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.5020 init_acc_rollout_kept=0.5121 logit_acc_rollout_applied=0.8600 logit_acc_rollout_kept=0.8647
618
+ step=2800 epoch=2800/3000 epoch_step=1/1 micro_steps=2800 elapsed=19.0s lr=2.000000e-03 loss=0.2855 loss_recon=0.2855 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3430 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9054 corrupt_frac=1.0000 acc_corrupt=0.9054 loss_corrupt=0.2855 wrong_frac=0.5037 init_acc_corrupt=0.4940 acc_corrupt_t_0p0_0p2=0.5346 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.9986 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1965 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1984 out_w_norm=13.1064 out_g_norm=0.4428 loss_all=0.3016 init_gold_top10=0.5783 init_gold_top100=0.6934 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.4441 init_acc_rollout_kept=0.4622 logit_acc_rollout_applied=0.8869 logit_acc_rollout_kept=0.9013
619
+ step=2900 epoch=2900/3000 epoch_step=1/1 micro_steps=2900 elapsed=19.1s lr=2.000000e-03 loss=0.2654 loss_recon=0.2654 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.5040 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3546 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9131 corrupt_frac=1.0000 acc_corrupt=0.9131 loss_corrupt=0.2654 wrong_frac=0.4960 init_acc_corrupt=0.5068 acc_corrupt_t_0p0_0p2=0.5513 corrupt_frac_t_0p0_0p2=0.1931 acc_corrupt_t_0p2_0p4=0.9990 corrupt_frac_t_0p2_0p4=0.2013 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2019 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1981 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2056 out_w_norm=13.0543 out_g_norm=0.4271 loss_all=0.1481 init_gold_top10=0.6606 init_gold_top100=0.7375 rollout_applied_pos_frac=0.3125 init_acc_rollout_applied=0.5364 init_acc_rollout_kept=0.5155 logit_acc_rollout_applied=0.9541 logit_acc_rollout_kept=0.9501
620
+ step=3000 epoch=3000/3000 epoch_step=1/1 micro_steps=3000 elapsed=19.0s lr=2.000000e-03 loss=0.2693 loss_recon=0.2693 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3534 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9110 corrupt_frac=1.0000 acc_corrupt=0.9110 loss_corrupt=0.2693 wrong_frac=0.4993 init_acc_corrupt=0.5007 acc_corrupt_t_0p0_0p2=0.5637 corrupt_frac_t_0p0_0p2=0.2034 acc_corrupt_t_0p2_0p4=0.9990 corrupt_frac_t_0p2_0p4=0.1967 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1933 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2055 out_w_norm=12.9986 out_g_norm=0.4138 loss_all=0.4644 init_gold_top10=0.6166 init_gold_top100=0.7234 rollout_applied_pos_frac=0.4141 init_acc_rollout_applied=0.6088 init_acc_rollout_kept=0.4381 logit_acc_rollout_applied=0.8838 logit_acc_rollout_kept=0.8375
621
+ NCCL version 2.25.1+cuda12.8
622
+ resumed_from=runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt start_step=3001
623
+ {
624
+ "device": "cuda:0",
625
+ "rank": 0,
626
+ "world_size": 4,
627
+ "samples": "owt_cached_chunks:8",
628
+ "vocab_size": 2423,
629
+ "tokenizer_vocab_size": 32100,
630
+ "save_dir": "runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728",
631
+ "batch_size": 128,
632
+ "grad_accum": 1,
633
+ "effective_batch_size": 512,
634
+ "global_batch_size": 512,
635
+ "lr_schedule": "constant_warmup",
636
+ "optimizer": "muon",
637
+ "epochs": 0.0,
638
+ "steps_per_epoch": 1,
639
+ "total_steps": 4000,
640
+ "warmup_steps": 10,
641
+ "warmup_epochs": -1.0,
642
+ "min_lr": 0.0,
643
+ "weight_decay": 0.1,
644
+ "output_weight_decay": -1.0,
645
+ "adamw_param_groups": "nanogpt",
646
+ "adam_beta1": 0.9,
647
+ "adam_beta2": 0.95,
648
+ "adam_eps": 1e-08,
649
+ "muon_impl": "legacy",
650
+ "muon_momentum": 0.95,
651
+ "muon_ns_steps": 5,
652
+ "muon_update_scale": 1.0,
653
+ "muon_nesterov": false,
654
+ "muon_width_scale": false,
655
+ "muon_grouping": "legacy_dim_ge_2",
656
+ "muon_param_count": 2523776,
657
+ "muon_adam_param_count": 8192,
658
+ "muon_param_names": [
659
+ "vocab_embed.embedding",
660
+ "sigma_map.net.0.weight",
661
+ "sigma_map.net.2.weight",
662
+ "blocks.0.attn_qkv.weight",
663
+ "blocks.0.attn_out.weight",
664
+ "blocks.0.mlp.0.weight",
665
+ "blocks.0.mlp.2.weight",
666
+ "blocks.0.adaLN_modulation.weight",
667
+ "blocks.1.attn_qkv.weight",
668
+ "blocks.1.attn_out.weight",
669
+ "blocks.1.mlp.0.weight",
670
+ "blocks.1.mlp.2.weight",
671
+ "blocks.1.adaLN_modulation.weight",
672
+ "blocks.2.attn_qkv.weight",
673
+ "blocks.2.attn_out.weight",
674
+ "blocks.2.mlp.0.weight",
675
+ "blocks.2.mlp.2.weight",
676
+ "blocks.2.adaLN_modulation.weight",
677
+ "output_layer.linear.weight",
678
+ "output_layer.adaLN_modulation.weight"
679
+ ],
680
+ "muon_adam_param_names": [
681
+ "sigma_map.net.0.bias",
682
+ "sigma_map.net.2.bias",
683
+ "blocks.0.norm1.weight",
684
+ "blocks.0.norm2.weight",
685
+ "blocks.0.mlp.0.bias",
686
+ "blocks.0.mlp.2.bias",
687
+ "blocks.0.adaLN_modulation.bias",
688
+ "blocks.1.norm1.weight",
689
+ "blocks.1.norm2.weight",
690
+ "blocks.1.mlp.0.bias",
691
+ "blocks.1.mlp.2.bias",
692
+ "blocks.1.adaLN_modulation.bias",
693
+ "blocks.2.norm1.weight",
694
+ "blocks.2.norm2.weight",
695
+ "blocks.2.mlp.0.bias",
696
+ "blocks.2.mlp.2.bias",
697
+ "blocks.2.adaLN_modulation.bias",
698
+ "output_layer.norm_final.weight",
699
+ "output_layer.adaLN_modulation.bias"
700
+ ],
701
+ "muon_effective_nesterov": false,
702
+ "muon_effective_width_scale": false,
703
+ "muon_effective_weight_decay": 0.1,
704
+ "muon_adam_fallback_nesterov": false,
705
+ "muon_adam_fallback_weight_decay": 0.1,
706
+ "ema_decay": 0.9999,
707
+ "ema_start_step": 0,
708
+ "model_type": "ddit",
709
+ "ddit_mlp_type": "gelu",
710
+ "elf_num_time_tokens": 4,
711
+ "elf_num_model_mode_tokens": 0,
712
+ "qk_norm": true,
713
+ "output_bias": false,
714
+ "output_init_std": -1.0,
715
+ "norm_type": "rmsnorm",
716
+ "target_loss": "hard_ce",
717
+ "linear_soft_target_power": 1.0,
718
+ "linear_soft_target_min_conf": 0.0,
719
+ "linear_soft_target_max_conf": 1.0,
720
+ "t_sampling_mode": "uniform",
721
+ "t_sampling_power": 1.0,
722
+ "t_sampling_eps": 0.0001,
723
+ "t_sampling_logit_mean": -1.5,
724
+ "t_sampling_logit_std": 0.8,
725
+ "dual_t": true,
726
+ "corrupt_t_mode": "same",
727
+ "corrupt_min_t": 0.0,
728
+ "corrupt_max_t": 1.0,
729
+ "prefix_block_prob": 0.0,
730
+ "prefix_block_len": 128,
731
+ "mask_ratio_floor_schedule": "none",
732
+ "dirichlet_endpoint_mode": "categorical_dual_t",
733
+ "dirichlet_semantic_t_mode": "same",
734
+ "dirichlet_semantic_t_value": 0.0,
735
+ "dirichlet_semantic_t_curve": "linear",
736
+ "dirichlet_semantic_t_power": 1.0,
737
+ "endpoint_sequence_random_prob_alpha": 0.0,
738
+ "categorical_wrong_from_full_vocab": true,
739
+ "categorical_wrong_from_batch_valid_tokens": false,
740
+ "categorical_wrong_basin_token_ids": "",
741
+ "categorical_wrong_basin_prob": 0.0,
742
+ "categorical_wrong_unigram_prob": 0.0,
743
+ "categorical_wrong_uniform_prob": 0.0,
744
+ "categorical_wrong_prob_floor": 0.0,
745
+ "categorical_wrong_corpus_unigram_path": "",
746
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
747
+ "categorical_wrong_basin_shared_prob": 0.0,
748
+ "categorical_wrong_unigram_shared_prob": 0.0,
749
+ "mask_mixture_original_prob": 0.0,
750
+ "mask_mixture_lowk_prob": 0.0,
751
+ "mask_mixture_lowcorrupt_prob": 0.0,
752
+ "mask_mixture_block_prob": 0.0,
753
+ "mask_mixture_all_prob": 1.0,
754
+ "mask_mixture_lowk_clean_tokens": "0",
755
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
756
+ "mask_mixture_block_tokens": "64,128",
757
+ "simplex_bridge_sampler": "dirichlet",
758
+ "logistic_normal_sigma_min": 0.1,
759
+ "logistic_normal_sigma_max": 1.0,
760
+ "logistic_normal_tau_min": 1.0,
761
+ "logistic_normal_tau_max": 1.0,
762
+ "torch_compile": false,
763
+ "compile_mode": "max-autotune",
764
+ "state_format": "prob",
765
+ "meanflow_weight": 0.0,
766
+ "rollout_train_prob": 0.35,
767
+ "rollout_train_steps": 3,
768
+ "rollout_train_steps_min": 0,
769
+ "rollout_train_infer_steps": 1,
770
+ "rollout_train_time_mode": "sampled_path",
771
+ "rollout_train_s_dist": "uniform",
772
+ "rollout_train_s_min_frac": 0.0,
773
+ "rollout_train_s_max_frac": 0.25,
774
+ "rollout_train_s_beta_alpha": 2.0,
775
+ "rollout_train_s_beta_beta": 6.0,
776
+ "rollout_train_temp": 1.0,
777
+ "rollout_train_max_gamma": 1.0,
778
+ "rollout_train_corrupt_only": true,
779
+ "rollout_train_samplewise": true,
780
+ "rollout_train_compute_always": false,
781
+ "rollout_train_sync_t": true,
782
+ "bridge_noise_init": "logistic_normal",
783
+ "noise_sigma": -1.0,
784
+ "allow_tf32": true,
785
+ "activation_checkpointing": false,
786
+ "activation_checkpoint_interval": 1,
787
+ "activation_checkpoint_scope": "block",
788
+ "ddp_static_graph": false,
789
+ "ddp_gradient_as_bucket_view": true,
790
+ "blocking_data_transfer": false,
791
+ "dataloader_prefetch_factor": 4,
792
+ "full_train_stats": false,
793
+ "tokenized_hf": false,
794
+ "tokenized_pad_token": "pad",
795
+ "elf_conditional_hf": false,
796
+ "record_pad_truncate": false,
797
+ "record_add_eos": false,
798
+ "record_add_special_tokens": false,
799
+ "record_pad_token": "pad",
800
+ "record_shuffle_buffer": 10000,
801
+ "wrap": true,
802
+ "wrap_mode": "stream",
803
+ "wrap_record_buffer_size": 200,
804
+ "owt_cached_chunks": true,
805
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
806
+ "owt_chunk_cache_rebuild": false,
807
+ "owt_chunk_cache_write_batch": 4096,
808
+ "owt_exact_repeat_per_chunk": 64,
809
+ "online_chunk_shuffle": false,
810
+ "online_chunk_shuffle_buffer": 10000,
811
+ "openwebtext_split": "train_minus_100k",
812
+ "detokenizer": "auto",
813
+ "resolved_detokenizer": null,
814
+ "num_workers": 0,
815
+ "latest_every": 1000,
816
+ "resume_path": "runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt"
817
+ }
818
+ step=3100 epoch=3100/4000 epoch_step=1/1 micro_steps=3100 elapsed=19.8s lr=2.000000e-03 loss=0.2771 loss_recon=0.2771 loss_meanflow=0.0000 mean_model_t=0.4971 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3513 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9082 corrupt_frac=1.0000 acc_corrupt=0.9082 loss_corrupt=0.2771 wrong_frac=0.5028 init_acc_corrupt=0.4959 acc_corrupt_t_0p0_0p2=0.5578 corrupt_frac_t_0p0_0p2=0.2071 acc_corrupt_t_0p2_0p4=0.9991 corrupt_frac_t_0p2_0p4=0.1988 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1951 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.9438 out_g_norm=0.4133 loss_all=0.1770 init_gold_top10=0.6054 init_gold_top100=0.6969 rollout_applied_pos_frac=0.3516 init_acc_rollout_applied=0.5426 init_acc_rollout_kept=0.4393 logit_acc_rollout_applied=0.9321 logit_acc_rollout_kept=0.9395
819
+ step=3200 epoch=3200/4000 epoch_step=1/1 micro_steps=3200 elapsed=19.0s lr=2.000000e-03 loss=0.2579 loss_recon=0.2579 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3505 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9130 corrupt_frac=1.0000 acc_corrupt=0.9130 loss_corrupt=0.2579 wrong_frac=0.4984 init_acc_corrupt=0.5023 acc_corrupt_t_0p0_0p2=0.5668 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9993 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2007 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.8972 out_g_norm=0.4063 loss_all=0.3037 init_gold_top10=0.6150 init_gold_top100=0.7113 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.5494 init_acc_rollout_kept=0.4824 logit_acc_rollout_applied=0.8931 logit_acc_rollout_kept=0.8928
820
+ step=3300 epoch=3300/4000 epoch_step=1/1 micro_steps=3300 elapsed=19.0s lr=2.000000e-03 loss=0.2556 loss_recon=0.2556 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3535 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9136 corrupt_frac=1.0000 acc_corrupt=0.9136 loss_corrupt=0.2556 wrong_frac=0.4985 init_acc_corrupt=0.5011 acc_corrupt_t_0p0_0p2=0.5569 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.9994 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2045 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=12.8626 out_g_norm=0.3628 loss_all=0.2428 init_gold_top10=0.6262 init_gold_top100=0.7122 rollout_applied_pos_frac=0.4062 init_acc_rollout_applied=0.5953 init_acc_rollout_kept=0.4471 logit_acc_rollout_applied=0.8962 logit_acc_rollout_kept=0.9286
821
+ step=3400 epoch=3400/4000 epoch_step=1/1 micro_steps=3400 elapsed=19.0s lr=2.000000e-03 loss=0.2476 loss_recon=0.2476 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3515 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9170 corrupt_frac=1.0000 acc_corrupt=0.9170 loss_corrupt=0.2476 wrong_frac=0.5016 init_acc_corrupt=0.4994 acc_corrupt_t_0p0_0p2=0.5868 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9994 corrupt_frac_t_0p2_0p4=0.2001 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1992 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=12.8240 out_g_norm=0.3630 loss_all=0.2311 init_gold_top10=0.6073 init_gold_top100=0.7086 rollout_applied_pos_frac=0.3281 init_acc_rollout_applied=0.6076 init_acc_rollout_kept=0.4320 logit_acc_rollout_applied=0.8999 logit_acc_rollout_kept=0.9337
822
+ step=3500 epoch=3500/4000 epoch_step=1/1 micro_steps=3500 elapsed=18.9s lr=2.000000e-03 loss=0.2571 loss_recon=0.2571 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3470 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9129 corrupt_frac=1.0000 acc_corrupt=0.9129 loss_corrupt=0.2571 wrong_frac=0.5023 init_acc_corrupt=0.4981 acc_corrupt_t_0p0_0p2=0.5775 corrupt_frac_t_0p0_0p2=0.2059 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.1985 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1969 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1927 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=12.7945 out_g_norm=0.3526 loss_all=0.2127 init_gold_top10=0.5645 init_gold_top100=0.6631 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5340 init_acc_rollout_kept=0.4433 logit_acc_rollout_applied=0.9209 logit_acc_rollout_kept=0.9339
823
+ step=3600 epoch=3600/4000 epoch_step=1/1 micro_steps=3600 elapsed=19.1s lr=2.000000e-03 loss=0.2311 loss_recon=0.2311 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3562 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9220 corrupt_frac=1.0000 acc_corrupt=0.9220 loss_corrupt=0.2311 wrong_frac=0.4987 init_acc_corrupt=0.5040 acc_corrupt_t_0p0_0p2=0.6049 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1990 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2017 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1997 out_w_norm=12.7752 out_g_norm=0.3081 loss_all=0.1592 init_gold_top10=0.6825 init_gold_top100=0.7468 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5806 init_acc_rollout_kept=0.5727 logit_acc_rollout_applied=0.9003 logit_acc_rollout_kept=0.9709
824
+ step=3700 epoch=3700/4000 epoch_step=1/1 micro_steps=3700 elapsed=18.9s lr=2.000000e-03 loss=0.2345 loss_recon=0.2345 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3412 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9207 corrupt_frac=1.0000 acc_corrupt=0.9207 loss_corrupt=0.2345 wrong_frac=0.4994 init_acc_corrupt=0.5010 acc_corrupt_t_0p0_0p2=0.6098 corrupt_frac_t_0p0_0p2=0.2030 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2001 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=12.7645 out_g_norm=0.2989 loss_all=0.3539 init_gold_top10=0.6224 init_gold_top100=0.7245 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.5012 init_acc_rollout_kept=0.5121 logit_acc_rollout_applied=0.8579 logit_acc_rollout_kept=0.8977
825
+ step=3800 epoch=3800/4000 epoch_step=1/1 micro_steps=3800 elapsed=19.0s lr=2.000000e-03 loss=0.2347 loss_recon=0.2347 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3430 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9189 corrupt_frac=1.0000 acc_corrupt=0.9189 loss_corrupt=0.2347 wrong_frac=0.5037 init_acc_corrupt=0.4948 acc_corrupt_t_0p0_0p2=0.6001 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1965 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1984 out_w_norm=12.7526 out_g_norm=0.3031 loss_all=0.2269 init_gold_top10=0.5784 init_gold_top100=0.6934 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.4589 init_acc_rollout_kept=0.4622 logit_acc_rollout_applied=0.9234 logit_acc_rollout_kept=0.9226
826
+ step=3900 epoch=3900/4000 epoch_step=1/1 micro_steps=3900 elapsed=19.1s lr=2.000000e-03 loss=0.2202 loss_recon=0.2202 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.5040 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3546 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9242 corrupt_frac=1.0000 acc_corrupt=0.9242 loss_corrupt=0.2202 wrong_frac=0.4960 init_acc_corrupt=0.5074 acc_corrupt_t_0p0_0p2=0.6082 corrupt_frac_t_0p0_0p2=0.1931 acc_corrupt_t_0p2_0p4=0.9993 corrupt_frac_t_0p2_0p4=0.2013 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2019 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1981 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2056 out_w_norm=12.7499 out_g_norm=0.3059 loss_all=0.1200 init_gold_top10=0.6612 init_gold_top100=0.7375 rollout_applied_pos_frac=0.3125 init_acc_rollout_applied=0.5376 init_acc_rollout_kept=0.5155 logit_acc_rollout_applied=0.9567 logit_acc_rollout_kept=0.9620
827
+ step=4000 epoch=4000/4000 epoch_step=1/1 micro_steps=4000 elapsed=19.0s lr=2.000000e-03 loss=0.2281 loss_recon=0.2281 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3534 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9216 corrupt_frac=1.0000 acc_corrupt=0.9216 loss_corrupt=0.2281 wrong_frac=0.4993 init_acc_corrupt=0.5014 acc_corrupt_t_0p0_0p2=0.6154 corrupt_frac_t_0p0_0p2=0.2034 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.1967 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1933 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2055 out_w_norm=12.7635 out_g_norm=0.2545 loss_all=0.3671 init_gold_top10=0.6215 init_gold_top100=0.7234 rollout_applied_pos_frac=0.4141 init_acc_rollout_applied=0.6094 init_acc_rollout_kept=0.4381 logit_acc_rollout_applied=0.8936 logit_acc_rollout_kept=0.8557
828
+ NCCL version 2.25.1+cuda12.8
829
+ resumed_from=runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt start_step=4001
830
+ {
831
+ "device": "cuda:0",
832
+ "rank": 0,
833
+ "world_size": 4,
834
+ "samples": "owt_cached_chunks:8",
835
+ "vocab_size": 2423,
836
+ "tokenizer_vocab_size": 32100,
837
+ "save_dir": "runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728",
838
+ "batch_size": 128,
839
+ "grad_accum": 1,
840
+ "effective_batch_size": 512,
841
+ "global_batch_size": 512,
842
+ "lr_schedule": "constant_warmup",
843
+ "optimizer": "muon",
844
+ "epochs": 0.0,
845
+ "steps_per_epoch": 1,
846
+ "total_steps": 5000,
847
+ "warmup_steps": 10,
848
+ "warmup_epochs": -1.0,
849
+ "min_lr": 0.0,
850
+ "weight_decay": 0.1,
851
+ "output_weight_decay": -1.0,
852
+ "adamw_param_groups": "nanogpt",
853
+ "adam_beta1": 0.9,
854
+ "adam_beta2": 0.95,
855
+ "adam_eps": 1e-08,
856
+ "muon_impl": "legacy",
857
+ "muon_momentum": 0.95,
858
+ "muon_ns_steps": 5,
859
+ "muon_update_scale": 1.0,
860
+ "muon_nesterov": false,
861
+ "muon_width_scale": false,
862
+ "muon_grouping": "legacy_dim_ge_2",
863
+ "muon_param_count": 2523776,
864
+ "muon_adam_param_count": 8192,
865
+ "muon_param_names": [
866
+ "vocab_embed.embedding",
867
+ "sigma_map.net.0.weight",
868
+ "sigma_map.net.2.weight",
869
+ "blocks.0.attn_qkv.weight",
870
+ "blocks.0.attn_out.weight",
871
+ "blocks.0.mlp.0.weight",
872
+ "blocks.0.mlp.2.weight",
873
+ "blocks.0.adaLN_modulation.weight",
874
+ "blocks.1.attn_qkv.weight",
875
+ "blocks.1.attn_out.weight",
876
+ "blocks.1.mlp.0.weight",
877
+ "blocks.1.mlp.2.weight",
878
+ "blocks.1.adaLN_modulation.weight",
879
+ "blocks.2.attn_qkv.weight",
880
+ "blocks.2.attn_out.weight",
881
+ "blocks.2.mlp.0.weight",
882
+ "blocks.2.mlp.2.weight",
883
+ "blocks.2.adaLN_modulation.weight",
884
+ "output_layer.linear.weight",
885
+ "output_layer.adaLN_modulation.weight"
886
+ ],
887
+ "muon_adam_param_names": [
888
+ "sigma_map.net.0.bias",
889
+ "sigma_map.net.2.bias",
890
+ "blocks.0.norm1.weight",
891
+ "blocks.0.norm2.weight",
892
+ "blocks.0.mlp.0.bias",
893
+ "blocks.0.mlp.2.bias",
894
+ "blocks.0.adaLN_modulation.bias",
895
+ "blocks.1.norm1.weight",
896
+ "blocks.1.norm2.weight",
897
+ "blocks.1.mlp.0.bias",
898
+ "blocks.1.mlp.2.bias",
899
+ "blocks.1.adaLN_modulation.bias",
900
+ "blocks.2.norm1.weight",
901
+ "blocks.2.norm2.weight",
902
+ "blocks.2.mlp.0.bias",
903
+ "blocks.2.mlp.2.bias",
904
+ "blocks.2.adaLN_modulation.bias",
905
+ "output_layer.norm_final.weight",
906
+ "output_layer.adaLN_modulation.bias"
907
+ ],
908
+ "muon_effective_nesterov": false,
909
+ "muon_effective_width_scale": false,
910
+ "muon_effective_weight_decay": 0.1,
911
+ "muon_adam_fallback_nesterov": false,
912
+ "muon_adam_fallback_weight_decay": 0.1,
913
+ "ema_decay": 0.9999,
914
+ "ema_start_step": 0,
915
+ "model_type": "ddit",
916
+ "ddit_mlp_type": "gelu",
917
+ "elf_num_time_tokens": 4,
918
+ "elf_num_model_mode_tokens": 0,
919
+ "qk_norm": true,
920
+ "output_bias": false,
921
+ "output_init_std": -1.0,
922
+ "norm_type": "rmsnorm",
923
+ "target_loss": "hard_ce",
924
+ "linear_soft_target_power": 1.0,
925
+ "linear_soft_target_min_conf": 0.0,
926
+ "linear_soft_target_max_conf": 1.0,
927
+ "t_sampling_mode": "uniform",
928
+ "t_sampling_power": 1.0,
929
+ "t_sampling_eps": 0.0001,
930
+ "t_sampling_logit_mean": -1.5,
931
+ "t_sampling_logit_std": 0.8,
932
+ "dual_t": true,
933
+ "corrupt_t_mode": "same",
934
+ "corrupt_min_t": 0.0,
935
+ "corrupt_max_t": 1.0,
936
+ "prefix_block_prob": 0.0,
937
+ "prefix_block_len": 128,
938
+ "mask_ratio_floor_schedule": "none",
939
+ "dirichlet_endpoint_mode": "categorical_dual_t",
940
+ "dirichlet_semantic_t_mode": "same",
941
+ "dirichlet_semantic_t_value": 0.0,
942
+ "dirichlet_semantic_t_curve": "linear",
943
+ "dirichlet_semantic_t_power": 1.0,
944
+ "endpoint_sequence_random_prob_alpha": 0.0,
945
+ "categorical_wrong_from_full_vocab": true,
946
+ "categorical_wrong_from_batch_valid_tokens": false,
947
+ "categorical_wrong_basin_token_ids": "",
948
+ "categorical_wrong_basin_prob": 0.0,
949
+ "categorical_wrong_unigram_prob": 0.0,
950
+ "categorical_wrong_uniform_prob": 0.0,
951
+ "categorical_wrong_prob_floor": 0.0,
952
+ "categorical_wrong_corpus_unigram_path": "",
953
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
954
+ "categorical_wrong_basin_shared_prob": 0.0,
955
+ "categorical_wrong_unigram_shared_prob": 0.0,
956
+ "mask_mixture_original_prob": 0.0,
957
+ "mask_mixture_lowk_prob": 0.0,
958
+ "mask_mixture_lowcorrupt_prob": 0.0,
959
+ "mask_mixture_block_prob": 0.0,
960
+ "mask_mixture_all_prob": 1.0,
961
+ "mask_mixture_lowk_clean_tokens": "0",
962
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
963
+ "mask_mixture_block_tokens": "64,128",
964
+ "simplex_bridge_sampler": "dirichlet",
965
+ "logistic_normal_sigma_min": 0.1,
966
+ "logistic_normal_sigma_max": 1.0,
967
+ "logistic_normal_tau_min": 1.0,
968
+ "logistic_normal_tau_max": 1.0,
969
+ "torch_compile": false,
970
+ "compile_mode": "max-autotune",
971
+ "state_format": "prob",
972
+ "meanflow_weight": 0.0,
973
+ "rollout_train_prob": 0.35,
974
+ "rollout_train_steps": 3,
975
+ "rollout_train_steps_min": 0,
976
+ "rollout_train_infer_steps": 1,
977
+ "rollout_train_time_mode": "sampled_path",
978
+ "rollout_train_s_dist": "uniform",
979
+ "rollout_train_s_min_frac": 0.0,
980
+ "rollout_train_s_max_frac": 0.25,
981
+ "rollout_train_s_beta_alpha": 2.0,
982
+ "rollout_train_s_beta_beta": 6.0,
983
+ "rollout_train_temp": 1.0,
984
+ "rollout_train_max_gamma": 1.0,
985
+ "rollout_train_corrupt_only": true,
986
+ "rollout_train_samplewise": true,
987
+ "rollout_train_compute_always": false,
988
+ "rollout_train_sync_t": true,
989
+ "bridge_noise_init": "logistic_normal",
990
+ "noise_sigma": -1.0,
991
+ "allow_tf32": true,
992
+ "activation_checkpointing": false,
993
+ "activation_checkpoint_interval": 1,
994
+ "activation_checkpoint_scope": "block",
995
+ "ddp_static_graph": false,
996
+ "ddp_gradient_as_bucket_view": true,
997
+ "blocking_data_transfer": false,
998
+ "dataloader_prefetch_factor": 4,
999
+ "full_train_stats": false,
1000
+ "tokenized_hf": false,
1001
+ "tokenized_pad_token": "pad",
1002
+ "elf_conditional_hf": false,
1003
+ "record_pad_truncate": false,
1004
+ "record_add_eos": false,
1005
+ "record_add_special_tokens": false,
1006
+ "record_pad_token": "pad",
1007
+ "record_shuffle_buffer": 10000,
1008
+ "wrap": true,
1009
+ "wrap_mode": "stream",
1010
+ "wrap_record_buffer_size": 200,
1011
+ "owt_cached_chunks": true,
1012
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
1013
+ "owt_chunk_cache_rebuild": false,
1014
+ "owt_chunk_cache_write_batch": 4096,
1015
+ "owt_exact_repeat_per_chunk": 64,
1016
+ "online_chunk_shuffle": false,
1017
+ "online_chunk_shuffle_buffer": 10000,
1018
+ "openwebtext_split": "train_minus_100k",
1019
+ "detokenizer": "auto",
1020
+ "resolved_detokenizer": null,
1021
+ "num_workers": 0,
1022
+ "latest_every": 1000,
1023
+ "resume_path": "runs/train8_ctx1024_t5tok_p35_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt"
1024
+ }
1025
+ step=4100 epoch=4100/5000 epoch_step=1/1 micro_steps=4100 elapsed=19.8s lr=2.000000e-03 loss=0.2388 loss_recon=0.2388 loss_meanflow=0.0000 mean_model_t=0.4971 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3513 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9192 corrupt_frac=1.0000 acc_corrupt=0.9192 loss_corrupt=0.2388 wrong_frac=0.5028 init_acc_corrupt=0.4964 acc_corrupt_t_0p0_0p2=0.6102 corrupt_frac_t_0p0_0p2=0.2071 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.1988 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1951 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.7665 out_g_norm=0.2375 loss_all=0.1860 init_gold_top10=0.6049 init_gold_top100=0.6969 rollout_applied_pos_frac=0.3516 init_acc_rollout_applied=0.5464 init_acc_rollout_kept=0.4393 logit_acc_rollout_applied=0.9101 logit_acc_rollout_kept=0.9438
1026
+ step=4200 epoch=4200/5000 epoch_step=1/1 micro_steps=4200 elapsed=19.0s lr=2.000000e-03 loss=0.2291 loss_recon=0.2291 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3505 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9221 corrupt_frac=1.0000 acc_corrupt=0.9221 loss_corrupt=0.2291 wrong_frac=0.4984 init_acc_corrupt=0.5025 acc_corrupt_t_0p0_0p2=0.6120 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2007 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.7798 out_g_norm=0.2175 loss_all=0.2915 init_gold_top10=0.6130 init_gold_top100=0.7113 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.5497 init_acc_rollout_kept=0.4824 logit_acc_rollout_applied=0.8851 logit_acc_rollout_kept=0.8986
1027
+ step=4300 epoch=4300/5000 epoch_step=1/1 micro_steps=4300 elapsed=19.0s lr=2.000000e-03 loss=0.2208 loss_recon=0.2208 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3535 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9235 corrupt_frac=1.0000 acc_corrupt=0.9235 loss_corrupt=0.2208 wrong_frac=0.4985 init_acc_corrupt=0.5016 acc_corrupt_t_0p0_0p2=0.6076 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.9997 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2045 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=12.7930 out_g_norm=0.2144 loss_all=0.2136 init_gold_top10=0.6264 init_gold_top100=0.7122 rollout_applied_pos_frac=0.4062 init_acc_rollout_applied=0.5958 init_acc_rollout_kept=0.4471 logit_acc_rollout_applied=0.9151 logit_acc_rollout_kept=0.9313
1028
+ step=4400 epoch=4400/5000 epoch_step=1/1 micro_steps=4400 elapsed=19.0s lr=2.000000e-03 loss=0.2217 loss_recon=0.2217 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3515 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9240 corrupt_frac=1.0000 acc_corrupt=0.9240 loss_corrupt=0.2217 wrong_frac=0.5016 init_acc_corrupt=0.4998 acc_corrupt_t_0p0_0p2=0.6214 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.2001 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1992 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=12.8076 out_g_norm=0.2126 loss_all=0.1889 init_gold_top10=0.6095 init_gold_top100=0.7086 rollout_applied_pos_frac=0.3281 init_acc_rollout_applied=0.6095 init_acc_rollout_kept=0.4320 logit_acc_rollout_applied=0.9396 logit_acc_rollout_kept=0.9343
1029
+ step=4500 epoch=4500/5000 epoch_step=1/1 micro_steps=4500 elapsed=18.9s lr=2.000000e-03 loss=0.2272 loss_recon=0.2272 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3470 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9222 corrupt_frac=1.0000 acc_corrupt=0.9222 loss_corrupt=0.2272 wrong_frac=0.5023 init_acc_corrupt=0.4989 acc_corrupt_t_0p0_0p2=0.6227 corrupt_frac_t_0p0_0p2=0.2059 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.1985 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1969 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1927 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=12.8130 out_g_norm=0.1987 loss_all=0.1815 init_gold_top10=0.5693 init_gold_top100=0.6631 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5343 init_acc_rollout_kept=0.4433 logit_acc_rollout_applied=0.9338 logit_acc_rollout_kept=0.9350
1030
+ step=4600 epoch=4600/5000 epoch_step=1/1 micro_steps=4600 elapsed=19.1s lr=2.000000e-03 loss=0.2009 loss_recon=0.2009 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3562 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9311 corrupt_frac=1.0000 acc_corrupt=0.9311 loss_corrupt=0.2009 wrong_frac=0.4987 init_acc_corrupt=0.5041 acc_corrupt_t_0p0_0p2=0.6509 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.9998 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1990 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2017 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1997 out_w_norm=12.8104 out_g_norm=0.1711 loss_all=0.1198 init_gold_top10=0.6808 init_gold_top100=0.7468 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5819 init_acc_rollout_kept=0.5727 logit_acc_rollout_applied=0.9010 logit_acc_rollout_kept=0.9871
1031
+ step=4700 epoch=4700/5000 epoch_step=1/1 micro_steps=4700 elapsed=18.8s lr=2.000000e-03 loss=0.2080 loss_recon=0.2080 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3412 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9287 corrupt_frac=1.0000 acc_corrupt=0.9287 loss_corrupt=0.2080 wrong_frac=0.4994 init_acc_corrupt=0.5013 acc_corrupt_t_0p0_0p2=0.6493 corrupt_frac_t_0p0_0p2=0.2030 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2001 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=12.8228 out_g_norm=0.1683 loss_all=0.2800 init_gold_top10=0.6261 init_gold_top100=0.7245 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.5064 init_acc_rollout_kept=0.5121 logit_acc_rollout_applied=0.8821 logit_acc_rollout_kept=0.9122
1032
+ step=4800 epoch=4800/5000 epoch_step=1/1 micro_steps=4800 elapsed=19.0s lr=2.000000e-03 loss=0.2172 loss_recon=0.2172 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3430 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9244 corrupt_frac=1.0000 acc_corrupt=0.9244 loss_corrupt=0.2172 wrong_frac=0.5037 init_acc_corrupt=0.4952 acc_corrupt_t_0p0_0p2=0.6273 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1965 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1984 out_w_norm=12.8338 out_g_norm=0.1678 loss_all=0.2473 init_gold_top10=0.5767 init_gold_top100=0.6934 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.4607 init_acc_rollout_kept=0.4622 logit_acc_rollout_applied=0.9255 logit_acc_rollout_kept=0.9273
1033
+ step=4900 epoch=4900/5000 epoch_step=1/1 micro_steps=4900 elapsed=19.1s lr=2.000000e-03 loss=0.2005 loss_recon=0.2005 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.5040 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3546 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9313 corrupt_frac=1.0000 acc_corrupt=0.9313 loss_corrupt=0.2005 wrong_frac=0.4960 init_acc_corrupt=0.5079 acc_corrupt_t_0p0_0p2=0.6448 corrupt_frac_t_0p0_0p2=0.1931 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.2013 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2019 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1981 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2056 out_w_norm=12.8373 out_g_norm=0.1631 loss_all=0.0820 init_gold_top10=0.6659 init_gold_top100=0.7375 rollout_applied_pos_frac=0.3125 init_acc_rollout_applied=0.5385 init_acc_rollout_kept=0.5155 logit_acc_rollout_applied=0.9767 logit_acc_rollout_kept=0.9712
1034
+ step=5000 epoch=5000/5000 epoch_step=1/1 micro_steps=5000 elapsed=19.0s lr=2.000000e-03 loss=0.2097 loss_recon=0.2097 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3534 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9280 corrupt_frac=1.0000 acc_corrupt=0.9280 loss_corrupt=0.2097 wrong_frac=0.4993 init_acc_corrupt=0.5014 acc_corrupt_t_0p0_0p2=0.6470 corrupt_frac_t_0p0_0p2=0.2034 acc_corrupt_t_0p2_0p4=0.9993 corrupt_frac_t_0p2_0p4=0.1967 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1933 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2055 out_w_norm=12.8513 out_g_norm=0.1607 loss_all=0.3370 init_gold_top10=0.6228 init_gold_top100=0.7234 rollout_applied_pos_frac=0.4141 init_acc_rollout_applied=0.6155 init_acc_rollout_kept=0.4381 logit_acc_rollout_applied=0.9016 logit_acc_rollout_kept=0.8619
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_t5tok_p50_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014620.log ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 2423,
8
+ "tokenizer_vocab_size": 32100,
9
+ "save_dir": "runs/train8_ctx1024_t5tok_p50_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014620",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 2523776,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "uniform",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_prob_floor": 0.0,
124
+ "categorical_wrong_corpus_unigram_path": "",
125
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
126
+ "categorical_wrong_basin_shared_prob": 0.0,
127
+ "categorical_wrong_unigram_shared_prob": 0.0,
128
+ "mask_mixture_original_prob": 0.0,
129
+ "mask_mixture_lowk_prob": 0.0,
130
+ "mask_mixture_lowcorrupt_prob": 0.0,
131
+ "mask_mixture_block_prob": 0.0,
132
+ "mask_mixture_all_prob": 1.0,
133
+ "mask_mixture_lowk_clean_tokens": "0",
134
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
135
+ "mask_mixture_block_tokens": "64,128",
136
+ "simplex_bridge_sampler": "dirichlet",
137
+ "logistic_normal_sigma_min": 0.1,
138
+ "logistic_normal_sigma_max": 1.0,
139
+ "logistic_normal_tau_min": 1.0,
140
+ "logistic_normal_tau_max": 1.0,
141
+ "torch_compile": false,
142
+ "compile_mode": "max-autotune",
143
+ "state_format": "prob",
144
+ "meanflow_weight": 0.0,
145
+ "rollout_train_prob": 0.5,
146
+ "rollout_train_steps": 4,
147
+ "rollout_train_steps_min": 0,
148
+ "rollout_train_infer_steps": 1,
149
+ "rollout_train_time_mode": "sampled_path",
150
+ "rollout_train_s_dist": "uniform",
151
+ "rollout_train_s_min_frac": 0.0,
152
+ "rollout_train_s_max_frac": 0.25,
153
+ "rollout_train_s_beta_alpha": 2.0,
154
+ "rollout_train_s_beta_beta": 6.0,
155
+ "rollout_train_temp": 1.0,
156
+ "rollout_train_max_gamma": 1.0,
157
+ "rollout_train_corrupt_only": true,
158
+ "rollout_train_samplewise": true,
159
+ "rollout_train_compute_always": false,
160
+ "rollout_train_sync_t": true,
161
+ "bridge_noise_init": "logistic_normal",
162
+ "noise_sigma": -1.0,
163
+ "allow_tf32": true,
164
+ "activation_checkpointing": false,
165
+ "activation_checkpoint_interval": 1,
166
+ "activation_checkpoint_scope": "block",
167
+ "ddp_static_graph": false,
168
+ "ddp_gradient_as_bucket_view": true,
169
+ "blocking_data_transfer": false,
170
+ "dataloader_prefetch_factor": 4,
171
+ "full_train_stats": false,
172
+ "tokenized_hf": false,
173
+ "tokenized_pad_token": "pad",
174
+ "elf_conditional_hf": false,
175
+ "record_pad_truncate": false,
176
+ "record_add_eos": false,
177
+ "record_add_special_tokens": false,
178
+ "record_pad_token": "pad",
179
+ "record_shuffle_buffer": 10000,
180
+ "wrap": true,
181
+ "wrap_mode": "stream",
182
+ "wrap_record_buffer_size": 200,
183
+ "owt_cached_chunks": true,
184
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
185
+ "owt_chunk_cache_rebuild": false,
186
+ "owt_chunk_cache_write_batch": 4096,
187
+ "owt_exact_repeat_per_chunk": 64,
188
+ "online_chunk_shuffle": false,
189
+ "online_chunk_shuffle_buffer": 10000,
190
+ "openwebtext_split": "train_minus_100k",
191
+ "detokenizer": "auto",
192
+ "resolved_detokenizer": null,
193
+ "num_workers": 0,
194
+ "latest_every": 1000,
195
+ "resume_path": ""
196
+ }
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705.log ADDED
@@ -0,0 +1,1024 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 2664,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 2616320,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_prob_floor": 0.0,
124
+ "categorical_wrong_corpus_unigram_path": "",
125
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
126
+ "categorical_wrong_basin_shared_prob": 0.0,
127
+ "categorical_wrong_unigram_shared_prob": 0.0,
128
+ "mask_mixture_original_prob": 0.0,
129
+ "mask_mixture_lowk_prob": 0.0,
130
+ "mask_mixture_lowcorrupt_prob": 0.0,
131
+ "mask_mixture_block_prob": 0.0,
132
+ "mask_mixture_all_prob": 1.0,
133
+ "mask_mixture_lowk_clean_tokens": "0",
134
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
135
+ "mask_mixture_block_tokens": "64,128",
136
+ "simplex_bridge_sampler": "dirichlet",
137
+ "logistic_normal_sigma_min": 0.1,
138
+ "logistic_normal_sigma_max": 1.0,
139
+ "logistic_normal_tau_min": 1.0,
140
+ "logistic_normal_tau_max": 1.0,
141
+ "torch_compile": false,
142
+ "compile_mode": "max-autotune",
143
+ "state_format": "prob",
144
+ "meanflow_weight": 0.0,
145
+ "rollout_train_prob": 0.35,
146
+ "rollout_train_steps": 1,
147
+ "rollout_train_infer_steps": 1,
148
+ "rollout_train_time_mode": "sampled_s",
149
+ "rollout_train_s_dist": "uniform",
150
+ "rollout_train_s_min_frac": 0.0,
151
+ "rollout_train_s_max_frac": 0.25,
152
+ "rollout_train_s_beta_alpha": 2.0,
153
+ "rollout_train_s_beta_beta": 6.0,
154
+ "rollout_train_temp": 1.45,
155
+ "rollout_train_max_gamma": 1.0,
156
+ "rollout_train_corrupt_only": true,
157
+ "rollout_train_samplewise": true,
158
+ "rollout_train_compute_always": false,
159
+ "rollout_train_sync_t": true,
160
+ "bridge_noise_init": "logistic_normal",
161
+ "noise_sigma": -1.0,
162
+ "allow_tf32": true,
163
+ "activation_checkpointing": false,
164
+ "activation_checkpoint_interval": 1,
165
+ "activation_checkpoint_scope": "block",
166
+ "ddp_static_graph": false,
167
+ "ddp_gradient_as_bucket_view": true,
168
+ "blocking_data_transfer": false,
169
+ "dataloader_prefetch_factor": 4,
170
+ "full_train_stats": false,
171
+ "tokenized_hf": false,
172
+ "tokenized_pad_token": "pad",
173
+ "elf_conditional_hf": false,
174
+ "record_pad_truncate": false,
175
+ "record_add_eos": false,
176
+ "record_add_special_tokens": false,
177
+ "record_pad_token": "pad",
178
+ "record_shuffle_buffer": 10000,
179
+ "wrap": true,
180
+ "wrap_mode": "stream",
181
+ "wrap_record_buffer_size": 200,
182
+ "owt_cached_chunks": true,
183
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
184
+ "owt_chunk_cache_rebuild": false,
185
+ "owt_chunk_cache_write_batch": 4096,
186
+ "owt_exact_repeat_per_chunk": 64,
187
+ "online_chunk_shuffle": false,
188
+ "online_chunk_shuffle_buffer": 10000,
189
+ "openwebtext_split": "train_minus_100k",
190
+ "detokenizer": "auto",
191
+ "resolved_detokenizer": null,
192
+ "num_workers": 0,
193
+ "latest_every": 1000,
194
+ "resume_path": ""
195
+ }
196
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=14.5s lr=2.000000e-03 loss=7.7225 loss_recon=7.7225 loss_meanflow=0.0000 mean_model_t=0.2070 mean_corrupt_t=0.2070 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3566 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0944 corrupt_frac=1.0000 acc_corrupt=0.0944 loss_corrupt=7.7225 wrong_frac=0.7930 init_acc_corrupt=0.1137 acc_corrupt_t_0p0_0p2=0.0502 corrupt_frac_t_0p0_0p2=0.5646 acc_corrupt_t_0p2_0p4=0.1261 corrupt_frac_t_0p2_0p4=0.3537 acc_corrupt_t_0p4_0p6=0.2510 corrupt_frac_t_0p4_0p6=0.0731 acc_corrupt_t_0p6_0p8=0.3593 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=1.0070 out_g_norm=1.0880 acc_corrupt_t_0p8_1p0=0.4619 corrupt_frac_t_0p8_1p0=0.0078 loss_all=7.4683 init_gold_top10=0.1940 init_gold_top100=0.3863 rollout_applied_pos_frac=0.3750 init_acc_rollout_applied=0.1046 init_acc_rollout_kept=0.1284 logit_acc_rollout_applied=0.0924 logit_acc_rollout_kept=0.1034
197
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=13.5s lr=2.000000e-03 loss=7.0875 loss_recon=7.0875 loss_meanflow=0.0000 mean_model_t=0.2093 mean_corrupt_t=0.2093 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3467 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1033 corrupt_frac=1.0000 acc_corrupt=0.1033 loss_corrupt=7.0875 wrong_frac=0.7905 init_acc_corrupt=0.1168 acc_corrupt_t_0p0_0p2=0.0558 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.1391 corrupt_frac_t_0p2_0p4=0.3600 acc_corrupt_t_0p4_0p6=0.2535 corrupt_frac_t_0p4_0p6=0.0760 acc_corrupt_t_0p6_0p8=0.3377 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=2.8606 out_g_norm=1.7741 acc_corrupt_t_0p8_1p0=0.2988 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.6883 init_gold_top10=0.2127 init_gold_top100=0.4090 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.1394 init_acc_rollout_kept=0.1165 logit_acc_rollout_applied=0.1224 logit_acc_rollout_kept=0.1105
198
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=13.5s lr=2.000000e-03 loss=6.4572 loss_recon=6.4572 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3452 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1128 corrupt_frac=1.0000 acc_corrupt=0.1128 loss_corrupt=6.4572 wrong_frac=0.7923 init_acc_corrupt=0.1157 acc_corrupt_t_0p0_0p2=0.0592 corrupt_frac_t_0p0_0p2=0.5677 acc_corrupt_t_0p2_0p4=0.1556 corrupt_frac_t_0p2_0p4=0.3483 acc_corrupt_t_0p4_0p6=0.2841 corrupt_frac_t_0p4_0p6=0.0743 acc_corrupt_t_0p6_0p8=0.3918 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=4.3503 out_g_norm=1.3203 acc_corrupt_t_0p8_1p0=0.5454 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.2518 init_gold_top10=0.1851 init_gold_top100=0.3919 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.0706 init_acc_rollout_kept=0.1125 logit_acc_rollout_applied=0.0955 logit_acc_rollout_kept=0.1151
199
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=13.6s lr=2.000000e-03 loss=5.9689 loss_recon=5.9689 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3509 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1251 corrupt_frac=1.0000 acc_corrupt=0.1251 loss_corrupt=5.9689 wrong_frac=0.7904 init_acc_corrupt=0.1177 acc_corrupt_t_0p0_0p2=0.0635 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=0.1711 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.3157 corrupt_frac_t_0p4_0p6=0.0770 acc_corrupt_t_0p6_0p8=0.4369 corrupt_frac_t_0p6_0p8=0.0138 out_w_norm=5.5056 out_g_norm=0.5242 acc_corrupt_t_0p8_1p0=0.5684 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.7489 init_gold_top10=0.1987 init_gold_top100=0.4110 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.1260 init_acc_rollout_kept=0.1193 logit_acc_rollout_applied=0.1379 logit_acc_rollout_kept=0.1298
200
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=13.6s lr=2.000000e-03 loss=5.4717 loss_recon=5.4717 loss_meanflow=0.0000 mean_model_t=0.2075 mean_corrupt_t=0.2075 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3543 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1360 corrupt_frac=1.0000 acc_corrupt=0.1360 loss_corrupt=5.4717 wrong_frac=0.7924 init_acc_corrupt=0.1157 acc_corrupt_t_0p0_0p2=0.0675 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=0.1901 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.3503 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.4852 corrupt_frac_t_0p6_0p8=0.0118 out_w_norm=6.7726 out_g_norm=0.4126 acc_corrupt_t_0p8_1p0=0.6328 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.1699 init_gold_top10=0.2023 init_gold_top100=0.4501 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1088 init_acc_rollout_kept=0.1267 logit_acc_rollout_applied=0.1348 logit_acc_rollout_kept=0.1538
201
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=13.5s lr=2.000000e-03 loss=4.8275 loss_recon=4.8275 loss_meanflow=0.0000 mean_model_t=0.2080 mean_corrupt_t=0.2080 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3501 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1514 corrupt_frac=1.0000 acc_corrupt=0.1514 loss_corrupt=4.8275 wrong_frac=0.7919 init_acc_corrupt=0.1154 acc_corrupt_t_0p0_0p2=0.0729 corrupt_frac_t_0p0_0p2=0.5587 acc_corrupt_t_0p2_0p4=0.2141 corrupt_frac_t_0p2_0p4=0.3601 acc_corrupt_t_0p4_0p6=0.3970 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.5683 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=8.0690 out_g_norm=0.4267 acc_corrupt_t_0p8_1p0=0.6172 corrupt_frac_t_0p8_1p0=0.0078 loss_all=4.3866 init_gold_top10=0.2056 init_gold_top100=0.5031 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.1209 init_acc_rollout_kept=0.1135 logit_acc_rollout_applied=0.1647 logit_acc_rollout_kept=0.1675
202
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=13.5s lr=2.000000e-03 loss=4.2103 loss_recon=4.2103 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3470 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1766 corrupt_frac=1.0000 acc_corrupt=0.1766 loss_corrupt=4.2103 wrong_frac=0.7918 init_acc_corrupt=0.1171 acc_corrupt_t_0p0_0p2=0.0787 corrupt_frac_t_0p0_0p2=0.5605 acc_corrupt_t_0p2_0p4=0.2543 corrupt_frac_t_0p2_0p4=0.3566 acc_corrupt_t_0p4_0p6=0.4862 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.6680 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=9.2577 out_g_norm=0.4974 acc_corrupt_t_0p8_1p0=0.8032 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.9837 init_gold_top10=0.2096 init_gold_top100=0.5203 rollout_applied_pos_frac=0.3516 init_acc_rollout_applied=0.1317 init_acc_rollout_kept=0.1155 logit_acc_rollout_applied=0.2035 logit_acc_rollout_kept=0.1879
203
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=13.5s lr=2.000000e-03 loss=3.7190 loss_recon=3.7190 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3430 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1987 corrupt_frac=1.0000 acc_corrupt=0.1987 loss_corrupt=3.7190 wrong_frac=0.7914 init_acc_corrupt=0.1172 acc_corrupt_t_0p0_0p2=0.0885 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.2933 corrupt_frac_t_0p2_0p4=0.3564 acc_corrupt_t_0p4_0p6=0.5263 corrupt_frac_t_0p4_0p6=0.0741 acc_corrupt_t_0p6_0p8=0.6865 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=10.1103 out_g_norm=0.6562 loss_all=3.4861 init_gold_top10=0.2151 init_gold_top100=0.5456 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1216 init_acc_rollout_kept=0.1331 logit_acc_rollout_applied=0.2087 logit_acc_rollout_kept=0.2242
204
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=13.6s lr=2.000000e-03 loss=3.3010 loss_recon=3.3010 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3524 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2325 corrupt_frac=1.0000 acc_corrupt=0.2325 loss_corrupt=3.3010 wrong_frac=0.7903 init_acc_corrupt=0.1189 acc_corrupt_t_0p0_0p2=0.1019 corrupt_frac_t_0p0_0p2=0.5486 acc_corrupt_t_0p2_0p4=0.3438 corrupt_frac_t_0p2_0p4=0.3675 acc_corrupt_t_0p4_0p6=0.5826 corrupt_frac_t_0p4_0p6=0.0754 out_w_norm=10.6246 out_g_norm=0.8460 acc_corrupt_t_0p6_0p8=0.7352 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.8434 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.1073 init_gold_top10=0.2286 init_gold_top100=0.5535 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1193 init_acc_rollout_kept=0.1251 logit_acc_rollout_applied=0.2425 logit_acc_rollout_kept=0.2695
205
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=13.5s lr=2.000000e-03 loss=2.9096 loss_recon=2.9096 loss_meanflow=0.0000 mean_model_t=0.2090 mean_corrupt_t=0.2090 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3507 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2852 corrupt_frac=1.0000 acc_corrupt=0.2852 loss_corrupt=2.9096 wrong_frac=0.7909 init_acc_corrupt=0.1190 acc_corrupt_t_0p0_0p2=0.1272 corrupt_frac_t_0p0_0p2=0.5551 acc_corrupt_t_0p2_0p4=0.4349 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.6706 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.8008 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=10.9832 out_g_norm=1.0880 acc_corrupt_t_0p8_1p0=0.8379 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.7501 init_gold_top10=0.2276 init_gold_top100=0.5129 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.0947 init_acc_rollout_kept=0.1007 logit_acc_rollout_applied=0.2757 logit_acc_rollout_kept=0.3128
206
+ NCCL version 2.25.1+cuda12.8
207
+ resumed_from=runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705/latest.pt start_step=1001
208
+ {
209
+ "device": "cuda:0",
210
+ "rank": 0,
211
+ "world_size": 4,
212
+ "samples": "owt_cached_chunks:8",
213
+ "vocab_size": 2664,
214
+ "tokenizer_vocab_size": 50257,
215
+ "save_dir": "runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705",
216
+ "batch_size": 128,
217
+ "grad_accum": 1,
218
+ "effective_batch_size": 512,
219
+ "global_batch_size": 512,
220
+ "lr_schedule": "constant_warmup",
221
+ "optimizer": "muon",
222
+ "epochs": 0.0,
223
+ "steps_per_epoch": 1,
224
+ "total_steps": 2000,
225
+ "warmup_steps": 10,
226
+ "warmup_epochs": -1.0,
227
+ "min_lr": 0.0,
228
+ "weight_decay": 0.1,
229
+ "output_weight_decay": -1.0,
230
+ "adamw_param_groups": "nanogpt",
231
+ "adam_beta1": 0.9,
232
+ "adam_beta2": 0.95,
233
+ "adam_eps": 1e-08,
234
+ "muon_impl": "legacy",
235
+ "muon_momentum": 0.95,
236
+ "muon_ns_steps": 5,
237
+ "muon_update_scale": 1.0,
238
+ "muon_nesterov": false,
239
+ "muon_width_scale": false,
240
+ "muon_grouping": "legacy_dim_ge_2",
241
+ "muon_param_count": 2616320,
242
+ "muon_adam_param_count": 8192,
243
+ "muon_param_names": [
244
+ "vocab_embed.embedding",
245
+ "sigma_map.net.0.weight",
246
+ "sigma_map.net.2.weight",
247
+ "blocks.0.attn_qkv.weight",
248
+ "blocks.0.attn_out.weight",
249
+ "blocks.0.mlp.0.weight",
250
+ "blocks.0.mlp.2.weight",
251
+ "blocks.0.adaLN_modulation.weight",
252
+ "blocks.1.attn_qkv.weight",
253
+ "blocks.1.attn_out.weight",
254
+ "blocks.1.mlp.0.weight",
255
+ "blocks.1.mlp.2.weight",
256
+ "blocks.1.adaLN_modulation.weight",
257
+ "blocks.2.attn_qkv.weight",
258
+ "blocks.2.attn_out.weight",
259
+ "blocks.2.mlp.0.weight",
260
+ "blocks.2.mlp.2.weight",
261
+ "blocks.2.adaLN_modulation.weight",
262
+ "output_layer.linear.weight",
263
+ "output_layer.adaLN_modulation.weight"
264
+ ],
265
+ "muon_adam_param_names": [
266
+ "sigma_map.net.0.bias",
267
+ "sigma_map.net.2.bias",
268
+ "blocks.0.norm1.weight",
269
+ "blocks.0.norm2.weight",
270
+ "blocks.0.mlp.0.bias",
271
+ "blocks.0.mlp.2.bias",
272
+ "blocks.0.adaLN_modulation.bias",
273
+ "blocks.1.norm1.weight",
274
+ "blocks.1.norm2.weight",
275
+ "blocks.1.mlp.0.bias",
276
+ "blocks.1.mlp.2.bias",
277
+ "blocks.1.adaLN_modulation.bias",
278
+ "blocks.2.norm1.weight",
279
+ "blocks.2.norm2.weight",
280
+ "blocks.2.mlp.0.bias",
281
+ "blocks.2.mlp.2.bias",
282
+ "blocks.2.adaLN_modulation.bias",
283
+ "output_layer.norm_final.weight",
284
+ "output_layer.adaLN_modulation.bias"
285
+ ],
286
+ "muon_effective_nesterov": false,
287
+ "muon_effective_width_scale": false,
288
+ "muon_effective_weight_decay": 0.1,
289
+ "muon_adam_fallback_nesterov": false,
290
+ "muon_adam_fallback_weight_decay": 0.1,
291
+ "ema_decay": 0.9999,
292
+ "ema_start_step": 0,
293
+ "model_type": "ddit",
294
+ "ddit_mlp_type": "gelu",
295
+ "elf_num_time_tokens": 4,
296
+ "elf_num_model_mode_tokens": 0,
297
+ "qk_norm": true,
298
+ "output_bias": false,
299
+ "output_init_std": -1.0,
300
+ "norm_type": "rmsnorm",
301
+ "target_loss": "hard_ce",
302
+ "linear_soft_target_power": 1.0,
303
+ "linear_soft_target_min_conf": 0.0,
304
+ "linear_soft_target_max_conf": 1.0,
305
+ "t_sampling_mode": "logit_normal",
306
+ "t_sampling_power": 1.0,
307
+ "t_sampling_eps": 0.0001,
308
+ "t_sampling_logit_mean": -1.5,
309
+ "t_sampling_logit_std": 0.8,
310
+ "dual_t": true,
311
+ "corrupt_t_mode": "same",
312
+ "corrupt_min_t": 0.0,
313
+ "corrupt_max_t": 1.0,
314
+ "prefix_block_prob": 0.0,
315
+ "prefix_block_len": 128,
316
+ "mask_ratio_floor_schedule": "none",
317
+ "dirichlet_endpoint_mode": "categorical_dual_t",
318
+ "dirichlet_semantic_t_mode": "same",
319
+ "dirichlet_semantic_t_value": 0.0,
320
+ "dirichlet_semantic_t_curve": "linear",
321
+ "dirichlet_semantic_t_power": 1.0,
322
+ "endpoint_sequence_random_prob_alpha": 0.0,
323
+ "categorical_wrong_from_full_vocab": true,
324
+ "categorical_wrong_from_batch_valid_tokens": false,
325
+ "categorical_wrong_basin_token_ids": "",
326
+ "categorical_wrong_basin_prob": 0.0,
327
+ "categorical_wrong_unigram_prob": 0.0,
328
+ "categorical_wrong_uniform_prob": 0.0,
329
+ "categorical_wrong_prob_floor": 0.0,
330
+ "categorical_wrong_corpus_unigram_path": "",
331
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
332
+ "categorical_wrong_basin_shared_prob": 0.0,
333
+ "categorical_wrong_unigram_shared_prob": 0.0,
334
+ "mask_mixture_original_prob": 0.0,
335
+ "mask_mixture_lowk_prob": 0.0,
336
+ "mask_mixture_lowcorrupt_prob": 0.0,
337
+ "mask_mixture_block_prob": 0.0,
338
+ "mask_mixture_all_prob": 1.0,
339
+ "mask_mixture_lowk_clean_tokens": "0",
340
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
341
+ "mask_mixture_block_tokens": "64,128",
342
+ "simplex_bridge_sampler": "dirichlet",
343
+ "logistic_normal_sigma_min": 0.1,
344
+ "logistic_normal_sigma_max": 1.0,
345
+ "logistic_normal_tau_min": 1.0,
346
+ "logistic_normal_tau_max": 1.0,
347
+ "torch_compile": false,
348
+ "compile_mode": "max-autotune",
349
+ "state_format": "prob",
350
+ "meanflow_weight": 0.0,
351
+ "rollout_train_prob": 0.35,
352
+ "rollout_train_steps": 1,
353
+ "rollout_train_infer_steps": 1,
354
+ "rollout_train_time_mode": "sampled_s",
355
+ "rollout_train_s_dist": "uniform",
356
+ "rollout_train_s_min_frac": 0.0,
357
+ "rollout_train_s_max_frac": 0.25,
358
+ "rollout_train_s_beta_alpha": 2.0,
359
+ "rollout_train_s_beta_beta": 6.0,
360
+ "rollout_train_temp": 1.45,
361
+ "rollout_train_max_gamma": 1.0,
362
+ "rollout_train_corrupt_only": true,
363
+ "rollout_train_samplewise": true,
364
+ "rollout_train_compute_always": false,
365
+ "rollout_train_sync_t": true,
366
+ "bridge_noise_init": "logistic_normal",
367
+ "noise_sigma": -1.0,
368
+ "allow_tf32": true,
369
+ "activation_checkpointing": false,
370
+ "activation_checkpoint_interval": 1,
371
+ "activation_checkpoint_scope": "block",
372
+ "ddp_static_graph": false,
373
+ "ddp_gradient_as_bucket_view": true,
374
+ "blocking_data_transfer": false,
375
+ "dataloader_prefetch_factor": 4,
376
+ "full_train_stats": false,
377
+ "tokenized_hf": false,
378
+ "tokenized_pad_token": "pad",
379
+ "elf_conditional_hf": false,
380
+ "record_pad_truncate": false,
381
+ "record_add_eos": false,
382
+ "record_add_special_tokens": false,
383
+ "record_pad_token": "pad",
384
+ "record_shuffle_buffer": 10000,
385
+ "wrap": true,
386
+ "wrap_mode": "stream",
387
+ "wrap_record_buffer_size": 200,
388
+ "owt_cached_chunks": true,
389
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
390
+ "owt_chunk_cache_rebuild": false,
391
+ "owt_chunk_cache_write_batch": 4096,
392
+ "owt_exact_repeat_per_chunk": 64,
393
+ "online_chunk_shuffle": false,
394
+ "online_chunk_shuffle_buffer": 10000,
395
+ "openwebtext_split": "train_minus_100k",
396
+ "detokenizer": "auto",
397
+ "resolved_detokenizer": null,
398
+ "num_workers": 0,
399
+ "latest_every": 1000,
400
+ "resume_path": "runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705/latest.pt"
401
+ }
402
+ step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=14.5s lr=2.000000e-03 loss=2.5661 loss_recon=2.5661 loss_meanflow=0.0000 mean_model_t=0.2070 mean_corrupt_t=0.2070 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3566 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3462 corrupt_frac=1.0000 acc_corrupt=0.3462 loss_corrupt=2.5661 wrong_frac=0.7930 init_acc_corrupt=0.1172 acc_corrupt_t_0p0_0p2=0.1600 corrupt_frac_t_0p0_0p2=0.5646 acc_corrupt_t_0p2_0p4=0.5433 corrupt_frac_t_0p2_0p4=0.3537 acc_corrupt_t_0p4_0p6=0.7710 corrupt_frac_t_0p4_0p6=0.0731 acc_corrupt_t_0p6_0p8=0.8578 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=11.2564 out_g_norm=1.2812 acc_corrupt_t_0p8_1p0=0.9385 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.4453 init_gold_top10=0.2908 init_gold_top100=0.5703 rollout_applied_pos_frac=0.3750 init_acc_rollout_applied=0.1161 init_acc_rollout_kept=0.1284 logit_acc_rollout_applied=0.3787 logit_acc_rollout_kept=0.3859
403
+ step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=13.5s lr=2.000000e-03 loss=2.2222 loss_recon=2.2222 loss_meanflow=0.0000 mean_model_t=0.2093 mean_corrupt_t=0.2093 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3467 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4209 corrupt_frac=1.0000 acc_corrupt=0.4209 loss_corrupt=2.2222 wrong_frac=0.7905 init_acc_corrupt=0.1209 acc_corrupt_t_0p0_0p2=0.1983 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.6590 corrupt_frac_t_0p2_0p4=0.3600 acc_corrupt_t_0p4_0p6=0.8595 corrupt_frac_t_0p4_0p6=0.0760 acc_corrupt_t_0p6_0p8=0.9181 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=11.5179 out_g_norm=1.4931 acc_corrupt_t_0p8_1p0=0.9355 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7875 init_gold_top10=0.3507 init_gold_top100=0.5781 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.1554 init_acc_rollout_kept=0.1165 logit_acc_rollout_applied=0.5391 logit_acc_rollout_kept=0.4889
404
+ step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=13.5s lr=2.000000e-03 loss=1.9908 loss_recon=1.9908 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3452 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4756 corrupt_frac=1.0000 acc_corrupt=0.4756 loss_corrupt=1.9908 wrong_frac=0.7923 init_acc_corrupt=0.1205 acc_corrupt_t_0p0_0p2=0.2363 corrupt_frac_t_0p0_0p2=0.5677 acc_corrupt_t_0p2_0p4=0.7568 corrupt_frac_t_0p2_0p4=0.3483 acc_corrupt_t_0p4_0p6=0.9233 corrupt_frac_t_0p4_0p6=0.0743 acc_corrupt_t_0p6_0p8=0.9486 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=11.7426 out_g_norm=1.6005 acc_corrupt_t_0p8_1p0=0.9565 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.8900 init_gold_top10=0.3031 init_gold_top100=0.5160 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.0895 init_acc_rollout_kept=0.1125 logit_acc_rollout_applied=0.5288 logit_acc_rollout_kept=0.4775
405
+ step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=13.6s lr=2.000000e-03 loss=1.7732 loss_recon=1.7732 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3509 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5297 corrupt_frac=1.0000 acc_corrupt=0.5297 loss_corrupt=1.7732 wrong_frac=0.7904 init_acc_corrupt=0.1238 acc_corrupt_t_0p0_0p2=0.2730 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=0.8254 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.9553 corrupt_frac_t_0p4_0p6=0.0770 acc_corrupt_t_0p6_0p8=0.9705 corrupt_frac_t_0p6_0p8=0.0138 out_w_norm=11.8904 out_g_norm=1.7736 acc_corrupt_t_0p8_1p0=0.9485 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6616 init_gold_top10=0.3314 init_gold_top100=0.5167 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.1414 init_acc_rollout_kept=0.1193 logit_acc_rollout_applied=0.6786 logit_acc_rollout_kept=0.5277
406
+ step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=13.6s lr=2.000000e-03 loss=1.6078 loss_recon=1.6078 loss_meanflow=0.0000 mean_model_t=0.2075 mean_corrupt_t=0.2075 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3543 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5728 corrupt_frac=1.0000 acc_corrupt=0.5728 loss_corrupt=1.6078 wrong_frac=0.7924 init_acc_corrupt=0.1231 acc_corrupt_t_0p0_0p2=0.3100 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=0.8863 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.9791 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9870 corrupt_frac_t_0p6_0p8=0.0118 out_w_norm=12.0140 out_g_norm=1.7297 acc_corrupt_t_0p8_1p0=0.9795 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6049 init_gold_top10=0.3586 init_gold_top100=0.5673 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1354 init_acc_rollout_kept=0.1267 logit_acc_rollout_applied=0.5946 logit_acc_rollout_kept=0.5775
407
+ step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=13.6s lr=2.000000e-03 loss=1.4398 loss_recon=1.4398 loss_meanflow=0.0000 mean_model_t=0.2080 mean_corrupt_t=0.2080 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3501 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6148 corrupt_frac=1.0000 acc_corrupt=0.6148 loss_corrupt=1.4398 wrong_frac=0.7919 init_acc_corrupt=0.1239 acc_corrupt_t_0p0_0p2=0.3616 corrupt_frac_t_0p0_0p2=0.5587 acc_corrupt_t_0p2_0p4=0.9233 corrupt_frac_t_0p2_0p4=0.3601 acc_corrupt_t_0p4_0p6=0.9885 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.9892 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.1155 out_g_norm=1.7083 acc_corrupt_t_0p8_1p0=0.9883 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.2419 init_gold_top10=0.3671 init_gold_top100=0.5536 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.1364 init_acc_rollout_kept=0.1135 logit_acc_rollout_applied=0.6601 logit_acc_rollout_kept=0.6458
408
+ step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=13.5s lr=2.000000e-03 loss=1.3160 loss_recon=1.3160 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3470 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6473 corrupt_frac=1.0000 acc_corrupt=0.6473 loss_corrupt=1.3160 wrong_frac=0.7918 init_acc_corrupt=0.1264 acc_corrupt_t_0p0_0p2=0.4046 corrupt_frac_t_0p0_0p2=0.5605 acc_corrupt_t_0p2_0p4=0.9485 corrupt_frac_t_0p2_0p4=0.3566 acc_corrupt_t_0p4_0p6=0.9930 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.9935 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.1736 out_g_norm=1.7850 acc_corrupt_t_0p8_1p0=0.9946 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.3934 init_gold_top10=0.3819 init_gold_top100=0.5598 rollout_applied_pos_frac=0.3516 init_acc_rollout_applied=0.1794 init_acc_rollout_kept=0.1155 logit_acc_rollout_applied=0.7141 logit_acc_rollout_kept=0.6128
409
+ step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=13.5s lr=2.000000e-03 loss=1.1661 loss_recon=1.1661 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3430 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6826 corrupt_frac=1.0000 acc_corrupt=0.6826 loss_corrupt=1.1661 wrong_frac=0.7914 init_acc_corrupt=0.1274 acc_corrupt_t_0p0_0p2=0.4589 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.9632 corrupt_frac_t_0p2_0p4=0.3564 acc_corrupt_t_0p4_0p6=0.9957 corrupt_frac_t_0p4_0p6=0.0741 acc_corrupt_t_0p6_0p8=0.9944 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.2279 out_g_norm=1.7180 loss_all=1.1713 init_gold_top10=0.3758 init_gold_top100=0.5692 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1465 init_acc_rollout_kept=0.1331 logit_acc_rollout_applied=0.7110 logit_acc_rollout_kept=0.6753
410
+ step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=13.6s lr=2.000000e-03 loss=1.0032 loss_recon=1.0032 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3524 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7219 corrupt_frac=1.0000 acc_corrupt=0.7219 loss_corrupt=1.0032 wrong_frac=0.7903 init_acc_corrupt=0.1292 acc_corrupt_t_0p0_0p2=0.5132 corrupt_frac_t_0p0_0p2=0.5486 acc_corrupt_t_0p2_0p4=0.9708 corrupt_frac_t_0p2_0p4=0.3675 acc_corrupt_t_0p4_0p6=0.9962 corrupt_frac_t_0p4_0p6=0.0754 out_w_norm=12.2700 out_g_norm=1.8058 acc_corrupt_t_0p6_0p8=0.9950 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.9883 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.8831 init_gold_top10=0.3894 init_gold_top100=0.5688 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1433 init_acc_rollout_kept=0.1251 logit_acc_rollout_applied=0.7317 logit_acc_rollout_kept=0.7577
411
+ step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=13.6s lr=2.000000e-03 loss=0.7741 loss_recon=0.7741 loss_meanflow=0.0000 mean_model_t=0.2090 mean_corrupt_t=0.2090 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3507 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7780 corrupt_frac=1.0000 acc_corrupt=0.7780 loss_corrupt=0.7741 wrong_frac=0.7909 init_acc_corrupt=0.1300 acc_corrupt_t_0p0_0p2=0.6133 corrupt_frac_t_0p0_0p2=0.5551 acc_corrupt_t_0p2_0p4=0.9805 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.9966 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.9938 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.3161 out_g_norm=1.6323 acc_corrupt_t_0p8_1p0=0.9678 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.6070 init_gold_top10=0.3857 init_gold_top100=0.5204 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.1180 init_acc_rollout_kept=0.1007 logit_acc_rollout_applied=0.8249 logit_acc_rollout_kept=0.8256
412
+ NCCL version 2.25.1+cuda12.8
413
+ resumed_from=runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705/latest.pt start_step=2001
414
+ {
415
+ "device": "cuda:0",
416
+ "rank": 0,
417
+ "world_size": 4,
418
+ "samples": "owt_cached_chunks:8",
419
+ "vocab_size": 2664,
420
+ "tokenizer_vocab_size": 50257,
421
+ "save_dir": "runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705",
422
+ "batch_size": 128,
423
+ "grad_accum": 1,
424
+ "effective_batch_size": 512,
425
+ "global_batch_size": 512,
426
+ "lr_schedule": "constant_warmup",
427
+ "optimizer": "muon",
428
+ "epochs": 0.0,
429
+ "steps_per_epoch": 1,
430
+ "total_steps": 3000,
431
+ "warmup_steps": 10,
432
+ "warmup_epochs": -1.0,
433
+ "min_lr": 0.0,
434
+ "weight_decay": 0.1,
435
+ "output_weight_decay": -1.0,
436
+ "adamw_param_groups": "nanogpt",
437
+ "adam_beta1": 0.9,
438
+ "adam_beta2": 0.95,
439
+ "adam_eps": 1e-08,
440
+ "muon_impl": "legacy",
441
+ "muon_momentum": 0.95,
442
+ "muon_ns_steps": 5,
443
+ "muon_update_scale": 1.0,
444
+ "muon_nesterov": false,
445
+ "muon_width_scale": false,
446
+ "muon_grouping": "legacy_dim_ge_2",
447
+ "muon_param_count": 2616320,
448
+ "muon_adam_param_count": 8192,
449
+ "muon_param_names": [
450
+ "vocab_embed.embedding",
451
+ "sigma_map.net.0.weight",
452
+ "sigma_map.net.2.weight",
453
+ "blocks.0.attn_qkv.weight",
454
+ "blocks.0.attn_out.weight",
455
+ "blocks.0.mlp.0.weight",
456
+ "blocks.0.mlp.2.weight",
457
+ "blocks.0.adaLN_modulation.weight",
458
+ "blocks.1.attn_qkv.weight",
459
+ "blocks.1.attn_out.weight",
460
+ "blocks.1.mlp.0.weight",
461
+ "blocks.1.mlp.2.weight",
462
+ "blocks.1.adaLN_modulation.weight",
463
+ "blocks.2.attn_qkv.weight",
464
+ "blocks.2.attn_out.weight",
465
+ "blocks.2.mlp.0.weight",
466
+ "blocks.2.mlp.2.weight",
467
+ "blocks.2.adaLN_modulation.weight",
468
+ "output_layer.linear.weight",
469
+ "output_layer.adaLN_modulation.weight"
470
+ ],
471
+ "muon_adam_param_names": [
472
+ "sigma_map.net.0.bias",
473
+ "sigma_map.net.2.bias",
474
+ "blocks.0.norm1.weight",
475
+ "blocks.0.norm2.weight",
476
+ "blocks.0.mlp.0.bias",
477
+ "blocks.0.mlp.2.bias",
478
+ "blocks.0.adaLN_modulation.bias",
479
+ "blocks.1.norm1.weight",
480
+ "blocks.1.norm2.weight",
481
+ "blocks.1.mlp.0.bias",
482
+ "blocks.1.mlp.2.bias",
483
+ "blocks.1.adaLN_modulation.bias",
484
+ "blocks.2.norm1.weight",
485
+ "blocks.2.norm2.weight",
486
+ "blocks.2.mlp.0.bias",
487
+ "blocks.2.mlp.2.bias",
488
+ "blocks.2.adaLN_modulation.bias",
489
+ "output_layer.norm_final.weight",
490
+ "output_layer.adaLN_modulation.bias"
491
+ ],
492
+ "muon_effective_nesterov": false,
493
+ "muon_effective_width_scale": false,
494
+ "muon_effective_weight_decay": 0.1,
495
+ "muon_adam_fallback_nesterov": false,
496
+ "muon_adam_fallback_weight_decay": 0.1,
497
+ "ema_decay": 0.9999,
498
+ "ema_start_step": 0,
499
+ "model_type": "ddit",
500
+ "ddit_mlp_type": "gelu",
501
+ "elf_num_time_tokens": 4,
502
+ "elf_num_model_mode_tokens": 0,
503
+ "qk_norm": true,
504
+ "output_bias": false,
505
+ "output_init_std": -1.0,
506
+ "norm_type": "rmsnorm",
507
+ "target_loss": "hard_ce",
508
+ "linear_soft_target_power": 1.0,
509
+ "linear_soft_target_min_conf": 0.0,
510
+ "linear_soft_target_max_conf": 1.0,
511
+ "t_sampling_mode": "logit_normal",
512
+ "t_sampling_power": 1.0,
513
+ "t_sampling_eps": 0.0001,
514
+ "t_sampling_logit_mean": -1.5,
515
+ "t_sampling_logit_std": 0.8,
516
+ "dual_t": true,
517
+ "corrupt_t_mode": "same",
518
+ "corrupt_min_t": 0.0,
519
+ "corrupt_max_t": 1.0,
520
+ "prefix_block_prob": 0.0,
521
+ "prefix_block_len": 128,
522
+ "mask_ratio_floor_schedule": "none",
523
+ "dirichlet_endpoint_mode": "categorical_dual_t",
524
+ "dirichlet_semantic_t_mode": "same",
525
+ "dirichlet_semantic_t_value": 0.0,
526
+ "dirichlet_semantic_t_curve": "linear",
527
+ "dirichlet_semantic_t_power": 1.0,
528
+ "endpoint_sequence_random_prob_alpha": 0.0,
529
+ "categorical_wrong_from_full_vocab": true,
530
+ "categorical_wrong_from_batch_valid_tokens": false,
531
+ "categorical_wrong_basin_token_ids": "",
532
+ "categorical_wrong_basin_prob": 0.0,
533
+ "categorical_wrong_unigram_prob": 0.0,
534
+ "categorical_wrong_uniform_prob": 0.0,
535
+ "categorical_wrong_prob_floor": 0.0,
536
+ "categorical_wrong_corpus_unigram_path": "",
537
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
538
+ "categorical_wrong_basin_shared_prob": 0.0,
539
+ "categorical_wrong_unigram_shared_prob": 0.0,
540
+ "mask_mixture_original_prob": 0.0,
541
+ "mask_mixture_lowk_prob": 0.0,
542
+ "mask_mixture_lowcorrupt_prob": 0.0,
543
+ "mask_mixture_block_prob": 0.0,
544
+ "mask_mixture_all_prob": 1.0,
545
+ "mask_mixture_lowk_clean_tokens": "0",
546
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
547
+ "mask_mixture_block_tokens": "64,128",
548
+ "simplex_bridge_sampler": "dirichlet",
549
+ "logistic_normal_sigma_min": 0.1,
550
+ "logistic_normal_sigma_max": 1.0,
551
+ "logistic_normal_tau_min": 1.0,
552
+ "logistic_normal_tau_max": 1.0,
553
+ "torch_compile": false,
554
+ "compile_mode": "max-autotune",
555
+ "state_format": "prob",
556
+ "meanflow_weight": 0.0,
557
+ "rollout_train_prob": 0.35,
558
+ "rollout_train_steps": 1,
559
+ "rollout_train_infer_steps": 1,
560
+ "rollout_train_time_mode": "sampled_s",
561
+ "rollout_train_s_dist": "uniform",
562
+ "rollout_train_s_min_frac": 0.0,
563
+ "rollout_train_s_max_frac": 0.25,
564
+ "rollout_train_s_beta_alpha": 2.0,
565
+ "rollout_train_s_beta_beta": 6.0,
566
+ "rollout_train_temp": 1.45,
567
+ "rollout_train_max_gamma": 1.0,
568
+ "rollout_train_corrupt_only": true,
569
+ "rollout_train_samplewise": true,
570
+ "rollout_train_compute_always": false,
571
+ "rollout_train_sync_t": true,
572
+ "bridge_noise_init": "logistic_normal",
573
+ "noise_sigma": -1.0,
574
+ "allow_tf32": true,
575
+ "activation_checkpointing": false,
576
+ "activation_checkpoint_interval": 1,
577
+ "activation_checkpoint_scope": "block",
578
+ "ddp_static_graph": false,
579
+ "ddp_gradient_as_bucket_view": true,
580
+ "blocking_data_transfer": false,
581
+ "dataloader_prefetch_factor": 4,
582
+ "full_train_stats": false,
583
+ "tokenized_hf": false,
584
+ "tokenized_pad_token": "pad",
585
+ "elf_conditional_hf": false,
586
+ "record_pad_truncate": false,
587
+ "record_add_eos": false,
588
+ "record_add_special_tokens": false,
589
+ "record_pad_token": "pad",
590
+ "record_shuffle_buffer": 10000,
591
+ "wrap": true,
592
+ "wrap_mode": "stream",
593
+ "wrap_record_buffer_size": 200,
594
+ "owt_cached_chunks": true,
595
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
596
+ "owt_chunk_cache_rebuild": false,
597
+ "owt_chunk_cache_write_batch": 4096,
598
+ "owt_exact_repeat_per_chunk": 64,
599
+ "online_chunk_shuffle": false,
600
+ "online_chunk_shuffle_buffer": 10000,
601
+ "openwebtext_split": "train_minus_100k",
602
+ "detokenizer": "auto",
603
+ "resolved_detokenizer": null,
604
+ "num_workers": 0,
605
+ "latest_every": 1000,
606
+ "resume_path": "runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705/latest.pt"
607
+ }
608
+ step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=14.7s lr=2.000000e-03 loss=0.6071 loss_recon=0.6071 loss_meanflow=0.0000 mean_model_t=0.2070 mean_corrupt_t=0.2070 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3566 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8207 corrupt_frac=1.0000 acc_corrupt=0.8207 loss_corrupt=0.6071 wrong_frac=0.7930 init_acc_corrupt=0.1295 acc_corrupt_t_0p0_0p2=0.6896 corrupt_frac_t_0p0_0p2=0.5646 acc_corrupt_t_0p2_0p4=0.9893 corrupt_frac_t_0p2_0p4=0.3537 acc_corrupt_t_0p4_0p6=0.9978 corrupt_frac_t_0p4_0p6=0.0731 acc_corrupt_t_0p6_0p8=0.9961 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.3759 out_g_norm=1.3563 acc_corrupt_t_0p8_1p0=0.9873 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5944 init_gold_top10=0.4541 init_gold_top100=0.5788 rollout_applied_pos_frac=0.3750 init_acc_rollout_applied=0.1540 init_acc_rollout_kept=0.1284 logit_acc_rollout_applied=0.8264 logit_acc_rollout_kept=0.8117
609
+ step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=13.8s lr=2.000000e-03 loss=0.5238 loss_recon=0.5238 loss_meanflow=0.0000 mean_model_t=0.2093 mean_corrupt_t=0.2093 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3467 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8450 corrupt_frac=1.0000 acc_corrupt=0.8450 loss_corrupt=0.5238 wrong_frac=0.7905 init_acc_corrupt=0.1334 acc_corrupt_t_0p0_0p2=0.7252 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.9936 corrupt_frac_t_0p2_0p4=0.3600 acc_corrupt_t_0p4_0p6=0.9980 corrupt_frac_t_0p4_0p6=0.0760 acc_corrupt_t_0p6_0p8=0.9972 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.3868 out_g_norm=1.3063 acc_corrupt_t_0p8_1p0=0.9883 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3427 init_gold_top10=0.4658 init_gold_top100=0.5809 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.2087 init_acc_rollout_kept=0.1165 logit_acc_rollout_applied=0.8794 logit_acc_rollout_kept=0.9044
610
+ step=2300 epoch=2300/3000 epoch_step=1/1 micro_steps=2300 elapsed=13.8s lr=2.000000e-03 loss=0.4988 loss_recon=0.4988 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3452 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8515 corrupt_frac=1.0000 acc_corrupt=0.8515 loss_corrupt=0.4988 wrong_frac=0.7923 init_acc_corrupt=0.1331 acc_corrupt_t_0p0_0p2=0.7407 corrupt_frac_t_0p0_0p2=0.5677 acc_corrupt_t_0p2_0p4=0.9965 corrupt_frac_t_0p2_0p4=0.3483 acc_corrupt_t_0p4_0p6=0.9987 corrupt_frac_t_0p4_0p6=0.0743 acc_corrupt_t_0p6_0p8=0.9984 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=12.3557 out_g_norm=1.2158 acc_corrupt_t_0p8_1p0=0.9985 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.4199 init_gold_top10=0.3863 init_gold_top100=0.5180 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.1357 init_acc_rollout_kept=0.1125 logit_acc_rollout_applied=0.8559 logit_acc_rollout_kept=0.8958
611
+ step=2400 epoch=2400/3000 epoch_step=1/1 micro_steps=2400 elapsed=13.9s lr=2.000000e-03 loss=0.4452 loss_recon=0.4452 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3509 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8646 corrupt_frac=1.0000 acc_corrupt=0.8646 loss_corrupt=0.4452 wrong_frac=0.7904 init_acc_corrupt=0.1363 acc_corrupt_t_0p0_0p2=0.7581 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=0.9976 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.9989 corrupt_frac_t_0p4_0p6=0.0770 acc_corrupt_t_0p6_0p8=0.9987 corrupt_frac_t_0p6_0p8=0.0138 out_w_norm=12.3157 out_g_norm=1.0273 acc_corrupt_t_0p8_1p0=0.9968 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5021 init_gold_top10=0.3939 init_gold_top100=0.5171 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.1624 init_acc_rollout_kept=0.1193 logit_acc_rollout_applied=0.9255 logit_acc_rollout_kept=0.8112
612
+ step=2500 epoch=2500/3000 epoch_step=1/1 micro_steps=2500 elapsed=13.9s lr=2.000000e-03 loss=0.4050 loss_recon=0.4050 loss_meanflow=0.0000 mean_model_t=0.2075 mean_corrupt_t=0.2075 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3543 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8737 corrupt_frac=1.0000 acc_corrupt=0.8737 loss_corrupt=0.4050 wrong_frac=0.7924 init_acc_corrupt=0.1349 acc_corrupt_t_0p0_0p2=0.7741 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=0.9991 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.9996 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9992 corrupt_frac_t_0p6_0p8=0.0118 out_w_norm=12.2846 out_g_norm=0.8842 acc_corrupt_t_0p8_1p0=0.9990 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5535 init_gold_top10=0.4440 init_gold_top100=0.5682 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1681 init_acc_rollout_kept=0.1267 logit_acc_rollout_applied=0.8005 logit_acc_rollout_kept=0.8542
613
+ step=2600 epoch=2600/3000 epoch_step=1/1 micro_steps=2600 elapsed=13.8s lr=2.000000e-03 loss=0.3667 loss_recon=0.3667 loss_meanflow=0.0000 mean_model_t=0.2080 mean_corrupt_t=0.2080 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3501 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8866 corrupt_frac=1.0000 acc_corrupt=0.8866 loss_corrupt=0.3667 wrong_frac=0.7919 init_acc_corrupt=0.1346 acc_corrupt_t_0p0_0p2=0.7975 corrupt_frac_t_0p0_0p2=0.5587 acc_corrupt_t_0p2_0p4=0.9993 corrupt_frac_t_0p2_0p4=0.3601 acc_corrupt_t_0p4_0p6=0.9996 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.9992 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.2575 out_g_norm=0.8410 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2945 init_gold_top10=0.4462 init_gold_top100=0.5540 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.1552 init_acc_rollout_kept=0.1135 logit_acc_rollout_applied=0.8925 logit_acc_rollout_kept=0.9056
614
+ step=2700 epoch=2700/3000 epoch_step=1/1 micro_steps=2700 elapsed=13.8s lr=2.000000e-03 loss=0.3555 loss_recon=0.3555 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3470 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8894 corrupt_frac=1.0000 acc_corrupt=0.8894 loss_corrupt=0.3555 wrong_frac=0.7918 init_acc_corrupt=0.1361 acc_corrupt_t_0p0_0p2=0.8030 corrupt_frac_t_0p0_0p2=0.5605 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.3566 acc_corrupt_t_0p4_0p6=0.9997 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.9994 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.2373 out_g_norm=0.7415 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.4078 init_gold_top10=0.4547 init_gold_top100=0.5600 rollout_applied_pos_frac=0.3516 init_acc_rollout_applied=0.2303 init_acc_rollout_kept=0.1155 logit_acc_rollout_applied=0.8906 logit_acc_rollout_kept=0.8987
615
+ step=2800 epoch=2800/3000 epoch_step=1/1 micro_steps=2800 elapsed=13.8s lr=2.000000e-03 loss=0.3279 loss_recon=0.3279 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3430 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8969 corrupt_frac=1.0000 acc_corrupt=0.8969 loss_corrupt=0.3279 wrong_frac=0.7914 init_acc_corrupt=0.1362 acc_corrupt_t_0p0_0p2=0.8168 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.9994 corrupt_frac_t_0p2_0p4=0.3564 acc_corrupt_t_0p4_0p6=0.9997 corrupt_frac_t_0p4_0p6=0.0741 acc_corrupt_t_0p6_0p8=0.9994 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.2266 out_g_norm=0.7464 loss_all=0.3466 init_gold_top10=0.4644 init_gold_top100=0.5694 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1603 init_acc_rollout_kept=0.1331 logit_acc_rollout_applied=0.8982 logit_acc_rollout_kept=0.8894
616
+ step=2900 epoch=2900/3000 epoch_step=1/1 micro_steps=2900 elapsed=13.8s lr=2.000000e-03 loss=0.3187 loss_recon=0.3187 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3524 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9008 corrupt_frac=1.0000 acc_corrupt=0.9008 loss_corrupt=0.3187 wrong_frac=0.7903 init_acc_corrupt=0.1369 acc_corrupt_t_0p0_0p2=0.8194 corrupt_frac_t_0p0_0p2=0.5486 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.3675 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0754 out_w_norm=12.2105 out_g_norm=0.6139 acc_corrupt_t_0p6_0p8=0.9996 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.9997 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3607 init_gold_top10=0.4493 init_gold_top100=0.5688 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1612 init_acc_rollout_kept=0.1251 logit_acc_rollout_applied=0.8823 logit_acc_rollout_kept=0.8900
617
+ step=3000 epoch=3000/3000 epoch_step=1/1 micro_steps=3000 elapsed=13.7s lr=2.000000e-03 loss=0.3103 loss_recon=0.3103 loss_meanflow=0.0000 mean_model_t=0.2090 mean_corrupt_t=0.2090 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3507 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9017 corrupt_frac=1.0000 acc_corrupt=0.9017 loss_corrupt=0.3103 wrong_frac=0.7909 init_acc_corrupt=0.1370 acc_corrupt_t_0p0_0p2=0.8231 corrupt_frac_t_0p0_0p2=0.5551 acc_corrupt_t_0p2_0p4=0.9997 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.9993 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.2031 out_g_norm=0.6367 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2591 init_gold_top10=0.4073 init_gold_top100=0.5204 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.1303 init_acc_rollout_kept=0.1007 logit_acc_rollout_applied=0.8910 logit_acc_rollout_kept=0.9292
618
+ NCCL version 2.25.1+cuda12.8
619
+ resumed_from=runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705/latest.pt start_step=3001
620
+ {
621
+ "device": "cuda:0",
622
+ "rank": 0,
623
+ "world_size": 4,
624
+ "samples": "owt_cached_chunks:8",
625
+ "vocab_size": 2664,
626
+ "tokenizer_vocab_size": 50257,
627
+ "save_dir": "runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705",
628
+ "batch_size": 128,
629
+ "grad_accum": 1,
630
+ "effective_batch_size": 512,
631
+ "global_batch_size": 512,
632
+ "lr_schedule": "constant_warmup",
633
+ "optimizer": "muon",
634
+ "epochs": 0.0,
635
+ "steps_per_epoch": 1,
636
+ "total_steps": 4000,
637
+ "warmup_steps": 10,
638
+ "warmup_epochs": -1.0,
639
+ "min_lr": 0.0,
640
+ "weight_decay": 0.1,
641
+ "output_weight_decay": -1.0,
642
+ "adamw_param_groups": "nanogpt",
643
+ "adam_beta1": 0.9,
644
+ "adam_beta2": 0.95,
645
+ "adam_eps": 1e-08,
646
+ "muon_impl": "legacy",
647
+ "muon_momentum": 0.95,
648
+ "muon_ns_steps": 5,
649
+ "muon_update_scale": 1.0,
650
+ "muon_nesterov": false,
651
+ "muon_width_scale": false,
652
+ "muon_grouping": "legacy_dim_ge_2",
653
+ "muon_param_count": 2616320,
654
+ "muon_adam_param_count": 8192,
655
+ "muon_param_names": [
656
+ "vocab_embed.embedding",
657
+ "sigma_map.net.0.weight",
658
+ "sigma_map.net.2.weight",
659
+ "blocks.0.attn_qkv.weight",
660
+ "blocks.0.attn_out.weight",
661
+ "blocks.0.mlp.0.weight",
662
+ "blocks.0.mlp.2.weight",
663
+ "blocks.0.adaLN_modulation.weight",
664
+ "blocks.1.attn_qkv.weight",
665
+ "blocks.1.attn_out.weight",
666
+ "blocks.1.mlp.0.weight",
667
+ "blocks.1.mlp.2.weight",
668
+ "blocks.1.adaLN_modulation.weight",
669
+ "blocks.2.attn_qkv.weight",
670
+ "blocks.2.attn_out.weight",
671
+ "blocks.2.mlp.0.weight",
672
+ "blocks.2.mlp.2.weight",
673
+ "blocks.2.adaLN_modulation.weight",
674
+ "output_layer.linear.weight",
675
+ "output_layer.adaLN_modulation.weight"
676
+ ],
677
+ "muon_adam_param_names": [
678
+ "sigma_map.net.0.bias",
679
+ "sigma_map.net.2.bias",
680
+ "blocks.0.norm1.weight",
681
+ "blocks.0.norm2.weight",
682
+ "blocks.0.mlp.0.bias",
683
+ "blocks.0.mlp.2.bias",
684
+ "blocks.0.adaLN_modulation.bias",
685
+ "blocks.1.norm1.weight",
686
+ "blocks.1.norm2.weight",
687
+ "blocks.1.mlp.0.bias",
688
+ "blocks.1.mlp.2.bias",
689
+ "blocks.1.adaLN_modulation.bias",
690
+ "blocks.2.norm1.weight",
691
+ "blocks.2.norm2.weight",
692
+ "blocks.2.mlp.0.bias",
693
+ "blocks.2.mlp.2.bias",
694
+ "blocks.2.adaLN_modulation.bias",
695
+ "output_layer.norm_final.weight",
696
+ "output_layer.adaLN_modulation.bias"
697
+ ],
698
+ "muon_effective_nesterov": false,
699
+ "muon_effective_width_scale": false,
700
+ "muon_effective_weight_decay": 0.1,
701
+ "muon_adam_fallback_nesterov": false,
702
+ "muon_adam_fallback_weight_decay": 0.1,
703
+ "ema_decay": 0.9999,
704
+ "ema_start_step": 0,
705
+ "model_type": "ddit",
706
+ "ddit_mlp_type": "gelu",
707
+ "elf_num_time_tokens": 4,
708
+ "elf_num_model_mode_tokens": 0,
709
+ "qk_norm": true,
710
+ "output_bias": false,
711
+ "output_init_std": -1.0,
712
+ "norm_type": "rmsnorm",
713
+ "target_loss": "hard_ce",
714
+ "linear_soft_target_power": 1.0,
715
+ "linear_soft_target_min_conf": 0.0,
716
+ "linear_soft_target_max_conf": 1.0,
717
+ "t_sampling_mode": "logit_normal",
718
+ "t_sampling_power": 1.0,
719
+ "t_sampling_eps": 0.0001,
720
+ "t_sampling_logit_mean": -1.5,
721
+ "t_sampling_logit_std": 0.8,
722
+ "dual_t": true,
723
+ "corrupt_t_mode": "same",
724
+ "corrupt_min_t": 0.0,
725
+ "corrupt_max_t": 1.0,
726
+ "prefix_block_prob": 0.0,
727
+ "prefix_block_len": 128,
728
+ "mask_ratio_floor_schedule": "none",
729
+ "dirichlet_endpoint_mode": "categorical_dual_t",
730
+ "dirichlet_semantic_t_mode": "same",
731
+ "dirichlet_semantic_t_value": 0.0,
732
+ "dirichlet_semantic_t_curve": "linear",
733
+ "dirichlet_semantic_t_power": 1.0,
734
+ "endpoint_sequence_random_prob_alpha": 0.0,
735
+ "categorical_wrong_from_full_vocab": true,
736
+ "categorical_wrong_from_batch_valid_tokens": false,
737
+ "categorical_wrong_basin_token_ids": "",
738
+ "categorical_wrong_basin_prob": 0.0,
739
+ "categorical_wrong_unigram_prob": 0.0,
740
+ "categorical_wrong_uniform_prob": 0.0,
741
+ "categorical_wrong_prob_floor": 0.0,
742
+ "categorical_wrong_corpus_unigram_path": "",
743
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
744
+ "categorical_wrong_basin_shared_prob": 0.0,
745
+ "categorical_wrong_unigram_shared_prob": 0.0,
746
+ "mask_mixture_original_prob": 0.0,
747
+ "mask_mixture_lowk_prob": 0.0,
748
+ "mask_mixture_lowcorrupt_prob": 0.0,
749
+ "mask_mixture_block_prob": 0.0,
750
+ "mask_mixture_all_prob": 1.0,
751
+ "mask_mixture_lowk_clean_tokens": "0",
752
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
753
+ "mask_mixture_block_tokens": "64,128",
754
+ "simplex_bridge_sampler": "dirichlet",
755
+ "logistic_normal_sigma_min": 0.1,
756
+ "logistic_normal_sigma_max": 1.0,
757
+ "logistic_normal_tau_min": 1.0,
758
+ "logistic_normal_tau_max": 1.0,
759
+ "torch_compile": false,
760
+ "compile_mode": "max-autotune",
761
+ "state_format": "prob",
762
+ "meanflow_weight": 0.0,
763
+ "rollout_train_prob": 0.35,
764
+ "rollout_train_steps": 1,
765
+ "rollout_train_infer_steps": 1,
766
+ "rollout_train_time_mode": "sampled_s",
767
+ "rollout_train_s_dist": "uniform",
768
+ "rollout_train_s_min_frac": 0.0,
769
+ "rollout_train_s_max_frac": 0.25,
770
+ "rollout_train_s_beta_alpha": 2.0,
771
+ "rollout_train_s_beta_beta": 6.0,
772
+ "rollout_train_temp": 1.45,
773
+ "rollout_train_max_gamma": 1.0,
774
+ "rollout_train_corrupt_only": true,
775
+ "rollout_train_samplewise": true,
776
+ "rollout_train_compute_always": false,
777
+ "rollout_train_sync_t": true,
778
+ "bridge_noise_init": "logistic_normal",
779
+ "noise_sigma": -1.0,
780
+ "allow_tf32": true,
781
+ "activation_checkpointing": false,
782
+ "activation_checkpoint_interval": 1,
783
+ "activation_checkpoint_scope": "block",
784
+ "ddp_static_graph": false,
785
+ "ddp_gradient_as_bucket_view": true,
786
+ "blocking_data_transfer": false,
787
+ "dataloader_prefetch_factor": 4,
788
+ "full_train_stats": false,
789
+ "tokenized_hf": false,
790
+ "tokenized_pad_token": "pad",
791
+ "elf_conditional_hf": false,
792
+ "record_pad_truncate": false,
793
+ "record_add_eos": false,
794
+ "record_add_special_tokens": false,
795
+ "record_pad_token": "pad",
796
+ "record_shuffle_buffer": 10000,
797
+ "wrap": true,
798
+ "wrap_mode": "stream",
799
+ "wrap_record_buffer_size": 200,
800
+ "owt_cached_chunks": true,
801
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
802
+ "owt_chunk_cache_rebuild": false,
803
+ "owt_chunk_cache_write_batch": 4096,
804
+ "owt_exact_repeat_per_chunk": 64,
805
+ "online_chunk_shuffle": false,
806
+ "online_chunk_shuffle_buffer": 10000,
807
+ "openwebtext_split": "train_minus_100k",
808
+ "detokenizer": "auto",
809
+ "resolved_detokenizer": null,
810
+ "num_workers": 0,
811
+ "latest_every": 1000,
812
+ "resume_path": "runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705/latest.pt"
813
+ }
814
+ step=3100 epoch=3100/4000 epoch_step=1/1 micro_steps=3100 elapsed=14.5s lr=2.000000e-03 loss=0.2988 loss_recon=0.2988 loss_meanflow=0.0000 mean_model_t=0.2070 mean_corrupt_t=0.2070 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3566 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9057 corrupt_frac=1.0000 acc_corrupt=0.9057 loss_corrupt=0.2988 wrong_frac=0.7930 init_acc_corrupt=0.1349 acc_corrupt_t_0p0_0p2=0.8331 corrupt_frac_t_0p0_0p2=0.5646 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3537 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0731 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.2055 out_g_norm=0.5154 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.4181 init_gold_top10=0.4698 init_gold_top100=0.5788 rollout_applied_pos_frac=0.3750 init_acc_rollout_applied=0.1684 init_acc_rollout_kept=0.1284 logit_acc_rollout_applied=0.8597 logit_acc_rollout_kept=0.8469
815
+ step=3200 epoch=3200/4000 epoch_step=1/1 micro_steps=3200 elapsed=13.5s lr=2.000000e-03 loss=0.2808 loss_recon=0.2808 loss_meanflow=0.0000 mean_model_t=0.2093 mean_corrupt_t=0.2093 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3467 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9119 corrupt_frac=1.0000 acc_corrupt=0.9119 loss_corrupt=0.2808 wrong_frac=0.7905 init_acc_corrupt=0.1374 acc_corrupt_t_0p0_0p2=0.8414 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.9998 corrupt_frac_t_0p2_0p4=0.3600 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0760 acc_corrupt_t_0p6_0p8=0.9995 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.2055 out_g_norm=0.6080 acc_corrupt_t_0p8_1p0=0.9980 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1874 init_gold_top10=0.4693 init_gold_top100=0.5809 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.2221 init_acc_rollout_kept=0.1165 logit_acc_rollout_applied=0.8865 logit_acc_rollout_kept=0.9701
816
+ step=3300 epoch=3300/4000 epoch_step=1/1 micro_steps=3300 elapsed=13.5s lr=2.000000e-03 loss=0.2906 loss_recon=0.2906 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3452 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9074 corrupt_frac=1.0000 acc_corrupt=0.9074 loss_corrupt=0.2906 wrong_frac=0.7923 init_acc_corrupt=0.1364 acc_corrupt_t_0p0_0p2=0.8369 corrupt_frac_t_0p0_0p2=0.5677 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3483 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0743 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=12.2204 out_g_norm=0.4741 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3067 init_gold_top10=0.4022 init_gold_top100=0.5180 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.1471 init_acc_rollout_kept=0.1125 logit_acc_rollout_applied=0.9373 logit_acc_rollout_kept=0.8873
817
+ step=3400 epoch=3400/4000 epoch_step=1/1 micro_steps=3400 elapsed=13.6s lr=2.000000e-03 loss=0.2800 loss_recon=0.2800 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3509 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9107 corrupt_frac=1.0000 acc_corrupt=0.9107 loss_corrupt=0.2800 wrong_frac=0.7904 init_acc_corrupt=0.1393 acc_corrupt_t_0p0_0p2=0.8396 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0770 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0138 out_w_norm=12.2315 out_g_norm=0.4718 acc_corrupt_t_0p8_1p0=0.9988 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.4144 init_gold_top10=0.3943 init_gold_top100=0.5171 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.1679 init_acc_rollout_kept=0.1193 logit_acc_rollout_applied=0.9457 logit_acc_rollout_kept=0.8529
818
+ step=3500 epoch=3500/4000 epoch_step=1/1 micro_steps=3500 elapsed=13.6s lr=2.000000e-03 loss=0.2717 loss_recon=0.2717 loss_meanflow=0.0000 mean_model_t=0.2075 mean_corrupt_t=0.2075 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3543 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9142 corrupt_frac=1.0000 acc_corrupt=0.9142 loss_corrupt=0.2717 wrong_frac=0.7924 init_acc_corrupt=0.1371 acc_corrupt_t_0p0_0p2=0.8463 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0118 out_w_norm=12.2636 out_g_norm=0.4406 acc_corrupt_t_0p8_1p0=0.9990 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3554 init_gold_top10=0.4580 init_gold_top100=0.5682 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1740 init_acc_rollout_kept=0.1267 logit_acc_rollout_applied=0.8787 logit_acc_rollout_kept=0.8976
819
+ step=3600 epoch=3600/4000 epoch_step=1/1 micro_steps=3600 elapsed=13.5s lr=2.000000e-03 loss=0.2541 loss_recon=0.2541 loss_meanflow=0.0000 mean_model_t=0.2080 mean_corrupt_t=0.2080 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3501 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9199 corrupt_frac=1.0000 acc_corrupt=0.9199 loss_corrupt=0.2541 wrong_frac=0.7919 init_acc_corrupt=0.1365 acc_corrupt_t_0p0_0p2=0.8567 corrupt_frac_t_0p0_0p2=0.5587 acc_corrupt_t_0p2_0p4=0.9998 corrupt_frac_t_0p2_0p4=0.3601 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.2850 out_g_norm=0.3934 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2026 init_gold_top10=0.4512 init_gold_top100=0.5540 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.1588 init_acc_rollout_kept=0.1135 logit_acc_rollout_applied=0.9273 logit_acc_rollout_kept=0.9316
820
+ step=3700 epoch=3700/4000 epoch_step=1/1 micro_steps=3700 elapsed=13.5s lr=2.000000e-03 loss=0.2640 loss_recon=0.2640 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3470 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9161 corrupt_frac=1.0000 acc_corrupt=0.9161 loss_corrupt=0.2640 wrong_frac=0.7918 init_acc_corrupt=0.1378 acc_corrupt_t_0p0_0p2=0.8504 corrupt_frac_t_0p0_0p2=0.5605 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3566 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.3074 out_g_norm=0.3603 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2453 init_gold_top10=0.4635 init_gold_top100=0.5600 rollout_applied_pos_frac=0.3516 init_acc_rollout_applied=0.2373 init_acc_rollout_kept=0.1155 logit_acc_rollout_applied=0.9115 logit_acc_rollout_kept=0.9218
821
+ step=3800 epoch=3800/4000 epoch_step=1/1 micro_steps=3800 elapsed=13.5s lr=2.000000e-03 loss=0.2382 loss_recon=0.2382 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3430 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9229 corrupt_frac=1.0000 acc_corrupt=0.9229 loss_corrupt=0.2382 wrong_frac=0.7914 init_acc_corrupt=0.1377 acc_corrupt_t_0p0_0p2=0.8628 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3564 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0741 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.3346 out_g_norm=0.3186 loss_all=0.2397 init_gold_top10=0.4725 init_gold_top100=0.5694 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1680 init_acc_rollout_kept=0.1331 logit_acc_rollout_applied=0.9598 logit_acc_rollout_kept=0.9054
822
+ step=3900 epoch=3900/4000 epoch_step=1/1 micro_steps=3900 elapsed=13.6s lr=2.000000e-03 loss=0.2536 loss_recon=0.2536 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3524 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9199 corrupt_frac=1.0000 acc_corrupt=0.9199 loss_corrupt=0.2536 wrong_frac=0.7903 init_acc_corrupt=0.1382 acc_corrupt_t_0p0_0p2=0.8541 corrupt_frac_t_0p0_0p2=0.5486 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3675 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0754 out_w_norm=12.3579 out_g_norm=0.3368 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.9993 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2855 init_gold_top10=0.4527 init_gold_top100=0.5688 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1643 init_acc_rollout_kept=0.1251 logit_acc_rollout_applied=0.8880 logit_acc_rollout_kept=0.9133
823
+ step=4000 epoch=4000/4000 epoch_step=1/1 micro_steps=4000 elapsed=13.5s lr=2.000000e-03 loss=0.2502 loss_recon=0.2502 loss_meanflow=0.0000 mean_model_t=0.2090 mean_corrupt_t=0.2090 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3507 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9205 corrupt_frac=1.0000 acc_corrupt=0.9205 loss_corrupt=0.2502 wrong_frac=0.7909 init_acc_corrupt=0.1380 acc_corrupt_t_0p0_0p2=0.8569 corrupt_frac_t_0p0_0p2=0.5551 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.3747 out_g_norm=0.3521 acc_corrupt_t_0p8_1p0=0.9990 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2211 init_gold_top10=0.4130 init_gold_top100=0.5204 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.1318 init_acc_rollout_kept=0.1007 logit_acc_rollout_applied=0.9164 logit_acc_rollout_kept=0.9363
824
+ NCCL version 2.25.1+cuda12.8
825
+ resumed_from=runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705/latest.pt start_step=4001
826
+ {
827
+ "device": "cuda:0",
828
+ "rank": 0,
829
+ "world_size": 4,
830
+ "samples": "owt_cached_chunks:8",
831
+ "vocab_size": 2664,
832
+ "tokenizer_vocab_size": 50257,
833
+ "save_dir": "runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705",
834
+ "batch_size": 128,
835
+ "grad_accum": 1,
836
+ "effective_batch_size": 512,
837
+ "global_batch_size": 512,
838
+ "lr_schedule": "constant_warmup",
839
+ "optimizer": "muon",
840
+ "epochs": 0.0,
841
+ "steps_per_epoch": 1,
842
+ "total_steps": 5000,
843
+ "warmup_steps": 10,
844
+ "warmup_epochs": -1.0,
845
+ "min_lr": 0.0,
846
+ "weight_decay": 0.1,
847
+ "output_weight_decay": -1.0,
848
+ "adamw_param_groups": "nanogpt",
849
+ "adam_beta1": 0.9,
850
+ "adam_beta2": 0.95,
851
+ "adam_eps": 1e-08,
852
+ "muon_impl": "legacy",
853
+ "muon_momentum": 0.95,
854
+ "muon_ns_steps": 5,
855
+ "muon_update_scale": 1.0,
856
+ "muon_nesterov": false,
857
+ "muon_width_scale": false,
858
+ "muon_grouping": "legacy_dim_ge_2",
859
+ "muon_param_count": 2616320,
860
+ "muon_adam_param_count": 8192,
861
+ "muon_param_names": [
862
+ "vocab_embed.embedding",
863
+ "sigma_map.net.0.weight",
864
+ "sigma_map.net.2.weight",
865
+ "blocks.0.attn_qkv.weight",
866
+ "blocks.0.attn_out.weight",
867
+ "blocks.0.mlp.0.weight",
868
+ "blocks.0.mlp.2.weight",
869
+ "blocks.0.adaLN_modulation.weight",
870
+ "blocks.1.attn_qkv.weight",
871
+ "blocks.1.attn_out.weight",
872
+ "blocks.1.mlp.0.weight",
873
+ "blocks.1.mlp.2.weight",
874
+ "blocks.1.adaLN_modulation.weight",
875
+ "blocks.2.attn_qkv.weight",
876
+ "blocks.2.attn_out.weight",
877
+ "blocks.2.mlp.0.weight",
878
+ "blocks.2.mlp.2.weight",
879
+ "blocks.2.adaLN_modulation.weight",
880
+ "output_layer.linear.weight",
881
+ "output_layer.adaLN_modulation.weight"
882
+ ],
883
+ "muon_adam_param_names": [
884
+ "sigma_map.net.0.bias",
885
+ "sigma_map.net.2.bias",
886
+ "blocks.0.norm1.weight",
887
+ "blocks.0.norm2.weight",
888
+ "blocks.0.mlp.0.bias",
889
+ "blocks.0.mlp.2.bias",
890
+ "blocks.0.adaLN_modulation.bias",
891
+ "blocks.1.norm1.weight",
892
+ "blocks.1.norm2.weight",
893
+ "blocks.1.mlp.0.bias",
894
+ "blocks.1.mlp.2.bias",
895
+ "blocks.1.adaLN_modulation.bias",
896
+ "blocks.2.norm1.weight",
897
+ "blocks.2.norm2.weight",
898
+ "blocks.2.mlp.0.bias",
899
+ "blocks.2.mlp.2.bias",
900
+ "blocks.2.adaLN_modulation.bias",
901
+ "output_layer.norm_final.weight",
902
+ "output_layer.adaLN_modulation.bias"
903
+ ],
904
+ "muon_effective_nesterov": false,
905
+ "muon_effective_width_scale": false,
906
+ "muon_effective_weight_decay": 0.1,
907
+ "muon_adam_fallback_nesterov": false,
908
+ "muon_adam_fallback_weight_decay": 0.1,
909
+ "ema_decay": 0.9999,
910
+ "ema_start_step": 0,
911
+ "model_type": "ddit",
912
+ "ddit_mlp_type": "gelu",
913
+ "elf_num_time_tokens": 4,
914
+ "elf_num_model_mode_tokens": 0,
915
+ "qk_norm": true,
916
+ "output_bias": false,
917
+ "output_init_std": -1.0,
918
+ "norm_type": "rmsnorm",
919
+ "target_loss": "hard_ce",
920
+ "linear_soft_target_power": 1.0,
921
+ "linear_soft_target_min_conf": 0.0,
922
+ "linear_soft_target_max_conf": 1.0,
923
+ "t_sampling_mode": "logit_normal",
924
+ "t_sampling_power": 1.0,
925
+ "t_sampling_eps": 0.0001,
926
+ "t_sampling_logit_mean": -1.5,
927
+ "t_sampling_logit_std": 0.8,
928
+ "dual_t": true,
929
+ "corrupt_t_mode": "same",
930
+ "corrupt_min_t": 0.0,
931
+ "corrupt_max_t": 1.0,
932
+ "prefix_block_prob": 0.0,
933
+ "prefix_block_len": 128,
934
+ "mask_ratio_floor_schedule": "none",
935
+ "dirichlet_endpoint_mode": "categorical_dual_t",
936
+ "dirichlet_semantic_t_mode": "same",
937
+ "dirichlet_semantic_t_value": 0.0,
938
+ "dirichlet_semantic_t_curve": "linear",
939
+ "dirichlet_semantic_t_power": 1.0,
940
+ "endpoint_sequence_random_prob_alpha": 0.0,
941
+ "categorical_wrong_from_full_vocab": true,
942
+ "categorical_wrong_from_batch_valid_tokens": false,
943
+ "categorical_wrong_basin_token_ids": "",
944
+ "categorical_wrong_basin_prob": 0.0,
945
+ "categorical_wrong_unigram_prob": 0.0,
946
+ "categorical_wrong_uniform_prob": 0.0,
947
+ "categorical_wrong_prob_floor": 0.0,
948
+ "categorical_wrong_corpus_unigram_path": "",
949
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
950
+ "categorical_wrong_basin_shared_prob": 0.0,
951
+ "categorical_wrong_unigram_shared_prob": 0.0,
952
+ "mask_mixture_original_prob": 0.0,
953
+ "mask_mixture_lowk_prob": 0.0,
954
+ "mask_mixture_lowcorrupt_prob": 0.0,
955
+ "mask_mixture_block_prob": 0.0,
956
+ "mask_mixture_all_prob": 1.0,
957
+ "mask_mixture_lowk_clean_tokens": "0",
958
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
959
+ "mask_mixture_block_tokens": "64,128",
960
+ "simplex_bridge_sampler": "dirichlet",
961
+ "logistic_normal_sigma_min": 0.1,
962
+ "logistic_normal_sigma_max": 1.0,
963
+ "logistic_normal_tau_min": 1.0,
964
+ "logistic_normal_tau_max": 1.0,
965
+ "torch_compile": false,
966
+ "compile_mode": "max-autotune",
967
+ "state_format": "prob",
968
+ "meanflow_weight": 0.0,
969
+ "rollout_train_prob": 0.35,
970
+ "rollout_train_steps": 1,
971
+ "rollout_train_infer_steps": 1,
972
+ "rollout_train_time_mode": "sampled_s",
973
+ "rollout_train_s_dist": "uniform",
974
+ "rollout_train_s_min_frac": 0.0,
975
+ "rollout_train_s_max_frac": 0.25,
976
+ "rollout_train_s_beta_alpha": 2.0,
977
+ "rollout_train_s_beta_beta": 6.0,
978
+ "rollout_train_temp": 1.45,
979
+ "rollout_train_max_gamma": 1.0,
980
+ "rollout_train_corrupt_only": true,
981
+ "rollout_train_samplewise": true,
982
+ "rollout_train_compute_always": false,
983
+ "rollout_train_sync_t": true,
984
+ "bridge_noise_init": "logistic_normal",
985
+ "noise_sigma": -1.0,
986
+ "allow_tf32": true,
987
+ "activation_checkpointing": false,
988
+ "activation_checkpoint_interval": 1,
989
+ "activation_checkpoint_scope": "block",
990
+ "ddp_static_graph": false,
991
+ "ddp_gradient_as_bucket_view": true,
992
+ "blocking_data_transfer": false,
993
+ "dataloader_prefetch_factor": 4,
994
+ "full_train_stats": false,
995
+ "tokenized_hf": false,
996
+ "tokenized_pad_token": "pad",
997
+ "elf_conditional_hf": false,
998
+ "record_pad_truncate": false,
999
+ "record_add_eos": false,
1000
+ "record_add_special_tokens": false,
1001
+ "record_pad_token": "pad",
1002
+ "record_shuffle_buffer": 10000,
1003
+ "wrap": true,
1004
+ "wrap_mode": "stream",
1005
+ "wrap_record_buffer_size": 200,
1006
+ "owt_cached_chunks": true,
1007
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
1008
+ "owt_chunk_cache_rebuild": false,
1009
+ "owt_chunk_cache_write_batch": 4096,
1010
+ "owt_exact_repeat_per_chunk": 64,
1011
+ "online_chunk_shuffle": false,
1012
+ "online_chunk_shuffle_buffer": 10000,
1013
+ "openwebtext_split": "train_minus_100k",
1014
+ "detokenizer": "auto",
1015
+ "resolved_detokenizer": null,
1016
+ "num_workers": 0,
1017
+ "latest_every": 1000,
1018
+ "resume_path": "runs/train8_ctx1024_tradeoff_p35_unif0_0p25_outwdm1_ctx1024_tradeoff_dual_20260517_225705/latest.pt"
1019
+ }
1020
+ step=4100 epoch=4100/5000 epoch_step=1/1 micro_steps=4100 elapsed=14.7s lr=2.000000e-03 loss=0.2475 loss_recon=0.2475 loss_meanflow=0.0000 mean_model_t=0.2070 mean_corrupt_t=0.2070 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3566 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9226 corrupt_frac=1.0000 acc_corrupt=0.9226 loss_corrupt=0.2475 wrong_frac=0.7930 init_acc_corrupt=0.1356 acc_corrupt_t_0p0_0p2=0.8630 corrupt_frac_t_0p0_0p2=0.5646 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3537 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0731 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.3913 out_g_norm=0.2801 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3210 init_gold_top10=0.4741 init_gold_top100=0.5788 rollout_applied_pos_frac=0.3750 init_acc_rollout_applied=0.1707 init_acc_rollout_kept=0.1284 logit_acc_rollout_applied=0.8816 logit_acc_rollout_kept=0.8989
1021
+ step=4200 epoch=4200/5000 epoch_step=1/1 micro_steps=4200 elapsed=13.8s lr=2.000000e-03 loss=0.2207 loss_recon=0.2207 loss_meanflow=0.0000 mean_model_t=0.2093 mean_corrupt_t=0.2093 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3467 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9305 corrupt_frac=1.0000 acc_corrupt=0.9305 loss_corrupt=0.2207 wrong_frac=0.7905 init_acc_corrupt=0.1383 acc_corrupt_t_0p0_0p2=0.8749 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3600 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0760 acc_corrupt_t_0p6_0p8=0.9996 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=12.4189 out_g_norm=0.2572 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1595 init_gold_top10=0.4790 init_gold_top100=0.5809 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.2249 init_acc_rollout_kept=0.1165 logit_acc_rollout_applied=0.9164 logit_acc_rollout_kept=0.9773
1022
+ step=4300 epoch=4300/5000 epoch_step=1/1 micro_steps=4300 elapsed=13.8s lr=2.000000e-03 loss=0.2405 loss_recon=0.2405 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3452 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9226 corrupt_frac=1.0000 acc_corrupt=0.9226 loss_corrupt=0.2405 wrong_frac=0.7923 init_acc_corrupt=0.1372 acc_corrupt_t_0p0_0p2=0.8638 corrupt_frac_t_0p0_0p2=0.5677 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3483 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0743 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=12.4505 out_g_norm=0.2480 acc_corrupt_t_0p8_1p0=0.9995 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2279 init_gold_top10=0.3981 init_gold_top100=0.5180 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.1497 init_acc_rollout_kept=0.1125 logit_acc_rollout_applied=0.9164 logit_acc_rollout_kept=0.9304
1023
+ step=4400 epoch=4400/5000 epoch_step=1/1 micro_steps=4400 elapsed=13.8s lr=2.000000e-03 loss=0.2299 loss_recon=0.2299 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3509 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9278 corrupt_frac=1.0000 acc_corrupt=0.9278 loss_corrupt=0.2299 wrong_frac=0.7904 init_acc_corrupt=0.1401 acc_corrupt_t_0p0_0p2=0.8702 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0770 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0138 out_w_norm=12.4714 out_g_norm=0.2465 acc_corrupt_t_0p8_1p0=0.9983 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5788 init_gold_top10=0.3953 init_gold_top100=0.5171 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.1687 init_acc_rollout_kept=0.1193 logit_acc_rollout_applied=0.9465 logit_acc_rollout_kept=0.8181
1024
+ step=4500 epoch=4500/5000 epoch_step=1/1 micro_steps=4500 elapsed=13.9s lr=2.000000e-03 loss=0.2353 loss_recon=0.2353 loss_meanflow=0.0000 mean_model_t=0.2075 mean_corrupt_t=0.2075 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3543 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9258 corrupt_frac=1.0000 acc_corrupt=0.9258 loss_corrupt=0.2353 wrong_frac=0.7924 init_acc_corrupt=0.1377 acc_corrupt_t_0p0_0p2=0.8669 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0118 out_w_norm=12.4896 out_g_norm=0.2453 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3040 init_gold_top10=0.4599 init_gold_top100=0.5682 rollout_applied_pos_frac=0.3594 init_acc_rollout_applied=0.1761 init_acc_rollout_kept=0.1267 logit_acc_rollout_applied=0.8716 logit_acc_rollout_kept=0.9091
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n1024_compactv2664_3l_bs512_hard_ce_allcorrupt.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n1024_compactv2664_3l_bs512_hard_ce_onehot.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n512_compactv1635_3l_bs512_hard_ce_allcorrupt.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n64_compactv335_3l_bs512_hard_ce_allcorrupt.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n8_compactv47_3l_hard_ce_onehot.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n8_linear_soft_kl_onehot_20260517_train8ctx8_overfit.log ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 50257,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_n8_linear_soft_kl_onehot_20260517_train8ctx8_overfit",
10
+ "batch_size": 1,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 4,
13
+ "global_batch_size": 4,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 2,
18
+ "total_steps": 500,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 169453056,
36
+ "muon_adam_param_count": 122368,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "blocks.3.attn_qkv.weight",
57
+ "blocks.3.attn_out.weight",
58
+ "blocks.3.mlp.0.weight",
59
+ "blocks.3.mlp.2.weight",
60
+ "blocks.3.adaLN_modulation.weight",
61
+ "blocks.4.attn_qkv.weight",
62
+ "blocks.4.attn_out.weight",
63
+ "blocks.4.mlp.0.weight",
64
+ "blocks.4.mlp.2.weight",
65
+ "blocks.4.adaLN_modulation.weight",
66
+ "blocks.5.attn_qkv.weight",
67
+ "blocks.5.attn_out.weight",
68
+ "blocks.5.mlp.0.weight",
69
+ "blocks.5.mlp.2.weight",
70
+ "blocks.5.adaLN_modulation.weight",
71
+ "blocks.6.attn_qkv.weight",
72
+ "blocks.6.attn_out.weight",
73
+ "blocks.6.mlp.0.weight",
74
+ "blocks.6.mlp.2.weight",
75
+ "blocks.6.adaLN_modulation.weight",
76
+ "blocks.7.attn_qkv.weight",
77
+ "blocks.7.attn_out.weight",
78
+ "blocks.7.mlp.0.weight",
79
+ "blocks.7.mlp.2.weight",
80
+ "blocks.7.adaLN_modulation.weight",
81
+ "blocks.8.attn_qkv.weight",
82
+ "blocks.8.attn_out.weight",
83
+ "blocks.8.mlp.0.weight",
84
+ "blocks.8.mlp.2.weight",
85
+ "blocks.8.adaLN_modulation.weight",
86
+ "blocks.9.attn_qkv.weight",
87
+ "blocks.9.attn_out.weight",
88
+ "blocks.9.mlp.0.weight",
89
+ "blocks.9.mlp.2.weight",
90
+ "blocks.9.adaLN_modulation.weight",
91
+ "blocks.10.attn_qkv.weight",
92
+ "blocks.10.attn_out.weight",
93
+ "blocks.10.mlp.0.weight",
94
+ "blocks.10.mlp.2.weight",
95
+ "blocks.10.adaLN_modulation.weight",
96
+ "blocks.11.attn_qkv.weight",
97
+ "blocks.11.attn_out.weight",
98
+ "blocks.11.mlp.0.weight",
99
+ "blocks.11.mlp.2.weight",
100
+ "blocks.11.adaLN_modulation.weight",
101
+ "output_layer.linear.weight",
102
+ "output_layer.adaLN_modulation.weight"
103
+ ],
104
+ "muon_adam_param_names": [
105
+ "sigma_map.net.0.bias",
106
+ "sigma_map.net.2.bias",
107
+ "blocks.0.norm1.weight",
108
+ "blocks.0.norm2.weight",
109
+ "blocks.0.mlp.0.bias",
110
+ "blocks.0.mlp.2.bias",
111
+ "blocks.0.adaLN_modulation.bias",
112
+ "blocks.1.norm1.weight",
113
+ "blocks.1.norm2.weight",
114
+ "blocks.1.mlp.0.bias",
115
+ "blocks.1.mlp.2.bias",
116
+ "blocks.1.adaLN_modulation.bias",
117
+ "blocks.2.norm1.weight",
118
+ "blocks.2.norm2.weight",
119
+ "blocks.2.mlp.0.bias",
120
+ "blocks.2.mlp.2.bias",
121
+ "blocks.2.adaLN_modulation.bias",
122
+ "blocks.3.norm1.weight",
123
+ "blocks.3.norm2.weight",
124
+ "blocks.3.mlp.0.bias",
125
+ "blocks.3.mlp.2.bias",
126
+ "blocks.3.adaLN_modulation.bias",
127
+ "blocks.4.norm1.weight",
128
+ "blocks.4.norm2.weight",
129
+ "blocks.4.mlp.0.bias",
130
+ "blocks.4.mlp.2.bias",
131
+ "blocks.4.adaLN_modulation.bias",
132
+ "blocks.5.norm1.weight",
133
+ "blocks.5.norm2.weight",
134
+ "blocks.5.mlp.0.bias",
135
+ "blocks.5.mlp.2.bias",
136
+ "blocks.5.adaLN_modulation.bias",
137
+ "blocks.6.norm1.weight",
138
+ "blocks.6.norm2.weight",
139
+ "blocks.6.mlp.0.bias",
140
+ "blocks.6.mlp.2.bias",
141
+ "blocks.6.adaLN_modulation.bias",
142
+ "blocks.7.norm1.weight",
143
+ "blocks.7.norm2.weight",
144
+ "blocks.7.mlp.0.bias",
145
+ "blocks.7.mlp.2.bias",
146
+ "blocks.7.adaLN_modulation.bias",
147
+ "blocks.8.norm1.weight",
148
+ "blocks.8.norm2.weight",
149
+ "blocks.8.mlp.0.bias",
150
+ "blocks.8.mlp.2.bias",
151
+ "blocks.8.adaLN_modulation.bias",
152
+ "blocks.9.norm1.weight",
153
+ "blocks.9.norm2.weight",
154
+ "blocks.9.mlp.0.bias",
155
+ "blocks.9.mlp.2.bias",
156
+ "blocks.9.adaLN_modulation.bias",
157
+ "blocks.10.norm1.weight",
158
+ "blocks.10.norm2.weight",
159
+ "blocks.10.mlp.0.bias",
160
+ "blocks.10.mlp.2.bias",
161
+ "blocks.10.adaLN_modulation.bias",
162
+ "blocks.11.norm1.weight",
163
+ "blocks.11.norm2.weight",
164
+ "blocks.11.mlp.0.bias",
165
+ "blocks.11.mlp.2.bias",
166
+ "blocks.11.adaLN_modulation.bias",
167
+ "output_layer.norm_final.weight",
168
+ "output_layer.adaLN_modulation.bias"
169
+ ],
170
+ "muon_effective_nesterov": false,
171
+ "muon_effective_width_scale": false,
172
+ "muon_effective_weight_decay": 0.1,
173
+ "muon_adam_fallback_nesterov": false,
174
+ "muon_adam_fallback_weight_decay": 0.1,
175
+ "ema_decay": 0.9999,
176
+ "ema_start_step": 0,
177
+ "model_type": "ddit",
178
+ "elf_num_time_tokens": 4,
179
+ "elf_num_model_mode_tokens": 0,
180
+ "qk_norm": true,
181
+ "output_bias": false,
182
+ "output_init_std": -1.0,
183
+ "norm_type": "rmsnorm",
184
+ "target_loss": "linear_soft_kl",
185
+ "linear_soft_target_power": 1.0,
186
+ "linear_soft_target_min_conf": 0.0,
187
+ "linear_soft_target_max_conf": 1.0,
188
+ "t_sampling_mode": "logit_normal",
189
+ "t_sampling_power": 1.0,
190
+ "t_sampling_eps": 0.0001,
191
+ "t_sampling_logit_mean": -1.5,
192
+ "t_sampling_logit_std": 0.8,
193
+ "dual_t": true,
194
+ "corrupt_t_mode": "same",
195
+ "corrupt_min_t": 0.0,
196
+ "corrupt_max_t": 1.0,
197
+ "prefix_block_prob": 0.0,
198
+ "prefix_block_len": 128,
199
+ "mask_ratio_floor_schedule": "none",
200
+ "dirichlet_endpoint_mode": "categorical_dual_t",
201
+ "dirichlet_semantic_t_mode": "same",
202
+ "dirichlet_semantic_t_value": 0.0,
203
+ "dirichlet_semantic_t_curve": "linear",
204
+ "dirichlet_semantic_t_power": 1.0,
205
+ "endpoint_sequence_random_prob_alpha": 0.0,
206
+ "categorical_wrong_from_full_vocab": true,
207
+ "categorical_wrong_from_batch_valid_tokens": false,
208
+ "categorical_wrong_basin_token_ids": "",
209
+ "categorical_wrong_basin_prob": 0.0,
210
+ "categorical_wrong_unigram_prob": 0.0,
211
+ "categorical_wrong_uniform_prob": 0.0,
212
+ "categorical_wrong_corpus_unigram_path": "",
213
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
214
+ "categorical_wrong_basin_shared_prob": 0.0,
215
+ "categorical_wrong_unigram_shared_prob": 0.0,
216
+ "mask_mixture_original_prob": 0.0,
217
+ "mask_mixture_lowk_prob": 1.0,
218
+ "mask_mixture_lowcorrupt_prob": 0.0,
219
+ "mask_mixture_block_prob": 0.0,
220
+ "mask_mixture_all_prob": 0.0,
221
+ "mask_mixture_lowk_clean_tokens": "1,2,4",
222
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
223
+ "mask_mixture_block_tokens": "64,128",
224
+ "simplex_bridge_sampler": "dirichlet",
225
+ "logistic_normal_sigma_min": 0.18,
226
+ "logistic_normal_sigma_max": 2.2,
227
+ "logistic_normal_tau_min": 0.65,
228
+ "logistic_normal_tau_max": 1.15,
229
+ "torch_compile": false,
230
+ "compile_mode": "max-autotune",
231
+ "state_format": "prob",
232
+ "meanflow_weight": 0.0,
233
+ "rollout_train_prob": 0.0,
234
+ "rollout_train_steps": 1,
235
+ "rollout_train_infer_steps": 64,
236
+ "rollout_train_temp": 1.45,
237
+ "rollout_train_max_gamma": 1.0,
238
+ "rollout_train_corrupt_only": true,
239
+ "rollout_train_samplewise": false,
240
+ "rollout_train_compute_always": false,
241
+ "bridge_noise_init": "logistic_normal",
242
+ "noise_sigma": -1.0,
243
+ "allow_tf32": true,
244
+ "activation_checkpointing": false,
245
+ "activation_checkpoint_interval": 1,
246
+ "activation_checkpoint_scope": "block",
247
+ "ddp_static_graph": false,
248
+ "ddp_gradient_as_bucket_view": true,
249
+ "blocking_data_transfer": false,
250
+ "dataloader_prefetch_factor": 4,
251
+ "full_train_stats": false,
252
+ "tokenized_hf": false,
253
+ "tokenized_pad_token": "pad",
254
+ "elf_conditional_hf": false,
255
+ "record_pad_truncate": false,
256
+ "record_add_eos": false,
257
+ "record_add_special_tokens": false,
258
+ "record_pad_token": "pad",
259
+ "record_shuffle_buffer": 10000,
260
+ "wrap": true,
261
+ "wrap_mode": "stream",
262
+ "wrap_record_buffer_size": 200,
263
+ "owt_cached_chunks": true,
264
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len8_train8_overfit",
265
+ "owt_chunk_cache_rebuild": false,
266
+ "owt_chunk_cache_write_batch": 4096,
267
+ "owt_exact_repeat_per_chunk": 0,
268
+ "online_chunk_shuffle": false,
269
+ "online_chunk_shuffle_buffer": 10000,
270
+ "openwebtext_split": "train_minus_100k",
271
+ "detokenizer": "auto",
272
+ "resolved_detokenizer": null,
273
+ "num_workers": 0,
274
+ "latest_every": 10,
275
+ "resume_path": ""
276
+ }
277
+ step=10 epoch=5/250 epoch_step=2/2 micro_steps=10 elapsed=2.1s lr=2.000000e-03 loss=1.4378 loss_recon=1.4378 loss_meanflow=0.0000 mean_model_t=0.1662 mean_corrupt_t=0.1662 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1662 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2500 corrupt_frac=0.7125 acc_corrupt=0.1228 loss_corrupt=2.1079 wrong_frac=0.7895 init_acc_corrupt=0.0877 acc_corrupt_t_0p0_0p2=0.0513 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.0168 out_g_norm=1.4755 acc_corrupt_t_0p2_0p4=0.2778 corrupt_frac_t_0p2_0p4=1.0000 loss_all=10.7422 init_gold_top10=0.5000 init_gold_top100=0.5000
278
+ step=20 epoch=10/250 epoch_step=2/2 micro_steps=20 elapsed=5.4s lr=2.000000e-03 loss=1.6261 loss_recon=1.6261 loss_meanflow=0.0000 mean_model_t=0.1939 mean_corrupt_t=0.1939 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1939 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3000 corrupt_frac=0.6375 acc_corrupt=0.1569 loss_corrupt=2.2662 wrong_frac=0.8824 init_acc_corrupt=0.0588 acc_corrupt_t_0p0_0p2=0.1538 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.1012 out_g_norm=1.7398 acc_corrupt_t_0p2_0p4=0.1579 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.1667 corrupt_frac_t_0p4_0p6=1.0000 loss_all=10.6484 init_gold_top10=0.0000 init_gold_top100=0.0000
279
+ step=30 epoch=15/250 epoch_step=2/2 micro_steps=30 elapsed=5.0s lr=2.000000e-03 loss=1.4998 loss_recon=1.4998 loss_meanflow=0.0000 mean_model_t=0.1869 mean_corrupt_t=0.1869 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1869 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1875 corrupt_frac=0.7125 acc_corrupt=0.1579 loss_corrupt=2.0982 wrong_frac=0.8772 init_acc_corrupt=0.0175 acc_corrupt_t_0p0_0p2=0.1379 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.2113 out_g_norm=1.9543 acc_corrupt_t_0p2_0p4=0.1786 corrupt_frac_t_0p2_0p4=1.0000 loss_all=10.3594 init_gold_top10=0.0000 init_gold_top100=0.0000
280
+ step=40 epoch=20/250 epoch_step=2/2 micro_steps=40 elapsed=4.8s lr=2.000000e-03 loss=2.1429 loss_recon=2.1429 loss_meanflow=0.0000 mean_model_t=0.2433 mean_corrupt_t=0.2433 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2433 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2750 corrupt_frac=0.6625 acc_corrupt=0.1887 loss_corrupt=2.8585 wrong_frac=0.7925 init_acc_corrupt=0.1509 acc_corrupt_t_0p0_0p2=0.2222 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.3226 out_g_norm=1.9545 acc_corrupt_t_0p2_0p4=0.1724 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p6_0p8=0.1667 corrupt_frac_t_0p6_0p8=1.0000 loss_all=10.1250 init_gold_top10=0.0000 init_gold_top100=0.0000
281
+ step=50 epoch=25/250 epoch_step=2/2 micro_steps=50 elapsed=5.3s lr=2.000000e-03 loss=2.0644 loss_recon=2.0644 loss_meanflow=0.0000 mean_model_t=0.2392 mean_corrupt_t=0.2392 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2392 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2500 corrupt_frac=0.7875 acc_corrupt=0.1111 loss_corrupt=3.0162 wrong_frac=0.7778 init_acc_corrupt=0.1111 acc_corrupt_t_0p0_0p2=0.0833 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.4197 out_g_norm=1.8317 acc_corrupt_t_0p4_0p6=0.1429 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.1200 corrupt_frac_t_0p2_0p4=1.0000 loss_all=10.3516 init_gold_top10=0.2857 init_gold_top100=0.4286
282
+ step=60 epoch=30/250 epoch_step=2/2 micro_steps=60 elapsed=3.9s lr=2.000000e-03 loss=0.6981 loss_recon=0.6981 loss_meanflow=0.0000 mean_model_t=0.0956 mean_corrupt_t=0.0956 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0956 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3125 corrupt_frac=0.7125 acc_corrupt=0.1404 loss_corrupt=1.7183 wrong_frac=0.8596 init_acc_corrupt=0.0175 acc_corrupt_t_0p0_0p2=0.1400 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.5080 out_g_norm=1.7802 acc_corrupt_t_0p2_0p4=0.1429 corrupt_frac_t_0p2_0p4=1.0000 loss_all=9.8359 init_gold_top10=0.0000 init_gold_top100=0.1667
283
+ step=70 epoch=35/250 epoch_step=2/2 micro_steps=70 elapsed=5.3s lr=2.000000e-03 loss=1.2525 loss_recon=1.2525 loss_meanflow=0.0000 mean_model_t=0.1739 mean_corrupt_t=0.1739 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1739 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4500 corrupt_frac=0.6875 acc_corrupt=0.2909 loss_corrupt=2.1904 wrong_frac=0.8000 init_acc_corrupt=0.1091 acc_corrupt_t_0p0_0p2=0.2051 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.5936 out_g_norm=2.0018 acc_corrupt_t_0p2_0p4=0.5000 corrupt_frac_t_0p2_0p4=1.0000 loss_all=9.6328 init_gold_top10=0.1667 init_gold_top100=0.1667
284
+ step=80 epoch=40/250 epoch_step=2/2 micro_steps=80 elapsed=4.3s lr=2.000000e-03 loss=1.0250 loss_recon=1.0250 loss_meanflow=0.0000 mean_model_t=0.1471 mean_corrupt_t=0.1471 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1471 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3375 corrupt_frac=0.7250 acc_corrupt=0.2414 loss_corrupt=1.5685 wrong_frac=0.8103 init_acc_corrupt=0.0517 acc_corrupt_t_0p0_0p2=0.2308 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.6759 out_g_norm=1.6317 acc_corrupt_t_0p2_0p4=0.3333 corrupt_frac_t_0p2_0p4=1.0000 loss_all=8.8125 init_gold_top10=0.2500 init_gold_top100=0.2500
285
+ step=90 epoch=45/250 epoch_step=2/2 micro_steps=90 elapsed=4.3s lr=2.000000e-03 loss=2.0293 loss_recon=2.0293 loss_meanflow=0.0000 mean_model_t=0.2610 mean_corrupt_t=0.2610 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2610 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4125 corrupt_frac=0.7125 acc_corrupt=0.2281 loss_corrupt=2.8313 wrong_frac=0.7719 init_acc_corrupt=0.2105 acc_corrupt_t_0p4_0p6=0.3333 corrupt_frac_t_0p4_0p6=1.0000 out_w_norm=0.7576 out_g_norm=2.3043 acc_corrupt_t_0p2_0p4=0.2083 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p0_0p2=0.1905 corrupt_frac_t_0p0_0p2=1.0000 loss_all=9.1992 init_gold_top10=0.1667 init_gold_top100=0.1667
286
+ step=100 epoch=50/250 epoch_step=2/2 micro_steps=100 elapsed=5.3s lr=2.000000e-03 loss=1.3626 loss_recon=1.3626 loss_meanflow=0.0000 mean_model_t=0.2061 mean_corrupt_t=0.2061 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2061 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4375 corrupt_frac=0.7000 acc_corrupt=0.2321 loss_corrupt=1.9125 wrong_frac=0.8393 init_acc_corrupt=0.0893 acc_corrupt_t_0p0_0p2=0.2059 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.8440 out_g_norm=2.1251 acc_corrupt_t_0p2_0p4=0.2727 corrupt_frac_t_0p2_0p4=1.0000 loss_all=8.1016 init_gold_top10=0.2500 init_gold_top100=0.2500
287
+ step=110 epoch=55/250 epoch_step=2/2 micro_steps=110 elapsed=4.3s lr=2.000000e-03 loss=1.2112 loss_recon=1.2112 loss_meanflow=0.0000 mean_model_t=0.1964 mean_corrupt_t=0.1964 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1964 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5375 corrupt_frac=0.6625 acc_corrupt=0.3208 loss_corrupt=2.1580 wrong_frac=0.6981 init_acc_corrupt=0.1509 acc_corrupt_t_0p2_0p4=0.3182 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=0.9333 out_g_norm=2.1929 acc_corrupt_t_0p0_0p2=0.2000 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.8333 corrupt_frac_t_0p4_0p6=1.0000 loss_all=7.6875 init_gold_top10=0.2857 init_gold_top100=0.2857
288
+ step=120 epoch=60/250 epoch_step=2/2 micro_steps=120 elapsed=3.9s lr=2.000000e-03 loss=1.3013 loss_recon=1.3013 loss_meanflow=0.0000 mean_model_t=0.2019 mean_corrupt_t=0.2019 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2019 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4750 corrupt_frac=0.7500 acc_corrupt=0.3000 loss_corrupt=1.7847 wrong_frac=0.7500 init_acc_corrupt=0.2167 acc_corrupt_t_0p0_0p2=0.1786 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.0233 out_g_norm=1.7968 acc_corrupt_t_0p2_0p4=0.3077 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.8333 corrupt_frac_t_0p4_0p6=1.0000 loss_all=6.5352 init_gold_top10=0.6667 init_gold_top100=0.6667
289
+ step=130 epoch=65/250 epoch_step=2/2 micro_steps=130 elapsed=5.0s lr=2.000000e-03 loss=0.9233 loss_recon=0.9233 loss_meanflow=0.0000 mean_model_t=0.1694 mean_corrupt_t=0.1694 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1694 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4125 corrupt_frac=0.7125 acc_corrupt=0.2105 loss_corrupt=1.8001 wrong_frac=0.7895 init_acc_corrupt=0.0175 acc_corrupt_t_0p0_0p2=0.1944 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.1115 out_g_norm=2.2608 acc_corrupt_t_0p2_0p4=0.2381 corrupt_frac_t_0p2_0p4=1.0000 loss_all=7.4980 init_gold_top10=0.3333 init_gold_top100=0.3333
290
+ step=140 epoch=70/250 epoch_step=2/2 micro_steps=140 elapsed=4.3s lr=2.000000e-03 loss=1.2486 loss_recon=1.2486 loss_meanflow=0.0000 mean_model_t=0.2216 mean_corrupt_t=0.2216 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2216 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4000 corrupt_frac=0.8000 acc_corrupt=0.2500 loss_corrupt=1.7026 wrong_frac=0.7969 init_acc_corrupt=0.1094 acc_corrupt_t_0p2_0p4=0.1579 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=1.2048 out_g_norm=2.0386 acc_corrupt_t_0p4_0p6=0.7143 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p0_0p2=0.2105 corrupt_frac_t_0p0_0p2=1.0000 loss_all=5.8379 init_gold_top10=0.3333 init_gold_top100=0.3333
291
+ step=150 epoch=75/250 epoch_step=2/2 micro_steps=150 elapsed=3.9s lr=2.000000e-03 loss=0.9679 loss_recon=0.9679 loss_meanflow=0.0000 mean_model_t=0.2049 mean_corrupt_t=0.2049 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2049 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5250 corrupt_frac=0.6875 acc_corrupt=0.3091 loss_corrupt=1.9291 wrong_frac=0.8727 init_acc_corrupt=0.0727 acc_corrupt_t_0p4_0p6=0.3333 corrupt_frac_t_0p4_0p6=1.0000 out_w_norm=1.3014 out_g_norm=2.1012 acc_corrupt_t_0p2_0p4=0.4286 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p0_0p2=0.2143 corrupt_frac_t_0p0_0p2=1.0000 loss_all=7.2578 init_gold_top10=0.0000 init_gold_top100=0.2857
292
+ step=160 epoch=80/250 epoch_step=2/2 micro_steps=160 elapsed=4.4s lr=2.000000e-03 loss=0.9315 loss_recon=0.9315 loss_meanflow=0.0000 mean_model_t=0.2270 mean_corrupt_t=0.2270 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2270 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5625 corrupt_frac=0.7125 acc_corrupt=0.3860 loss_corrupt=1.5416 wrong_frac=0.7018 init_acc_corrupt=0.1754 acc_corrupt_t_0p0_0p2=0.3750 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.3993 out_g_norm=2.1721 acc_corrupt_t_0p2_0p4=0.2632 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p6_0p8=0.8333 corrupt_frac_t_0p6_0p8=1.0000 loss_all=5.0039 init_gold_top10=0.1667 init_gold_top100=0.1667
293
+ step=170 epoch=85/250 epoch_step=2/2 micro_steps=170 elapsed=4.3s lr=2.000000e-03 loss=0.9990 loss_recon=0.9990 loss_meanflow=0.0000 mean_model_t=0.2447 mean_corrupt_t=0.2447 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2447 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5875 corrupt_frac=0.6500 acc_corrupt=0.3654 loss_corrupt=1.2837 wrong_frac=0.8654 init_acc_corrupt=0.0962 acc_corrupt_t_0p2_0p4=0.3793 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=1.4993 out_g_norm=2.1622 acc_corrupt_t_0p4_0p6=0.5714 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p0_0p2=0.2500 corrupt_frac_t_0p0_0p2=1.0000 loss_all=2.9458 init_gold_top10=0.0000 init_gold_top100=0.2500
294
+ step=180 epoch=90/250 epoch_step=2/2 micro_steps=180 elapsed=3.9s lr=2.000000e-03 loss=0.7330 loss_recon=0.7330 loss_meanflow=0.0000 mean_model_t=0.2271 mean_corrupt_t=0.2271 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2271 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6000 corrupt_frac=0.6375 acc_corrupt=0.3725 loss_corrupt=1.1424 wrong_frac=0.7843 init_acc_corrupt=0.1569 acc_corrupt_t_0p2_0p4=0.4444 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=1.5973 out_g_norm=2.0392 acc_corrupt_t_0p0_0p2=0.2963 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.5000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=4.4272 init_gold_top10=0.2500 init_gold_top100=0.2500
295
+ step=190 epoch=95/250 epoch_step=2/2 micro_steps=190 elapsed=4.4s lr=2.000000e-03 loss=0.6392 loss_recon=0.6392 loss_meanflow=0.0000 mean_model_t=0.1793 mean_corrupt_t=0.1793 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1793 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4500 corrupt_frac=0.7125 acc_corrupt=0.2456 loss_corrupt=1.4030 wrong_frac=0.8070 init_acc_corrupt=0.0702 acc_corrupt_t_0p0_0p2=0.1429 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.6917 out_g_norm=2.2318 acc_corrupt_t_0p2_0p4=0.3636 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=6.1974 init_gold_top10=0.5000 init_gold_top100=0.5000
296
+ step=200 epoch=100/250 epoch_step=2/2 micro_steps=200 elapsed=4.3s lr=2.000000e-03 loss=0.6127 loss_recon=0.6127 loss_meanflow=0.0000 mean_model_t=0.1869 mean_corrupt_t=0.1869 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1869 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5125 corrupt_frac=0.7625 acc_corrupt=0.3607 loss_corrupt=0.9250 wrong_frac=0.8033 init_acc_corrupt=0.1148 acc_corrupt_t_0p2_0p4=0.5000 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=1.7771 out_g_norm=2.0441 acc_corrupt_t_0p0_0p2=0.2069 corrupt_frac_t_0p0_0p2=1.0000 loss_all=2.7065 init_gold_top10=0.0000 init_gold_top100=0.2500
297
+ step=210 epoch=105/250 epoch_step=2/2 micro_steps=210 elapsed=3.9s lr=2.000000e-03 loss=0.3842 loss_recon=0.3842 loss_meanflow=0.0000 mean_model_t=0.1656 mean_corrupt_t=0.1656 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1656 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4875 corrupt_frac=0.7625 acc_corrupt=0.3279 loss_corrupt=0.8346 wrong_frac=0.8689 init_acc_corrupt=0.0656 acc_corrupt_t_0p0_0p2=0.2391 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.8511 out_g_norm=1.9414 acc_corrupt_t_0p2_0p4=0.5455 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.7500 corrupt_frac_t_0p4_0p6=1.0000 loss_all=3.7437 init_gold_top10=0.0000 init_gold_top100=0.0000
298
+ step=220 epoch=110/250 epoch_step=2/2 micro_steps=220 elapsed=4.4s lr=2.000000e-03 loss=0.5714 loss_recon=0.5714 loss_meanflow=0.0000 mean_model_t=0.2338 mean_corrupt_t=0.2338 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2338 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5750 corrupt_frac=0.7125 acc_corrupt=0.4035 loss_corrupt=0.9256 wrong_frac=0.7193 init_acc_corrupt=0.1754 acc_corrupt_t_0p2_0p4=0.3871 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=1.9223 out_g_norm=2.8620 acc_corrupt_t_0p0_0p2=0.3500 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.6667 corrupt_frac_t_0p4_0p6=1.0000 loss_all=2.8745 init_gold_top10=0.4286 init_gold_top100=0.4286
299
+ step=230 epoch=115/250 epoch_step=2/2 micro_steps=230 elapsed=4.3s lr=2.000000e-03 loss=0.3316 loss_recon=0.3316 loss_meanflow=0.0000 mean_model_t=0.1464 mean_corrupt_t=0.1464 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1464 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4875 corrupt_frac=0.6875 acc_corrupt=0.2545 loss_corrupt=0.6993 wrong_frac=0.8545 init_acc_corrupt=0.0545 acc_corrupt_t_0p0_0p2=0.2500 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.9838 out_g_norm=1.9833 acc_corrupt_t_0p2_0p4=0.2632 corrupt_frac_t_0p2_0p4=1.0000 loss_all=3.0620 init_gold_top10=0.3333 init_gold_top100=0.3333
300
+ step=240 epoch=120/250 epoch_step=2/2 micro_steps=240 elapsed=3.9s lr=2.000000e-03 loss=0.5485 loss_recon=0.5485 loss_meanflow=0.0000 mean_model_t=0.1917 mean_corrupt_t=0.1917 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1917 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4250 corrupt_frac=0.8125 acc_corrupt=0.2923 loss_corrupt=1.0804 wrong_frac=0.8769 init_acc_corrupt=0.0462 acc_corrupt_t_0p2_0p4=0.1579 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.0305 out_g_norm=2.2724 acc_corrupt_t_0p4_0p6=0.8333 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p0_0p2=0.2750 corrupt_frac_t_0p0_0p2=1.0000 loss_all=4.9873 init_gold_top10=0.0000 init_gold_top100=0.0000
301
+ step=250 epoch=125/250 epoch_step=2/2 micro_steps=250 elapsed=4.3s lr=2.000000e-03 loss=0.5894 loss_recon=0.5894 loss_meanflow=0.0000 mean_model_t=0.2324 mean_corrupt_t=0.2324 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2324 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5000 corrupt_frac=0.7375 acc_corrupt=0.3220 loss_corrupt=1.1833 wrong_frac=0.7966 init_acc_corrupt=0.1356 acc_corrupt_t_0p2_0p4=0.3571 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.0676 out_g_norm=2.6213 acc_corrupt_t_0p0_0p2=0.2353 corrupt_frac_t_0p0_0p2=1.0000 loss_all=5.5378 init_gold_top10=0.0000 init_gold_top100=0.0000
302
+ step=260 epoch=130/250 epoch_step=2/2 micro_steps=260 elapsed=4.3s lr=2.000000e-03 loss=0.4236 loss_recon=0.4236 loss_meanflow=0.0000 mean_model_t=0.2037 mean_corrupt_t=0.2037 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2037 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6125 corrupt_frac=0.7000 acc_corrupt=0.4464 loss_corrupt=0.6426 wrong_frac=0.7857 init_acc_corrupt=0.0893 acc_corrupt_t_0p0_0p2=0.2500 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0902 out_g_norm=2.7820 acc_corrupt_t_0p2_0p4=0.6429 corrupt_frac_t_0p2_0p4=1.0000 loss_all=1.9424 init_gold_top10=0.0000 init_gold_top100=0.2500
303
+ step=270 epoch=135/250 epoch_step=2/2 micro_steps=270 elapsed=3.9s lr=2.000000e-03 loss=0.4216 loss_recon=0.4216 loss_meanflow=0.0000 mean_model_t=0.2494 mean_corrupt_t=0.2494 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2494 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6375 corrupt_frac=0.6375 acc_corrupt=0.4314 loss_corrupt=0.6980 wrong_frac=0.8235 init_acc_corrupt=0.1373 acc_corrupt_t_0p0_0p2=0.3103 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0985 out_g_norm=2.1731 acc_corrupt_t_0p2_0p4=0.6667 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.5000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=2.8916 init_gold_top10=0.3333 init_gold_top100=0.3333
304
+ step=280 epoch=140/250 epoch_step=2/2 micro_steps=280 elapsed=4.4s lr=2.000000e-03 loss=0.4478 loss_recon=0.4478 loss_meanflow=0.0000 mean_model_t=0.2913 mean_corrupt_t=0.2913 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2913 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6750 corrupt_frac=0.6125 acc_corrupt=0.4694 loss_corrupt=0.6763 wrong_frac=0.7143 init_acc_corrupt=0.2041 acc_corrupt_t_0p0_0p2=0.3600 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1126 out_g_norm=2.5709 acc_corrupt_t_0p6_0p8=0.7500 corrupt_frac_t_0p6_0p8=1.0000 acc_corrupt_t_0p2_0p4=0.5833 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.5000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=2.2793 init_gold_top10=0.0000 init_gold_top100=0.0000
305
+ step=290 epoch=145/250 epoch_step=2/2 micro_steps=290 elapsed=4.3s lr=2.000000e-03 loss=0.2915 loss_recon=0.2915 loss_meanflow=0.0000 mean_model_t=0.1777 mean_corrupt_t=0.1777 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1777 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6000 corrupt_frac=0.7125 acc_corrupt=0.4386 loss_corrupt=0.5437 wrong_frac=0.8947 init_acc_corrupt=0.0526 acc_corrupt_t_0p0_0p2=0.3333 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1215 out_g_norm=2.7226 acc_corrupt_t_0p2_0p4=0.5833 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.3824 init_gold_top10=0.1667 init_gold_top100=0.3333
306
+ step=300 epoch=150/250 epoch_step=2/2 micro_steps=300 elapsed=3.9s lr=2.000000e-03 loss=0.4178 loss_recon=0.4178 loss_meanflow=0.0000 mean_model_t=0.2061 mean_corrupt_t=0.2061 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2061 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6125 corrupt_frac=0.7375 acc_corrupt=0.4746 loss_corrupt=0.7239 wrong_frac=0.7627 init_acc_corrupt=0.0678 acc_corrupt_t_0p0_0p2=0.6154 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1224 out_g_norm=2.1706 acc_corrupt_t_0p2_0p4=0.3636 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.5565 init_gold_top10=0.1429 init_gold_top100=0.5714
307
+ step=310 epoch=155/250 epoch_step=2/2 micro_steps=310 elapsed=4.9s lr=2.000000e-03 loss=0.4163 loss_recon=0.4163 loss_meanflow=0.0000 mean_model_t=0.1841 mean_corrupt_t=0.1841 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1841 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6125 corrupt_frac=0.7000 acc_corrupt=0.4464 loss_corrupt=0.6763 wrong_frac=0.7857 init_acc_corrupt=0.1429 acc_corrupt_t_0p0_0p2=0.3889 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1090 out_g_norm=2.5876 acc_corrupt_t_0p4_0p6=0.5714 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.5385 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.2671 init_gold_top10=0.5714 init_gold_top100=0.5714
308
+ step=320 epoch=160/250 epoch_step=2/2 micro_steps=320 elapsed=4.2s lr=2.000000e-03 loss=0.4162 loss_recon=0.4162 loss_meanflow=0.0000 mean_model_t=0.2237 mean_corrupt_t=0.2237 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2237 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6500 corrupt_frac=0.7625 acc_corrupt=0.5410 loss_corrupt=0.6050 wrong_frac=0.7213 init_acc_corrupt=0.1803 acc_corrupt_t_0p0_0p2=0.5185 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0998 out_g_norm=2.0437 acc_corrupt_t_0p2_0p4=0.5556 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.5714 corrupt_frac_t_0p4_0p6=1.0000 loss_all=2.4873 init_gold_top10=0.2500 init_gold_top100=0.5000
309
+ step=330 epoch=165/250 epoch_step=2/2 micro_steps=330 elapsed=4.0s lr=2.000000e-03 loss=0.3596 loss_recon=0.3596 loss_meanflow=0.0000 mean_model_t=0.1733 mean_corrupt_t=0.1733 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1733 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4375 corrupt_frac=0.7750 acc_corrupt=0.2742 loss_corrupt=0.9846 wrong_frac=0.8710 init_acc_corrupt=0.0968 acc_corrupt_t_0p0_0p2=0.1277 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0984 out_g_norm=2.1073 acc_corrupt_t_0p2_0p4=0.7333 corrupt_frac_t_0p2_0p4=1.0000 loss_all=5.2998 init_gold_top10=0.0000 init_gold_top100=0.0000
310
+ step=340 epoch=170/250 epoch_step=2/2 micro_steps=340 elapsed=4.4s lr=2.000000e-03 loss=0.2857 loss_recon=0.2857 loss_meanflow=0.0000 mean_model_t=0.1706 mean_corrupt_t=0.1706 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1706 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6625 corrupt_frac=0.6875 acc_corrupt=0.5273 loss_corrupt=0.4981 wrong_frac=0.8364 init_acc_corrupt=0.0545 acc_corrupt_t_0p4_0p6=0.7500 corrupt_frac_t_0p4_0p6=1.0000 out_w_norm=2.0925 out_g_norm=2.0339 acc_corrupt_t_0p0_0p2=0.5000 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p2_0p4=0.5714 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.0654 init_gold_top10=0.0000 init_gold_top100=0.0000
311
+ step=350 epoch=175/250 epoch_step=2/2 micro_steps=350 elapsed=4.2s lr=2.000000e-03 loss=0.1813 loss_recon=0.1813 loss_meanflow=0.0000 mean_model_t=0.1718 mean_corrupt_t=0.1718 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1718 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7750 corrupt_frac=0.7000 acc_corrupt=0.6786 loss_corrupt=0.4732 wrong_frac=0.8214 init_acc_corrupt=0.1607 acc_corrupt_t_0p0_0p2=0.5500 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0854 out_g_norm=2.3255 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.5088 init_gold_top10=0.1667 init_gold_top100=0.3333
312
+ step=360 epoch=180/250 epoch_step=2/2 micro_steps=360 elapsed=3.9s lr=2.000000e-03 loss=0.1510 loss_recon=0.1510 loss_meanflow=0.0000 mean_model_t=0.2313 mean_corrupt_t=0.2313 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2313 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7875 corrupt_frac=0.7125 acc_corrupt=0.7018 loss_corrupt=0.3167 wrong_frac=0.7368 init_acc_corrupt=0.1930 acc_corrupt_t_0p2_0p4=0.8571 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.0792 out_g_norm=1.7478 acc_corrupt_t_0p0_0p2=0.4828 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=1.8887 init_gold_top10=0.0000 init_gold_top100=0.5000
313
+ step=370 epoch=185/250 epoch_step=2/2 micro_steps=370 elapsed=4.3s lr=2.000000e-03 loss=0.2450 loss_recon=0.2450 loss_meanflow=0.0000 mean_model_t=0.1875 mean_corrupt_t=0.1875 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1875 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8000 corrupt_frac=0.6250 acc_corrupt=0.6800 loss_corrupt=0.3616 wrong_frac=0.8000 init_acc_corrupt=0.1000 acc_corrupt_t_0p0_0p2=0.7586 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0847 out_g_norm=2.1802 acc_corrupt_t_0p2_0p4=0.5714 corrupt_frac_t_0p2_0p4=1.0000 loss_all=1.4507 init_gold_top10=0.2500 init_gold_top100=0.2500
314
+ step=380 epoch=190/250 epoch_step=2/2 micro_steps=380 elapsed=4.3s lr=2.000000e-03 loss=0.1449 loss_recon=0.1449 loss_meanflow=0.0000 mean_model_t=0.1769 mean_corrupt_t=0.1769 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1769 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8000 corrupt_frac=0.6500 acc_corrupt=0.6923 loss_corrupt=0.3072 wrong_frac=0.7692 init_acc_corrupt=0.0962 acc_corrupt_t_0p0_0p2=0.6250 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0835 out_g_norm=1.5933 acc_corrupt_t_0p2_0p4=0.9167 corrupt_frac_t_0p2_0p4=1.0000 loss_all=1.4590 init_gold_top10=0.5000 init_gold_top100=0.5000
315
+ step=390 epoch=195/250 epoch_step=2/2 micro_steps=390 elapsed=3.9s lr=2.000000e-03 loss=0.1706 loss_recon=0.1706 loss_meanflow=0.0000 mean_model_t=0.2492 mean_corrupt_t=0.2492 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2492 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8250 corrupt_frac=0.7750 acc_corrupt=0.7742 loss_corrupt=0.3738 wrong_frac=0.7097 init_acc_corrupt=0.0968 acc_corrupt_t_0p2_0p4=0.7843 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.0944 out_g_norm=2.0398 acc_corrupt_t_0p0_0p2=0.7273 corrupt_frac_t_0p0_0p2=1.0000 loss_all=1.9468 init_gold_top10=0.1429 init_gold_top100=0.1429
316
+ step=400 epoch=200/250 epoch_step=2/2 micro_steps=400 elapsed=4.3s lr=2.000000e-03 loss=0.1752 loss_recon=0.1752 loss_meanflow=0.0000 mean_model_t=0.2467 mean_corrupt_t=0.2467 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2467 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8000 corrupt_frac=0.7750 acc_corrupt=0.7419 loss_corrupt=0.3860 wrong_frac=0.7903 init_acc_corrupt=0.1613 acc_corrupt_t_0p0_0p2=0.5000 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0922 out_g_norm=1.7354 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.8750 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.0479 init_gold_top10=0.4286 init_gold_top100=0.4286
317
+ step=410 epoch=205/250 epoch_step=2/2 micro_steps=410 elapsed=4.3s lr=2.000000e-03 loss=0.2606 loss_recon=0.2606 loss_meanflow=0.0000 mean_model_t=0.2461 mean_corrupt_t=0.2461 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2461 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7625 corrupt_frac=0.6250 acc_corrupt=0.6200 loss_corrupt=1.0780 wrong_frac=0.7600 init_acc_corrupt=0.1200 acc_corrupt_t_0p0_0p2=0.4286 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0802 out_g_norm=2.0628 acc_corrupt_t_0p2_0p4=0.8000 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.9167 corrupt_frac_t_0p4_0p6=1.0000 loss_all=5.3184 init_gold_top10=0.0000 init_gold_top100=0.2857
318
+ step=420 epoch=210/250 epoch_step=2/2 micro_steps=420 elapsed=3.9s lr=2.000000e-03 loss=0.2090 loss_recon=0.2090 loss_meanflow=0.0000 mean_model_t=0.2875 mean_corrupt_t=0.2875 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2875 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8750 corrupt_frac=0.7625 acc_corrupt=0.8361 loss_corrupt=0.3340 wrong_frac=0.7049 init_acc_corrupt=0.2459 acc_corrupt_t_0p0_0p2=0.7407 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0755 out_g_norm=1.5891 acc_corrupt_t_0p4_0p6=0.9000 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.9286 corrupt_frac_t_0p2_0p4=1.0000 loss_all=1.5884 init_gold_top10=0.5000 init_gold_top100=0.5000
319
+ step=430 epoch=215/250 epoch_step=2/2 micro_steps=430 elapsed=4.3s lr=2.000000e-03 loss=0.2835 loss_recon=0.2835 loss_meanflow=0.0000 mean_model_t=0.2302 mean_corrupt_t=0.2302 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2302 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8000 corrupt_frac=0.6875 acc_corrupt=0.7091 loss_corrupt=0.9789 wrong_frac=0.8727 init_acc_corrupt=0.0727 acc_corrupt_t_0p0_0p2=0.7778 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0719 out_g_norm=1.7877 acc_corrupt_t_0p2_0p4=0.5833 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=5.6514 init_gold_top10=0.0000 init_gold_top100=0.2857
320
+ step=440 epoch=220/250 epoch_step=2/2 micro_steps=440 elapsed=4.3s lr=2.000000e-03 loss=0.1238 loss_recon=0.1238 loss_meanflow=0.0000 mean_model_t=0.2328 mean_corrupt_t=0.2328 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2328 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8625 corrupt_frac=0.6500 acc_corrupt=0.7885 loss_corrupt=0.6087 wrong_frac=0.7692 init_acc_corrupt=0.1731 acc_corrupt_t_0p0_0p2=0.6765 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0703 out_g_norm=1.9463 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=1.0000 loss_all=3.3848 init_gold_top10=0.0000 init_gold_top100=0.2857
321
+ step=450 epoch=225/250 epoch_step=2/2 micro_steps=450 elapsed=3.9s lr=2.000000e-03 loss=0.1397 loss_recon=0.1397 loss_meanflow=0.0000 mean_model_t=0.1586 mean_corrupt_t=0.1586 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1586 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8875 corrupt_frac=0.6750 acc_corrupt=0.8333 loss_corrupt=0.4628 wrong_frac=0.8333 init_acc_corrupt=0.0185 acc_corrupt_t_0p2_0p4=0.6471 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.0704 out_g_norm=1.4328 acc_corrupt_t_0p0_0p2=0.9189 corrupt_frac_t_0p0_0p2=1.0000 loss_all=2.4082 init_gold_top10=0.1429 init_gold_top100=0.1429
322
+ step=460 epoch=230/250 epoch_step=2/2 micro_steps=460 elapsed=4.4s lr=2.000000e-03 loss=0.1377 loss_recon=0.1377 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2081 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9125 corrupt_frac=0.6375 acc_corrupt=0.8627 loss_corrupt=0.2271 wrong_frac=0.7255 init_acc_corrupt=0.1765 acc_corrupt_t_0p0_0p2=0.8437 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0668 out_g_norm=1.6259 acc_corrupt_t_0p2_0p4=0.9091 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.8750 corrupt_frac_t_0p4_0p6=1.0000 loss_all=1.0952 init_gold_top10=0.5000 init_gold_top100=0.5000
323
+ step=470 epoch=235/250 epoch_step=2/2 micro_steps=470 elapsed=4.3s lr=2.000000e-03 loss=0.1759 loss_recon=0.1759 loss_meanflow=0.0000 mean_model_t=0.1753 mean_corrupt_t=0.1753 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1753 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8500 corrupt_frac=0.7000 acc_corrupt=0.7857 loss_corrupt=0.4325 wrong_frac=0.8393 init_acc_corrupt=0.0893 acc_corrupt_t_0p0_0p2=0.7436 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0613 out_g_norm=1.8623 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.8182 corrupt_frac_t_0p2_0p4=1.0000 loss_all=1.9712 init_gold_top10=0.1429 init_gold_top100=0.1429
324
+ step=480 epoch=240/250 epoch_step=2/2 micro_steps=480 elapsed=3.9s lr=2.000000e-03 loss=0.1247 loss_recon=0.1247 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2083 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9375 corrupt_frac=0.6500 acc_corrupt=0.9038 loss_corrupt=0.2654 wrong_frac=0.7500 init_acc_corrupt=0.1731 acc_corrupt_t_0p0_0p2=0.8750 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0751 out_g_norm=1.8762 acc_corrupt_t_0p2_0p4=0.9091 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=1.2505 init_gold_top10=0.0000 init_gold_top100=0.5000
325
+ step=490 epoch=245/250 epoch_step=2/2 micro_steps=490 elapsed=4.8s lr=2.000000e-03 loss=0.0926 loss_recon=0.0926 loss_meanflow=0.0000 mean_model_t=0.2059 mean_corrupt_t=0.2059 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2059 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9625 corrupt_frac=0.7125 acc_corrupt=0.9474 loss_corrupt=0.3796 wrong_frac=0.7895 init_acc_corrupt=0.0877 acc_corrupt_t_0p0_0p2=0.9302 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0723 out_g_norm=1.4834 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.5068 init_gold_top10=0.1667 init_gold_top100=0.1667
326
+ step=500 epoch=250/250 epoch_step=2/2 micro_steps=500 elapsed=4.2s lr=2.000000e-03 loss=0.1136 loss_recon=0.1136 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2092 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8250 corrupt_frac=0.8000 acc_corrupt=0.7813 loss_corrupt=0.2750 wrong_frac=0.7656 init_acc_corrupt=0.1563 acc_corrupt_t_0p2_0p4=0.8800 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.0694 out_g_norm=1.7062 acc_corrupt_t_0p0_0p2=0.6667 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=1.8398 init_gold_top10=0.3333 init_gold_top100=0.3333
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805.log ADDED
@@ -0,0 +1,987 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 969,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 1965440,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_corpus_unigram_path": "",
124
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
125
+ "categorical_wrong_basin_shared_prob": 0.0,
126
+ "categorical_wrong_unigram_shared_prob": 0.0,
127
+ "mask_mixture_original_prob": 0.0,
128
+ "mask_mixture_lowk_prob": 0.0,
129
+ "mask_mixture_lowcorrupt_prob": 0.0,
130
+ "mask_mixture_block_prob": 0.0,
131
+ "mask_mixture_all_prob": 1.0,
132
+ "mask_mixture_lowk_clean_tokens": "0",
133
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
134
+ "mask_mixture_block_tokens": "64,128",
135
+ "simplex_bridge_sampler": "dirichlet",
136
+ "logistic_normal_sigma_min": 0.1,
137
+ "logistic_normal_sigma_max": 1.0,
138
+ "logistic_normal_tau_min": 1.0,
139
+ "logistic_normal_tau_max": 1.0,
140
+ "torch_compile": false,
141
+ "compile_mode": "max-autotune",
142
+ "state_format": "prob",
143
+ "meanflow_weight": 0.0,
144
+ "rollout_train_prob": 0.0,
145
+ "rollout_train_steps": 1,
146
+ "rollout_train_infer_steps": 64,
147
+ "rollout_train_temp": 1.45,
148
+ "rollout_train_max_gamma": 1.0,
149
+ "rollout_train_corrupt_only": true,
150
+ "rollout_train_samplewise": false,
151
+ "rollout_train_compute_always": false,
152
+ "bridge_noise_init": "logistic_normal",
153
+ "noise_sigma": -1.0,
154
+ "allow_tf32": true,
155
+ "activation_checkpointing": false,
156
+ "activation_checkpoint_interval": 1,
157
+ "activation_checkpoint_scope": "block",
158
+ "ddp_static_graph": false,
159
+ "ddp_gradient_as_bucket_view": true,
160
+ "blocking_data_transfer": false,
161
+ "dataloader_prefetch_factor": 4,
162
+ "full_train_stats": false,
163
+ "tokenized_hf": false,
164
+ "tokenized_pad_token": "pad",
165
+ "elf_conditional_hf": false,
166
+ "record_pad_truncate": false,
167
+ "record_add_eos": false,
168
+ "record_add_special_tokens": false,
169
+ "record_pad_token": "pad",
170
+ "record_shuffle_buffer": 10000,
171
+ "wrap": true,
172
+ "wrap_mode": "stream",
173
+ "wrap_record_buffer_size": 200,
174
+ "owt_cached_chunks": true,
175
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
176
+ "owt_chunk_cache_rebuild": false,
177
+ "owt_chunk_cache_write_batch": 4096,
178
+ "owt_exact_repeat_per_chunk": 64,
179
+ "online_chunk_shuffle": false,
180
+ "online_chunk_shuffle_buffer": 10000,
181
+ "openwebtext_split": "train_minus_100k",
182
+ "detokenizer": "auto",
183
+ "resolved_detokenizer": null,
184
+ "num_workers": 0,
185
+ "latest_every": 1000,
186
+ "resume_path": ""
187
+ }
188
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=4.4s lr=2.000000e-03 loss=6.6495 loss_recon=6.6495 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1702 corrupt_frac=1.0000 acc_corrupt=0.1702 loss_corrupt=6.6495 wrong_frac=0.7917 init_acc_corrupt=0.2067 acc_corrupt_t_0p0_0p2=0.0878 corrupt_frac_t_0p0_0p2=0.5588 acc_corrupt_t_0p2_0p4=0.2294 corrupt_frac_t_0p2_0p4=0.3579 acc_corrupt_t_0p4_0p6=0.4491 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.6387 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=1.2350 out_g_norm=0.9734 acc_corrupt_t_0p8_1p0=0.8477 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.3486 init_gold_top10=0.2053 init_gold_top100=0.2800
189
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=3.7s lr=2.000000e-03 loss=5.8699 loss_recon=5.8699 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1640 corrupt_frac=1.0000 acc_corrupt=0.1640 loss_corrupt=5.8699 wrong_frac=0.7925 init_acc_corrupt=0.2059 acc_corrupt_t_0p0_0p2=0.0976 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=0.2106 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.3899 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.6051 corrupt_frac_t_0p6_0p8=0.0119 out_w_norm=4.0392 out_g_norm=1.3294 acc_corrupt_t_0p8_1p0=0.7979 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.5159 init_gold_top10=0.2043 init_gold_top100=0.2783
190
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=3.7s lr=2.000000e-03 loss=5.1215 loss_recon=5.1215 loss_meanflow=0.0000 mean_model_t=0.2117 mean_corrupt_t=0.2117 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1844 corrupt_frac=1.0000 acc_corrupt=0.1844 loss_corrupt=5.1215 wrong_frac=0.7884 init_acc_corrupt=0.2102 acc_corrupt_t_0p0_0p2=0.1071 corrupt_frac_t_0p0_0p2=0.5484 acc_corrupt_t_0p2_0p4=0.2350 corrupt_frac_t_0p2_0p4=0.3611 acc_corrupt_t_0p4_0p6=0.4284 corrupt_frac_t_0p4_0p6=0.0803 acc_corrupt_t_0p6_0p8=0.6246 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=6.5345 out_g_norm=0.7810 acc_corrupt_t_0p8_1p0=0.7812 corrupt_frac_t_0p8_1p0=0.0078 loss_all=4.7567 init_gold_top10=0.2224 init_gold_top100=0.2981
191
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=3.7s lr=2.000000e-03 loss=4.3914 loss_recon=4.3914 loss_meanflow=0.0000 mean_model_t=0.2073 mean_corrupt_t=0.2073 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2275 corrupt_frac=1.0000 acc_corrupt=0.2275 loss_corrupt=4.3914 wrong_frac=0.7927 init_acc_corrupt=0.2056 acc_corrupt_t_0p0_0p2=0.1322 corrupt_frac_t_0p0_0p2=0.5613 acc_corrupt_t_0p2_0p4=0.3118 corrupt_frac_t_0p2_0p4=0.3571 acc_corrupt_t_0p4_0p6=0.4954 corrupt_frac_t_0p4_0p6=0.0727 acc_corrupt_t_0p6_0p8=0.6688 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=8.2403 out_g_norm=0.4520 loss_all=3.9405 init_gold_top10=0.2197 init_gold_top100=0.2939
192
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=3.7s lr=2.000000e-03 loss=3.4904 loss_recon=3.4904 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2580 corrupt_frac=1.0000 acc_corrupt=0.2580 loss_corrupt=3.4904 wrong_frac=0.7904 init_acc_corrupt=0.2080 acc_corrupt_t_0p0_0p2=0.1592 corrupt_frac_t_0p0_0p2=0.5540 acc_corrupt_t_0p2_0p4=0.3420 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=0.5218 corrupt_frac_t_0p4_0p6=0.0745 out_w_norm=9.6795 out_g_norm=0.4054 acc_corrupt_t_0p6_0p8=0.6903 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=0.8594 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.9689 init_gold_top10=0.2225 init_gold_top100=0.2952
193
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=3.7s lr=2.000000e-03 loss=2.5069 loss_recon=2.5069 loss_meanflow=0.0000 mean_model_t=0.2089 mean_corrupt_t=0.2089 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3478 corrupt_frac=1.0000 acc_corrupt=0.3478 loss_corrupt=2.5069 wrong_frac=0.7910 init_acc_corrupt=0.2074 acc_corrupt_t_0p0_0p2=0.2274 corrupt_frac_t_0p0_0p2=0.5598 acc_corrupt_t_0p2_0p4=0.4660 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.6365 corrupt_frac_t_0p4_0p6=0.0750 acc_corrupt_t_0p6_0p8=0.7711 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=10.4910 out_g_norm=0.4782 acc_corrupt_t_0p8_1p0=0.8952 corrupt_frac_t_0p8_1p0=0.0094 loss_all=2.0582 init_gold_top10=0.2072 init_gold_top100=0.2815
194
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=3.7s lr=2.000000e-03 loss=1.6159 loss_recon=1.6159 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5261 corrupt_frac=1.0000 acc_corrupt=0.5261 loss_corrupt=1.6159 wrong_frac=0.7907 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.3700 corrupt_frac_t_0p0_0p2=0.5552 acc_corrupt_t_0p2_0p4=0.6933 corrupt_frac_t_0p2_0p4=0.3584 acc_corrupt_t_0p4_0p6=0.8285 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=11.0190 out_g_norm=0.5898 acc_corrupt_t_0p6_0p8=0.8964 corrupt_frac_t_0p6_0p8=0.0132 acc_corrupt_t_0p8_1p0=0.9167 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.3305 init_gold_top10=0.2117 init_gold_top100=0.2832
195
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=3.7s lr=2.000000e-03 loss=0.9537 loss_recon=0.9537 loss_meanflow=0.0000 mean_model_t=0.2103 mean_corrupt_t=0.2103 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7156 corrupt_frac=1.0000 acc_corrupt=0.7156 loss_corrupt=0.9537 wrong_frac=0.7897 init_acc_corrupt=0.2086 acc_corrupt_t_0p0_0p2=0.5634 corrupt_frac_t_0p0_0p2=0.5525 acc_corrupt_t_0p2_0p4=0.8899 corrupt_frac_t_0p2_0p4=0.3592 acc_corrupt_t_0p4_0p6=0.9571 corrupt_frac_t_0p4_0p6=0.0800 acc_corrupt_t_0p6_0p8=0.9757 corrupt_frac_t_0p6_0p8=0.0141 out_w_norm=11.3997 out_g_norm=0.6929 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5768 init_gold_top10=0.2208 init_gold_top100=0.2931
196
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=3.7s lr=2.000000e-03 loss=0.5253 loss_recon=0.5253 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8455 corrupt_frac=1.0000 acc_corrupt=0.8455 loss_corrupt=0.5253 wrong_frac=0.7906 init_acc_corrupt=0.2078 acc_corrupt_t_0p0_0p2=0.7406 corrupt_frac_t_0p0_0p2=0.5584 acc_corrupt_t_0p2_0p4=0.9744 corrupt_frac_t_0p2_0p4=0.3530 acc_corrupt_t_0p4_0p6=0.9931 corrupt_frac_t_0p4_0p6=0.0801 acc_corrupt_t_0p6_0p8=0.9967 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=11.7285 out_g_norm=0.5915 acc_corrupt_t_0p8_1p0=0.9974 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3668 init_gold_top10=0.2190 init_gold_top100=0.2921
197
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=3.7s lr=2.000000e-03 loss=0.3128 loss_recon=0.3128 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9050 corrupt_frac=1.0000 acc_corrupt=0.9050 loss_corrupt=0.3128 wrong_frac=0.7908 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.8329 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=0.9944 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.9985 corrupt_frac_t_0p4_0p6=0.0759 acc_corrupt_t_0p6_0p8=0.9982 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=11.9476 out_g_norm=0.4867 acc_corrupt_t_0p8_1p0=0.9980 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2410 init_gold_top10=0.2187 init_gold_top100=0.2915
198
+ NCCL version 2.25.1+cuda12.8
199
+ resumed_from=runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805/latest.pt start_step=1001
200
+ {
201
+ "device": "cuda:0",
202
+ "rank": 0,
203
+ "world_size": 4,
204
+ "samples": "owt_cached_chunks:8",
205
+ "vocab_size": 969,
206
+ "tokenizer_vocab_size": 50257,
207
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805",
208
+ "batch_size": 128,
209
+ "grad_accum": 1,
210
+ "effective_batch_size": 512,
211
+ "global_batch_size": 512,
212
+ "lr_schedule": "constant_warmup",
213
+ "optimizer": "muon",
214
+ "epochs": 0.0,
215
+ "steps_per_epoch": 1,
216
+ "total_steps": 2000,
217
+ "warmup_steps": 10,
218
+ "warmup_epochs": -1.0,
219
+ "min_lr": 0.0,
220
+ "weight_decay": 0.1,
221
+ "output_weight_decay": -1.0,
222
+ "adamw_param_groups": "nanogpt",
223
+ "adam_beta1": 0.9,
224
+ "adam_beta2": 0.95,
225
+ "adam_eps": 1e-08,
226
+ "muon_impl": "legacy",
227
+ "muon_momentum": 0.95,
228
+ "muon_ns_steps": 5,
229
+ "muon_update_scale": 1.0,
230
+ "muon_nesterov": false,
231
+ "muon_width_scale": false,
232
+ "muon_grouping": "legacy_dim_ge_2",
233
+ "muon_param_count": 1965440,
234
+ "muon_adam_param_count": 8192,
235
+ "muon_param_names": [
236
+ "vocab_embed.embedding",
237
+ "sigma_map.net.0.weight",
238
+ "sigma_map.net.2.weight",
239
+ "blocks.0.attn_qkv.weight",
240
+ "blocks.0.attn_out.weight",
241
+ "blocks.0.mlp.0.weight",
242
+ "blocks.0.mlp.2.weight",
243
+ "blocks.0.adaLN_modulation.weight",
244
+ "blocks.1.attn_qkv.weight",
245
+ "blocks.1.attn_out.weight",
246
+ "blocks.1.mlp.0.weight",
247
+ "blocks.1.mlp.2.weight",
248
+ "blocks.1.adaLN_modulation.weight",
249
+ "blocks.2.attn_qkv.weight",
250
+ "blocks.2.attn_out.weight",
251
+ "blocks.2.mlp.0.weight",
252
+ "blocks.2.mlp.2.weight",
253
+ "blocks.2.adaLN_modulation.weight",
254
+ "output_layer.linear.weight",
255
+ "output_layer.adaLN_modulation.weight"
256
+ ],
257
+ "muon_adam_param_names": [
258
+ "sigma_map.net.0.bias",
259
+ "sigma_map.net.2.bias",
260
+ "blocks.0.norm1.weight",
261
+ "blocks.0.norm2.weight",
262
+ "blocks.0.mlp.0.bias",
263
+ "blocks.0.mlp.2.bias",
264
+ "blocks.0.adaLN_modulation.bias",
265
+ "blocks.1.norm1.weight",
266
+ "blocks.1.norm2.weight",
267
+ "blocks.1.mlp.0.bias",
268
+ "blocks.1.mlp.2.bias",
269
+ "blocks.1.adaLN_modulation.bias",
270
+ "blocks.2.norm1.weight",
271
+ "blocks.2.norm2.weight",
272
+ "blocks.2.mlp.0.bias",
273
+ "blocks.2.mlp.2.bias",
274
+ "blocks.2.adaLN_modulation.bias",
275
+ "output_layer.norm_final.weight",
276
+ "output_layer.adaLN_modulation.bias"
277
+ ],
278
+ "muon_effective_nesterov": false,
279
+ "muon_effective_width_scale": false,
280
+ "muon_effective_weight_decay": 0.1,
281
+ "muon_adam_fallback_nesterov": false,
282
+ "muon_adam_fallback_weight_decay": 0.1,
283
+ "ema_decay": 0.9999,
284
+ "ema_start_step": 0,
285
+ "model_type": "ddit",
286
+ "ddit_mlp_type": "gelu",
287
+ "elf_num_time_tokens": 4,
288
+ "elf_num_model_mode_tokens": 0,
289
+ "qk_norm": true,
290
+ "output_bias": false,
291
+ "output_init_std": -1.0,
292
+ "norm_type": "rmsnorm",
293
+ "target_loss": "hard_ce",
294
+ "linear_soft_target_power": 1.0,
295
+ "linear_soft_target_min_conf": 0.0,
296
+ "linear_soft_target_max_conf": 1.0,
297
+ "t_sampling_mode": "logit_normal",
298
+ "t_sampling_power": 1.0,
299
+ "t_sampling_eps": 0.0001,
300
+ "t_sampling_logit_mean": -1.5,
301
+ "t_sampling_logit_std": 0.8,
302
+ "dual_t": true,
303
+ "corrupt_t_mode": "same",
304
+ "corrupt_min_t": 0.0,
305
+ "corrupt_max_t": 1.0,
306
+ "prefix_block_prob": 0.0,
307
+ "prefix_block_len": 128,
308
+ "mask_ratio_floor_schedule": "none",
309
+ "dirichlet_endpoint_mode": "categorical_dual_t",
310
+ "dirichlet_semantic_t_mode": "same",
311
+ "dirichlet_semantic_t_value": 0.0,
312
+ "dirichlet_semantic_t_curve": "linear",
313
+ "dirichlet_semantic_t_power": 1.0,
314
+ "endpoint_sequence_random_prob_alpha": 0.0,
315
+ "categorical_wrong_from_full_vocab": true,
316
+ "categorical_wrong_from_batch_valid_tokens": false,
317
+ "categorical_wrong_basin_token_ids": "",
318
+ "categorical_wrong_basin_prob": 0.0,
319
+ "categorical_wrong_unigram_prob": 0.0,
320
+ "categorical_wrong_uniform_prob": 0.0,
321
+ "categorical_wrong_corpus_unigram_path": "",
322
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
323
+ "categorical_wrong_basin_shared_prob": 0.0,
324
+ "categorical_wrong_unigram_shared_prob": 0.0,
325
+ "mask_mixture_original_prob": 0.0,
326
+ "mask_mixture_lowk_prob": 0.0,
327
+ "mask_mixture_lowcorrupt_prob": 0.0,
328
+ "mask_mixture_block_prob": 0.0,
329
+ "mask_mixture_all_prob": 1.0,
330
+ "mask_mixture_lowk_clean_tokens": "0",
331
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
332
+ "mask_mixture_block_tokens": "64,128",
333
+ "simplex_bridge_sampler": "dirichlet",
334
+ "logistic_normal_sigma_min": 0.1,
335
+ "logistic_normal_sigma_max": 1.0,
336
+ "logistic_normal_tau_min": 1.0,
337
+ "logistic_normal_tau_max": 1.0,
338
+ "torch_compile": false,
339
+ "compile_mode": "max-autotune",
340
+ "state_format": "prob",
341
+ "meanflow_weight": 0.0,
342
+ "rollout_train_prob": 0.0,
343
+ "rollout_train_steps": 1,
344
+ "rollout_train_infer_steps": 64,
345
+ "rollout_train_temp": 1.45,
346
+ "rollout_train_max_gamma": 1.0,
347
+ "rollout_train_corrupt_only": true,
348
+ "rollout_train_samplewise": false,
349
+ "rollout_train_compute_always": false,
350
+ "bridge_noise_init": "logistic_normal",
351
+ "noise_sigma": -1.0,
352
+ "allow_tf32": true,
353
+ "activation_checkpointing": false,
354
+ "activation_checkpoint_interval": 1,
355
+ "activation_checkpoint_scope": "block",
356
+ "ddp_static_graph": false,
357
+ "ddp_gradient_as_bucket_view": true,
358
+ "blocking_data_transfer": false,
359
+ "dataloader_prefetch_factor": 4,
360
+ "full_train_stats": false,
361
+ "tokenized_hf": false,
362
+ "tokenized_pad_token": "pad",
363
+ "elf_conditional_hf": false,
364
+ "record_pad_truncate": false,
365
+ "record_add_eos": false,
366
+ "record_add_special_tokens": false,
367
+ "record_pad_token": "pad",
368
+ "record_shuffle_buffer": 10000,
369
+ "wrap": true,
370
+ "wrap_mode": "stream",
371
+ "wrap_record_buffer_size": 200,
372
+ "owt_cached_chunks": true,
373
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
374
+ "owt_chunk_cache_rebuild": false,
375
+ "owt_chunk_cache_write_batch": 4096,
376
+ "owt_exact_repeat_per_chunk": 64,
377
+ "online_chunk_shuffle": false,
378
+ "online_chunk_shuffle_buffer": 10000,
379
+ "openwebtext_split": "train_minus_100k",
380
+ "detokenizer": "auto",
381
+ "resolved_detokenizer": null,
382
+ "num_workers": 0,
383
+ "latest_every": 1000,
384
+ "resume_path": "runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805/latest.pt"
385
+ }
386
+ step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=4.4s lr=2.000000e-03 loss=0.2449 loss_recon=0.2449 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9244 corrupt_frac=1.0000 acc_corrupt=0.9244 loss_corrupt=0.2449 wrong_frac=0.7917 init_acc_corrupt=0.2067 acc_corrupt_t_0p0_0p2=0.8659 corrupt_frac_t_0p0_0p2=0.5588 acc_corrupt_t_0p2_0p4=0.9983 corrupt_frac_t_0p2_0p4=0.3579 acc_corrupt_t_0p4_0p6=0.9992 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9987 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.0079 out_g_norm=0.3943 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2226 init_gold_top10=0.2053 init_gold_top100=0.2800
387
+ step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=3.9s lr=2.000000e-03 loss=0.2175 loss_recon=0.2175 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9306 corrupt_frac=1.0000 acc_corrupt=0.9306 loss_corrupt=0.2175 wrong_frac=0.7925 init_acc_corrupt=0.2059 acc_corrupt_t_0p0_0p2=0.8761 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=0.9994 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.9996 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.9987 corrupt_frac_t_0p6_0p8=0.0119 out_w_norm=12.0269 out_g_norm=0.3266 acc_corrupt_t_0p8_1p0=0.9980 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2401 init_gold_top10=0.2043 init_gold_top100=0.2783
388
+ step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=3.9s lr=2.000000e-03 loss=0.1832 loss_recon=0.1832 loss_meanflow=0.0000 mean_model_t=0.2117 mean_corrupt_t=0.2117 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9416 corrupt_frac=1.0000 acc_corrupt=0.9416 loss_corrupt=0.1832 wrong_frac=0.7884 init_acc_corrupt=0.2102 acc_corrupt_t_0p0_0p2=0.8938 corrupt_frac_t_0p0_0p2=0.5484 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.3611 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0803 acc_corrupt_t_0p6_0p8=0.9995 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=12.0197 out_g_norm=0.3052 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1627 init_gold_top10=0.2224 init_gold_top100=0.2981
389
+ step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=3.9s lr=2.000000e-03 loss=0.1813 loss_recon=0.1813 loss_meanflow=0.0000 mean_model_t=0.2073 mean_corrupt_t=0.2073 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9413 corrupt_frac=1.0000 acc_corrupt=0.9413 loss_corrupt=0.1813 wrong_frac=0.7927 init_acc_corrupt=0.2056 acc_corrupt_t_0p0_0p2=0.8956 corrupt_frac_t_0p0_0p2=0.5613 acc_corrupt_t_0p2_0p4=0.9997 corrupt_frac_t_0p2_0p4=0.3571 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0727 acc_corrupt_t_0p6_0p8=0.9996 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=12.0342 out_g_norm=0.2857 loss_all=0.0911 init_gold_top10=0.2197 init_gold_top100=0.2939
390
+ step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=3.9s lr=2.000000e-03 loss=0.1679 loss_recon=0.1679 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9460 corrupt_frac=1.0000 acc_corrupt=0.9460 loss_corrupt=0.1679 wrong_frac=0.7904 init_acc_corrupt=0.2080 acc_corrupt_t_0p0_0p2=0.9026 corrupt_frac_t_0p0_0p2=0.5540 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0745 out_w_norm=12.0459 out_g_norm=0.2685 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1933 init_gold_top10=0.2225 init_gold_top100=0.2952
391
+ step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=3.9s lr=2.000000e-03 loss=0.1488 loss_recon=0.1488 loss_meanflow=0.0000 mean_model_t=0.2089 mean_corrupt_t=0.2089 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9519 corrupt_frac=1.0000 acc_corrupt=0.9519 loss_corrupt=0.1488 wrong_frac=0.7910 init_acc_corrupt=0.2074 acc_corrupt_t_0p0_0p2=0.9142 corrupt_frac_t_0p0_0p2=0.5598 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0750 acc_corrupt_t_0p6_0p8=0.9996 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=12.0548 out_g_norm=0.2284 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.0094 loss_all=0.1261 init_gold_top10=0.2072 init_gold_top100=0.2815
392
+ step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=3.9s lr=2.000000e-03 loss=0.1446 loss_recon=0.1446 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9526 corrupt_frac=1.0000 acc_corrupt=0.9526 loss_corrupt=0.1446 wrong_frac=0.7907 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9147 corrupt_frac_t_0p0_0p2=0.5552 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3584 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=12.0717 out_g_norm=0.1855 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0132 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1127 init_gold_top10=0.2117 init_gold_top100=0.2832
393
+ step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=3.9s lr=2.000000e-03 loss=0.1385 loss_recon=0.1385 loss_meanflow=0.0000 mean_model_t=0.2103 mean_corrupt_t=0.2103 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9550 corrupt_frac=1.0000 acc_corrupt=0.9550 loss_corrupt=0.1385 wrong_frac=0.7897 init_acc_corrupt=0.2086 acc_corrupt_t_0p0_0p2=0.9186 corrupt_frac_t_0p0_0p2=0.5525 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3592 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0800 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0141 out_w_norm=12.0831 out_g_norm=0.1934 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0903 init_gold_top10=0.2208 init_gold_top100=0.2931
394
+ step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=3.9s lr=2.000000e-03 loss=0.1373 loss_recon=0.1373 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9553 corrupt_frac=1.0000 acc_corrupt=0.9553 loss_corrupt=0.1373 wrong_frac=0.7906 init_acc_corrupt=0.2078 acc_corrupt_t_0p0_0p2=0.9199 corrupt_frac_t_0p0_0p2=0.5584 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3530 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0801 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.0937 out_g_norm=0.1728 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1211 init_gold_top10=0.2190 init_gold_top100=0.2921
395
+ step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=3.9s lr=2.000000e-03 loss=0.1201 loss_recon=0.1201 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9606 corrupt_frac=1.0000 acc_corrupt=0.9606 loss_corrupt=0.1201 wrong_frac=0.7908 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9292 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0759 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=12.1029 out_g_norm=0.1619 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0543 init_gold_top10=0.2187 init_gold_top100=0.2915
396
+ NCCL version 2.25.1+cuda12.8
397
+ resumed_from=runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805/latest.pt start_step=2001
398
+ {
399
+ "device": "cuda:0",
400
+ "rank": 0,
401
+ "world_size": 4,
402
+ "samples": "owt_cached_chunks:8",
403
+ "vocab_size": 969,
404
+ "tokenizer_vocab_size": 50257,
405
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805",
406
+ "batch_size": 128,
407
+ "grad_accum": 1,
408
+ "effective_batch_size": 512,
409
+ "global_batch_size": 512,
410
+ "lr_schedule": "constant_warmup",
411
+ "optimizer": "muon",
412
+ "epochs": 0.0,
413
+ "steps_per_epoch": 1,
414
+ "total_steps": 3000,
415
+ "warmup_steps": 10,
416
+ "warmup_epochs": -1.0,
417
+ "min_lr": 0.0,
418
+ "weight_decay": 0.1,
419
+ "output_weight_decay": -1.0,
420
+ "adamw_param_groups": "nanogpt",
421
+ "adam_beta1": 0.9,
422
+ "adam_beta2": 0.95,
423
+ "adam_eps": 1e-08,
424
+ "muon_impl": "legacy",
425
+ "muon_momentum": 0.95,
426
+ "muon_ns_steps": 5,
427
+ "muon_update_scale": 1.0,
428
+ "muon_nesterov": false,
429
+ "muon_width_scale": false,
430
+ "muon_grouping": "legacy_dim_ge_2",
431
+ "muon_param_count": 1965440,
432
+ "muon_adam_param_count": 8192,
433
+ "muon_param_names": [
434
+ "vocab_embed.embedding",
435
+ "sigma_map.net.0.weight",
436
+ "sigma_map.net.2.weight",
437
+ "blocks.0.attn_qkv.weight",
438
+ "blocks.0.attn_out.weight",
439
+ "blocks.0.mlp.0.weight",
440
+ "blocks.0.mlp.2.weight",
441
+ "blocks.0.adaLN_modulation.weight",
442
+ "blocks.1.attn_qkv.weight",
443
+ "blocks.1.attn_out.weight",
444
+ "blocks.1.mlp.0.weight",
445
+ "blocks.1.mlp.2.weight",
446
+ "blocks.1.adaLN_modulation.weight",
447
+ "blocks.2.attn_qkv.weight",
448
+ "blocks.2.attn_out.weight",
449
+ "blocks.2.mlp.0.weight",
450
+ "blocks.2.mlp.2.weight",
451
+ "blocks.2.adaLN_modulation.weight",
452
+ "output_layer.linear.weight",
453
+ "output_layer.adaLN_modulation.weight"
454
+ ],
455
+ "muon_adam_param_names": [
456
+ "sigma_map.net.0.bias",
457
+ "sigma_map.net.2.bias",
458
+ "blocks.0.norm1.weight",
459
+ "blocks.0.norm2.weight",
460
+ "blocks.0.mlp.0.bias",
461
+ "blocks.0.mlp.2.bias",
462
+ "blocks.0.adaLN_modulation.bias",
463
+ "blocks.1.norm1.weight",
464
+ "blocks.1.norm2.weight",
465
+ "blocks.1.mlp.0.bias",
466
+ "blocks.1.mlp.2.bias",
467
+ "blocks.1.adaLN_modulation.bias",
468
+ "blocks.2.norm1.weight",
469
+ "blocks.2.norm2.weight",
470
+ "blocks.2.mlp.0.bias",
471
+ "blocks.2.mlp.2.bias",
472
+ "blocks.2.adaLN_modulation.bias",
473
+ "output_layer.norm_final.weight",
474
+ "output_layer.adaLN_modulation.bias"
475
+ ],
476
+ "muon_effective_nesterov": false,
477
+ "muon_effective_width_scale": false,
478
+ "muon_effective_weight_decay": 0.1,
479
+ "muon_adam_fallback_nesterov": false,
480
+ "muon_adam_fallback_weight_decay": 0.1,
481
+ "ema_decay": 0.9999,
482
+ "ema_start_step": 0,
483
+ "model_type": "ddit",
484
+ "ddit_mlp_type": "gelu",
485
+ "elf_num_time_tokens": 4,
486
+ "elf_num_model_mode_tokens": 0,
487
+ "qk_norm": true,
488
+ "output_bias": false,
489
+ "output_init_std": -1.0,
490
+ "norm_type": "rmsnorm",
491
+ "target_loss": "hard_ce",
492
+ "linear_soft_target_power": 1.0,
493
+ "linear_soft_target_min_conf": 0.0,
494
+ "linear_soft_target_max_conf": 1.0,
495
+ "t_sampling_mode": "logit_normal",
496
+ "t_sampling_power": 1.0,
497
+ "t_sampling_eps": 0.0001,
498
+ "t_sampling_logit_mean": -1.5,
499
+ "t_sampling_logit_std": 0.8,
500
+ "dual_t": true,
501
+ "corrupt_t_mode": "same",
502
+ "corrupt_min_t": 0.0,
503
+ "corrupt_max_t": 1.0,
504
+ "prefix_block_prob": 0.0,
505
+ "prefix_block_len": 128,
506
+ "mask_ratio_floor_schedule": "none",
507
+ "dirichlet_endpoint_mode": "categorical_dual_t",
508
+ "dirichlet_semantic_t_mode": "same",
509
+ "dirichlet_semantic_t_value": 0.0,
510
+ "dirichlet_semantic_t_curve": "linear",
511
+ "dirichlet_semantic_t_power": 1.0,
512
+ "endpoint_sequence_random_prob_alpha": 0.0,
513
+ "categorical_wrong_from_full_vocab": true,
514
+ "categorical_wrong_from_batch_valid_tokens": false,
515
+ "categorical_wrong_basin_token_ids": "",
516
+ "categorical_wrong_basin_prob": 0.0,
517
+ "categorical_wrong_unigram_prob": 0.0,
518
+ "categorical_wrong_uniform_prob": 0.0,
519
+ "categorical_wrong_corpus_unigram_path": "",
520
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
521
+ "categorical_wrong_basin_shared_prob": 0.0,
522
+ "categorical_wrong_unigram_shared_prob": 0.0,
523
+ "mask_mixture_original_prob": 0.0,
524
+ "mask_mixture_lowk_prob": 0.0,
525
+ "mask_mixture_lowcorrupt_prob": 0.0,
526
+ "mask_mixture_block_prob": 0.0,
527
+ "mask_mixture_all_prob": 1.0,
528
+ "mask_mixture_lowk_clean_tokens": "0",
529
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
530
+ "mask_mixture_block_tokens": "64,128",
531
+ "simplex_bridge_sampler": "dirichlet",
532
+ "logistic_normal_sigma_min": 0.1,
533
+ "logistic_normal_sigma_max": 1.0,
534
+ "logistic_normal_tau_min": 1.0,
535
+ "logistic_normal_tau_max": 1.0,
536
+ "torch_compile": false,
537
+ "compile_mode": "max-autotune",
538
+ "state_format": "prob",
539
+ "meanflow_weight": 0.0,
540
+ "rollout_train_prob": 0.0,
541
+ "rollout_train_steps": 1,
542
+ "rollout_train_infer_steps": 64,
543
+ "rollout_train_temp": 1.45,
544
+ "rollout_train_max_gamma": 1.0,
545
+ "rollout_train_corrupt_only": true,
546
+ "rollout_train_samplewise": false,
547
+ "rollout_train_compute_always": false,
548
+ "bridge_noise_init": "logistic_normal",
549
+ "noise_sigma": -1.0,
550
+ "allow_tf32": true,
551
+ "activation_checkpointing": false,
552
+ "activation_checkpoint_interval": 1,
553
+ "activation_checkpoint_scope": "block",
554
+ "ddp_static_graph": false,
555
+ "ddp_gradient_as_bucket_view": true,
556
+ "blocking_data_transfer": false,
557
+ "dataloader_prefetch_factor": 4,
558
+ "full_train_stats": false,
559
+ "tokenized_hf": false,
560
+ "tokenized_pad_token": "pad",
561
+ "elf_conditional_hf": false,
562
+ "record_pad_truncate": false,
563
+ "record_add_eos": false,
564
+ "record_add_special_tokens": false,
565
+ "record_pad_token": "pad",
566
+ "record_shuffle_buffer": 10000,
567
+ "wrap": true,
568
+ "wrap_mode": "stream",
569
+ "wrap_record_buffer_size": 200,
570
+ "owt_cached_chunks": true,
571
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
572
+ "owt_chunk_cache_rebuild": false,
573
+ "owt_chunk_cache_write_batch": 4096,
574
+ "owt_exact_repeat_per_chunk": 64,
575
+ "online_chunk_shuffle": false,
576
+ "online_chunk_shuffle_buffer": 10000,
577
+ "openwebtext_split": "train_minus_100k",
578
+ "detokenizer": "auto",
579
+ "resolved_detokenizer": null,
580
+ "num_workers": 0,
581
+ "latest_every": 1000,
582
+ "resume_path": "runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805/latest.pt"
583
+ }
584
+ step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=4.4s lr=2.000000e-03 loss=0.1151 loss_recon=0.1151 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9627 corrupt_frac=1.0000 acc_corrupt=0.9627 loss_corrupt=0.1151 wrong_frac=0.7917 init_acc_corrupt=0.2067 acc_corrupt_t_0p0_0p2=0.9332 corrupt_frac_t_0p0_0p2=0.5588 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3579 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9995 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.1280 out_g_norm=0.1673 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0847 init_gold_top10=0.2053 init_gold_top100=0.2800
585
+ step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=3.7s lr=2.000000e-03 loss=0.1195 loss_recon=0.1195 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9618 corrupt_frac=1.0000 acc_corrupt=0.9618 loss_corrupt=0.1195 wrong_frac=0.7925 init_acc_corrupt=0.2059 acc_corrupt_t_0p0_0p2=0.9315 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0119 out_w_norm=12.1464 out_g_norm=0.1466 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1061 init_gold_top10=0.2043 init_gold_top100=0.2783
586
+ step=2300 epoch=2300/3000 epoch_step=1/1 micro_steps=2300 elapsed=3.7s lr=2.000000e-03 loss=0.1005 loss_recon=0.1005 loss_meanflow=0.0000 mean_model_t=0.2117 mean_corrupt_t=0.2117 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9673 corrupt_frac=1.0000 acc_corrupt=0.9673 loss_corrupt=0.1005 wrong_frac=0.7884 init_acc_corrupt=0.2102 acc_corrupt_t_0p0_0p2=0.9405 corrupt_frac_t_0p0_0p2=0.5484 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3611 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0803 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=12.1477 out_g_norm=0.1316 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0869 init_gold_top10=0.2224 init_gold_top100=0.2981
587
+ step=2400 epoch=2400/3000 epoch_step=1/1 micro_steps=2400 elapsed=3.7s lr=2.000000e-03 loss=0.1115 loss_recon=0.1115 loss_meanflow=0.0000 mean_model_t=0.2073 mean_corrupt_t=0.2073 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9634 corrupt_frac=1.0000 acc_corrupt=0.9634 loss_corrupt=0.1115 wrong_frac=0.7927 init_acc_corrupt=0.2056 acc_corrupt_t_0p0_0p2=0.9349 corrupt_frac_t_0p0_0p2=0.5613 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3571 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0727 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=12.1573 out_g_norm=0.1343 loss_all=0.0942 init_gold_top10=0.2197 init_gold_top100=0.2939
588
+ step=2500 epoch=2500/3000 epoch_step=1/1 micro_steps=2500 elapsed=3.7s lr=2.000000e-03 loss=0.1057 loss_recon=0.1057 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9654 corrupt_frac=1.0000 acc_corrupt=0.9654 loss_corrupt=0.1057 wrong_frac=0.7904 init_acc_corrupt=0.2080 acc_corrupt_t_0p0_0p2=0.9376 corrupt_frac_t_0p0_0p2=0.5540 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0745 out_w_norm=12.1754 out_g_norm=0.1418 acc_corrupt_t_0p6_0p8=0.9996 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1002 init_gold_top10=0.2225 init_gold_top100=0.2952
589
+ step=2600 epoch=2600/3000 epoch_step=1/1 micro_steps=2600 elapsed=3.7s lr=2.000000e-03 loss=0.0998 loss_recon=0.0998 loss_meanflow=0.0000 mean_model_t=0.2089 mean_corrupt_t=0.2089 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9678 corrupt_frac=1.0000 acc_corrupt=0.9678 loss_corrupt=0.0998 wrong_frac=0.7910 init_acc_corrupt=0.2074 acc_corrupt_t_0p0_0p2=0.9426 corrupt_frac_t_0p0_0p2=0.5598 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0750 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=12.1861 out_g_norm=0.1271 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.0094 loss_all=0.0707 init_gold_top10=0.2072 init_gold_top100=0.2815
590
+ step=2700 epoch=2700/3000 epoch_step=1/1 micro_steps=2700 elapsed=3.7s lr=2.000000e-03 loss=0.0970 loss_recon=0.0970 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9686 corrupt_frac=1.0000 acc_corrupt=0.9686 loss_corrupt=0.0970 wrong_frac=0.7907 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9435 corrupt_frac_t_0p0_0p2=0.5552 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3584 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=12.1979 out_g_norm=0.1102 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0132 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0570 init_gold_top10=0.2117 init_gold_top100=0.2832
591
+ step=2800 epoch=2800/3000 epoch_step=1/1 micro_steps=2800 elapsed=3.7s lr=2.000000e-03 loss=0.0949 loss_recon=0.0949 loss_meanflow=0.0000 mean_model_t=0.2103 mean_corrupt_t=0.2103 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9698 corrupt_frac=1.0000 acc_corrupt=0.9698 loss_corrupt=0.0949 wrong_frac=0.7897 init_acc_corrupt=0.2086 acc_corrupt_t_0p0_0p2=0.9453 corrupt_frac_t_0p0_0p2=0.5525 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3592 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0800 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0141 out_w_norm=12.2015 out_g_norm=0.1196 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0685 init_gold_top10=0.2208 init_gold_top100=0.2931
592
+ step=2900 epoch=2900/3000 epoch_step=1/1 micro_steps=2900 elapsed=3.7s lr=2.000000e-03 loss=0.0920 loss_recon=0.0920 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9697 corrupt_frac=1.0000 acc_corrupt=0.9697 loss_corrupt=0.0920 wrong_frac=0.7906 init_acc_corrupt=0.2078 acc_corrupt_t_0p0_0p2=0.9458 corrupt_frac_t_0p0_0p2=0.5584 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3530 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0801 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.2185 out_g_norm=0.1084 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0760 init_gold_top10=0.2190 init_gold_top100=0.2921
593
+ step=3000 epoch=3000/3000 epoch_step=1/1 micro_steps=3000 elapsed=3.7s lr=2.000000e-03 loss=0.0899 loss_recon=0.0899 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9708 corrupt_frac=1.0000 acc_corrupt=0.9708 loss_corrupt=0.0899 wrong_frac=0.7908 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9476 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0759 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=12.2416 out_g_norm=0.1068 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0347 init_gold_top10=0.2187 init_gold_top100=0.2915
594
+ NCCL version 2.25.1+cuda12.8
595
+ resumed_from=runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805/latest.pt start_step=3001
596
+ {
597
+ "device": "cuda:0",
598
+ "rank": 0,
599
+ "world_size": 4,
600
+ "samples": "owt_cached_chunks:8",
601
+ "vocab_size": 969,
602
+ "tokenizer_vocab_size": 50257,
603
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805",
604
+ "batch_size": 128,
605
+ "grad_accum": 1,
606
+ "effective_batch_size": 512,
607
+ "global_batch_size": 512,
608
+ "lr_schedule": "constant_warmup",
609
+ "optimizer": "muon",
610
+ "epochs": 0.0,
611
+ "steps_per_epoch": 1,
612
+ "total_steps": 4000,
613
+ "warmup_steps": 10,
614
+ "warmup_epochs": -1.0,
615
+ "min_lr": 0.0,
616
+ "weight_decay": 0.1,
617
+ "output_weight_decay": -1.0,
618
+ "adamw_param_groups": "nanogpt",
619
+ "adam_beta1": 0.9,
620
+ "adam_beta2": 0.95,
621
+ "adam_eps": 1e-08,
622
+ "muon_impl": "legacy",
623
+ "muon_momentum": 0.95,
624
+ "muon_ns_steps": 5,
625
+ "muon_update_scale": 1.0,
626
+ "muon_nesterov": false,
627
+ "muon_width_scale": false,
628
+ "muon_grouping": "legacy_dim_ge_2",
629
+ "muon_param_count": 1965440,
630
+ "muon_adam_param_count": 8192,
631
+ "muon_param_names": [
632
+ "vocab_embed.embedding",
633
+ "sigma_map.net.0.weight",
634
+ "sigma_map.net.2.weight",
635
+ "blocks.0.attn_qkv.weight",
636
+ "blocks.0.attn_out.weight",
637
+ "blocks.0.mlp.0.weight",
638
+ "blocks.0.mlp.2.weight",
639
+ "blocks.0.adaLN_modulation.weight",
640
+ "blocks.1.attn_qkv.weight",
641
+ "blocks.1.attn_out.weight",
642
+ "blocks.1.mlp.0.weight",
643
+ "blocks.1.mlp.2.weight",
644
+ "blocks.1.adaLN_modulation.weight",
645
+ "blocks.2.attn_qkv.weight",
646
+ "blocks.2.attn_out.weight",
647
+ "blocks.2.mlp.0.weight",
648
+ "blocks.2.mlp.2.weight",
649
+ "blocks.2.adaLN_modulation.weight",
650
+ "output_layer.linear.weight",
651
+ "output_layer.adaLN_modulation.weight"
652
+ ],
653
+ "muon_adam_param_names": [
654
+ "sigma_map.net.0.bias",
655
+ "sigma_map.net.2.bias",
656
+ "blocks.0.norm1.weight",
657
+ "blocks.0.norm2.weight",
658
+ "blocks.0.mlp.0.bias",
659
+ "blocks.0.mlp.2.bias",
660
+ "blocks.0.adaLN_modulation.bias",
661
+ "blocks.1.norm1.weight",
662
+ "blocks.1.norm2.weight",
663
+ "blocks.1.mlp.0.bias",
664
+ "blocks.1.mlp.2.bias",
665
+ "blocks.1.adaLN_modulation.bias",
666
+ "blocks.2.norm1.weight",
667
+ "blocks.2.norm2.weight",
668
+ "blocks.2.mlp.0.bias",
669
+ "blocks.2.mlp.2.bias",
670
+ "blocks.2.adaLN_modulation.bias",
671
+ "output_layer.norm_final.weight",
672
+ "output_layer.adaLN_modulation.bias"
673
+ ],
674
+ "muon_effective_nesterov": false,
675
+ "muon_effective_width_scale": false,
676
+ "muon_effective_weight_decay": 0.1,
677
+ "muon_adam_fallback_nesterov": false,
678
+ "muon_adam_fallback_weight_decay": 0.1,
679
+ "ema_decay": 0.9999,
680
+ "ema_start_step": 0,
681
+ "model_type": "ddit",
682
+ "ddit_mlp_type": "gelu",
683
+ "elf_num_time_tokens": 4,
684
+ "elf_num_model_mode_tokens": 0,
685
+ "qk_norm": true,
686
+ "output_bias": false,
687
+ "output_init_std": -1.0,
688
+ "norm_type": "rmsnorm",
689
+ "target_loss": "hard_ce",
690
+ "linear_soft_target_power": 1.0,
691
+ "linear_soft_target_min_conf": 0.0,
692
+ "linear_soft_target_max_conf": 1.0,
693
+ "t_sampling_mode": "logit_normal",
694
+ "t_sampling_power": 1.0,
695
+ "t_sampling_eps": 0.0001,
696
+ "t_sampling_logit_mean": -1.5,
697
+ "t_sampling_logit_std": 0.8,
698
+ "dual_t": true,
699
+ "corrupt_t_mode": "same",
700
+ "corrupt_min_t": 0.0,
701
+ "corrupt_max_t": 1.0,
702
+ "prefix_block_prob": 0.0,
703
+ "prefix_block_len": 128,
704
+ "mask_ratio_floor_schedule": "none",
705
+ "dirichlet_endpoint_mode": "categorical_dual_t",
706
+ "dirichlet_semantic_t_mode": "same",
707
+ "dirichlet_semantic_t_value": 0.0,
708
+ "dirichlet_semantic_t_curve": "linear",
709
+ "dirichlet_semantic_t_power": 1.0,
710
+ "endpoint_sequence_random_prob_alpha": 0.0,
711
+ "categorical_wrong_from_full_vocab": true,
712
+ "categorical_wrong_from_batch_valid_tokens": false,
713
+ "categorical_wrong_basin_token_ids": "",
714
+ "categorical_wrong_basin_prob": 0.0,
715
+ "categorical_wrong_unigram_prob": 0.0,
716
+ "categorical_wrong_uniform_prob": 0.0,
717
+ "categorical_wrong_corpus_unigram_path": "",
718
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
719
+ "categorical_wrong_basin_shared_prob": 0.0,
720
+ "categorical_wrong_unigram_shared_prob": 0.0,
721
+ "mask_mixture_original_prob": 0.0,
722
+ "mask_mixture_lowk_prob": 0.0,
723
+ "mask_mixture_lowcorrupt_prob": 0.0,
724
+ "mask_mixture_block_prob": 0.0,
725
+ "mask_mixture_all_prob": 1.0,
726
+ "mask_mixture_lowk_clean_tokens": "0",
727
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
728
+ "mask_mixture_block_tokens": "64,128",
729
+ "simplex_bridge_sampler": "dirichlet",
730
+ "logistic_normal_sigma_min": 0.1,
731
+ "logistic_normal_sigma_max": 1.0,
732
+ "logistic_normal_tau_min": 1.0,
733
+ "logistic_normal_tau_max": 1.0,
734
+ "torch_compile": false,
735
+ "compile_mode": "max-autotune",
736
+ "state_format": "prob",
737
+ "meanflow_weight": 0.0,
738
+ "rollout_train_prob": 0.0,
739
+ "rollout_train_steps": 1,
740
+ "rollout_train_infer_steps": 64,
741
+ "rollout_train_temp": 1.45,
742
+ "rollout_train_max_gamma": 1.0,
743
+ "rollout_train_corrupt_only": true,
744
+ "rollout_train_samplewise": false,
745
+ "rollout_train_compute_always": false,
746
+ "bridge_noise_init": "logistic_normal",
747
+ "noise_sigma": -1.0,
748
+ "allow_tf32": true,
749
+ "activation_checkpointing": false,
750
+ "activation_checkpoint_interval": 1,
751
+ "activation_checkpoint_scope": "block",
752
+ "ddp_static_graph": false,
753
+ "ddp_gradient_as_bucket_view": true,
754
+ "blocking_data_transfer": false,
755
+ "dataloader_prefetch_factor": 4,
756
+ "full_train_stats": false,
757
+ "tokenized_hf": false,
758
+ "tokenized_pad_token": "pad",
759
+ "elf_conditional_hf": false,
760
+ "record_pad_truncate": false,
761
+ "record_add_eos": false,
762
+ "record_add_special_tokens": false,
763
+ "record_pad_token": "pad",
764
+ "record_shuffle_buffer": 10000,
765
+ "wrap": true,
766
+ "wrap_mode": "stream",
767
+ "wrap_record_buffer_size": 200,
768
+ "owt_cached_chunks": true,
769
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
770
+ "owt_chunk_cache_rebuild": false,
771
+ "owt_chunk_cache_write_batch": 4096,
772
+ "owt_exact_repeat_per_chunk": 64,
773
+ "online_chunk_shuffle": false,
774
+ "online_chunk_shuffle_buffer": 10000,
775
+ "openwebtext_split": "train_minus_100k",
776
+ "detokenizer": "auto",
777
+ "resolved_detokenizer": null,
778
+ "num_workers": 0,
779
+ "latest_every": 1000,
780
+ "resume_path": "runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805/latest.pt"
781
+ }
782
+ step=3100 epoch=3100/4000 epoch_step=1/1 micro_steps=3100 elapsed=4.4s lr=2.000000e-03 loss=0.0858 loss_recon=0.0858 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9728 corrupt_frac=1.0000 acc_corrupt=0.9728 loss_corrupt=0.0858 wrong_frac=0.7917 init_acc_corrupt=0.2067 acc_corrupt_t_0p0_0p2=0.9514 corrupt_frac_t_0p0_0p2=0.5588 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3579 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9996 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.2445 out_g_norm=0.1039 acc_corrupt_t_0p8_1p0=0.9980 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0670 init_gold_top10=0.2053 init_gold_top100=0.2800
783
+ step=3200 epoch=3200/4000 epoch_step=1/1 micro_steps=3200 elapsed=3.7s lr=2.000000e-03 loss=0.0896 loss_recon=0.0896 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9710 corrupt_frac=1.0000 acc_corrupt=0.9710 loss_corrupt=0.0896 wrong_frac=0.7925 init_acc_corrupt=0.2059 acc_corrupt_t_0p0_0p2=0.9480 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0119 out_w_norm=12.2565 out_g_norm=0.1005 acc_corrupt_t_0p8_1p0=0.9980 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0698 init_gold_top10=0.2043 init_gold_top100=0.2783
784
+ step=3300 epoch=3300/4000 epoch_step=1/1 micro_steps=3300 elapsed=3.7s lr=2.000000e-03 loss=0.0751 loss_recon=0.0751 loss_meanflow=0.0000 mean_model_t=0.2117 mean_corrupt_t=0.2117 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9757 corrupt_frac=1.0000 acc_corrupt=0.9757 loss_corrupt=0.0751 wrong_frac=0.7884 init_acc_corrupt=0.2102 acc_corrupt_t_0p0_0p2=0.9558 corrupt_frac_t_0p0_0p2=0.5484 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3611 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0803 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=12.2656 out_g_norm=0.0997 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0877 init_gold_top10=0.2224 init_gold_top100=0.2981
785
+ step=3400 epoch=3400/4000 epoch_step=1/1 micro_steps=3400 elapsed=3.7s lr=2.000000e-03 loss=0.0874 loss_recon=0.0874 loss_meanflow=0.0000 mean_model_t=0.2073 mean_corrupt_t=0.2073 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9713 corrupt_frac=1.0000 acc_corrupt=0.9713 loss_corrupt=0.0874 wrong_frac=0.7927 init_acc_corrupt=0.2056 acc_corrupt_t_0p0_0p2=0.9490 corrupt_frac_t_0p0_0p2=0.5613 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3571 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0727 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=12.2718 out_g_norm=0.0918 loss_all=0.0652 init_gold_top10=0.2197 init_gold_top100=0.2939
786
+ step=3500 epoch=3500/4000 epoch_step=1/1 micro_steps=3500 elapsed=3.7s lr=2.000000e-03 loss=0.0786 loss_recon=0.0786 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9741 corrupt_frac=1.0000 acc_corrupt=0.9741 loss_corrupt=0.0786 wrong_frac=0.7904 init_acc_corrupt=0.2080 acc_corrupt_t_0p0_0p2=0.9533 corrupt_frac_t_0p0_0p2=0.5540 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0745 out_w_norm=12.2720 out_g_norm=0.0895 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0935 init_gold_top10=0.2225 init_gold_top100=0.2952
787
+ step=3600 epoch=3600/4000 epoch_step=1/1 micro_steps=3600 elapsed=3.7s lr=2.000000e-03 loss=0.0763 loss_recon=0.0763 loss_meanflow=0.0000 mean_model_t=0.2089 mean_corrupt_t=0.2089 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9753 corrupt_frac=1.0000 acc_corrupt=0.9753 loss_corrupt=0.0763 wrong_frac=0.7910 init_acc_corrupt=0.2074 acc_corrupt_t_0p0_0p2=0.9560 corrupt_frac_t_0p0_0p2=0.5598 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0750 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=12.2727 out_g_norm=0.0911 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0094 loss_all=0.0643 init_gold_top10=0.2072 init_gold_top100=0.2815
788
+ step=3700 epoch=3700/4000 epoch_step=1/1 micro_steps=3700 elapsed=3.7s lr=2.000000e-03 loss=0.0781 loss_recon=0.0781 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9748 corrupt_frac=1.0000 acc_corrupt=0.9748 loss_corrupt=0.0781 wrong_frac=0.7907 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9546 corrupt_frac_t_0p0_0p2=0.5552 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3584 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=12.2705 out_g_norm=0.0846 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0132 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0330 init_gold_top10=0.2117 init_gold_top100=0.2832
789
+ step=3800 epoch=3800/4000 epoch_step=1/1 micro_steps=3800 elapsed=3.7s lr=2.000000e-03 loss=0.0692 loss_recon=0.0692 loss_meanflow=0.0000 mean_model_t=0.2103 mean_corrupt_t=0.2103 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9780 corrupt_frac=1.0000 acc_corrupt=0.9780 loss_corrupt=0.0692 wrong_frac=0.7897 init_acc_corrupt=0.2086 acc_corrupt_t_0p0_0p2=0.9602 corrupt_frac_t_0p0_0p2=0.5525 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3592 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0800 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0141 out_w_norm=12.2632 out_g_norm=0.0868 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0457 init_gold_top10=0.2208 init_gold_top100=0.2931
790
+ step=3900 epoch=3900/4000 epoch_step=1/1 micro_steps=3900 elapsed=3.7s lr=2.000000e-03 loss=0.0706 loss_recon=0.0706 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9766 corrupt_frac=1.0000 acc_corrupt=0.9766 loss_corrupt=0.0706 wrong_frac=0.7906 init_acc_corrupt=0.2078 acc_corrupt_t_0p0_0p2=0.9581 corrupt_frac_t_0p0_0p2=0.5584 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3530 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0801 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.2531 out_g_norm=0.0820 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0530 init_gold_top10=0.2190 init_gold_top100=0.2921
791
+ step=4000 epoch=4000/4000 epoch_step=1/1 micro_steps=4000 elapsed=3.7s lr=2.000000e-03 loss=0.0658 loss_recon=0.0658 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9795 corrupt_frac=1.0000 acc_corrupt=0.9795 loss_corrupt=0.0658 wrong_frac=0.7908 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9631 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0759 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=12.2590 out_g_norm=0.0768 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0183 init_gold_top10=0.2187 init_gold_top100=0.2915
792
+ NCCL version 2.25.1+cuda12.8
793
+ resumed_from=runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805/latest.pt start_step=4001
794
+ {
795
+ "device": "cuda:0",
796
+ "rank": 0,
797
+ "world_size": 4,
798
+ "samples": "owt_cached_chunks:8",
799
+ "vocab_size": 969,
800
+ "tokenizer_vocab_size": 50257,
801
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805",
802
+ "batch_size": 128,
803
+ "grad_accum": 1,
804
+ "effective_batch_size": 512,
805
+ "global_batch_size": 512,
806
+ "lr_schedule": "constant_warmup",
807
+ "optimizer": "muon",
808
+ "epochs": 0.0,
809
+ "steps_per_epoch": 1,
810
+ "total_steps": 5000,
811
+ "warmup_steps": 10,
812
+ "warmup_epochs": -1.0,
813
+ "min_lr": 0.0,
814
+ "weight_decay": 0.1,
815
+ "output_weight_decay": -1.0,
816
+ "adamw_param_groups": "nanogpt",
817
+ "adam_beta1": 0.9,
818
+ "adam_beta2": 0.95,
819
+ "adam_eps": 1e-08,
820
+ "muon_impl": "legacy",
821
+ "muon_momentum": 0.95,
822
+ "muon_ns_steps": 5,
823
+ "muon_update_scale": 1.0,
824
+ "muon_nesterov": false,
825
+ "muon_width_scale": false,
826
+ "muon_grouping": "legacy_dim_ge_2",
827
+ "muon_param_count": 1965440,
828
+ "muon_adam_param_count": 8192,
829
+ "muon_param_names": [
830
+ "vocab_embed.embedding",
831
+ "sigma_map.net.0.weight",
832
+ "sigma_map.net.2.weight",
833
+ "blocks.0.attn_qkv.weight",
834
+ "blocks.0.attn_out.weight",
835
+ "blocks.0.mlp.0.weight",
836
+ "blocks.0.mlp.2.weight",
837
+ "blocks.0.adaLN_modulation.weight",
838
+ "blocks.1.attn_qkv.weight",
839
+ "blocks.1.attn_out.weight",
840
+ "blocks.1.mlp.0.weight",
841
+ "blocks.1.mlp.2.weight",
842
+ "blocks.1.adaLN_modulation.weight",
843
+ "blocks.2.attn_qkv.weight",
844
+ "blocks.2.attn_out.weight",
845
+ "blocks.2.mlp.0.weight",
846
+ "blocks.2.mlp.2.weight",
847
+ "blocks.2.adaLN_modulation.weight",
848
+ "output_layer.linear.weight",
849
+ "output_layer.adaLN_modulation.weight"
850
+ ],
851
+ "muon_adam_param_names": [
852
+ "sigma_map.net.0.bias",
853
+ "sigma_map.net.2.bias",
854
+ "blocks.0.norm1.weight",
855
+ "blocks.0.norm2.weight",
856
+ "blocks.0.mlp.0.bias",
857
+ "blocks.0.mlp.2.bias",
858
+ "blocks.0.adaLN_modulation.bias",
859
+ "blocks.1.norm1.weight",
860
+ "blocks.1.norm2.weight",
861
+ "blocks.1.mlp.0.bias",
862
+ "blocks.1.mlp.2.bias",
863
+ "blocks.1.adaLN_modulation.bias",
864
+ "blocks.2.norm1.weight",
865
+ "blocks.2.norm2.weight",
866
+ "blocks.2.mlp.0.bias",
867
+ "blocks.2.mlp.2.bias",
868
+ "blocks.2.adaLN_modulation.bias",
869
+ "output_layer.norm_final.weight",
870
+ "output_layer.adaLN_modulation.bias"
871
+ ],
872
+ "muon_effective_nesterov": false,
873
+ "muon_effective_width_scale": false,
874
+ "muon_effective_weight_decay": 0.1,
875
+ "muon_adam_fallback_nesterov": false,
876
+ "muon_adam_fallback_weight_decay": 0.1,
877
+ "ema_decay": 0.9999,
878
+ "ema_start_step": 0,
879
+ "model_type": "ddit",
880
+ "ddit_mlp_type": "gelu",
881
+ "elf_num_time_tokens": 4,
882
+ "elf_num_model_mode_tokens": 0,
883
+ "qk_norm": true,
884
+ "output_bias": false,
885
+ "output_init_std": -1.0,
886
+ "norm_type": "rmsnorm",
887
+ "target_loss": "hard_ce",
888
+ "linear_soft_target_power": 1.0,
889
+ "linear_soft_target_min_conf": 0.0,
890
+ "linear_soft_target_max_conf": 1.0,
891
+ "t_sampling_mode": "logit_normal",
892
+ "t_sampling_power": 1.0,
893
+ "t_sampling_eps": 0.0001,
894
+ "t_sampling_logit_mean": -1.5,
895
+ "t_sampling_logit_std": 0.8,
896
+ "dual_t": true,
897
+ "corrupt_t_mode": "same",
898
+ "corrupt_min_t": 0.0,
899
+ "corrupt_max_t": 1.0,
900
+ "prefix_block_prob": 0.0,
901
+ "prefix_block_len": 128,
902
+ "mask_ratio_floor_schedule": "none",
903
+ "dirichlet_endpoint_mode": "categorical_dual_t",
904
+ "dirichlet_semantic_t_mode": "same",
905
+ "dirichlet_semantic_t_value": 0.0,
906
+ "dirichlet_semantic_t_curve": "linear",
907
+ "dirichlet_semantic_t_power": 1.0,
908
+ "endpoint_sequence_random_prob_alpha": 0.0,
909
+ "categorical_wrong_from_full_vocab": true,
910
+ "categorical_wrong_from_batch_valid_tokens": false,
911
+ "categorical_wrong_basin_token_ids": "",
912
+ "categorical_wrong_basin_prob": 0.0,
913
+ "categorical_wrong_unigram_prob": 0.0,
914
+ "categorical_wrong_uniform_prob": 0.0,
915
+ "categorical_wrong_corpus_unigram_path": "",
916
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
917
+ "categorical_wrong_basin_shared_prob": 0.0,
918
+ "categorical_wrong_unigram_shared_prob": 0.0,
919
+ "mask_mixture_original_prob": 0.0,
920
+ "mask_mixture_lowk_prob": 0.0,
921
+ "mask_mixture_lowcorrupt_prob": 0.0,
922
+ "mask_mixture_block_prob": 0.0,
923
+ "mask_mixture_all_prob": 1.0,
924
+ "mask_mixture_lowk_clean_tokens": "0",
925
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
926
+ "mask_mixture_block_tokens": "64,128",
927
+ "simplex_bridge_sampler": "dirichlet",
928
+ "logistic_normal_sigma_min": 0.1,
929
+ "logistic_normal_sigma_max": 1.0,
930
+ "logistic_normal_tau_min": 1.0,
931
+ "logistic_normal_tau_max": 1.0,
932
+ "torch_compile": false,
933
+ "compile_mode": "max-autotune",
934
+ "state_format": "prob",
935
+ "meanflow_weight": 0.0,
936
+ "rollout_train_prob": 0.0,
937
+ "rollout_train_steps": 1,
938
+ "rollout_train_infer_steps": 64,
939
+ "rollout_train_temp": 1.45,
940
+ "rollout_train_max_gamma": 1.0,
941
+ "rollout_train_corrupt_only": true,
942
+ "rollout_train_samplewise": false,
943
+ "rollout_train_compute_always": false,
944
+ "bridge_noise_init": "logistic_normal",
945
+ "noise_sigma": -1.0,
946
+ "allow_tf32": true,
947
+ "activation_checkpointing": false,
948
+ "activation_checkpoint_interval": 1,
949
+ "activation_checkpoint_scope": "block",
950
+ "ddp_static_graph": false,
951
+ "ddp_gradient_as_bucket_view": true,
952
+ "blocking_data_transfer": false,
953
+ "dataloader_prefetch_factor": 4,
954
+ "full_train_stats": false,
955
+ "tokenized_hf": false,
956
+ "tokenized_pad_token": "pad",
957
+ "elf_conditional_hf": false,
958
+ "record_pad_truncate": false,
959
+ "record_add_eos": false,
960
+ "record_add_special_tokens": false,
961
+ "record_pad_token": "pad",
962
+ "record_shuffle_buffer": 10000,
963
+ "wrap": true,
964
+ "wrap_mode": "stream",
965
+ "wrap_record_buffer_size": 200,
966
+ "owt_cached_chunks": true,
967
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
968
+ "owt_chunk_cache_rebuild": false,
969
+ "owt_chunk_cache_write_batch": 4096,
970
+ "owt_exact_repeat_per_chunk": 64,
971
+ "online_chunk_shuffle": false,
972
+ "online_chunk_shuffle_buffer": 10000,
973
+ "openwebtext_split": "train_minus_100k",
974
+ "detokenizer": "auto",
975
+ "resolved_detokenizer": null,
976
+ "num_workers": 0,
977
+ "latest_every": 1000,
978
+ "resume_path": "runs/train8_noisegeo_len256_allcorrupt_highC64_4096_20260517_163805/latest.pt"
979
+ }
980
+ step=4100 epoch=4100/5000 epoch_step=1/1 micro_steps=4100 elapsed=4.4s lr=2.000000e-03 loss=0.0738 loss_recon=0.0738 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9772 corrupt_frac=1.0000 acc_corrupt=0.9772 loss_corrupt=0.0738 wrong_frac=0.7917 init_acc_corrupt=0.2067 acc_corrupt_t_0p0_0p2=0.9592 corrupt_frac_t_0p0_0p2=0.5588 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3579 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=12.2596 out_g_norm=0.0802 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1209 init_gold_top10=0.2053 init_gold_top100=0.2800
981
+ step=4200 epoch=4200/5000 epoch_step=1/1 micro_steps=4200 elapsed=3.7s lr=2.000000e-03 loss=0.0741 loss_recon=0.0741 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9760 corrupt_frac=1.0000 acc_corrupt=0.9760 loss_corrupt=0.0741 wrong_frac=0.7925 init_acc_corrupt=0.2059 acc_corrupt_t_0p0_0p2=0.9570 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0119 out_w_norm=12.2460 out_g_norm=0.0796 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0588 init_gold_top10=0.2043 init_gold_top100=0.2783
982
+ step=4300 epoch=4300/5000 epoch_step=1/1 micro_steps=4300 elapsed=3.7s lr=2.000000e-03 loss=0.0609 loss_recon=0.0609 loss_meanflow=0.0000 mean_model_t=0.2117 mean_corrupt_t=0.2117 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9804 corrupt_frac=1.0000 acc_corrupt=0.9804 loss_corrupt=0.0609 wrong_frac=0.7884 init_acc_corrupt=0.2102 acc_corrupt_t_0p0_0p2=0.9642 corrupt_frac_t_0p0_0p2=0.5484 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3611 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0803 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=12.2396 out_g_norm=0.0777 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0815 init_gold_top10=0.2224 init_gold_top100=0.2981
983
+ step=4400 epoch=4400/5000 epoch_step=1/1 micro_steps=4400 elapsed=3.7s lr=2.000000e-03 loss=0.0665 loss_recon=0.0665 loss_meanflow=0.0000 mean_model_t=0.2073 mean_corrupt_t=0.2073 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9787 corrupt_frac=1.0000 acc_corrupt=0.9787 loss_corrupt=0.0665 wrong_frac=0.7927 init_acc_corrupt=0.2056 acc_corrupt_t_0p0_0p2=0.9621 corrupt_frac_t_0p0_0p2=0.5613 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3571 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0727 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=12.2321 out_g_norm=0.0750 loss_all=0.0665 init_gold_top10=0.2197 init_gold_top100=0.2939
984
+ step=4500 epoch=4500/5000 epoch_step=1/1 micro_steps=4500 elapsed=3.9s lr=2.000000e-03 loss=0.0639 loss_recon=0.0639 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9799 corrupt_frac=1.0000 acc_corrupt=0.9799 loss_corrupt=0.0639 wrong_frac=0.7904 init_acc_corrupt=0.2080 acc_corrupt_t_0p0_0p2=0.9637 corrupt_frac_t_0p0_0p2=0.5540 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0745 out_w_norm=12.2223 out_g_norm=0.0766 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0581 init_gold_top10=0.2225 init_gold_top100=0.2952
985
+ step=4600 epoch=4600/5000 epoch_step=1/1 micro_steps=4600 elapsed=3.7s lr=2.000000e-03 loss=0.0625 loss_recon=0.0625 loss_meanflow=0.0000 mean_model_t=0.2089 mean_corrupt_t=0.2089 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9799 corrupt_frac=1.0000 acc_corrupt=0.9799 loss_corrupt=0.0625 wrong_frac=0.7910 init_acc_corrupt=0.2074 acc_corrupt_t_0p0_0p2=0.9642 corrupt_frac_t_0p0_0p2=0.5598 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0750 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=12.1981 out_g_norm=0.0736 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.0094 loss_all=0.0510 init_gold_top10=0.2072 init_gold_top100=0.2815
986
+ step=4700 epoch=4700/5000 epoch_step=1/1 micro_steps=4700 elapsed=3.7s lr=2.000000e-03 loss=0.0641 loss_recon=0.0641 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9791 corrupt_frac=1.0000 acc_corrupt=0.9791 loss_corrupt=0.0641 wrong_frac=0.7907 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9624 corrupt_frac_t_0p0_0p2=0.5552 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3584 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=12.1963 out_g_norm=0.0704 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0132 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0676 init_gold_top10=0.2117 init_gold_top100=0.2832
987
+ step=4800 epoch=4800/5000 epoch_step=1/1 micro_steps=4800 elapsed=3.7s lr=2.000000e-03 loss=0.0633 loss_recon=0.0633 loss_meanflow=0.0000 mean_model_t=0.2103 mean_corrupt_t=0.2103 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9797 corrupt_frac=1.0000 acc_corrupt=0.9797 loss_corrupt=0.0633 wrong_frac=0.7897 init_acc_corrupt=0.2086 acc_corrupt_t_0p0_0p2=0.9632 corrupt_frac_t_0p0_0p2=0.5525 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3592 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0800 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0141 out_w_norm=12.1704 out_g_norm=0.0736 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0173 init_gold_top10=0.2208 init_gold_top100=0.2931
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805.log ADDED
@@ -0,0 +1,791 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 969,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 1965440,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_corpus_unigram_path": "",
124
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
125
+ "categorical_wrong_basin_shared_prob": 0.0,
126
+ "categorical_wrong_unigram_shared_prob": 0.0,
127
+ "mask_mixture_original_prob": 0.0,
128
+ "mask_mixture_lowk_prob": 0.0,
129
+ "mask_mixture_lowcorrupt_prob": 0.0,
130
+ "mask_mixture_block_prob": 0.0,
131
+ "mask_mixture_all_prob": 1.0,
132
+ "mask_mixture_lowk_clean_tokens": "0",
133
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
134
+ "mask_mixture_block_tokens": "64,128",
135
+ "simplex_bridge_sampler": "logistic_normal_linear_mean",
136
+ "logistic_normal_sigma_min": 0.05,
137
+ "logistic_normal_sigma_max": 0.5,
138
+ "logistic_normal_tau_min": 1.0,
139
+ "logistic_normal_tau_max": 1.0,
140
+ "torch_compile": false,
141
+ "compile_mode": "max-autotune",
142
+ "state_format": "prob",
143
+ "meanflow_weight": 0.0,
144
+ "rollout_train_prob": 0.0,
145
+ "rollout_train_steps": 1,
146
+ "rollout_train_infer_steps": 64,
147
+ "rollout_train_temp": 1.45,
148
+ "rollout_train_max_gamma": 1.0,
149
+ "rollout_train_corrupt_only": true,
150
+ "rollout_train_samplewise": false,
151
+ "rollout_train_compute_always": false,
152
+ "bridge_noise_init": "logistic_normal",
153
+ "noise_sigma": -1.0,
154
+ "allow_tf32": true,
155
+ "activation_checkpointing": false,
156
+ "activation_checkpoint_interval": 1,
157
+ "activation_checkpoint_scope": "block",
158
+ "ddp_static_graph": false,
159
+ "ddp_gradient_as_bucket_view": true,
160
+ "blocking_data_transfer": false,
161
+ "dataloader_prefetch_factor": 4,
162
+ "full_train_stats": false,
163
+ "tokenized_hf": false,
164
+ "tokenized_pad_token": "pad",
165
+ "elf_conditional_hf": false,
166
+ "record_pad_truncate": false,
167
+ "record_add_eos": false,
168
+ "record_add_special_tokens": false,
169
+ "record_pad_token": "pad",
170
+ "record_shuffle_buffer": 10000,
171
+ "wrap": true,
172
+ "wrap_mode": "stream",
173
+ "wrap_record_buffer_size": 200,
174
+ "owt_cached_chunks": true,
175
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
176
+ "owt_chunk_cache_rebuild": false,
177
+ "owt_chunk_cache_write_batch": 4096,
178
+ "owt_exact_repeat_per_chunk": 64,
179
+ "online_chunk_shuffle": false,
180
+ "online_chunk_shuffle_buffer": 10000,
181
+ "openwebtext_split": "train_minus_100k",
182
+ "detokenizer": "auto",
183
+ "resolved_detokenizer": null,
184
+ "num_workers": 0,
185
+ "latest_every": 1000,
186
+ "resume_path": ""
187
+ }
188
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=4.5s lr=2.000000e-03 loss=6.6416 loss_recon=6.6416 loss_meanflow=0.0000 mean_model_t=0.2088 mean_corrupt_t=0.2088 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1812 corrupt_frac=1.0000 acc_corrupt=0.1812 loss_corrupt=6.6416 wrong_frac=0.7916 init_acc_corrupt=0.2093 acc_corrupt_t_0p0_0p2=0.0970 corrupt_frac_t_0p0_0p2=0.5587 acc_corrupt_t_0p2_0p4=0.2433 corrupt_frac_t_0p2_0p4=0.3553 acc_corrupt_t_0p4_0p6=0.4524 corrupt_frac_t_0p4_0p6=0.0770 acc_corrupt_t_0p6_0p8=0.6374 corrupt_frac_t_0p6_0p8=0.0122 out_w_norm=1.2373 out_g_norm=0.9360 acc_corrupt_t_0p8_1p0=0.8027 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.3029 init_gold_top10=0.2177 init_gold_top100=0.2919
189
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=3.8s lr=2.000000e-03 loss=5.8451 loss_recon=5.8451 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1702 corrupt_frac=1.0000 acc_corrupt=0.1702 loss_corrupt=5.8451 wrong_frac=0.7931 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.1040 corrupt_frac_t_0p0_0p2=0.5635 acc_corrupt_t_0p2_0p4=0.2165 corrupt_frac_t_0p2_0p4=0.3538 acc_corrupt_t_0p4_0p6=0.4006 corrupt_frac_t_0p4_0p6=0.0742 acc_corrupt_t_0p6_0p8=0.6151 corrupt_frac_t_0p6_0p8=0.0127 out_w_norm=4.0542 out_g_norm=1.2907 acc_corrupt_t_0p8_1p0=0.7878 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.5064 init_gold_top10=0.1975 init_gold_top100=0.2742
190
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=3.8s lr=2.000000e-03 loss=5.0754 loss_recon=5.0754 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1945 corrupt_frac=1.0000 acc_corrupt=0.1945 loss_corrupt=5.0754 wrong_frac=0.7916 init_acc_corrupt=0.2092 acc_corrupt_t_0p0_0p2=0.1171 corrupt_frac_t_0p0_0p2=0.5532 acc_corrupt_t_0p2_0p4=0.2511 corrupt_frac_t_0p2_0p4=0.3616 acc_corrupt_t_0p4_0p6=0.4385 corrupt_frac_t_0p4_0p6=0.0771 acc_corrupt_t_0p6_0p8=0.6358 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=6.6304 out_g_norm=0.7911 loss_all=4.7495 init_gold_top10=0.2067 init_gold_top100=0.2811
191
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=3.8s lr=2.000000e-03 loss=4.2417 loss_recon=4.2417 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2396 corrupt_frac=1.0000 acc_corrupt=0.2396 loss_corrupt=4.2417 wrong_frac=0.7924 init_acc_corrupt=0.2084 acc_corrupt_t_0p0_0p2=0.1509 corrupt_frac_t_0p0_0p2=0.5611 acc_corrupt_t_0p2_0p4=0.3167 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.4920 corrupt_frac_t_0p4_0p6=0.0734 out_w_norm=8.3769 out_g_norm=0.4660 acc_corrupt_t_0p6_0p8=0.6663 corrupt_frac_t_0p6_0p8=0.0122 loss_all=3.9329 init_gold_top10=0.1958 init_gold_top100=0.2700
192
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=3.8s lr=2.000000e-03 loss=3.2604 loss_recon=3.2604 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2686 corrupt_frac=1.0000 acc_corrupt=0.2686 loss_corrupt=3.2604 wrong_frac=0.7902 init_acc_corrupt=0.2106 acc_corrupt_t_0p0_0p2=0.1795 corrupt_frac_t_0p0_0p2=0.5567 acc_corrupt_t_0p2_0p4=0.3418 corrupt_frac_t_0p2_0p4=0.3548 acc_corrupt_t_0p4_0p6=0.5201 corrupt_frac_t_0p4_0p6=0.0798 out_w_norm=9.7739 out_g_norm=0.4141 acc_corrupt_t_0p6_0p8=0.6828 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=0.8711 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.7174 init_gold_top10=0.2355 init_gold_top100=0.3061
193
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=3.8s lr=2.000000e-03 loss=2.2324 loss_recon=2.2324 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3783 corrupt_frac=1.0000 acc_corrupt=0.3783 loss_corrupt=2.2324 wrong_frac=0.7920 init_acc_corrupt=0.2088 acc_corrupt_t_0p0_0p2=0.2666 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=0.4840 corrupt_frac_t_0p2_0p4=0.3566 acc_corrupt_t_0p4_0p6=0.6511 corrupt_frac_t_0p4_0p6=0.0777 acc_corrupt_t_0p6_0p8=0.7795 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=10.5161 out_g_norm=0.4345 acc_corrupt_t_0p8_1p0=0.8880 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7136 init_gold_top10=0.2176 init_gold_top100=0.2910
194
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=3.8s lr=2.000000e-03 loss=1.1924 loss_recon=1.1924 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6467 corrupt_frac=1.0000 acc_corrupt=0.6467 loss_corrupt=1.1924 wrong_frac=0.7910 init_acc_corrupt=0.2098 acc_corrupt_t_0p0_0p2=0.5041 corrupt_frac_t_0p0_0p2=0.5582 acc_corrupt_t_0p2_0p4=0.8081 corrupt_frac_t_0p2_0p4=0.3555 acc_corrupt_t_0p4_0p6=0.8997 corrupt_frac_t_0p4_0p6=0.0766 acc_corrupt_t_0p6_0p8=0.9358 corrupt_frac_t_0p6_0p8=0.0144 out_w_norm=11.0276 out_g_norm=0.5232 acc_corrupt_t_0p8_1p0=0.9854 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.7009 init_gold_top10=0.2223 init_gold_top100=0.2966
195
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=3.8s lr=2.000000e-03 loss=0.4993 loss_recon=0.4993 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8635 corrupt_frac=1.0000 acc_corrupt=0.8635 loss_corrupt=0.4993 wrong_frac=0.7890 init_acc_corrupt=0.2118 acc_corrupt_t_0p0_0p2=0.7646 corrupt_frac_t_0p0_0p2=0.5512 acc_corrupt_t_0p2_0p4=0.9823 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=0.9957 corrupt_frac_t_0p4_0p6=0.0805 out_w_norm=11.4562 out_g_norm=0.4135 acc_corrupt_t_0p6_0p8=0.9968 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3302 init_gold_top10=0.2302 init_gold_top100=0.3015
196
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=3.8s lr=2.000000e-03 loss=0.2580 loss_recon=0.2580 loss_meanflow=0.0000 mean_model_t=0.2088 mean_corrupt_t=0.2088 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9303 corrupt_frac=1.0000 acc_corrupt=0.9303 loss_corrupt=0.2580 wrong_frac=0.7914 init_acc_corrupt=0.2094 acc_corrupt_t_0p0_0p2=0.8760 corrupt_frac_t_0p0_0p2=0.5583 acc_corrupt_t_0p2_0p4=0.9987 corrupt_frac_t_0p2_0p4=0.3557 acc_corrupt_t_0p4_0p6=0.9997 corrupt_frac_t_0p4_0p6=0.0769 out_w_norm=11.7902 out_g_norm=0.3034 acc_corrupt_t_0p6_0p8=0.9991 corrupt_frac_t_0p6_0p8=0.0129 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2191 init_gold_top10=0.1997 init_gold_top100=0.2768
197
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=3.8s lr=2.000000e-03 loss=0.1393 loss_recon=0.1393 loss_meanflow=0.0000 mean_model_t=0.2113 mean_corrupt_t=0.2113 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9612 corrupt_frac=1.0000 acc_corrupt=0.9612 loss_corrupt=0.1393 wrong_frac=0.7887 init_acc_corrupt=0.2122 acc_corrupt_t_0p0_0p2=0.9298 corrupt_frac_t_0p0_0p2=0.5509 acc_corrupt_t_0p2_0p4=0.9997 corrupt_frac_t_0p2_0p4=0.3594 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0805 out_w_norm=11.9794 out_g_norm=0.2449 acc_corrupt_t_0p6_0p8=0.9995 corrupt_frac_t_0p6_0p8=0.0125 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0803 init_gold_top10=0.2300 init_gold_top100=0.3030
198
+ NCCL version 2.25.1+cuda12.8
199
+ resumed_from=runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805/latest.pt start_step=1001
200
+ {
201
+ "device": "cuda:0",
202
+ "rank": 0,
203
+ "world_size": 4,
204
+ "samples": "owt_cached_chunks:8",
205
+ "vocab_size": 969,
206
+ "tokenizer_vocab_size": 50257,
207
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805",
208
+ "batch_size": 128,
209
+ "grad_accum": 1,
210
+ "effective_batch_size": 512,
211
+ "global_batch_size": 512,
212
+ "lr_schedule": "constant_warmup",
213
+ "optimizer": "muon",
214
+ "epochs": 0.0,
215
+ "steps_per_epoch": 1,
216
+ "total_steps": 2000,
217
+ "warmup_steps": 10,
218
+ "warmup_epochs": -1.0,
219
+ "min_lr": 0.0,
220
+ "weight_decay": 0.1,
221
+ "output_weight_decay": -1.0,
222
+ "adamw_param_groups": "nanogpt",
223
+ "adam_beta1": 0.9,
224
+ "adam_beta2": 0.95,
225
+ "adam_eps": 1e-08,
226
+ "muon_impl": "legacy",
227
+ "muon_momentum": 0.95,
228
+ "muon_ns_steps": 5,
229
+ "muon_update_scale": 1.0,
230
+ "muon_nesterov": false,
231
+ "muon_width_scale": false,
232
+ "muon_grouping": "legacy_dim_ge_2",
233
+ "muon_param_count": 1965440,
234
+ "muon_adam_param_count": 8192,
235
+ "muon_param_names": [
236
+ "vocab_embed.embedding",
237
+ "sigma_map.net.0.weight",
238
+ "sigma_map.net.2.weight",
239
+ "blocks.0.attn_qkv.weight",
240
+ "blocks.0.attn_out.weight",
241
+ "blocks.0.mlp.0.weight",
242
+ "blocks.0.mlp.2.weight",
243
+ "blocks.0.adaLN_modulation.weight",
244
+ "blocks.1.attn_qkv.weight",
245
+ "blocks.1.attn_out.weight",
246
+ "blocks.1.mlp.0.weight",
247
+ "blocks.1.mlp.2.weight",
248
+ "blocks.1.adaLN_modulation.weight",
249
+ "blocks.2.attn_qkv.weight",
250
+ "blocks.2.attn_out.weight",
251
+ "blocks.2.mlp.0.weight",
252
+ "blocks.2.mlp.2.weight",
253
+ "blocks.2.adaLN_modulation.weight",
254
+ "output_layer.linear.weight",
255
+ "output_layer.adaLN_modulation.weight"
256
+ ],
257
+ "muon_adam_param_names": [
258
+ "sigma_map.net.0.bias",
259
+ "sigma_map.net.2.bias",
260
+ "blocks.0.norm1.weight",
261
+ "blocks.0.norm2.weight",
262
+ "blocks.0.mlp.0.bias",
263
+ "blocks.0.mlp.2.bias",
264
+ "blocks.0.adaLN_modulation.bias",
265
+ "blocks.1.norm1.weight",
266
+ "blocks.1.norm2.weight",
267
+ "blocks.1.mlp.0.bias",
268
+ "blocks.1.mlp.2.bias",
269
+ "blocks.1.adaLN_modulation.bias",
270
+ "blocks.2.norm1.weight",
271
+ "blocks.2.norm2.weight",
272
+ "blocks.2.mlp.0.bias",
273
+ "blocks.2.mlp.2.bias",
274
+ "blocks.2.adaLN_modulation.bias",
275
+ "output_layer.norm_final.weight",
276
+ "output_layer.adaLN_modulation.bias"
277
+ ],
278
+ "muon_effective_nesterov": false,
279
+ "muon_effective_width_scale": false,
280
+ "muon_effective_weight_decay": 0.1,
281
+ "muon_adam_fallback_nesterov": false,
282
+ "muon_adam_fallback_weight_decay": 0.1,
283
+ "ema_decay": 0.9999,
284
+ "ema_start_step": 0,
285
+ "model_type": "ddit",
286
+ "ddit_mlp_type": "gelu",
287
+ "elf_num_time_tokens": 4,
288
+ "elf_num_model_mode_tokens": 0,
289
+ "qk_norm": true,
290
+ "output_bias": false,
291
+ "output_init_std": -1.0,
292
+ "norm_type": "rmsnorm",
293
+ "target_loss": "hard_ce",
294
+ "linear_soft_target_power": 1.0,
295
+ "linear_soft_target_min_conf": 0.0,
296
+ "linear_soft_target_max_conf": 1.0,
297
+ "t_sampling_mode": "logit_normal",
298
+ "t_sampling_power": 1.0,
299
+ "t_sampling_eps": 0.0001,
300
+ "t_sampling_logit_mean": -1.5,
301
+ "t_sampling_logit_std": 0.8,
302
+ "dual_t": true,
303
+ "corrupt_t_mode": "same",
304
+ "corrupt_min_t": 0.0,
305
+ "corrupt_max_t": 1.0,
306
+ "prefix_block_prob": 0.0,
307
+ "prefix_block_len": 128,
308
+ "mask_ratio_floor_schedule": "none",
309
+ "dirichlet_endpoint_mode": "categorical_dual_t",
310
+ "dirichlet_semantic_t_mode": "same",
311
+ "dirichlet_semantic_t_value": 0.0,
312
+ "dirichlet_semantic_t_curve": "linear",
313
+ "dirichlet_semantic_t_power": 1.0,
314
+ "endpoint_sequence_random_prob_alpha": 0.0,
315
+ "categorical_wrong_from_full_vocab": true,
316
+ "categorical_wrong_from_batch_valid_tokens": false,
317
+ "categorical_wrong_basin_token_ids": "",
318
+ "categorical_wrong_basin_prob": 0.0,
319
+ "categorical_wrong_unigram_prob": 0.0,
320
+ "categorical_wrong_uniform_prob": 0.0,
321
+ "categorical_wrong_corpus_unigram_path": "",
322
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
323
+ "categorical_wrong_basin_shared_prob": 0.0,
324
+ "categorical_wrong_unigram_shared_prob": 0.0,
325
+ "mask_mixture_original_prob": 0.0,
326
+ "mask_mixture_lowk_prob": 0.0,
327
+ "mask_mixture_lowcorrupt_prob": 0.0,
328
+ "mask_mixture_block_prob": 0.0,
329
+ "mask_mixture_all_prob": 1.0,
330
+ "mask_mixture_lowk_clean_tokens": "0",
331
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
332
+ "mask_mixture_block_tokens": "64,128",
333
+ "simplex_bridge_sampler": "logistic_normal_linear_mean",
334
+ "logistic_normal_sigma_min": 0.05,
335
+ "logistic_normal_sigma_max": 0.5,
336
+ "logistic_normal_tau_min": 1.0,
337
+ "logistic_normal_tau_max": 1.0,
338
+ "torch_compile": false,
339
+ "compile_mode": "max-autotune",
340
+ "state_format": "prob",
341
+ "meanflow_weight": 0.0,
342
+ "rollout_train_prob": 0.0,
343
+ "rollout_train_steps": 1,
344
+ "rollout_train_infer_steps": 64,
345
+ "rollout_train_temp": 1.45,
346
+ "rollout_train_max_gamma": 1.0,
347
+ "rollout_train_corrupt_only": true,
348
+ "rollout_train_samplewise": false,
349
+ "rollout_train_compute_always": false,
350
+ "bridge_noise_init": "logistic_normal",
351
+ "noise_sigma": -1.0,
352
+ "allow_tf32": true,
353
+ "activation_checkpointing": false,
354
+ "activation_checkpoint_interval": 1,
355
+ "activation_checkpoint_scope": "block",
356
+ "ddp_static_graph": false,
357
+ "ddp_gradient_as_bucket_view": true,
358
+ "blocking_data_transfer": false,
359
+ "dataloader_prefetch_factor": 4,
360
+ "full_train_stats": false,
361
+ "tokenized_hf": false,
362
+ "tokenized_pad_token": "pad",
363
+ "elf_conditional_hf": false,
364
+ "record_pad_truncate": false,
365
+ "record_add_eos": false,
366
+ "record_add_special_tokens": false,
367
+ "record_pad_token": "pad",
368
+ "record_shuffle_buffer": 10000,
369
+ "wrap": true,
370
+ "wrap_mode": "stream",
371
+ "wrap_record_buffer_size": 200,
372
+ "owt_cached_chunks": true,
373
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
374
+ "owt_chunk_cache_rebuild": false,
375
+ "owt_chunk_cache_write_batch": 4096,
376
+ "owt_exact_repeat_per_chunk": 64,
377
+ "online_chunk_shuffle": false,
378
+ "online_chunk_shuffle_buffer": 10000,
379
+ "openwebtext_split": "train_minus_100k",
380
+ "detokenizer": "auto",
381
+ "resolved_detokenizer": null,
382
+ "num_workers": 0,
383
+ "latest_every": 1000,
384
+ "resume_path": "runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805/latest.pt"
385
+ }
386
+ step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=4.5s lr=2.000000e-03 loss=0.1004 loss_recon=0.1004 loss_meanflow=0.0000 mean_model_t=0.2088 mean_corrupt_t=0.2088 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9707 corrupt_frac=1.0000 acc_corrupt=0.9707 loss_corrupt=0.1004 wrong_frac=0.7916 init_acc_corrupt=0.2093 acc_corrupt_t_0p0_0p2=0.9476 corrupt_frac_t_0p0_0p2=0.5587 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3553 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0770 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0122 out_w_norm=12.0726 out_g_norm=0.1836 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0791 init_gold_top10=0.2177 init_gold_top100=0.2919
387
+ step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=3.8s lr=2.000000e-03 loss=0.0856 loss_recon=0.0856 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9749 corrupt_frac=1.0000 acc_corrupt=0.9749 loss_corrupt=0.0856 wrong_frac=0.7931 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9556 corrupt_frac_t_0p0_0p2=0.5635 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3538 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0742 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0127 out_w_norm=12.1000 out_g_norm=0.1557 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0377 init_gold_top10=0.1975 init_gold_top100=0.2742
388
+ step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=3.8s lr=2.000000e-03 loss=0.0679 loss_recon=0.0679 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9793 corrupt_frac=1.0000 acc_corrupt=0.9793 loss_corrupt=0.0679 wrong_frac=0.7916 init_acc_corrupt=0.2092 acc_corrupt_t_0p0_0p2=0.9626 corrupt_frac_t_0p0_0p2=0.5532 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3616 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0771 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=12.0992 out_g_norm=0.1191 loss_all=0.0899 init_gold_top10=0.2067 init_gold_top100=0.2811
389
+ step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=3.8s lr=2.000000e-03 loss=0.0679 loss_recon=0.0679 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9794 corrupt_frac=1.0000 acc_corrupt=0.9794 loss_corrupt=0.0679 wrong_frac=0.7924 init_acc_corrupt=0.2084 acc_corrupt_t_0p0_0p2=0.9633 corrupt_frac_t_0p0_0p2=0.5611 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0734 out_w_norm=12.0845 out_g_norm=0.1117 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0122 loss_all=0.0877 init_gold_top10=0.1958 init_gold_top100=0.2700
390
+ step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=3.8s lr=2.000000e-03 loss=0.0575 loss_recon=0.0575 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9828 corrupt_frac=1.0000 acc_corrupt=0.9828 loss_corrupt=0.0575 wrong_frac=0.7902 init_acc_corrupt=0.2106 acc_corrupt_t_0p0_0p2=0.9692 corrupt_frac_t_0p0_0p2=0.5567 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3548 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0798 out_w_norm=12.0797 out_g_norm=0.1056 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0339 init_gold_top10=0.2355 init_gold_top100=0.3061
391
+ step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=3.8s lr=2.000000e-03 loss=0.0564 loss_recon=0.0564 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9829 corrupt_frac=1.0000 acc_corrupt=0.9829 loss_corrupt=0.0564 wrong_frac=0.7920 init_acc_corrupt=0.2088 acc_corrupt_t_0p0_0p2=0.9693 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3566 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0777 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=12.0764 out_g_norm=0.0942 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1145 init_gold_top10=0.2176 init_gold_top100=0.2910
392
+ step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=3.8s lr=2.000000e-03 loss=0.0459 loss_recon=0.0459 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9860 corrupt_frac=1.0000 acc_corrupt=0.9860 loss_corrupt=0.0459 wrong_frac=0.7910 init_acc_corrupt=0.2098 acc_corrupt_t_0p0_0p2=0.9750 corrupt_frac_t_0p0_0p2=0.5582 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3555 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0766 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0144 out_w_norm=12.0780 out_g_norm=0.0834 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0271 init_gold_top10=0.2223 init_gold_top100=0.2966
393
+ step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=3.8s lr=2.000000e-03 loss=0.0491 loss_recon=0.0491 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9850 corrupt_frac=1.0000 acc_corrupt=0.9850 loss_corrupt=0.0491 wrong_frac=0.7890 init_acc_corrupt=0.2118 acc_corrupt_t_0p0_0p2=0.9729 corrupt_frac_t_0p0_0p2=0.5512 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0805 out_w_norm=12.0707 out_g_norm=0.0727 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0427 init_gold_top10=0.2302 init_gold_top100=0.3015
394
+ step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=3.8s lr=2.000000e-03 loss=0.0457 loss_recon=0.0457 loss_meanflow=0.0000 mean_model_t=0.2088 mean_corrupt_t=0.2088 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9862 corrupt_frac=1.0000 acc_corrupt=0.9862 loss_corrupt=0.0457 wrong_frac=0.7914 init_acc_corrupt=0.2094 acc_corrupt_t_0p0_0p2=0.9752 corrupt_frac_t_0p0_0p2=0.5583 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3557 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0769 out_w_norm=12.0792 out_g_norm=0.0672 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0129 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0452 init_gold_top10=0.1997 init_gold_top100=0.2768
395
+ step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=3.8s lr=2.000000e-03 loss=0.0360 loss_recon=0.0360 loss_meanflow=0.0000 mean_model_t=0.2113 mean_corrupt_t=0.2113 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9890 corrupt_frac=1.0000 acc_corrupt=0.9890 loss_corrupt=0.0360 wrong_frac=0.7887 init_acc_corrupt=0.2122 acc_corrupt_t_0p0_0p2=0.9800 corrupt_frac_t_0p0_0p2=0.5509 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3594 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0805 out_w_norm=12.0823 out_g_norm=0.0653 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0125 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0224 init_gold_top10=0.2300 init_gold_top100=0.3030
396
+ NCCL version 2.25.1+cuda12.8
397
+ resumed_from=runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805/latest.pt start_step=2001
398
+ {
399
+ "device": "cuda:0",
400
+ "rank": 0,
401
+ "world_size": 4,
402
+ "samples": "owt_cached_chunks:8",
403
+ "vocab_size": 969,
404
+ "tokenizer_vocab_size": 50257,
405
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805",
406
+ "batch_size": 128,
407
+ "grad_accum": 1,
408
+ "effective_batch_size": 512,
409
+ "global_batch_size": 512,
410
+ "lr_schedule": "constant_warmup",
411
+ "optimizer": "muon",
412
+ "epochs": 0.0,
413
+ "steps_per_epoch": 1,
414
+ "total_steps": 3000,
415
+ "warmup_steps": 10,
416
+ "warmup_epochs": -1.0,
417
+ "min_lr": 0.0,
418
+ "weight_decay": 0.1,
419
+ "output_weight_decay": -1.0,
420
+ "adamw_param_groups": "nanogpt",
421
+ "adam_beta1": 0.9,
422
+ "adam_beta2": 0.95,
423
+ "adam_eps": 1e-08,
424
+ "muon_impl": "legacy",
425
+ "muon_momentum": 0.95,
426
+ "muon_ns_steps": 5,
427
+ "muon_update_scale": 1.0,
428
+ "muon_nesterov": false,
429
+ "muon_width_scale": false,
430
+ "muon_grouping": "legacy_dim_ge_2",
431
+ "muon_param_count": 1965440,
432
+ "muon_adam_param_count": 8192,
433
+ "muon_param_names": [
434
+ "vocab_embed.embedding",
435
+ "sigma_map.net.0.weight",
436
+ "sigma_map.net.2.weight",
437
+ "blocks.0.attn_qkv.weight",
438
+ "blocks.0.attn_out.weight",
439
+ "blocks.0.mlp.0.weight",
440
+ "blocks.0.mlp.2.weight",
441
+ "blocks.0.adaLN_modulation.weight",
442
+ "blocks.1.attn_qkv.weight",
443
+ "blocks.1.attn_out.weight",
444
+ "blocks.1.mlp.0.weight",
445
+ "blocks.1.mlp.2.weight",
446
+ "blocks.1.adaLN_modulation.weight",
447
+ "blocks.2.attn_qkv.weight",
448
+ "blocks.2.attn_out.weight",
449
+ "blocks.2.mlp.0.weight",
450
+ "blocks.2.mlp.2.weight",
451
+ "blocks.2.adaLN_modulation.weight",
452
+ "output_layer.linear.weight",
453
+ "output_layer.adaLN_modulation.weight"
454
+ ],
455
+ "muon_adam_param_names": [
456
+ "sigma_map.net.0.bias",
457
+ "sigma_map.net.2.bias",
458
+ "blocks.0.norm1.weight",
459
+ "blocks.0.norm2.weight",
460
+ "blocks.0.mlp.0.bias",
461
+ "blocks.0.mlp.2.bias",
462
+ "blocks.0.adaLN_modulation.bias",
463
+ "blocks.1.norm1.weight",
464
+ "blocks.1.norm2.weight",
465
+ "blocks.1.mlp.0.bias",
466
+ "blocks.1.mlp.2.bias",
467
+ "blocks.1.adaLN_modulation.bias",
468
+ "blocks.2.norm1.weight",
469
+ "blocks.2.norm2.weight",
470
+ "blocks.2.mlp.0.bias",
471
+ "blocks.2.mlp.2.bias",
472
+ "blocks.2.adaLN_modulation.bias",
473
+ "output_layer.norm_final.weight",
474
+ "output_layer.adaLN_modulation.bias"
475
+ ],
476
+ "muon_effective_nesterov": false,
477
+ "muon_effective_width_scale": false,
478
+ "muon_effective_weight_decay": 0.1,
479
+ "muon_adam_fallback_nesterov": false,
480
+ "muon_adam_fallback_weight_decay": 0.1,
481
+ "ema_decay": 0.9999,
482
+ "ema_start_step": 0,
483
+ "model_type": "ddit",
484
+ "ddit_mlp_type": "gelu",
485
+ "elf_num_time_tokens": 4,
486
+ "elf_num_model_mode_tokens": 0,
487
+ "qk_norm": true,
488
+ "output_bias": false,
489
+ "output_init_std": -1.0,
490
+ "norm_type": "rmsnorm",
491
+ "target_loss": "hard_ce",
492
+ "linear_soft_target_power": 1.0,
493
+ "linear_soft_target_min_conf": 0.0,
494
+ "linear_soft_target_max_conf": 1.0,
495
+ "t_sampling_mode": "logit_normal",
496
+ "t_sampling_power": 1.0,
497
+ "t_sampling_eps": 0.0001,
498
+ "t_sampling_logit_mean": -1.5,
499
+ "t_sampling_logit_std": 0.8,
500
+ "dual_t": true,
501
+ "corrupt_t_mode": "same",
502
+ "corrupt_min_t": 0.0,
503
+ "corrupt_max_t": 1.0,
504
+ "prefix_block_prob": 0.0,
505
+ "prefix_block_len": 128,
506
+ "mask_ratio_floor_schedule": "none",
507
+ "dirichlet_endpoint_mode": "categorical_dual_t",
508
+ "dirichlet_semantic_t_mode": "same",
509
+ "dirichlet_semantic_t_value": 0.0,
510
+ "dirichlet_semantic_t_curve": "linear",
511
+ "dirichlet_semantic_t_power": 1.0,
512
+ "endpoint_sequence_random_prob_alpha": 0.0,
513
+ "categorical_wrong_from_full_vocab": true,
514
+ "categorical_wrong_from_batch_valid_tokens": false,
515
+ "categorical_wrong_basin_token_ids": "",
516
+ "categorical_wrong_basin_prob": 0.0,
517
+ "categorical_wrong_unigram_prob": 0.0,
518
+ "categorical_wrong_uniform_prob": 0.0,
519
+ "categorical_wrong_corpus_unigram_path": "",
520
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
521
+ "categorical_wrong_basin_shared_prob": 0.0,
522
+ "categorical_wrong_unigram_shared_prob": 0.0,
523
+ "mask_mixture_original_prob": 0.0,
524
+ "mask_mixture_lowk_prob": 0.0,
525
+ "mask_mixture_lowcorrupt_prob": 0.0,
526
+ "mask_mixture_block_prob": 0.0,
527
+ "mask_mixture_all_prob": 1.0,
528
+ "mask_mixture_lowk_clean_tokens": "0",
529
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
530
+ "mask_mixture_block_tokens": "64,128",
531
+ "simplex_bridge_sampler": "logistic_normal_linear_mean",
532
+ "logistic_normal_sigma_min": 0.05,
533
+ "logistic_normal_sigma_max": 0.5,
534
+ "logistic_normal_tau_min": 1.0,
535
+ "logistic_normal_tau_max": 1.0,
536
+ "torch_compile": false,
537
+ "compile_mode": "max-autotune",
538
+ "state_format": "prob",
539
+ "meanflow_weight": 0.0,
540
+ "rollout_train_prob": 0.0,
541
+ "rollout_train_steps": 1,
542
+ "rollout_train_infer_steps": 64,
543
+ "rollout_train_temp": 1.45,
544
+ "rollout_train_max_gamma": 1.0,
545
+ "rollout_train_corrupt_only": true,
546
+ "rollout_train_samplewise": false,
547
+ "rollout_train_compute_always": false,
548
+ "bridge_noise_init": "logistic_normal",
549
+ "noise_sigma": -1.0,
550
+ "allow_tf32": true,
551
+ "activation_checkpointing": false,
552
+ "activation_checkpoint_interval": 1,
553
+ "activation_checkpoint_scope": "block",
554
+ "ddp_static_graph": false,
555
+ "ddp_gradient_as_bucket_view": true,
556
+ "blocking_data_transfer": false,
557
+ "dataloader_prefetch_factor": 4,
558
+ "full_train_stats": false,
559
+ "tokenized_hf": false,
560
+ "tokenized_pad_token": "pad",
561
+ "elf_conditional_hf": false,
562
+ "record_pad_truncate": false,
563
+ "record_add_eos": false,
564
+ "record_add_special_tokens": false,
565
+ "record_pad_token": "pad",
566
+ "record_shuffle_buffer": 10000,
567
+ "wrap": true,
568
+ "wrap_mode": "stream",
569
+ "wrap_record_buffer_size": 200,
570
+ "owt_cached_chunks": true,
571
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
572
+ "owt_chunk_cache_rebuild": false,
573
+ "owt_chunk_cache_write_batch": 4096,
574
+ "owt_exact_repeat_per_chunk": 64,
575
+ "online_chunk_shuffle": false,
576
+ "online_chunk_shuffle_buffer": 10000,
577
+ "openwebtext_split": "train_minus_100k",
578
+ "detokenizer": "auto",
579
+ "resolved_detokenizer": null,
580
+ "num_workers": 0,
581
+ "latest_every": 1000,
582
+ "resume_path": "runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805/latest.pt"
583
+ }
584
+ step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=4.5s lr=2.000000e-03 loss=0.0377 loss_recon=0.0377 loss_meanflow=0.0000 mean_model_t=0.2088 mean_corrupt_t=0.2088 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9883 corrupt_frac=1.0000 acc_corrupt=0.9883 loss_corrupt=0.0377 wrong_frac=0.7916 init_acc_corrupt=0.2093 acc_corrupt_t_0p0_0p2=0.9791 corrupt_frac_t_0p0_0p2=0.5587 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3553 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0770 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0122 out_w_norm=12.0951 out_g_norm=0.0622 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0344 init_gold_top10=0.2177 init_gold_top100=0.2919
585
+ step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=4.0s lr=2.000000e-03 loss=0.0402 loss_recon=0.0402 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9881 corrupt_frac=1.0000 acc_corrupt=0.9881 loss_corrupt=0.0402 wrong_frac=0.7931 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9788 corrupt_frac_t_0p0_0p2=0.5635 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3538 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0742 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0127 out_w_norm=12.1114 out_g_norm=0.0591 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0402 init_gold_top10=0.1975 init_gold_top100=0.2742
586
+ step=2300 epoch=2300/3000 epoch_step=1/1 micro_steps=2300 elapsed=3.9s lr=2.000000e-03 loss=0.0340 loss_recon=0.0340 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9892 corrupt_frac=1.0000 acc_corrupt=0.9892 loss_corrupt=0.0340 wrong_frac=0.7916 init_acc_corrupt=0.2092 acc_corrupt_t_0p0_0p2=0.9805 corrupt_frac_t_0p0_0p2=0.5532 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3616 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0771 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=12.1281 out_g_norm=0.0558 loss_all=0.0586 init_gold_top10=0.2067 init_gold_top100=0.2811
587
+ step=2400 epoch=2400/3000 epoch_step=1/1 micro_steps=2400 elapsed=3.9s lr=2.000000e-03 loss=0.0344 loss_recon=0.0344 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9894 corrupt_frac=1.0000 acc_corrupt=0.9894 loss_corrupt=0.0344 wrong_frac=0.7924 init_acc_corrupt=0.2084 acc_corrupt_t_0p0_0p2=0.9811 corrupt_frac_t_0p0_0p2=0.5611 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0734 out_w_norm=12.1489 out_g_norm=0.0537 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0122 loss_all=0.0615 init_gold_top10=0.1958 init_gold_top100=0.2700
588
+ step=2500 epoch=2500/3000 epoch_step=1/1 micro_steps=2500 elapsed=3.9s lr=2.000000e-03 loss=0.0312 loss_recon=0.0312 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9905 corrupt_frac=1.0000 acc_corrupt=0.9905 loss_corrupt=0.0312 wrong_frac=0.7902 init_acc_corrupt=0.2106 acc_corrupt_t_0p0_0p2=0.9830 corrupt_frac_t_0p0_0p2=0.5567 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3548 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0798 out_w_norm=12.1605 out_g_norm=0.0507 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0138 init_gold_top10=0.2355 init_gold_top100=0.3061
589
+ step=2600 epoch=2600/3000 epoch_step=1/1 micro_steps=2600 elapsed=3.9s lr=2.000000e-03 loss=0.0313 loss_recon=0.0313 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9904 corrupt_frac=1.0000 acc_corrupt=0.9904 loss_corrupt=0.0313 wrong_frac=0.7920 init_acc_corrupt=0.2088 acc_corrupt_t_0p0_0p2=0.9828 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3566 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0777 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=12.1766 out_g_norm=0.0494 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0815 init_gold_top10=0.2176 init_gold_top100=0.2910
590
+ step=2700 epoch=2700/3000 epoch_step=1/1 micro_steps=2700 elapsed=3.9s lr=2.000000e-03 loss=0.0249 loss_recon=0.0249 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9922 corrupt_frac=1.0000 acc_corrupt=0.9922 loss_corrupt=0.0249 wrong_frac=0.7910 init_acc_corrupt=0.2098 acc_corrupt_t_0p0_0p2=0.9860 corrupt_frac_t_0p0_0p2=0.5582 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3555 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0766 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0144 out_w_norm=12.1834 out_g_norm=0.0451 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0019 init_gold_top10=0.2223 init_gold_top100=0.2966
591
+ step=2800 epoch=2800/3000 epoch_step=1/1 micro_steps=2800 elapsed=3.9s lr=2.000000e-03 loss=0.0284 loss_recon=0.0284 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9912 corrupt_frac=1.0000 acc_corrupt=0.9912 loss_corrupt=0.0284 wrong_frac=0.7890 init_acc_corrupt=0.2118 acc_corrupt_t_0p0_0p2=0.9841 corrupt_frac_t_0p0_0p2=0.5512 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0805 out_w_norm=12.2032 out_g_norm=0.0412 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0442 init_gold_top10=0.2302 init_gold_top100=0.3015
592
+ step=2900 epoch=2900/3000 epoch_step=1/1 micro_steps=2900 elapsed=3.9s lr=2.000000e-03 loss=0.0276 loss_recon=0.0276 loss_meanflow=0.0000 mean_model_t=0.2088 mean_corrupt_t=0.2088 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9916 corrupt_frac=1.0000 acc_corrupt=0.9916 loss_corrupt=0.0276 wrong_frac=0.7914 init_acc_corrupt=0.2094 acc_corrupt_t_0p0_0p2=0.9849 corrupt_frac_t_0p0_0p2=0.5583 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3557 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0769 out_w_norm=12.2219 out_g_norm=0.0415 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0129 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0432 init_gold_top10=0.1997 init_gold_top100=0.2768
593
+ step=3000 epoch=3000/3000 epoch_step=1/1 micro_steps=3000 elapsed=3.9s lr=2.000000e-03 loss=0.0224 loss_recon=0.0224 loss_meanflow=0.0000 mean_model_t=0.2113 mean_corrupt_t=0.2113 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9929 corrupt_frac=1.0000 acc_corrupt=0.9929 loss_corrupt=0.0224 wrong_frac=0.7887 init_acc_corrupt=0.2122 acc_corrupt_t_0p0_0p2=0.9872 corrupt_frac_t_0p0_0p2=0.5509 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3594 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0805 out_w_norm=12.2261 out_g_norm=0.0409 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0125 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0203 init_gold_top10=0.2300 init_gold_top100=0.3030
594
+ NCCL version 2.25.1+cuda12.8
595
+ resumed_from=runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805/latest.pt start_step=3001
596
+ {
597
+ "device": "cuda:0",
598
+ "rank": 0,
599
+ "world_size": 4,
600
+ "samples": "owt_cached_chunks:8",
601
+ "vocab_size": 969,
602
+ "tokenizer_vocab_size": 50257,
603
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805",
604
+ "batch_size": 128,
605
+ "grad_accum": 1,
606
+ "effective_batch_size": 512,
607
+ "global_batch_size": 512,
608
+ "lr_schedule": "constant_warmup",
609
+ "optimizer": "muon",
610
+ "epochs": 0.0,
611
+ "steps_per_epoch": 1,
612
+ "total_steps": 4000,
613
+ "warmup_steps": 10,
614
+ "warmup_epochs": -1.0,
615
+ "min_lr": 0.0,
616
+ "weight_decay": 0.1,
617
+ "output_weight_decay": -1.0,
618
+ "adamw_param_groups": "nanogpt",
619
+ "adam_beta1": 0.9,
620
+ "adam_beta2": 0.95,
621
+ "adam_eps": 1e-08,
622
+ "muon_impl": "legacy",
623
+ "muon_momentum": 0.95,
624
+ "muon_ns_steps": 5,
625
+ "muon_update_scale": 1.0,
626
+ "muon_nesterov": false,
627
+ "muon_width_scale": false,
628
+ "muon_grouping": "legacy_dim_ge_2",
629
+ "muon_param_count": 1965440,
630
+ "muon_adam_param_count": 8192,
631
+ "muon_param_names": [
632
+ "vocab_embed.embedding",
633
+ "sigma_map.net.0.weight",
634
+ "sigma_map.net.2.weight",
635
+ "blocks.0.attn_qkv.weight",
636
+ "blocks.0.attn_out.weight",
637
+ "blocks.0.mlp.0.weight",
638
+ "blocks.0.mlp.2.weight",
639
+ "blocks.0.adaLN_modulation.weight",
640
+ "blocks.1.attn_qkv.weight",
641
+ "blocks.1.attn_out.weight",
642
+ "blocks.1.mlp.0.weight",
643
+ "blocks.1.mlp.2.weight",
644
+ "blocks.1.adaLN_modulation.weight",
645
+ "blocks.2.attn_qkv.weight",
646
+ "blocks.2.attn_out.weight",
647
+ "blocks.2.mlp.0.weight",
648
+ "blocks.2.mlp.2.weight",
649
+ "blocks.2.adaLN_modulation.weight",
650
+ "output_layer.linear.weight",
651
+ "output_layer.adaLN_modulation.weight"
652
+ ],
653
+ "muon_adam_param_names": [
654
+ "sigma_map.net.0.bias",
655
+ "sigma_map.net.2.bias",
656
+ "blocks.0.norm1.weight",
657
+ "blocks.0.norm2.weight",
658
+ "blocks.0.mlp.0.bias",
659
+ "blocks.0.mlp.2.bias",
660
+ "blocks.0.adaLN_modulation.bias",
661
+ "blocks.1.norm1.weight",
662
+ "blocks.1.norm2.weight",
663
+ "blocks.1.mlp.0.bias",
664
+ "blocks.1.mlp.2.bias",
665
+ "blocks.1.adaLN_modulation.bias",
666
+ "blocks.2.norm1.weight",
667
+ "blocks.2.norm2.weight",
668
+ "blocks.2.mlp.0.bias",
669
+ "blocks.2.mlp.2.bias",
670
+ "blocks.2.adaLN_modulation.bias",
671
+ "output_layer.norm_final.weight",
672
+ "output_layer.adaLN_modulation.bias"
673
+ ],
674
+ "muon_effective_nesterov": false,
675
+ "muon_effective_width_scale": false,
676
+ "muon_effective_weight_decay": 0.1,
677
+ "muon_adam_fallback_nesterov": false,
678
+ "muon_adam_fallback_weight_decay": 0.1,
679
+ "ema_decay": 0.9999,
680
+ "ema_start_step": 0,
681
+ "model_type": "ddit",
682
+ "ddit_mlp_type": "gelu",
683
+ "elf_num_time_tokens": 4,
684
+ "elf_num_model_mode_tokens": 0,
685
+ "qk_norm": true,
686
+ "output_bias": false,
687
+ "output_init_std": -1.0,
688
+ "norm_type": "rmsnorm",
689
+ "target_loss": "hard_ce",
690
+ "linear_soft_target_power": 1.0,
691
+ "linear_soft_target_min_conf": 0.0,
692
+ "linear_soft_target_max_conf": 1.0,
693
+ "t_sampling_mode": "logit_normal",
694
+ "t_sampling_power": 1.0,
695
+ "t_sampling_eps": 0.0001,
696
+ "t_sampling_logit_mean": -1.5,
697
+ "t_sampling_logit_std": 0.8,
698
+ "dual_t": true,
699
+ "corrupt_t_mode": "same",
700
+ "corrupt_min_t": 0.0,
701
+ "corrupt_max_t": 1.0,
702
+ "prefix_block_prob": 0.0,
703
+ "prefix_block_len": 128,
704
+ "mask_ratio_floor_schedule": "none",
705
+ "dirichlet_endpoint_mode": "categorical_dual_t",
706
+ "dirichlet_semantic_t_mode": "same",
707
+ "dirichlet_semantic_t_value": 0.0,
708
+ "dirichlet_semantic_t_curve": "linear",
709
+ "dirichlet_semantic_t_power": 1.0,
710
+ "endpoint_sequence_random_prob_alpha": 0.0,
711
+ "categorical_wrong_from_full_vocab": true,
712
+ "categorical_wrong_from_batch_valid_tokens": false,
713
+ "categorical_wrong_basin_token_ids": "",
714
+ "categorical_wrong_basin_prob": 0.0,
715
+ "categorical_wrong_unigram_prob": 0.0,
716
+ "categorical_wrong_uniform_prob": 0.0,
717
+ "categorical_wrong_corpus_unigram_path": "",
718
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
719
+ "categorical_wrong_basin_shared_prob": 0.0,
720
+ "categorical_wrong_unigram_shared_prob": 0.0,
721
+ "mask_mixture_original_prob": 0.0,
722
+ "mask_mixture_lowk_prob": 0.0,
723
+ "mask_mixture_lowcorrupt_prob": 0.0,
724
+ "mask_mixture_block_prob": 0.0,
725
+ "mask_mixture_all_prob": 1.0,
726
+ "mask_mixture_lowk_clean_tokens": "0",
727
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
728
+ "mask_mixture_block_tokens": "64,128",
729
+ "simplex_bridge_sampler": "logistic_normal_linear_mean",
730
+ "logistic_normal_sigma_min": 0.05,
731
+ "logistic_normal_sigma_max": 0.5,
732
+ "logistic_normal_tau_min": 1.0,
733
+ "logistic_normal_tau_max": 1.0,
734
+ "torch_compile": false,
735
+ "compile_mode": "max-autotune",
736
+ "state_format": "prob",
737
+ "meanflow_weight": 0.0,
738
+ "rollout_train_prob": 0.0,
739
+ "rollout_train_steps": 1,
740
+ "rollout_train_infer_steps": 64,
741
+ "rollout_train_temp": 1.45,
742
+ "rollout_train_max_gamma": 1.0,
743
+ "rollout_train_corrupt_only": true,
744
+ "rollout_train_samplewise": false,
745
+ "rollout_train_compute_always": false,
746
+ "bridge_noise_init": "logistic_normal",
747
+ "noise_sigma": -1.0,
748
+ "allow_tf32": true,
749
+ "activation_checkpointing": false,
750
+ "activation_checkpoint_interval": 1,
751
+ "activation_checkpoint_scope": "block",
752
+ "ddp_static_graph": false,
753
+ "ddp_gradient_as_bucket_view": true,
754
+ "blocking_data_transfer": false,
755
+ "dataloader_prefetch_factor": 4,
756
+ "full_train_stats": false,
757
+ "tokenized_hf": false,
758
+ "tokenized_pad_token": "pad",
759
+ "elf_conditional_hf": false,
760
+ "record_pad_truncate": false,
761
+ "record_add_eos": false,
762
+ "record_add_special_tokens": false,
763
+ "record_pad_token": "pad",
764
+ "record_shuffle_buffer": 10000,
765
+ "wrap": true,
766
+ "wrap_mode": "stream",
767
+ "wrap_record_buffer_size": 200,
768
+ "owt_cached_chunks": true,
769
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
770
+ "owt_chunk_cache_rebuild": false,
771
+ "owt_chunk_cache_write_batch": 4096,
772
+ "owt_exact_repeat_per_chunk": 64,
773
+ "online_chunk_shuffle": false,
774
+ "online_chunk_shuffle_buffer": 10000,
775
+ "openwebtext_split": "train_minus_100k",
776
+ "detokenizer": "auto",
777
+ "resolved_detokenizer": null,
778
+ "num_workers": 0,
779
+ "latest_every": 1000,
780
+ "resume_path": "runs/train8_noisegeo_len256_allcorrupt_logistic_sig0p05_0p5_20260517_163805/latest.pt"
781
+ }
782
+ step=3100 epoch=3100/4000 epoch_step=1/1 micro_steps=3100 elapsed=4.4s lr=2.000000e-03 loss=0.0224 loss_recon=0.0224 loss_meanflow=0.0000 mean_model_t=0.2088 mean_corrupt_t=0.2088 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9929 corrupt_frac=1.0000 acc_corrupt=0.9929 loss_corrupt=0.0224 wrong_frac=0.7916 init_acc_corrupt=0.2093 acc_corrupt_t_0p0_0p2=0.9873 corrupt_frac_t_0p0_0p2=0.5587 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3553 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0770 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0122 out_w_norm=12.2369 out_g_norm=0.0382 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0017 init_gold_top10=0.2177 init_gold_top100=0.2919
783
+ step=3200 epoch=3200/4000 epoch_step=1/1 micro_steps=3200 elapsed=4.0s lr=2.000000e-03 loss=0.0248 loss_recon=0.0248 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9927 corrupt_frac=1.0000 acc_corrupt=0.9927 loss_corrupt=0.0248 wrong_frac=0.7931 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.9870 corrupt_frac_t_0p0_0p2=0.5635 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3538 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0742 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0127 out_w_norm=12.2663 out_g_norm=0.0392 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0048 init_gold_top10=0.1975 init_gold_top100=0.2742
784
+ step=3300 epoch=3300/4000 epoch_step=1/1 micro_steps=3300 elapsed=4.0s lr=2.000000e-03 loss=0.0220 loss_recon=0.0220 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9929 corrupt_frac=1.0000 acc_corrupt=0.9929 loss_corrupt=0.0220 wrong_frac=0.7916 init_acc_corrupt=0.2092 acc_corrupt_t_0p0_0p2=0.9872 corrupt_frac_t_0p0_0p2=0.5532 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3616 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0771 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=12.2826 out_g_norm=0.0395 loss_all=0.0236 init_gold_top10=0.2067 init_gold_top100=0.2811
785
+ step=3400 epoch=3400/4000 epoch_step=1/1 micro_steps=3400 elapsed=3.9s lr=2.000000e-03 loss=0.0221 loss_recon=0.0221 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9931 corrupt_frac=1.0000 acc_corrupt=0.9931 loss_corrupt=0.0221 wrong_frac=0.7924 init_acc_corrupt=0.2084 acc_corrupt_t_0p0_0p2=0.9878 corrupt_frac_t_0p0_0p2=0.5611 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0734 out_w_norm=12.2881 out_g_norm=0.0364 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0122 loss_all=0.0122 init_gold_top10=0.1958 init_gold_top100=0.2700
786
+ step=3500 epoch=3500/4000 epoch_step=1/1 micro_steps=3500 elapsed=3.9s lr=2.000000e-03 loss=0.0195 loss_recon=0.0195 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9940 corrupt_frac=1.0000 acc_corrupt=0.9940 loss_corrupt=0.0195 wrong_frac=0.7902 init_acc_corrupt=0.2106 acc_corrupt_t_0p0_0p2=0.9892 corrupt_frac_t_0p0_0p2=0.5567 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3548 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0798 out_w_norm=12.2898 out_g_norm=0.0367 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0037 init_gold_top10=0.2355 init_gold_top100=0.3061
787
+ step=3600 epoch=3600/4000 epoch_step=1/1 micro_steps=3600 elapsed=3.9s lr=2.000000e-03 loss=0.0201 loss_recon=0.0201 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9938 corrupt_frac=1.0000 acc_corrupt=0.9938 loss_corrupt=0.0201 wrong_frac=0.7920 init_acc_corrupt=0.2088 acc_corrupt_t_0p0_0p2=0.9889 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3566 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0777 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=12.2942 out_g_norm=0.0362 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0312 init_gold_top10=0.2176 init_gold_top100=0.2910
788
+ step=3700 epoch=3700/4000 epoch_step=1/1 micro_steps=3700 elapsed=3.9s lr=2.000000e-03 loss=0.0160 loss_recon=0.0160 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9951 corrupt_frac=1.0000 acc_corrupt=0.9951 loss_corrupt=0.0160 wrong_frac=0.7910 init_acc_corrupt=0.2098 acc_corrupt_t_0p0_0p2=0.9912 corrupt_frac_t_0p0_0p2=0.5582 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3555 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0766 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0144 out_w_norm=12.2805 out_g_norm=0.0321 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0018 init_gold_top10=0.2223 init_gold_top100=0.2966
789
+ step=3800 epoch=3800/4000 epoch_step=1/1 micro_steps=3800 elapsed=3.9s lr=2.000000e-03 loss=0.0190 loss_recon=0.0190 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9940 corrupt_frac=1.0000 acc_corrupt=0.9940 loss_corrupt=0.0190 wrong_frac=0.7890 init_acc_corrupt=0.2118 acc_corrupt_t_0p0_0p2=0.9891 corrupt_frac_t_0p0_0p2=0.5512 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0805 out_w_norm=12.2694 out_g_norm=0.0318 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0473 init_gold_top10=0.2302 init_gold_top100=0.3015
790
+ step=3900 epoch=3900/4000 epoch_step=1/1 micro_steps=3900 elapsed=3.9s lr=2.000000e-03 loss=0.0213 loss_recon=0.0213 loss_meanflow=0.0000 mean_model_t=0.2088 mean_corrupt_t=0.2088 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9935 corrupt_frac=1.0000 acc_corrupt=0.9935 loss_corrupt=0.0213 wrong_frac=0.7914 init_acc_corrupt=0.2094 acc_corrupt_t_0p0_0p2=0.9883 corrupt_frac_t_0p0_0p2=0.5583 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3557 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0769 out_w_norm=12.2660 out_g_norm=0.0325 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0129 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0110 init_gold_top10=0.1997 init_gold_top100=0.2768
791
+ step=4000 epoch=4000/4000 epoch_step=1/1 micro_steps=4000 elapsed=3.9s lr=2.000000e-03 loss=0.0161 loss_recon=0.0161 loss_meanflow=0.0000 mean_model_t=0.2113 mean_corrupt_t=0.2113 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9951 corrupt_frac=1.0000 acc_corrupt=0.9951 loss_corrupt=0.0161 wrong_frac=0.7887 init_acc_corrupt=0.2122 acc_corrupt_t_0p0_0p2=0.9910 corrupt_frac_t_0p0_0p2=0.5509 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=0.3594 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0805 out_w_norm=12.2661 out_g_norm=0.0345 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.0125 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0059 init_gold_top10=0.2300 init_gold_top100=0.3030
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_noisegeo_len256_allcorrupt_unigram_shared0p5_20260517_163805.log ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 969,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_unigram_shared0p5_20260517_163805",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 1965440,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 1.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_corpus_unigram_path": "",
124
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
125
+ "categorical_wrong_basin_shared_prob": 0.0,
126
+ "categorical_wrong_unigram_shared_prob": 0.5,
127
+ "mask_mixture_original_prob": 0.0,
128
+ "mask_mixture_lowk_prob": 0.0,
129
+ "mask_mixture_lowcorrupt_prob": 0.0,
130
+ "mask_mixture_block_prob": 0.0,
131
+ "mask_mixture_all_prob": 1.0,
132
+ "mask_mixture_lowk_clean_tokens": "0",
133
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
134
+ "mask_mixture_block_tokens": "64,128",
135
+ "simplex_bridge_sampler": "dirichlet",
136
+ "logistic_normal_sigma_min": 0.1,
137
+ "logistic_normal_sigma_max": 1.0,
138
+ "logistic_normal_tau_min": 1.0,
139
+ "logistic_normal_tau_max": 1.0,
140
+ "torch_compile": false,
141
+ "compile_mode": "max-autotune",
142
+ "state_format": "prob",
143
+ "meanflow_weight": 0.0,
144
+ "rollout_train_prob": 0.0,
145
+ "rollout_train_steps": 1,
146
+ "rollout_train_infer_steps": 64,
147
+ "rollout_train_temp": 1.45,
148
+ "rollout_train_max_gamma": 1.0,
149
+ "rollout_train_corrupt_only": true,
150
+ "rollout_train_samplewise": false,
151
+ "rollout_train_compute_always": false,
152
+ "bridge_noise_init": "logistic_normal",
153
+ "noise_sigma": -1.0,
154
+ "allow_tf32": true,
155
+ "activation_checkpointing": false,
156
+ "activation_checkpoint_interval": 1,
157
+ "activation_checkpoint_scope": "block",
158
+ "ddp_static_graph": false,
159
+ "ddp_gradient_as_bucket_view": true,
160
+ "blocking_data_transfer": false,
161
+ "dataloader_prefetch_factor": 4,
162
+ "full_train_stats": false,
163
+ "tokenized_hf": false,
164
+ "tokenized_pad_token": "pad",
165
+ "elf_conditional_hf": false,
166
+ "record_pad_truncate": false,
167
+ "record_add_eos": false,
168
+ "record_add_special_tokens": false,
169
+ "record_pad_token": "pad",
170
+ "record_shuffle_buffer": 10000,
171
+ "wrap": true,
172
+ "wrap_mode": "stream",
173
+ "wrap_record_buffer_size": 200,
174
+ "owt_cached_chunks": true,
175
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
176
+ "owt_chunk_cache_rebuild": false,
177
+ "owt_chunk_cache_write_batch": 4096,
178
+ "owt_exact_repeat_per_chunk": 64,
179
+ "online_chunk_shuffle": false,
180
+ "online_chunk_shuffle_buffer": 10000,
181
+ "openwebtext_split": "train_minus_100k",
182
+ "detokenizer": "auto",
183
+ "resolved_detokenizer": null,
184
+ "num_workers": 0,
185
+ "latest_every": 1000,
186
+ "resume_path": ""
187
+ }
188
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=4.5s lr=2.000000e-03 loss=6.7171 loss_recon=6.7171 loss_meanflow=0.0000 mean_model_t=0.2078 mean_corrupt_t=0.2078 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0942 corrupt_frac=1.0000 acc_corrupt=0.0942 loss_corrupt=6.7171 wrong_frac=0.7922 init_acc_corrupt=0.1148 acc_corrupt_t_0p0_0p2=0.0455 corrupt_frac_t_0p0_0p2=0.5620 acc_corrupt_t_0p2_0p4=0.1254 corrupt_frac_t_0p2_0p4=0.3533 acc_corrupt_t_0p4_0p6=0.2713 corrupt_frac_t_0p4_0p6=0.0755 acc_corrupt_t_0p6_0p8=0.4107 corrupt_frac_t_0p6_0p8=0.0117 out_w_norm=1.0887 out_g_norm=1.0092 acc_corrupt_t_0p8_1p0=0.8945 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.4630 init_gold_top10=0.2117 init_gold_top100=0.4247
189
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=3.8s lr=2.000000e-03 loss=6.1879 loss_recon=6.1879 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1016 corrupt_frac=1.0000 acc_corrupt=0.1016 loss_corrupt=6.1879 wrong_frac=0.7916 init_acc_corrupt=0.1160 acc_corrupt_t_0p0_0p2=0.0495 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=0.1388 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.2810 corrupt_frac_t_0p4_0p6=0.0744 acc_corrupt_t_0p6_0p8=0.3941 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=0.4678 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=3.3136 out_g_norm=1.4051 loss_all=5.9306 init_gold_top10=0.2016 init_gold_top100=0.4192
190
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=3.8s lr=2.000000e-03 loss=5.7129 loss_recon=5.7129 loss_meanflow=0.0000 mean_model_t=0.2105 mean_corrupt_t=0.2105 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1216 corrupt_frac=1.0000 acc_corrupt=0.1216 loss_corrupt=5.7129 wrong_frac=0.7894 init_acc_corrupt=0.1187 acc_corrupt_t_0p0_0p2=0.0537 corrupt_frac_t_0p0_0p2=0.5542 acc_corrupt_t_0p2_0p4=0.1615 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.3620 corrupt_frac_t_0p4_0p6=0.0813 acc_corrupt_t_0p6_0p8=0.5778 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=5.3362 out_g_norm=0.6855 acc_corrupt_t_0p8_1p0=0.5938 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.4734 init_gold_top10=0.2133 init_gold_top100=0.4237
191
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=3.8s lr=2.000000e-03 loss=5.3493 loss_recon=5.3493 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1453 corrupt_frac=1.0000 acc_corrupt=0.1453 loss_corrupt=5.3493 wrong_frac=0.7915 init_acc_corrupt=0.1161 acc_corrupt_t_0p0_0p2=0.0574 corrupt_frac_t_0p0_0p2=0.5597 acc_corrupt_t_0p2_0p4=0.2021 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.4675 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.6597 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=7.0676 out_g_norm=0.3587 acc_corrupt_t_0p8_1p0=0.7930 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.0547 init_gold_top10=0.2146 init_gold_top100=0.4290
192
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=3.8s lr=2.000000e-03 loss=4.9052 loss_recon=4.9052 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1578 corrupt_frac=1.0000 acc_corrupt=0.1578 loss_corrupt=4.9052 wrong_frac=0.7914 init_acc_corrupt=0.1160 acc_corrupt_t_0p0_0p2=0.0608 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.2300 corrupt_frac_t_0p2_0p4=0.3550 acc_corrupt_t_0p4_0p6=0.4843 corrupt_frac_t_0p4_0p6=0.0741 acc_corrupt_t_0p6_0p8=0.6602 corrupt_frac_t_0p6_0p8=0.0143 out_w_norm=8.6464 out_g_norm=0.4603 loss_all=4.6661 init_gold_top10=0.1906 init_gold_top100=0.4129
193
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=3.8s lr=2.000000e-03 loss=4.3633 loss_recon=4.3633 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1713 corrupt_frac=1.0000 acc_corrupt=0.1713 loss_corrupt=4.3633 wrong_frac=0.7917 init_acc_corrupt=0.1154 acc_corrupt_t_0p0_0p2=0.0670 corrupt_frac_t_0p0_0p2=0.5568 acc_corrupt_t_0p2_0p4=0.2545 corrupt_frac_t_0p2_0p4=0.3606 acc_corrupt_t_0p4_0p6=0.4927 corrupt_frac_t_0p4_0p6=0.0742 acc_corrupt_t_0p6_0p8=0.6751 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=9.6855 out_g_norm=0.4965 loss_all=4.2113 init_gold_top10=0.1772 init_gold_top100=0.4130
194
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=3.8s lr=2.000000e-03 loss=3.8335 loss_recon=3.8335 loss_meanflow=0.0000 mean_model_t=0.2079 mean_corrupt_t=0.2079 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1847 corrupt_frac=1.0000 acc_corrupt=0.1847 loss_corrupt=3.8335 wrong_frac=0.7925 init_acc_corrupt=0.1147 acc_corrupt_t_0p0_0p2=0.0766 corrupt_frac_t_0p0_0p2=0.5607 acc_corrupt_t_0p2_0p4=0.2754 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.5085 corrupt_frac_t_0p4_0p6=0.0739 acc_corrupt_t_0p6_0p8=0.6886 corrupt_frac_t_0p6_0p8=0.0132 out_w_norm=10.2232 out_g_norm=0.6982 loss_all=3.5264 init_gold_top10=0.2137 init_gold_top100=0.4244
195
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=3.8s lr=2.000000e-03 loss=3.3860 loss_recon=3.3860 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2059 corrupt_frac=1.0000 acc_corrupt=0.2059 loss_corrupt=3.3860 wrong_frac=0.7904 init_acc_corrupt=0.1168 acc_corrupt_t_0p0_0p2=0.0875 corrupt_frac_t_0p0_0p2=0.5505 acc_corrupt_t_0p2_0p4=0.3057 corrupt_frac_t_0p2_0p4=0.3649 acc_corrupt_t_0p4_0p6=0.5299 corrupt_frac_t_0p4_0p6=0.0765 out_w_norm=10.5877 out_g_norm=0.9937 acc_corrupt_t_0p6_0p8=0.6989 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=0.8398 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.1965 init_gold_top10=0.1935 init_gold_top100=0.4118
196
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=3.8s lr=2.000000e-03 loss=3.0038 loss_recon=3.0038 loss_meanflow=0.0000 mean_model_t=0.2102 mean_corrupt_t=0.2102 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2384 corrupt_frac=1.0000 acc_corrupt=0.2384 loss_corrupt=3.0038 wrong_frac=0.7901 init_acc_corrupt=0.1169 acc_corrupt_t_0p0_0p2=0.1048 corrupt_frac_t_0p0_0p2=0.5513 acc_corrupt_t_0p2_0p4=0.3572 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.5844 corrupt_frac_t_0p4_0p6=0.0755 out_w_norm=10.8302 out_g_norm=1.2164 acc_corrupt_t_0p6_0p8=0.7286 corrupt_frac_t_0p6_0p8=0.0127 acc_corrupt_t_0p8_1p0=0.8242 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.9094 init_gold_top10=0.1910 init_gold_top100=0.4159
197
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=3.8s lr=2.000000e-03 loss=2.6625 loss_recon=2.6625 loss_meanflow=0.0000 mean_model_t=0.2101 mean_corrupt_t=0.2101 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2880 corrupt_frac=1.0000 acc_corrupt=0.2880 loss_corrupt=2.6625 wrong_frac=0.7897 init_acc_corrupt=0.1182 acc_corrupt_t_0p0_0p2=0.1255 corrupt_frac_t_0p0_0p2=0.5591 acc_corrupt_t_0p2_0p4=0.4443 corrupt_frac_t_0p2_0p4=0.3526 acc_corrupt_t_0p4_0p6=0.6811 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p8_1p0=0.8555 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=10.9865 out_g_norm=1.5042 acc_corrupt_t_0p6_0p8=0.7904 corrupt_frac_t_0p6_0p8=0.0133 loss_all=2.5068 init_gold_top10=0.1951 init_gold_top100=0.4188
198
+ Traceback (most recent call last):
199
+ File "<frozen runpy>", line 198, in _run_module_as_main
200
+ File "<frozen runpy>", line 88, in _run_code
201
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
202
+ main()
203
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
204
+ return f(*args, **kwargs)
205
+ ^^^^^^^^^^^^^^^^^^
206
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
207
+ run(args)
208
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
209
+ elastic_launch(
210
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
211
+ return launch_agent(self._config, self._entrypoint, list(args))
212
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
213
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
214
+ result = agent.run()
215
+ ^^^^^^^^^^^
216
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
217
+ result = f(*args, **kwargs)
218
+ ^^^^^^^^^^^^^^^^^^
219
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run
220
+ result = self._invoke_run(role)
221
+ ^^^^^^^^^^^^^^^^^^^^^^
222
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 864, in _invoke_run
223
+ self._initialize_workers(self._worker_group)
224
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
225
+ result = f(*args, **kwargs)
226
+ ^^^^^^^^^^^^^^^^^^
227
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 683, in _initialize_workers
228
+ self._rendezvous(worker_group)
229
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
230
+ result = f(*args, **kwargs)
231
+ ^^^^^^^^^^^^^^^^^^
232
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 500, in _rendezvous
233
+ rdzv_info = spec.rdzv_handler.next_rendezvous()
234
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
235
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 67, in next_rendezvous
236
+ self._store = TCPStore( # type: ignore[call-arg]
237
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
238
+ RuntimeError: The server socket has failed to listen on any local network address. port: 32976, useIpv6: 0, code: -98, name: EADDRINUSE, message: address already in use
239
+ NCCL version 2.25.1+cuda12.8
240
+ resumed_from=runs/train8_noisegeo_len256_allcorrupt_unigram_shared0p5_20260517_163805/latest.pt start_step=1001
241
+ {
242
+ "device": "cuda:0",
243
+ "rank": 0,
244
+ "world_size": 4,
245
+ "samples": "owt_cached_chunks:8",
246
+ "vocab_size": 969,
247
+ "tokenizer_vocab_size": 50257,
248
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_unigram_shared0p5_20260517_163805",
249
+ "batch_size": 128,
250
+ "grad_accum": 1,
251
+ "effective_batch_size": 512,
252
+ "global_batch_size": 512,
253
+ "lr_schedule": "constant_warmup",
254
+ "optimizer": "muon",
255
+ "epochs": 0.0,
256
+ "steps_per_epoch": 1,
257
+ "total_steps": 2000,
258
+ "warmup_steps": 10,
259
+ "warmup_epochs": -1.0,
260
+ "min_lr": 0.0,
261
+ "weight_decay": 0.1,
262
+ "output_weight_decay": -1.0,
263
+ "adamw_param_groups": "nanogpt",
264
+ "adam_beta1": 0.9,
265
+ "adam_beta2": 0.95,
266
+ "adam_eps": 1e-08,
267
+ "muon_impl": "legacy",
268
+ "muon_momentum": 0.95,
269
+ "muon_ns_steps": 5,
270
+ "muon_update_scale": 1.0,
271
+ "muon_nesterov": false,
272
+ "muon_width_scale": false,
273
+ "muon_grouping": "legacy_dim_ge_2",
274
+ "muon_param_count": 1965440,
275
+ "muon_adam_param_count": 8192,
276
+ "muon_param_names": [
277
+ "vocab_embed.embedding",
278
+ "sigma_map.net.0.weight",
279
+ "sigma_map.net.2.weight",
280
+ "blocks.0.attn_qkv.weight",
281
+ "blocks.0.attn_out.weight",
282
+ "blocks.0.mlp.0.weight",
283
+ "blocks.0.mlp.2.weight",
284
+ "blocks.0.adaLN_modulation.weight",
285
+ "blocks.1.attn_qkv.weight",
286
+ "blocks.1.attn_out.weight",
287
+ "blocks.1.mlp.0.weight",
288
+ "blocks.1.mlp.2.weight",
289
+ "blocks.1.adaLN_modulation.weight",
290
+ "blocks.2.attn_qkv.weight",
291
+ "blocks.2.attn_out.weight",
292
+ "blocks.2.mlp.0.weight",
293
+ "blocks.2.mlp.2.weight",
294
+ "blocks.2.adaLN_modulation.weight",
295
+ "output_layer.linear.weight",
296
+ "output_layer.adaLN_modulation.weight"
297
+ ],
298
+ "muon_adam_param_names": [
299
+ "sigma_map.net.0.bias",
300
+ "sigma_map.net.2.bias",
301
+ "blocks.0.norm1.weight",
302
+ "blocks.0.norm2.weight",
303
+ "blocks.0.mlp.0.bias",
304
+ "blocks.0.mlp.2.bias",
305
+ "blocks.0.adaLN_modulation.bias",
306
+ "blocks.1.norm1.weight",
307
+ "blocks.1.norm2.weight",
308
+ "blocks.1.mlp.0.bias",
309
+ "blocks.1.mlp.2.bias",
310
+ "blocks.1.adaLN_modulation.bias",
311
+ "blocks.2.norm1.weight",
312
+ "blocks.2.norm2.weight",
313
+ "blocks.2.mlp.0.bias",
314
+ "blocks.2.mlp.2.bias",
315
+ "blocks.2.adaLN_modulation.bias",
316
+ "output_layer.norm_final.weight",
317
+ "output_layer.adaLN_modulation.bias"
318
+ ],
319
+ "muon_effective_nesterov": false,
320
+ "muon_effective_width_scale": false,
321
+ "muon_effective_weight_decay": 0.1,
322
+ "muon_adam_fallback_nesterov": false,
323
+ "muon_adam_fallback_weight_decay": 0.1,
324
+ "ema_decay": 0.9999,
325
+ "ema_start_step": 0,
326
+ "model_type": "ddit",
327
+ "ddit_mlp_type": "gelu",
328
+ "elf_num_time_tokens": 4,
329
+ "elf_num_model_mode_tokens": 0,
330
+ "qk_norm": true,
331
+ "output_bias": false,
332
+ "output_init_std": -1.0,
333
+ "norm_type": "rmsnorm",
334
+ "target_loss": "hard_ce",
335
+ "linear_soft_target_power": 1.0,
336
+ "linear_soft_target_min_conf": 0.0,
337
+ "linear_soft_target_max_conf": 1.0,
338
+ "t_sampling_mode": "logit_normal",
339
+ "t_sampling_power": 1.0,
340
+ "t_sampling_eps": 0.0001,
341
+ "t_sampling_logit_mean": -1.5,
342
+ "t_sampling_logit_std": 0.8,
343
+ "dual_t": true,
344
+ "corrupt_t_mode": "same",
345
+ "corrupt_min_t": 0.0,
346
+ "corrupt_max_t": 1.0,
347
+ "prefix_block_prob": 0.0,
348
+ "prefix_block_len": 128,
349
+ "mask_ratio_floor_schedule": "none",
350
+ "dirichlet_endpoint_mode": "categorical_dual_t",
351
+ "dirichlet_semantic_t_mode": "same",
352
+ "dirichlet_semantic_t_value": 0.0,
353
+ "dirichlet_semantic_t_curve": "linear",
354
+ "dirichlet_semantic_t_power": 1.0,
355
+ "endpoint_sequence_random_prob_alpha": 0.0,
356
+ "categorical_wrong_from_full_vocab": true,
357
+ "categorical_wrong_from_batch_valid_tokens": false,
358
+ "categorical_wrong_basin_token_ids": "",
359
+ "categorical_wrong_basin_prob": 0.0,
360
+ "categorical_wrong_unigram_prob": 1.0,
361
+ "categorical_wrong_uniform_prob": 0.0,
362
+ "categorical_wrong_corpus_unigram_path": "",
363
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
364
+ "categorical_wrong_basin_shared_prob": 0.0,
365
+ "categorical_wrong_unigram_shared_prob": 0.5,
366
+ "mask_mixture_original_prob": 0.0,
367
+ "mask_mixture_lowk_prob": 0.0,
368
+ "mask_mixture_lowcorrupt_prob": 0.0,
369
+ "mask_mixture_block_prob": 0.0,
370
+ "mask_mixture_all_prob": 1.0,
371
+ "mask_mixture_lowk_clean_tokens": "0",
372
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
373
+ "mask_mixture_block_tokens": "64,128",
374
+ "simplex_bridge_sampler": "dirichlet",
375
+ "logistic_normal_sigma_min": 0.1,
376
+ "logistic_normal_sigma_max": 1.0,
377
+ "logistic_normal_tau_min": 1.0,
378
+ "logistic_normal_tau_max": 1.0,
379
+ "torch_compile": false,
380
+ "compile_mode": "max-autotune",
381
+ "state_format": "prob",
382
+ "meanflow_weight": 0.0,
383
+ "rollout_train_prob": 0.0,
384
+ "rollout_train_steps": 1,
385
+ "rollout_train_infer_steps": 64,
386
+ "rollout_train_temp": 1.45,
387
+ "rollout_train_max_gamma": 1.0,
388
+ "rollout_train_corrupt_only": true,
389
+ "rollout_train_samplewise": false,
390
+ "rollout_train_compute_always": false,
391
+ "bridge_noise_init": "logistic_normal",
392
+ "noise_sigma": -1.0,
393
+ "allow_tf32": true,
394
+ "activation_checkpointing": false,
395
+ "activation_checkpoint_interval": 1,
396
+ "activation_checkpoint_scope": "block",
397
+ "ddp_static_graph": false,
398
+ "ddp_gradient_as_bucket_view": true,
399
+ "blocking_data_transfer": false,
400
+ "dataloader_prefetch_factor": 4,
401
+ "full_train_stats": false,
402
+ "tokenized_hf": false,
403
+ "tokenized_pad_token": "pad",
404
+ "elf_conditional_hf": false,
405
+ "record_pad_truncate": false,
406
+ "record_add_eos": false,
407
+ "record_add_special_tokens": false,
408
+ "record_pad_token": "pad",
409
+ "record_shuffle_buffer": 10000,
410
+ "wrap": true,
411
+ "wrap_mode": "stream",
412
+ "wrap_record_buffer_size": 200,
413
+ "owt_cached_chunks": true,
414
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
415
+ "owt_chunk_cache_rebuild": false,
416
+ "owt_chunk_cache_write_batch": 4096,
417
+ "owt_exact_repeat_per_chunk": 64,
418
+ "online_chunk_shuffle": false,
419
+ "online_chunk_shuffle_buffer": 10000,
420
+ "openwebtext_split": "train_minus_100k",
421
+ "detokenizer": "auto",
422
+ "resolved_detokenizer": null,
423
+ "num_workers": 0,
424
+ "latest_every": 1000,
425
+ "resume_path": "runs/train8_noisegeo_len256_allcorrupt_unigram_shared0p5_20260517_163805/latest.pt"
426
+ }
427
+ step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=4.8s lr=2.000000e-03 loss=2.2796 loss_recon=2.2796 loss_meanflow=0.0000 mean_model_t=0.2078 mean_corrupt_t=0.2078 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3636 corrupt_frac=1.0000 acc_corrupt=0.3636 loss_corrupt=2.2796 wrong_frac=0.7922 init_acc_corrupt=0.1148 acc_corrupt_t_0p0_0p2=0.1696 corrupt_frac_t_0p0_0p2=0.5620 acc_corrupt_t_0p2_0p4=0.5667 corrupt_frac_t_0p2_0p4=0.3533 acc_corrupt_t_0p4_0p6=0.7971 corrupt_frac_t_0p4_0p6=0.0755 acc_corrupt_t_0p6_0p8=0.8522 corrupt_frac_t_0p6_0p8=0.0117 out_w_norm=11.0831 out_g_norm=1.6545 acc_corrupt_t_0p8_1p0=0.9609 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.9833 init_gold_top10=0.2117 init_gold_top100=0.4247
428
+ step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=4.1s lr=2.000000e-03 loss=1.8033 loss_recon=1.8033 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4715 corrupt_frac=1.0000 acc_corrupt=0.4715 loss_corrupt=1.8033 wrong_frac=0.7916 init_acc_corrupt=0.1160 acc_corrupt_t_0p0_0p2=0.2501 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=0.7186 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.8878 corrupt_frac_t_0p4_0p6=0.0744 acc_corrupt_t_0p6_0p8=0.9219 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=0.9346 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=11.1814 out_g_norm=1.7198 loss_all=1.5160 init_gold_top10=0.2016 init_gold_top100=0.4192
429
+ step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=4.1s lr=2.000000e-03 loss=1.4037 loss_recon=1.4037 loss_meanflow=0.0000 mean_model_t=0.2105 mean_corrupt_t=0.2105 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5691 corrupt_frac=1.0000 acc_corrupt=0.5691 loss_corrupt=1.4037 wrong_frac=0.7894 init_acc_corrupt=0.1187 acc_corrupt_t_0p0_0p2=0.3382 corrupt_frac_t_0p0_0p2=0.5542 acc_corrupt_t_0p2_0p4=0.8347 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.9404 corrupt_frac_t_0p4_0p6=0.0813 acc_corrupt_t_0p6_0p8=0.9533 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=11.3091 out_g_norm=1.9176 acc_corrupt_t_0p8_1p0=0.9688 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0970 init_gold_top10=0.2133 init_gold_top100=0.4237
430
+ step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=4.1s lr=2.000000e-03 loss=1.1179 loss_recon=1.1179 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6413 corrupt_frac=1.0000 acc_corrupt=0.6413 loss_corrupt=1.1179 wrong_frac=0.7915 init_acc_corrupt=0.1161 acc_corrupt_t_0p0_0p2=0.4109 corrupt_frac_t_0p0_0p2=0.5597 acc_corrupt_t_0p2_0p4=0.9243 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.9755 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9771 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=11.4008 out_g_norm=1.8839 acc_corrupt_t_0p8_1p0=0.9844 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0257 init_gold_top10=0.2146 init_gold_top100=0.4290
431
+ step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=4.1s lr=2.000000e-03 loss=0.9652 loss_recon=0.9652 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6788 corrupt_frac=1.0000 acc_corrupt=0.6788 loss_corrupt=0.9652 wrong_frac=0.7914 init_acc_corrupt=0.1160 acc_corrupt_t_0p0_0p2=0.4517 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.9651 corrupt_frac_t_0p2_0p4=0.3550 acc_corrupt_t_0p4_0p6=0.9896 corrupt_frac_t_0p4_0p6=0.0741 acc_corrupt_t_0p6_0p8=0.9916 corrupt_frac_t_0p6_0p8=0.0143 out_w_norm=11.4319 out_g_norm=1.6646 loss_all=0.9362 init_gold_top10=0.1906 init_gold_top100=0.4129
432
+ step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=4.1s lr=2.000000e-03 loss=0.8711 loss_recon=0.8711 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7041 corrupt_frac=1.0000 acc_corrupt=0.7041 loss_corrupt=0.8711 wrong_frac=0.7917 init_acc_corrupt=0.1154 acc_corrupt_t_0p0_0p2=0.4811 corrupt_frac_t_0p0_0p2=0.5568 acc_corrupt_t_0p2_0p4=0.9818 corrupt_frac_t_0p2_0p4=0.3606 acc_corrupt_t_0p4_0p6=0.9955 corrupt_frac_t_0p4_0p6=0.0742 acc_corrupt_t_0p6_0p8=0.9946 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=11.4282 out_g_norm=1.3996 loss_all=0.8841 init_gold_top10=0.1772 init_gold_top100=0.4130
433
+ step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=4.1s lr=2.000000e-03 loss=0.8025 loss_recon=0.8025 loss_meanflow=0.0000 mean_model_t=0.2079 mean_corrupt_t=0.2079 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7223 corrupt_frac=1.0000 acc_corrupt=0.7223 loss_corrupt=0.8025 wrong_frac=0.7925 init_acc_corrupt=0.1147 acc_corrupt_t_0p0_0p2=0.5119 corrupt_frac_t_0p0_0p2=0.5607 acc_corrupt_t_0p2_0p4=0.9893 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.9980 corrupt_frac_t_0p4_0p6=0.0739 acc_corrupt_t_0p6_0p8=0.9964 corrupt_frac_t_0p6_0p8=0.0132 out_w_norm=11.4249 out_g_norm=1.2780 loss_all=0.7132 init_gold_top10=0.2137 init_gold_top100=0.4244
434
+ step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=4.1s lr=2.000000e-03 loss=0.7516 loss_recon=0.7516 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7385 corrupt_frac=1.0000 acc_corrupt=0.7385 loss_corrupt=0.7516 wrong_frac=0.7904 init_acc_corrupt=0.1168 acc_corrupt_t_0p0_0p2=0.5306 corrupt_frac_t_0p0_0p2=0.5505 acc_corrupt_t_0p2_0p4=0.9921 corrupt_frac_t_0p2_0p4=0.3649 acc_corrupt_t_0p4_0p6=0.9986 corrupt_frac_t_0p4_0p6=0.0765 out_w_norm=11.4288 out_g_norm=1.0979 acc_corrupt_t_0p6_0p8=0.9968 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=0.9922 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.6790 init_gold_top10=0.1935 init_gold_top100=0.4118
435
+ step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=4.1s lr=2.000000e-03 loss=0.7025 loss_recon=0.7025 loss_meanflow=0.0000 mean_model_t=0.2102 mean_corrupt_t=0.2102 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7553 corrupt_frac=1.0000 acc_corrupt=0.7553 loss_corrupt=0.7025 wrong_frac=0.7901 init_acc_corrupt=0.1169 acc_corrupt_t_0p0_0p2=0.5593 corrupt_frac_t_0p0_0p2=0.5513 acc_corrupt_t_0p2_0p4=0.9956 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.9994 corrupt_frac_t_0p4_0p6=0.0755 out_w_norm=11.4342 out_g_norm=0.9973 acc_corrupt_t_0p6_0p8=0.9980 corrupt_frac_t_0p6_0p8=0.0127 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.6958 init_gold_top10=0.1910 init_gold_top100=0.4159
436
+ step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=4.1s lr=2.000000e-03 loss=0.6663 loss_recon=0.6663 loss_meanflow=0.0000 mean_model_t=0.2101 mean_corrupt_t=0.2101 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7647 corrupt_frac=1.0000 acc_corrupt=0.7647 loss_corrupt=0.6663 wrong_frac=0.7897 init_acc_corrupt=0.1182 acc_corrupt_t_0p0_0p2=0.5808 corrupt_frac_t_0p0_0p2=0.5591 acc_corrupt_t_0p2_0p4=0.9975 corrupt_frac_t_0p2_0p4=0.3526 acc_corrupt_t_0p4_0p6=0.9996 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p8_1p0=0.9785 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=11.4442 out_g_norm=0.8627 acc_corrupt_t_0p6_0p8=0.9991 corrupt_frac_t_0p6_0p8=0.0133 loss_all=0.6893 init_gold_top10=0.1951 init_gold_top100=0.4188
437
+ NCCL version 2.25.1+cuda12.8
438
+ resumed_from=runs/train8_noisegeo_len256_allcorrupt_unigram_shared0p5_20260517_163805/latest.pt start_step=2001
439
+ {
440
+ "device": "cuda:0",
441
+ "rank": 0,
442
+ "world_size": 4,
443
+ "samples": "owt_cached_chunks:8",
444
+ "vocab_size": 969,
445
+ "tokenizer_vocab_size": 50257,
446
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_unigram_shared0p5_20260517_163805",
447
+ "batch_size": 128,
448
+ "grad_accum": 1,
449
+ "effective_batch_size": 512,
450
+ "global_batch_size": 512,
451
+ "lr_schedule": "constant_warmup",
452
+ "optimizer": "muon",
453
+ "epochs": 0.0,
454
+ "steps_per_epoch": 1,
455
+ "total_steps": 3000,
456
+ "warmup_steps": 10,
457
+ "warmup_epochs": -1.0,
458
+ "min_lr": 0.0,
459
+ "weight_decay": 0.1,
460
+ "output_weight_decay": -1.0,
461
+ "adamw_param_groups": "nanogpt",
462
+ "adam_beta1": 0.9,
463
+ "adam_beta2": 0.95,
464
+ "adam_eps": 1e-08,
465
+ "muon_impl": "legacy",
466
+ "muon_momentum": 0.95,
467
+ "muon_ns_steps": 5,
468
+ "muon_update_scale": 1.0,
469
+ "muon_nesterov": false,
470
+ "muon_width_scale": false,
471
+ "muon_grouping": "legacy_dim_ge_2",
472
+ "muon_param_count": 1965440,
473
+ "muon_adam_param_count": 8192,
474
+ "muon_param_names": [
475
+ "vocab_embed.embedding",
476
+ "sigma_map.net.0.weight",
477
+ "sigma_map.net.2.weight",
478
+ "blocks.0.attn_qkv.weight",
479
+ "blocks.0.attn_out.weight",
480
+ "blocks.0.mlp.0.weight",
481
+ "blocks.0.mlp.2.weight",
482
+ "blocks.0.adaLN_modulation.weight",
483
+ "blocks.1.attn_qkv.weight",
484
+ "blocks.1.attn_out.weight",
485
+ "blocks.1.mlp.0.weight",
486
+ "blocks.1.mlp.2.weight",
487
+ "blocks.1.adaLN_modulation.weight",
488
+ "blocks.2.attn_qkv.weight",
489
+ "blocks.2.attn_out.weight",
490
+ "blocks.2.mlp.0.weight",
491
+ "blocks.2.mlp.2.weight",
492
+ "blocks.2.adaLN_modulation.weight",
493
+ "output_layer.linear.weight",
494
+ "output_layer.adaLN_modulation.weight"
495
+ ],
496
+ "muon_adam_param_names": [
497
+ "sigma_map.net.0.bias",
498
+ "sigma_map.net.2.bias",
499
+ "blocks.0.norm1.weight",
500
+ "blocks.0.norm2.weight",
501
+ "blocks.0.mlp.0.bias",
502
+ "blocks.0.mlp.2.bias",
503
+ "blocks.0.adaLN_modulation.bias",
504
+ "blocks.1.norm1.weight",
505
+ "blocks.1.norm2.weight",
506
+ "blocks.1.mlp.0.bias",
507
+ "blocks.1.mlp.2.bias",
508
+ "blocks.1.adaLN_modulation.bias",
509
+ "blocks.2.norm1.weight",
510
+ "blocks.2.norm2.weight",
511
+ "blocks.2.mlp.0.bias",
512
+ "blocks.2.mlp.2.bias",
513
+ "blocks.2.adaLN_modulation.bias",
514
+ "output_layer.norm_final.weight",
515
+ "output_layer.adaLN_modulation.bias"
516
+ ],
517
+ "muon_effective_nesterov": false,
518
+ "muon_effective_width_scale": false,
519
+ "muon_effective_weight_decay": 0.1,
520
+ "muon_adam_fallback_nesterov": false,
521
+ "muon_adam_fallback_weight_decay": 0.1,
522
+ "ema_decay": 0.9999,
523
+ "ema_start_step": 0,
524
+ "model_type": "ddit",
525
+ "ddit_mlp_type": "gelu",
526
+ "elf_num_time_tokens": 4,
527
+ "elf_num_model_mode_tokens": 0,
528
+ "qk_norm": true,
529
+ "output_bias": false,
530
+ "output_init_std": -1.0,
531
+ "norm_type": "rmsnorm",
532
+ "target_loss": "hard_ce",
533
+ "linear_soft_target_power": 1.0,
534
+ "linear_soft_target_min_conf": 0.0,
535
+ "linear_soft_target_max_conf": 1.0,
536
+ "t_sampling_mode": "logit_normal",
537
+ "t_sampling_power": 1.0,
538
+ "t_sampling_eps": 0.0001,
539
+ "t_sampling_logit_mean": -1.5,
540
+ "t_sampling_logit_std": 0.8,
541
+ "dual_t": true,
542
+ "corrupt_t_mode": "same",
543
+ "corrupt_min_t": 0.0,
544
+ "corrupt_max_t": 1.0,
545
+ "prefix_block_prob": 0.0,
546
+ "prefix_block_len": 128,
547
+ "mask_ratio_floor_schedule": "none",
548
+ "dirichlet_endpoint_mode": "categorical_dual_t",
549
+ "dirichlet_semantic_t_mode": "same",
550
+ "dirichlet_semantic_t_value": 0.0,
551
+ "dirichlet_semantic_t_curve": "linear",
552
+ "dirichlet_semantic_t_power": 1.0,
553
+ "endpoint_sequence_random_prob_alpha": 0.0,
554
+ "categorical_wrong_from_full_vocab": true,
555
+ "categorical_wrong_from_batch_valid_tokens": false,
556
+ "categorical_wrong_basin_token_ids": "",
557
+ "categorical_wrong_basin_prob": 0.0,
558
+ "categorical_wrong_unigram_prob": 1.0,
559
+ "categorical_wrong_uniform_prob": 0.0,
560
+ "categorical_wrong_corpus_unigram_path": "",
561
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
562
+ "categorical_wrong_basin_shared_prob": 0.0,
563
+ "categorical_wrong_unigram_shared_prob": 0.5,
564
+ "mask_mixture_original_prob": 0.0,
565
+ "mask_mixture_lowk_prob": 0.0,
566
+ "mask_mixture_lowcorrupt_prob": 0.0,
567
+ "mask_mixture_block_prob": 0.0,
568
+ "mask_mixture_all_prob": 1.0,
569
+ "mask_mixture_lowk_clean_tokens": "0",
570
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
571
+ "mask_mixture_block_tokens": "64,128",
572
+ "simplex_bridge_sampler": "dirichlet",
573
+ "logistic_normal_sigma_min": 0.1,
574
+ "logistic_normal_sigma_max": 1.0,
575
+ "logistic_normal_tau_min": 1.0,
576
+ "logistic_normal_tau_max": 1.0,
577
+ "torch_compile": false,
578
+ "compile_mode": "max-autotune",
579
+ "state_format": "prob",
580
+ "meanflow_weight": 0.0,
581
+ "rollout_train_prob": 0.0,
582
+ "rollout_train_steps": 1,
583
+ "rollout_train_infer_steps": 64,
584
+ "rollout_train_temp": 1.45,
585
+ "rollout_train_max_gamma": 1.0,
586
+ "rollout_train_corrupt_only": true,
587
+ "rollout_train_samplewise": false,
588
+ "rollout_train_compute_always": false,
589
+ "bridge_noise_init": "logistic_normal",
590
+ "noise_sigma": -1.0,
591
+ "allow_tf32": true,
592
+ "activation_checkpointing": false,
593
+ "activation_checkpoint_interval": 1,
594
+ "activation_checkpoint_scope": "block",
595
+ "ddp_static_graph": false,
596
+ "ddp_gradient_as_bucket_view": true,
597
+ "blocking_data_transfer": false,
598
+ "dataloader_prefetch_factor": 4,
599
+ "full_train_stats": false,
600
+ "tokenized_hf": false,
601
+ "tokenized_pad_token": "pad",
602
+ "elf_conditional_hf": false,
603
+ "record_pad_truncate": false,
604
+ "record_add_eos": false,
605
+ "record_add_special_tokens": false,
606
+ "record_pad_token": "pad",
607
+ "record_shuffle_buffer": 10000,
608
+ "wrap": true,
609
+ "wrap_mode": "stream",
610
+ "wrap_record_buffer_size": 200,
611
+ "owt_cached_chunks": true,
612
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
613
+ "owt_chunk_cache_rebuild": false,
614
+ "owt_chunk_cache_write_batch": 4096,
615
+ "owt_exact_repeat_per_chunk": 64,
616
+ "online_chunk_shuffle": false,
617
+ "online_chunk_shuffle_buffer": 10000,
618
+ "openwebtext_split": "train_minus_100k",
619
+ "detokenizer": "auto",
620
+ "resolved_detokenizer": null,
621
+ "num_workers": 0,
622
+ "latest_every": 1000,
623
+ "resume_path": "runs/train8_noisegeo_len256_allcorrupt_unigram_shared0p5_20260517_163805/latest.pt"
624
+ }
625
+ step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=4.5s lr=2.000000e-03 loss=0.6586 loss_recon=0.6586 loss_meanflow=0.0000 mean_model_t=0.2078 mean_corrupt_t=0.2078 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7668 corrupt_frac=1.0000 acc_corrupt=0.7668 loss_corrupt=0.6586 wrong_frac=0.7922 init_acc_corrupt=0.1148 acc_corrupt_t_0p0_0p2=0.5865 corrupt_frac_t_0p0_0p2=0.5620 acc_corrupt_t_0p2_0p4=0.9977 corrupt_frac_t_0p2_0p4=0.3533 acc_corrupt_t_0p4_0p6=0.9996 corrupt_frac_t_0p4_0p6=0.0755 acc_corrupt_t_0p6_0p8=0.9993 corrupt_frac_t_0p6_0p8=0.0117 out_w_norm=11.4584 out_g_norm=0.8178 acc_corrupt_t_0p8_1p0=0.9922 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.6532 init_gold_top10=0.2117 init_gold_top100=0.4247
626
+ step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=3.8s lr=2.000000e-03 loss=0.6295 loss_recon=0.6295 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7760 corrupt_frac=1.0000 acc_corrupt=0.7760 loss_corrupt=0.6295 wrong_frac=0.7916 init_acc_corrupt=0.1160 acc_corrupt_t_0p0_0p2=0.5995 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=0.9984 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0744 acc_corrupt_t_0p6_0p8=0.9990 corrupt_frac_t_0p6_0p8=0.0126 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=11.4663 out_g_norm=0.7199 loss_all=0.6324 init_gold_top10=0.2016 init_gold_top100=0.4192
627
+ step=2300 epoch=2300/3000 epoch_step=1/1 micro_steps=2300 elapsed=3.8s lr=2.000000e-03 loss=0.6050 loss_recon=0.6050 loss_meanflow=0.0000 mean_model_t=0.2105 mean_corrupt_t=0.2105 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7846 corrupt_frac=1.0000 acc_corrupt=0.7846 loss_corrupt=0.6050 wrong_frac=0.7894 init_acc_corrupt=0.1187 acc_corrupt_t_0p0_0p2=0.6123 corrupt_frac_t_0p0_0p2=0.5542 acc_corrupt_t_0p2_0p4=0.9985 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0813 acc_corrupt_t_0p6_0p8=0.9993 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=11.4696 out_g_norm=0.6746 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.4587 init_gold_top10=0.2133 init_gold_top100=0.4237
628
+ step=2400 epoch=2400/3000 epoch_step=1/1 micro_steps=2400 elapsed=3.8s lr=2.000000e-03 loss=0.5927 loss_recon=0.5927 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7908 corrupt_frac=1.0000 acc_corrupt=0.7908 loss_corrupt=0.5927 wrong_frac=0.7915 init_acc_corrupt=0.1161 acc_corrupt_t_0p0_0p2=0.6268 corrupt_frac_t_0p0_0p2=0.5597 acc_corrupt_t_0p2_0p4=0.9991 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9984 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=11.4838 out_g_norm=0.6499 acc_corrupt_t_0p8_1p0=0.9980 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5744 init_gold_top10=0.2146 init_gold_top100=0.4290
629
+ step=2500 epoch=2500/3000 epoch_step=1/1 micro_steps=2500 elapsed=3.8s lr=2.000000e-03 loss=0.5809 loss_recon=0.5809 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7926 corrupt_frac=1.0000 acc_corrupt=0.7926 loss_corrupt=0.5809 wrong_frac=0.7914 init_acc_corrupt=0.1160 acc_corrupt_t_0p0_0p2=0.6313 corrupt_frac_t_0p0_0p2=0.5616 acc_corrupt_t_0p2_0p4=0.9991 corrupt_frac_t_0p2_0p4=0.3550 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0741 acc_corrupt_t_0p6_0p8=0.9993 corrupt_frac_t_0p6_0p8=0.0143 out_w_norm=11.5089 out_g_norm=0.5936 loss_all=0.5771 init_gold_top10=0.1906 init_gold_top100=0.4129
630
+ step=2600 epoch=2600/3000 epoch_step=1/1 micro_steps=2600 elapsed=3.8s lr=2.000000e-03 loss=0.5694 loss_recon=0.5694 loss_meanflow=0.0000 mean_model_t=0.2084 mean_corrupt_t=0.2084 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7986 corrupt_frac=1.0000 acc_corrupt=0.7986 loss_corrupt=0.5694 wrong_frac=0.7917 init_acc_corrupt=0.1154 acc_corrupt_t_0p0_0p2=0.6388 corrupt_frac_t_0p0_0p2=0.5568 acc_corrupt_t_0p2_0p4=0.9994 corrupt_frac_t_0p2_0p4=0.3606 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0742 acc_corrupt_t_0p6_0p8=0.9993 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=11.5321 out_g_norm=0.5380 loss_all=0.6365 init_gold_top10=0.1772 init_gold_top100=0.4130
631
+ step=2700 epoch=2700/3000 epoch_step=1/1 micro_steps=2700 elapsed=3.8s lr=2.000000e-03 loss=0.5522 loss_recon=0.5522 loss_meanflow=0.0000 mean_model_t=0.2079 mean_corrupt_t=0.2079 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8042 corrupt_frac=1.0000 acc_corrupt=0.8042 loss_corrupt=0.5522 wrong_frac=0.7925 init_acc_corrupt=0.1147 acc_corrupt_t_0p0_0p2=0.6515 corrupt_frac_t_0p0_0p2=0.5607 acc_corrupt_t_0p2_0p4=0.9990 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0739 acc_corrupt_t_0p6_0p8=0.9993 corrupt_frac_t_0p6_0p8=0.0132 out_w_norm=11.5532 out_g_norm=0.5052 loss_all=0.5069 init_gold_top10=0.2137 init_gold_top100=0.4244
632
+ step=2800 epoch=2800/3000 epoch_step=1/1 micro_steps=2800 elapsed=3.8s lr=2.000000e-03 loss=0.5327 loss_recon=0.5327 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8109 corrupt_frac=1.0000 acc_corrupt=0.8109 loss_corrupt=0.5327 wrong_frac=0.7904 init_acc_corrupt=0.1168 acc_corrupt_t_0p0_0p2=0.6569 corrupt_frac_t_0p0_0p2=0.5505 acc_corrupt_t_0p2_0p4=0.9994 corrupt_frac_t_0p2_0p4=0.3649 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0765 out_w_norm=11.5678 out_g_norm=0.4714 acc_corrupt_t_0p6_0p8=0.9993 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=0.9922 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.4982 init_gold_top10=0.1935 init_gold_top100=0.4118
633
+ step=2900 epoch=2900/3000 epoch_step=1/1 micro_steps=2900 elapsed=3.8s lr=2.000000e-03 loss=0.5118 loss_recon=0.5118 loss_meanflow=0.0000 mean_model_t=0.2102 mean_corrupt_t=0.2102 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8184 corrupt_frac=1.0000 acc_corrupt=0.8184 loss_corrupt=0.5118 wrong_frac=0.7901 init_acc_corrupt=0.1169 acc_corrupt_t_0p0_0p2=0.6710 corrupt_frac_t_0p0_0p2=0.5513 acc_corrupt_t_0p2_0p4=0.9994 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0755 out_w_norm=11.5866 out_g_norm=0.4311 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0127 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.4894 init_gold_top10=0.1910 init_gold_top100=0.4159
634
+ step=3000 epoch=3000/3000 epoch_step=1/1 micro_steps=3000 elapsed=3.8s lr=2.000000e-03 loss=0.5007 loss_recon=0.5007 loss_meanflow=0.0000 mean_model_t=0.2101 mean_corrupt_t=0.2101 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8232 corrupt_frac=1.0000 acc_corrupt=0.8232 loss_corrupt=0.5007 wrong_frac=0.7897 init_acc_corrupt=0.1182 acc_corrupt_t_0p0_0p2=0.6841 corrupt_frac_t_0p0_0p2=0.5591 acc_corrupt_t_0p2_0p4=0.9997 corrupt_frac_t_0p2_0p4=0.3526 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p8_1p0=0.9922 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=11.6083 out_g_norm=0.3939 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0133 loss_all=0.5446 init_gold_top10=0.1951 init_gold_top100=0.4188
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_focused_len256_rollin_p100_s4_i32_20260517_1733focused.log ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 969,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_rollin_focused_len256_rollin_p100_s4_i32_20260517_1733focused",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 500,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 1965440,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_prob_floor": 0.0,
124
+ "categorical_wrong_corpus_unigram_path": "",
125
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
126
+ "categorical_wrong_basin_shared_prob": 0.0,
127
+ "categorical_wrong_unigram_shared_prob": 0.0,
128
+ "mask_mixture_original_prob": 0.0,
129
+ "mask_mixture_lowk_prob": 0.0,
130
+ "mask_mixture_lowcorrupt_prob": 0.0,
131
+ "mask_mixture_block_prob": 0.0,
132
+ "mask_mixture_all_prob": 1.0,
133
+ "mask_mixture_lowk_clean_tokens": "0",
134
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
135
+ "mask_mixture_block_tokens": "64,128",
136
+ "simplex_bridge_sampler": "dirichlet",
137
+ "logistic_normal_sigma_min": 0.1,
138
+ "logistic_normal_sigma_max": 1.0,
139
+ "logistic_normal_tau_min": 1.0,
140
+ "logistic_normal_tau_max": 1.0,
141
+ "torch_compile": false,
142
+ "compile_mode": "max-autotune",
143
+ "state_format": "prob",
144
+ "meanflow_weight": 0.0,
145
+ "rollout_train_prob": 1.0,
146
+ "rollout_train_steps": 4,
147
+ "rollout_train_infer_steps": 32,
148
+ "rollout_train_temp": 1.45,
149
+ "rollout_train_max_gamma": 1.0,
150
+ "rollout_train_corrupt_only": true,
151
+ "rollout_train_samplewise": true,
152
+ "rollout_train_compute_always": false,
153
+ "bridge_noise_init": "logistic_normal",
154
+ "noise_sigma": -1.0,
155
+ "allow_tf32": true,
156
+ "activation_checkpointing": false,
157
+ "activation_checkpoint_interval": 1,
158
+ "activation_checkpoint_scope": "block",
159
+ "ddp_static_graph": false,
160
+ "ddp_gradient_as_bucket_view": true,
161
+ "blocking_data_transfer": false,
162
+ "dataloader_prefetch_factor": 4,
163
+ "full_train_stats": false,
164
+ "tokenized_hf": false,
165
+ "tokenized_pad_token": "pad",
166
+ "elf_conditional_hf": false,
167
+ "record_pad_truncate": false,
168
+ "record_add_eos": false,
169
+ "record_add_special_tokens": false,
170
+ "record_pad_token": "pad",
171
+ "record_shuffle_buffer": 10000,
172
+ "wrap": true,
173
+ "wrap_mode": "stream",
174
+ "wrap_record_buffer_size": 200,
175
+ "owt_cached_chunks": true,
176
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
177
+ "owt_chunk_cache_rebuild": false,
178
+ "owt_chunk_cache_write_batch": 4096,
179
+ "owt_exact_repeat_per_chunk": 64,
180
+ "online_chunk_shuffle": false,
181
+ "online_chunk_shuffle_buffer": 10000,
182
+ "openwebtext_split": "train_minus_100k",
183
+ "detokenizer": "auto",
184
+ "resolved_detokenizer": null,
185
+ "num_workers": 0,
186
+ "latest_every": 500,
187
+ "resume_path": ""
188
+ }
189
+ step=100 epoch=100/500 epoch_step=1/1 micro_steps=100 elapsed=8.2s lr=2.000000e-03 loss=6.7057 loss_recon=6.7057 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=1.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0995 corrupt_frac=1.0000 acc_corrupt=0.0995 loss_corrupt=6.7057 wrong_frac=0.7915 init_acc_corrupt=0.1159 acc_corrupt_t_0p0_0p2=0.0488 corrupt_frac_t_0p0_0p2=0.5559 acc_corrupt_t_0p2_0p4=0.1333 corrupt_frac_t_0p2_0p4=0.3589 acc_corrupt_t_0p4_0p6=0.2777 corrupt_frac_t_0p4_0p6=0.0773 acc_corrupt_t_0p6_0p8=0.3853 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=1.0988 out_g_norm=1.0104 loss_all=6.4460 init_gold_top10=0.2110 init_gold_top100=0.5461 rollout_applied_pos_frac=1.0000 init_acc_rollout_applied=0.1169 init_acc_rollout_kept=0.0000 logit_acc_rollout_applied=0.1051 logit_acc_rollout_kept=0.0000
190
+ step=200 epoch=200/500 epoch_step=1/1 micro_steps=200 elapsed=7.5s lr=2.000000e-03 loss=6.0920 loss_recon=6.0920 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=1.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1116 corrupt_frac=1.0000 acc_corrupt=0.1116 loss_corrupt=6.0920 wrong_frac=0.7892 init_acc_corrupt=0.1190 acc_corrupt_t_0p0_0p2=0.0551 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.1512 corrupt_frac_t_0p2_0p4=0.3621 acc_corrupt_t_0p4_0p6=0.2945 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.4229 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=3.3334 out_g_norm=1.4060 acc_corrupt_t_0p8_1p0=0.4766 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.8240 init_gold_top10=0.2049 init_gold_top100=0.5963 rollout_applied_pos_frac=1.0000 init_acc_rollout_applied=0.1107 init_acc_rollout_kept=0.0000 logit_acc_rollout_applied=0.1096 logit_acc_rollout_kept=0.0000
191
+ step=300 epoch=300/500 epoch_step=1/1 micro_steps=300 elapsed=7.4s lr=2.000000e-03 loss=5.5560 loss_recon=5.5560 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=1.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1210 corrupt_frac=1.0000 acc_corrupt=0.1210 loss_corrupt=5.5560 wrong_frac=0.7935 init_acc_corrupt=0.1153 acc_corrupt_t_0p0_0p2=0.0590 corrupt_frac_t_0p0_0p2=0.5641 acc_corrupt_t_0p2_0p4=0.1694 corrupt_frac_t_0p2_0p4=0.3542 acc_corrupt_t_0p4_0p6=0.3234 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.4773 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=5.2201 out_g_norm=0.7125 acc_corrupt_t_0p8_1p0=0.6380 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.2826 init_gold_top10=0.2209 init_gold_top100=0.6553 rollout_applied_pos_frac=1.0000 init_acc_rollout_applied=0.1227 init_acc_rollout_kept=0.0000 logit_acc_rollout_applied=0.1303 logit_acc_rollout_kept=0.0000
192
+ step=400 epoch=400/500 epoch_step=1/1 micro_steps=400 elapsed=7.3s lr=2.000000e-03 loss=4.9781 loss_recon=4.9781 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=1.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1483 corrupt_frac=1.0000 acc_corrupt=0.1483 loss_corrupt=4.9781 wrong_frac=0.7917 init_acc_corrupt=0.1181 acc_corrupt_t_0p0_0p2=0.0642 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.2056 corrupt_frac_t_0p2_0p4=0.3620 acc_corrupt_t_0p4_0p6=0.4439 corrupt_frac_t_0p4_0p6=0.0719 out_w_norm=6.9063 out_g_norm=0.4180 acc_corrupt_t_0p6_0p8=0.6502 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.7422 corrupt_frac_t_0p8_1p0=0.0078 loss_all=4.7369 init_gold_top10=0.2113 init_gold_top100=0.7779 rollout_applied_pos_frac=1.0000 init_acc_rollout_applied=0.1016 init_acc_rollout_kept=0.0000 logit_acc_rollout_applied=0.1530 logit_acc_rollout_kept=0.0000
193
+ step=500 epoch=500/500 epoch_step=1/1 micro_steps=500 elapsed=7.3s lr=2.000000e-03 loss=4.1805 loss_recon=4.1805 loss_meanflow=0.0000 mean_model_t=0.2071 mean_corrupt_t=0.2071 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=1.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1890 corrupt_frac=1.0000 acc_corrupt=0.1890 loss_corrupt=4.1805 wrong_frac=0.7928 init_acc_corrupt=0.1178 acc_corrupt_t_0p0_0p2=0.0758 corrupt_frac_t_0p0_0p2=0.5632 acc_corrupt_t_0p2_0p4=0.2839 corrupt_frac_t_0p2_0p4=0.3546 acc_corrupt_t_0p4_0p6=0.5403 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.7033 corrupt_frac_t_0p6_0p8=0.0118 acc_corrupt_t_0p8_1p0=0.8555 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=8.4044 out_g_norm=0.4606 loss_all=3.6550 init_gold_top10=0.2562 init_gold_top100=0.9090 rollout_applied_pos_frac=1.0000 init_acc_rollout_applied=0.1274 init_acc_rollout_kept=0.0000 logit_acc_rollout_applied=0.2250 logit_acc_rollout_kept=0.0000
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_len1024_rollin_p50_s4_i32_20260517_1840ctx1024.log ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 2664,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_rollin_len1024_rollin_p50_s4_i32_20260517_1840ctx1024",
10
+ "batch_size": 32,
11
+ "grad_accum": 4,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 2616320,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_prob_floor": 0.0,
124
+ "categorical_wrong_corpus_unigram_path": "",
125
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
126
+ "categorical_wrong_basin_shared_prob": 0.0,
127
+ "categorical_wrong_unigram_shared_prob": 0.0,
128
+ "mask_mixture_original_prob": 0.0,
129
+ "mask_mixture_lowk_prob": 0.0,
130
+ "mask_mixture_lowcorrupt_prob": 0.0,
131
+ "mask_mixture_block_prob": 0.0,
132
+ "mask_mixture_all_prob": 1.0,
133
+ "mask_mixture_lowk_clean_tokens": "0",
134
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
135
+ "mask_mixture_block_tokens": "64,128",
136
+ "simplex_bridge_sampler": "dirichlet",
137
+ "logistic_normal_sigma_min": 0.1,
138
+ "logistic_normal_sigma_max": 1.0,
139
+ "logistic_normal_tau_min": 1.0,
140
+ "logistic_normal_tau_max": 1.0,
141
+ "torch_compile": false,
142
+ "compile_mode": "max-autotune",
143
+ "state_format": "prob",
144
+ "meanflow_weight": 0.0,
145
+ "rollout_train_prob": 0.5,
146
+ "rollout_train_steps": 4,
147
+ "rollout_train_infer_steps": 32,
148
+ "rollout_train_temp": 1.45,
149
+ "rollout_train_max_gamma": 1.0,
150
+ "rollout_train_corrupt_only": true,
151
+ "rollout_train_samplewise": true,
152
+ "rollout_train_compute_always": false,
153
+ "rollout_train_sync_t": false,
154
+ "bridge_noise_init": "logistic_normal",
155
+ "noise_sigma": -1.0,
156
+ "allow_tf32": true,
157
+ "activation_checkpointing": false,
158
+ "activation_checkpoint_interval": 1,
159
+ "activation_checkpoint_scope": "block",
160
+ "ddp_static_graph": false,
161
+ "ddp_gradient_as_bucket_view": true,
162
+ "blocking_data_transfer": false,
163
+ "dataloader_prefetch_factor": 4,
164
+ "full_train_stats": false,
165
+ "tokenized_hf": false,
166
+ "tokenized_pad_token": "pad",
167
+ "elf_conditional_hf": false,
168
+ "record_pad_truncate": false,
169
+ "record_add_eos": false,
170
+ "record_add_special_tokens": false,
171
+ "record_pad_token": "pad",
172
+ "record_shuffle_buffer": 10000,
173
+ "wrap": true,
174
+ "wrap_mode": "stream",
175
+ "wrap_record_buffer_size": 200,
176
+ "owt_cached_chunks": true,
177
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
178
+ "owt_chunk_cache_rebuild": false,
179
+ "owt_chunk_cache_write_batch": 4096,
180
+ "owt_exact_repeat_per_chunk": 64,
181
+ "online_chunk_shuffle": false,
182
+ "online_chunk_shuffle_buffer": 10000,
183
+ "openwebtext_split": "train_minus_100k",
184
+ "detokenizer": "auto",
185
+ "resolved_detokenizer": null,
186
+ "num_workers": 0,
187
+ "latest_every": 1000,
188
+ "resume_path": ""
189
+ }
190
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=400 elapsed=38.3s lr=2.000000e-03 loss=7.7211 loss_recon=7.7211 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5055 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0955 corrupt_frac=1.0000 acc_corrupt=0.0955 loss_corrupt=7.7211 wrong_frac=0.7908 init_acc_corrupt=0.1161 acc_corrupt_t_0p0_0p2=0.0500 corrupt_frac_t_0p0_0p2=0.5551 acc_corrupt_t_0p2_0p4=0.1263 corrupt_frac_t_0p2_0p4=0.3604 acc_corrupt_t_0p4_0p6=0.2508 corrupt_frac_t_0p4_0p6=0.0822 acc_corrupt_t_0p6_0p8=0.3566 corrupt_frac_t_0p6_0p8=0.0356 out_w_norm=1.0063 out_g_norm=0.6737 loss_all=7.5189 init_gold_top10=0.1519 init_gold_top100=0.4115 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.0816 init_acc_rollout_kept=0.0578 logit_acc_rollout_applied=0.0869 logit_acc_rollout_kept=0.0730
191
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=800 elapsed=37.5s lr=2.000000e-03 loss=7.0875 loss_recon=7.0875 loss_meanflow=0.0000 mean_model_t=0.2087 mean_corrupt_t=0.2087 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5035 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1032 corrupt_frac=1.0000 acc_corrupt=0.1032 loss_corrupt=7.0875 wrong_frac=0.7913 init_acc_corrupt=0.1163 acc_corrupt_t_0p0_0p2=0.0556 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=0.1398 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.2566 corrupt_frac_t_0p4_0p6=0.0807 acc_corrupt_t_0p8_1p0=0.4570 corrupt_frac_t_0p8_1p0=0.0312 out_w_norm=2.8644 out_g_norm=1.0978 acc_corrupt_t_0p6_0p8=0.3368 corrupt_frac_t_0p6_0p8=0.0357 loss_all=6.7515 init_gold_top10=0.1885 init_gold_top100=0.4529 rollout_applied_pos_frac=0.5625 init_acc_rollout_applied=0.1125 init_acc_rollout_kept=0.1085 logit_acc_rollout_applied=0.1083 logit_acc_rollout_kept=0.1037
192
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=1200 elapsed=37.5s lr=2.000000e-03 loss=6.4511 loss_recon=6.4511 loss_meanflow=0.0000 mean_model_t=0.2118 mean_corrupt_t=0.2118 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4957 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1155 corrupt_frac=1.0000 acc_corrupt=0.1155 loss_corrupt=6.4511 wrong_frac=0.7884 init_acc_corrupt=0.1212 acc_corrupt_t_0p0_0p2=0.0595 corrupt_frac_t_0p0_0p2=0.5476 acc_corrupt_t_0p2_0p4=0.1565 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.2815 corrupt_frac_t_0p4_0p6=0.0835 out_w_norm=4.3617 out_g_norm=0.8178 acc_corrupt_t_0p6_0p8=0.3852 corrupt_frac_t_0p6_0p8=0.0366 acc_corrupt_t_0p8_1p0=0.5366 corrupt_frac_t_0p8_1p0=0.0312 loss_all=6.2349 init_gold_top10=0.1833 init_gold_top100=0.4322 rollout_applied_pos_frac=0.4375 init_acc_rollout_applied=0.1633 init_acc_rollout_kept=0.0588 logit_acc_rollout_applied=0.1432 logit_acc_rollout_kept=0.0882
193
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=1600 elapsed=37.5s lr=2.000000e-03 loss=5.9906 loss_recon=5.9906 loss_meanflow=0.0000 mean_model_t=0.2091 mean_corrupt_t=0.2091 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5023 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1250 corrupt_frac=1.0000 acc_corrupt=0.1250 loss_corrupt=5.9906 wrong_frac=0.7911 init_acc_corrupt=0.1188 acc_corrupt_t_0p0_0p2=0.0629 corrupt_frac_t_0p0_0p2=0.5556 acc_corrupt_t_0p2_0p4=0.1717 corrupt_frac_t_0p2_0p4=0.3572 acc_corrupt_t_0p4_0p6=0.3178 corrupt_frac_t_0p4_0p6=0.0865 out_w_norm=5.4987 out_g_norm=0.3310 acc_corrupt_t_0p6_0p8=0.4333 corrupt_frac_t_0p6_0p8=0.0366 acc_corrupt_t_0p8_1p0=0.5137 corrupt_frac_t_0p8_1p0=0.0312 loss_all=5.7675 init_gold_top10=0.1940 init_gold_top100=0.4395 rollout_applied_pos_frac=0.4062 init_acc_rollout_applied=0.1511 init_acc_rollout_kept=0.0853 logit_acc_rollout_applied=0.1493 logit_acc_rollout_kept=0.1149
194
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=2000 elapsed=37.4s lr=2.000000e-03 loss=5.4943 loss_recon=5.4943 loss_meanflow=0.0000 mean_model_t=0.2104 mean_corrupt_t=0.2104 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5032 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1383 corrupt_frac=1.0000 acc_corrupt=0.1383 loss_corrupt=5.4943 wrong_frac=0.7895 init_acc_corrupt=0.1195 acc_corrupt_t_0p0_0p2=0.0679 corrupt_frac_t_0p0_0p2=0.5505 acc_corrupt_t_0p2_0p4=0.1911 corrupt_frac_t_0p2_0p4=0.3658 acc_corrupt_t_0p4_0p6=0.3549 corrupt_frac_t_0p4_0p6=0.0809 out_w_norm=6.7494 out_g_norm=0.2532 acc_corrupt_t_0p6_0p8=0.4935 corrupt_frac_t_0p6_0p8=0.0353 acc_corrupt_t_0p8_1p0=0.6230 corrupt_frac_t_0p8_1p0=0.0312 loss_all=5.4116 init_gold_top10=0.1764 init_gold_top100=0.4898 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.0849 init_acc_rollout_kept=0.1106 logit_acc_rollout_applied=0.1191 logit_acc_rollout_kept=0.1371
195
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=2400 elapsed=37.4s lr=2.000000e-03 loss=4.8848 loss_recon=4.8848 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4970 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1534 corrupt_frac=1.0000 acc_corrupt=0.1534 loss_corrupt=4.8848 wrong_frac=0.7905 init_acc_corrupt=0.1196 acc_corrupt_t_0p0_0p2=0.0718 corrupt_frac_t_0p0_0p2=0.5511 acc_corrupt_t_0p2_0p4=0.2144 corrupt_frac_t_0p2_0p4=0.3627 out_w_norm=8.0570 out_g_norm=0.2940 acc_corrupt_t_0p4_0p6=0.3997 corrupt_frac_t_0p4_0p6=0.0831 acc_corrupt_t_0p6_0p8=0.5749 corrupt_frac_t_0p6_0p8=0.0372 acc_corrupt_t_0p8_1p0=0.7373 corrupt_frac_t_0p8_1p0=0.0312 loss_all=4.4114 init_gold_top10=0.2211 init_gold_top100=0.6172 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.1576 init_acc_rollout_kept=0.0989 logit_acc_rollout_applied=0.2024 logit_acc_rollout_kept=0.1540
196
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=2800 elapsed=37.5s lr=2.000000e-03 loss=4.2577 loss_recon=4.2577 loss_meanflow=0.0000 mean_model_t=0.2102 mean_corrupt_t=0.2102 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4981 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1793 corrupt_frac=1.0000 acc_corrupt=0.1793 loss_corrupt=4.2577 wrong_frac=0.7899 init_acc_corrupt=0.1206 acc_corrupt_t_0p0_0p2=0.0785 corrupt_frac_t_0p0_0p2=0.5530 acc_corrupt_t_0p2_0p4=0.2532 corrupt_frac_t_0p2_0p4=0.3584 acc_corrupt_t_0p4_0p6=0.4900 corrupt_frac_t_0p4_0p6=0.0863 acc_corrupt_t_0p6_0p8=0.6753 corrupt_frac_t_0p6_0p8=0.0361 out_w_norm=9.2749 out_g_norm=0.3343 acc_corrupt_t_0p8_1p0=0.8145 corrupt_frac_t_0p8_1p0=0.0312 loss_all=4.1313 init_gold_top10=0.2126 init_gold_top100=0.5775 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.1626 init_acc_rollout_kept=0.1093 logit_acc_rollout_applied=0.2192 logit_acc_rollout_kept=0.1784
197
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=3200 elapsed=37.5s lr=2.000000e-03 loss=3.7527 loss_recon=3.7527 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5079 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2030 corrupt_frac=1.0000 acc_corrupt=0.2030 loss_corrupt=3.7527 wrong_frac=0.7903 init_acc_corrupt=0.1207 acc_corrupt_t_0p0_0p2=0.0879 corrupt_frac_t_0p0_0p2=0.5517 acc_corrupt_t_0p2_0p4=0.2965 corrupt_frac_t_0p2_0p4=0.3616 acc_corrupt_t_0p4_0p6=0.5306 corrupt_frac_t_0p4_0p6=0.0850 out_w_norm=10.1655 out_g_norm=0.3946 acc_corrupt_t_0p6_0p8=0.7050 corrupt_frac_t_0p6_0p8=0.0338 acc_corrupt_t_0p8_1p0=0.8413 corrupt_frac_t_0p8_1p0=0.0312 loss_all=3.6608 init_gold_top10=0.2162 init_gold_top100=0.6223 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.1540 init_acc_rollout_kept=0.0570 logit_acc_rollout_applied=0.2421 logit_acc_rollout_kept=0.1555
198
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=3600 elapsed=37.4s lr=2.000000e-03 loss=3.3035 loss_recon=3.3035 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4956 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2399 corrupt_frac=1.0000 acc_corrupt=0.2399 loss_corrupt=3.3035 wrong_frac=0.7892 init_acc_corrupt=0.1235 acc_corrupt_t_0p0_0p2=0.1037 corrupt_frac_t_0p0_0p2=0.5547 acc_corrupt_t_0p2_0p4=0.3581 corrupt_frac_t_0p2_0p4=0.3554 acc_corrupt_t_0p4_0p6=0.5989 corrupt_frac_t_0p4_0p6=0.0877 out_w_norm=10.7195 out_g_norm=0.5110 acc_corrupt_t_0p6_0p8=0.7433 corrupt_frac_t_0p6_0p8=0.0358 acc_corrupt_t_0p8_1p0=0.8540 corrupt_frac_t_0p8_1p0=0.0312 loss_all=3.0285 init_gold_top10=0.2676 init_gold_top100=0.6202 rollout_applied_pos_frac=0.4375 init_acc_rollout_applied=0.1009 init_acc_rollout_kept=0.1176 logit_acc_rollout_applied=0.2540 logit_acc_rollout_kept=0.2764
199
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=4000 elapsed=37.5s lr=2.000000e-03 loss=2.8950 loss_recon=2.8950 loss_meanflow=0.0000 mean_model_t=0.2090 mean_corrupt_t=0.2090 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4988 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2938 corrupt_frac=1.0000 acc_corrupt=0.2938 loss_corrupt=2.8950 wrong_frac=0.7911 init_acc_corrupt=0.1224 acc_corrupt_t_0p0_0p2=0.1292 corrupt_frac_t_0p0_0p2=0.5598 acc_corrupt_t_0p2_0p4=0.4540 corrupt_frac_t_0p2_0p4=0.3541 acc_corrupt_t_0p4_0p6=0.6939 corrupt_frac_t_0p4_0p6=0.0843 out_w_norm=11.0862 out_g_norm=0.6754 acc_corrupt_t_0p6_0p8=0.8143 corrupt_frac_t_0p6_0p8=0.0341 acc_corrupt_t_0p8_1p0=0.9004 corrupt_frac_t_0p8_1p0=0.0312 loss_all=3.0296 init_gold_top10=0.2572 init_gold_top100=0.5604 rollout_applied_pos_frac=0.3750 init_acc_rollout_applied=0.0764 init_acc_rollout_kept=0.1180 logit_acc_rollout_applied=0.2254 logit_acc_rollout_kept=0.3271
200
+ NCCL version 2.25.1+cuda12.8
201
+ resumed_from=runs/train8_rollin_len1024_rollin_p50_s4_i32_20260517_1840ctx1024/latest.pt start_step=1001
202
+ {
203
+ "device": "cuda:0",
204
+ "rank": 0,
205
+ "world_size": 4,
206
+ "samples": "owt_cached_chunks:8",
207
+ "vocab_size": 2664,
208
+ "tokenizer_vocab_size": 50257,
209
+ "save_dir": "runs/train8_rollin_len1024_rollin_p50_s4_i32_20260517_1840ctx1024",
210
+ "batch_size": 32,
211
+ "grad_accum": 4,
212
+ "effective_batch_size": 512,
213
+ "global_batch_size": 512,
214
+ "lr_schedule": "constant_warmup",
215
+ "optimizer": "muon",
216
+ "epochs": 0.0,
217
+ "steps_per_epoch": 1,
218
+ "total_steps": 2000,
219
+ "warmup_steps": 10,
220
+ "warmup_epochs": -1.0,
221
+ "min_lr": 0.0,
222
+ "weight_decay": 0.1,
223
+ "output_weight_decay": -1.0,
224
+ "adamw_param_groups": "nanogpt",
225
+ "adam_beta1": 0.9,
226
+ "adam_beta2": 0.95,
227
+ "adam_eps": 1e-08,
228
+ "muon_impl": "legacy",
229
+ "muon_momentum": 0.95,
230
+ "muon_ns_steps": 5,
231
+ "muon_update_scale": 1.0,
232
+ "muon_nesterov": false,
233
+ "muon_width_scale": false,
234
+ "muon_grouping": "legacy_dim_ge_2",
235
+ "muon_param_count": 2616320,
236
+ "muon_adam_param_count": 8192,
237
+ "muon_param_names": [
238
+ "vocab_embed.embedding",
239
+ "sigma_map.net.0.weight",
240
+ "sigma_map.net.2.weight",
241
+ "blocks.0.attn_qkv.weight",
242
+ "blocks.0.attn_out.weight",
243
+ "blocks.0.mlp.0.weight",
244
+ "blocks.0.mlp.2.weight",
245
+ "blocks.0.adaLN_modulation.weight",
246
+ "blocks.1.attn_qkv.weight",
247
+ "blocks.1.attn_out.weight",
248
+ "blocks.1.mlp.0.weight",
249
+ "blocks.1.mlp.2.weight",
250
+ "blocks.1.adaLN_modulation.weight",
251
+ "blocks.2.attn_qkv.weight",
252
+ "blocks.2.attn_out.weight",
253
+ "blocks.2.mlp.0.weight",
254
+ "blocks.2.mlp.2.weight",
255
+ "blocks.2.adaLN_modulation.weight",
256
+ "output_layer.linear.weight",
257
+ "output_layer.adaLN_modulation.weight"
258
+ ],
259
+ "muon_adam_param_names": [
260
+ "sigma_map.net.0.bias",
261
+ "sigma_map.net.2.bias",
262
+ "blocks.0.norm1.weight",
263
+ "blocks.0.norm2.weight",
264
+ "blocks.0.mlp.0.bias",
265
+ "blocks.0.mlp.2.bias",
266
+ "blocks.0.adaLN_modulation.bias",
267
+ "blocks.1.norm1.weight",
268
+ "blocks.1.norm2.weight",
269
+ "blocks.1.mlp.0.bias",
270
+ "blocks.1.mlp.2.bias",
271
+ "blocks.1.adaLN_modulation.bias",
272
+ "blocks.2.norm1.weight",
273
+ "blocks.2.norm2.weight",
274
+ "blocks.2.mlp.0.bias",
275
+ "blocks.2.mlp.2.bias",
276
+ "blocks.2.adaLN_modulation.bias",
277
+ "output_layer.norm_final.weight",
278
+ "output_layer.adaLN_modulation.bias"
279
+ ],
280
+ "muon_effective_nesterov": false,
281
+ "muon_effective_width_scale": false,
282
+ "muon_effective_weight_decay": 0.1,
283
+ "muon_adam_fallback_nesterov": false,
284
+ "muon_adam_fallback_weight_decay": 0.1,
285
+ "ema_decay": 0.9999,
286
+ "ema_start_step": 0,
287
+ "model_type": "ddit",
288
+ "ddit_mlp_type": "gelu",
289
+ "elf_num_time_tokens": 4,
290
+ "elf_num_model_mode_tokens": 0,
291
+ "qk_norm": true,
292
+ "output_bias": false,
293
+ "output_init_std": -1.0,
294
+ "norm_type": "rmsnorm",
295
+ "target_loss": "hard_ce",
296
+ "linear_soft_target_power": 1.0,
297
+ "linear_soft_target_min_conf": 0.0,
298
+ "linear_soft_target_max_conf": 1.0,
299
+ "t_sampling_mode": "logit_normal",
300
+ "t_sampling_power": 1.0,
301
+ "t_sampling_eps": 0.0001,
302
+ "t_sampling_logit_mean": -1.5,
303
+ "t_sampling_logit_std": 0.8,
304
+ "dual_t": true,
305
+ "corrupt_t_mode": "same",
306
+ "corrupt_min_t": 0.0,
307
+ "corrupt_max_t": 1.0,
308
+ "prefix_block_prob": 0.0,
309
+ "prefix_block_len": 128,
310
+ "mask_ratio_floor_schedule": "none",
311
+ "dirichlet_endpoint_mode": "categorical_dual_t",
312
+ "dirichlet_semantic_t_mode": "same",
313
+ "dirichlet_semantic_t_value": 0.0,
314
+ "dirichlet_semantic_t_curve": "linear",
315
+ "dirichlet_semantic_t_power": 1.0,
316
+ "endpoint_sequence_random_prob_alpha": 0.0,
317
+ "categorical_wrong_from_full_vocab": true,
318
+ "categorical_wrong_from_batch_valid_tokens": false,
319
+ "categorical_wrong_basin_token_ids": "",
320
+ "categorical_wrong_basin_prob": 0.0,
321
+ "categorical_wrong_unigram_prob": 0.0,
322
+ "categorical_wrong_uniform_prob": 0.0,
323
+ "categorical_wrong_prob_floor": 0.0,
324
+ "categorical_wrong_corpus_unigram_path": "",
325
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
326
+ "categorical_wrong_basin_shared_prob": 0.0,
327
+ "categorical_wrong_unigram_shared_prob": 0.0,
328
+ "mask_mixture_original_prob": 0.0,
329
+ "mask_mixture_lowk_prob": 0.0,
330
+ "mask_mixture_lowcorrupt_prob": 0.0,
331
+ "mask_mixture_block_prob": 0.0,
332
+ "mask_mixture_all_prob": 1.0,
333
+ "mask_mixture_lowk_clean_tokens": "0",
334
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
335
+ "mask_mixture_block_tokens": "64,128",
336
+ "simplex_bridge_sampler": "dirichlet",
337
+ "logistic_normal_sigma_min": 0.1,
338
+ "logistic_normal_sigma_max": 1.0,
339
+ "logistic_normal_tau_min": 1.0,
340
+ "logistic_normal_tau_max": 1.0,
341
+ "torch_compile": false,
342
+ "compile_mode": "max-autotune",
343
+ "state_format": "prob",
344
+ "meanflow_weight": 0.0,
345
+ "rollout_train_prob": 0.5,
346
+ "rollout_train_steps": 4,
347
+ "rollout_train_infer_steps": 32,
348
+ "rollout_train_temp": 1.45,
349
+ "rollout_train_max_gamma": 1.0,
350
+ "rollout_train_corrupt_only": true,
351
+ "rollout_train_samplewise": true,
352
+ "rollout_train_compute_always": false,
353
+ "rollout_train_sync_t": false,
354
+ "bridge_noise_init": "logistic_normal",
355
+ "noise_sigma": -1.0,
356
+ "allow_tf32": true,
357
+ "activation_checkpointing": false,
358
+ "activation_checkpoint_interval": 1,
359
+ "activation_checkpoint_scope": "block",
360
+ "ddp_static_graph": false,
361
+ "ddp_gradient_as_bucket_view": true,
362
+ "blocking_data_transfer": false,
363
+ "dataloader_prefetch_factor": 4,
364
+ "full_train_stats": false,
365
+ "tokenized_hf": false,
366
+ "tokenized_pad_token": "pad",
367
+ "elf_conditional_hf": false,
368
+ "record_pad_truncate": false,
369
+ "record_add_eos": false,
370
+ "record_add_special_tokens": false,
371
+ "record_pad_token": "pad",
372
+ "record_shuffle_buffer": 10000,
373
+ "wrap": true,
374
+ "wrap_mode": "stream",
375
+ "wrap_record_buffer_size": 200,
376
+ "owt_cached_chunks": true,
377
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
378
+ "owt_chunk_cache_rebuild": false,
379
+ "owt_chunk_cache_write_batch": 4096,
380
+ "owt_exact_repeat_per_chunk": 64,
381
+ "online_chunk_shuffle": false,
382
+ "online_chunk_shuffle_buffer": 10000,
383
+ "openwebtext_split": "train_minus_100k",
384
+ "detokenizer": "auto",
385
+ "resolved_detokenizer": null,
386
+ "num_workers": 0,
387
+ "latest_every": 1000,
388
+ "resume_path": "runs/train8_rollin_len1024_rollin_p50_s4_i32_20260517_1840ctx1024/latest.pt"
389
+ }
390
+ step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=4400 elapsed=38.3s lr=2.000000e-03 loss=2.4945 loss_recon=2.4945 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5055 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3683 corrupt_frac=1.0000 acc_corrupt=0.3683 loss_corrupt=2.4945 wrong_frac=0.7908 init_acc_corrupt=0.1240 acc_corrupt_t_0p0_0p2=0.1675 corrupt_frac_t_0p0_0p2=0.5551 acc_corrupt_t_0p2_0p4=0.5746 corrupt_frac_t_0p2_0p4=0.3604 acc_corrupt_t_0p4_0p6=0.7983 corrupt_frac_t_0p4_0p6=0.0822 acc_corrupt_t_0p6_0p8=0.8784 corrupt_frac_t_0p6_0p8=0.0356 out_w_norm=11.3758 out_g_norm=0.7552 loss_all=2.5911 init_gold_top10=0.3336 init_gold_top100=0.6645 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.0988 init_acc_rollout_kept=0.0578 logit_acc_rollout_applied=0.3668 logit_acc_rollout_kept=0.3079
391
+ step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=4800 elapsed=37.6s lr=2.000000e-03 loss=2.1611 loss_recon=2.1611 loss_meanflow=0.0000 mean_model_t=0.2087 mean_corrupt_t=0.2087 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5035 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4426 corrupt_frac=1.0000 acc_corrupt=0.4426 loss_corrupt=2.1611 wrong_frac=0.7913 init_acc_corrupt=0.1255 acc_corrupt_t_0p0_0p2=0.2089 corrupt_frac_t_0p0_0p2=0.5574 acc_corrupt_t_0p2_0p4=0.7016 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.8862 corrupt_frac_t_0p4_0p6=0.0807 acc_corrupt_t_0p8_1p0=0.9312 corrupt_frac_t_0p8_1p0=0.0312 out_w_norm=11.6450 out_g_norm=0.8699 acc_corrupt_t_0p6_0p8=0.9271 corrupt_frac_t_0p6_0p8=0.0357 loss_all=2.2175 init_gold_top10=0.4112 init_gold_top100=0.6921 rollout_applied_pos_frac=0.5625 init_acc_rollout_applied=0.1286 init_acc_rollout_kept=0.1085 logit_acc_rollout_applied=0.4788 logit_acc_rollout_kept=0.4031
392
+ step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=5200 elapsed=37.7s lr=2.000000e-03 loss=1.8630 loss_recon=1.8630 loss_meanflow=0.0000 mean_model_t=0.2118 mean_corrupt_t=0.2118 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4957 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5171 corrupt_frac=1.0000 acc_corrupt=0.5171 loss_corrupt=1.8630 wrong_frac=0.7884 init_acc_corrupt=0.1321 acc_corrupt_t_0p0_0p2=0.2547 corrupt_frac_t_0p0_0p2=0.5476 acc_corrupt_t_0p2_0p4=0.8082 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.9420 corrupt_frac_t_0p4_0p6=0.0835 out_w_norm=11.8612 out_g_norm=0.9278 acc_corrupt_t_0p6_0p8=0.9599 corrupt_frac_t_0p6_0p8=0.0366 acc_corrupt_t_0p8_1p0=0.9707 corrupt_frac_t_0p8_1p0=0.0312 loss_all=1.9265 init_gold_top10=0.4385 init_gold_top100=0.6085 rollout_applied_pos_frac=0.4375 init_acc_rollout_applied=0.1980 init_acc_rollout_kept=0.0588 logit_acc_rollout_applied=0.7255 logit_acc_rollout_kept=0.3106
393
+ step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=5600 elapsed=37.6s lr=2.000000e-03 loss=1.6395 loss_recon=1.6395 loss_meanflow=0.0000 mean_model_t=0.2091 mean_corrupt_t=0.2091 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5023 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5740 corrupt_frac=1.0000 acc_corrupt=0.5740 loss_corrupt=1.6395 wrong_frac=0.7911 init_acc_corrupt=0.1323 acc_corrupt_t_0p0_0p2=0.3145 corrupt_frac_t_0p0_0p2=0.5556 acc_corrupt_t_0p2_0p4=0.8804 corrupt_frac_t_0p2_0p4=0.3572 acc_corrupt_t_0p4_0p6=0.9724 corrupt_frac_t_0p4_0p6=0.0865 out_w_norm=12.0236 out_g_norm=0.9774 acc_corrupt_t_0p6_0p8=0.9795 corrupt_frac_t_0p6_0p8=0.0366 acc_corrupt_t_0p8_1p0=0.9873 corrupt_frac_t_0p8_1p0=0.0312 loss_all=1.7346 init_gold_top10=0.3990 init_gold_top100=0.5863 rollout_applied_pos_frac=0.4062 init_acc_rollout_applied=0.1877 init_acc_rollout_kept=0.0853 logit_acc_rollout_applied=0.6870 logit_acc_rollout_kept=0.4878
394
+ step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=6000 elapsed=37.6s lr=2.000000e-03 loss=1.3980 loss_recon=1.3980 loss_meanflow=0.0000 mean_model_t=0.2104 mean_corrupt_t=0.2104 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5032 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6327 corrupt_frac=1.0000 acc_corrupt=0.6327 loss_corrupt=1.3980 wrong_frac=0.7895 init_acc_corrupt=0.1356 acc_corrupt_t_0p0_0p2=0.3876 corrupt_frac_t_0p0_0p2=0.5505 acc_corrupt_t_0p2_0p4=0.9209 corrupt_frac_t_0p2_0p4=0.3658 acc_corrupt_t_0p4_0p6=0.9857 corrupt_frac_t_0p4_0p6=0.0809 out_w_norm=12.1409 out_g_norm=0.9930 acc_corrupt_t_0p6_0p8=0.9875 corrupt_frac_t_0p6_0p8=0.0353 acc_corrupt_t_0p8_1p0=0.9740 corrupt_frac_t_0p8_1p0=0.0312 loss_all=2.0586 init_gold_top10=0.4348 init_gold_top100=0.6795 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.1116 init_acc_rollout_kept=0.1106 logit_acc_rollout_applied=0.5888 logit_acc_rollout_kept=0.4600
395
+ step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=6400 elapsed=37.5s lr=2.000000e-03 loss=1.2548 loss_recon=1.2548 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4970 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6669 corrupt_frac=1.0000 acc_corrupt=0.6669 loss_corrupt=1.2548 wrong_frac=0.7905 init_acc_corrupt=0.1381 acc_corrupt_t_0p0_0p2=0.4306 corrupt_frac_t_0p0_0p2=0.5511 acc_corrupt_t_0p2_0p4=0.9487 corrupt_frac_t_0p2_0p4=0.3627 out_w_norm=12.2261 out_g_norm=0.9112 acc_corrupt_t_0p4_0p6=0.9921 corrupt_frac_t_0p4_0p6=0.0831 acc_corrupt_t_0p6_0p8=0.9925 corrupt_frac_t_0p6_0p8=0.0372 acc_corrupt_t_0p8_1p0=0.9829 corrupt_frac_t_0p8_1p0=0.0312 loss_all=1.0855 init_gold_top10=0.5554 init_gold_top100=0.6855 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.2027 init_acc_rollout_kept=0.0989 logit_acc_rollout_applied=0.8286 logit_acc_rollout_kept=0.5480
396
+ step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=6800 elapsed=37.3s lr=2.000000e-03 loss=1.1174 loss_recon=1.1174 loss_meanflow=0.0000 mean_model_t=0.2102 mean_corrupt_t=0.2102 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4981 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6984 corrupt_frac=1.0000 acc_corrupt=0.6984 loss_corrupt=1.1174 wrong_frac=0.7899 init_acc_corrupt=0.1409 acc_corrupt_t_0p0_0p2=0.4781 corrupt_frac_t_0p0_0p2=0.5530 acc_corrupt_t_0p2_0p4=0.9650 corrupt_frac_t_0p2_0p4=0.3584 acc_corrupt_t_0p4_0p6=0.9951 corrupt_frac_t_0p4_0p6=0.0863 acc_corrupt_t_0p6_0p8=0.9940 corrupt_frac_t_0p6_0p8=0.0361 out_w_norm=12.2909 out_g_norm=0.8889 acc_corrupt_t_0p8_1p0=0.9893 corrupt_frac_t_0p8_1p0=0.0312 loss_all=1.3472 init_gold_top10=0.4938 init_gold_top100=0.6382 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.2066 init_acc_rollout_kept=0.1093 logit_acc_rollout_applied=0.7921 logit_acc_rollout_kept=0.5591
397
+ step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=7200 elapsed=37.6s lr=2.000000e-03 loss=1.0126 loss_recon=1.0126 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5079 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7211 corrupt_frac=1.0000 acc_corrupt=0.7211 loss_corrupt=1.0126 wrong_frac=0.7903 init_acc_corrupt=0.1416 acc_corrupt_t_0p0_0p2=0.5124 corrupt_frac_t_0p0_0p2=0.5517 acc_corrupt_t_0p2_0p4=0.9736 corrupt_frac_t_0p2_0p4=0.3616 acc_corrupt_t_0p4_0p6=0.9970 corrupt_frac_t_0p4_0p6=0.0850 out_w_norm=12.3265 out_g_norm=0.8711 acc_corrupt_t_0p6_0p8=0.9957 corrupt_frac_t_0p6_0p8=0.0338 acc_corrupt_t_0p8_1p0=0.9839 corrupt_frac_t_0p8_1p0=0.0312 loss_all=1.1469 init_gold_top10=0.5062 init_gold_top100=0.6721 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.2005 init_acc_rollout_kept=0.0570 logit_acc_rollout_applied=0.7336 logit_acc_rollout_kept=0.5972
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_len1024_rollin_p50_s4_i32_20260517_1855ctx1024bs128.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_synct_len256_synct_p50_s8_i64_20260517_1800synct.log ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 969,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_rollin_synct_len256_synct_p50_s8_i64_20260517_1800synct",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 500,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 1965440,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_prob_floor": 0.0,
124
+ "categorical_wrong_corpus_unigram_path": "",
125
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
126
+ "categorical_wrong_basin_shared_prob": 0.0,
127
+ "categorical_wrong_unigram_shared_prob": 0.0,
128
+ "mask_mixture_original_prob": 0.0,
129
+ "mask_mixture_lowk_prob": 0.0,
130
+ "mask_mixture_lowcorrupt_prob": 0.0,
131
+ "mask_mixture_block_prob": 0.0,
132
+ "mask_mixture_all_prob": 1.0,
133
+ "mask_mixture_lowk_clean_tokens": "0",
134
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
135
+ "mask_mixture_block_tokens": "64,128",
136
+ "simplex_bridge_sampler": "dirichlet",
137
+ "logistic_normal_sigma_min": 0.1,
138
+ "logistic_normal_sigma_max": 1.0,
139
+ "logistic_normal_tau_min": 1.0,
140
+ "logistic_normal_tau_max": 1.0,
141
+ "torch_compile": false,
142
+ "compile_mode": "max-autotune",
143
+ "state_format": "prob",
144
+ "meanflow_weight": 0.0,
145
+ "rollout_train_prob": 0.5,
146
+ "rollout_train_steps": 8,
147
+ "rollout_train_infer_steps": 64,
148
+ "rollout_train_temp": 1.45,
149
+ "rollout_train_max_gamma": 1.0,
150
+ "rollout_train_corrupt_only": true,
151
+ "rollout_train_samplewise": true,
152
+ "rollout_train_compute_always": false,
153
+ "rollout_train_sync_t": true,
154
+ "bridge_noise_init": "logistic_normal",
155
+ "noise_sigma": -1.0,
156
+ "allow_tf32": true,
157
+ "activation_checkpointing": false,
158
+ "activation_checkpoint_interval": 1,
159
+ "activation_checkpoint_scope": "block",
160
+ "ddp_static_graph": false,
161
+ "ddp_gradient_as_bucket_view": true,
162
+ "blocking_data_transfer": false,
163
+ "dataloader_prefetch_factor": 4,
164
+ "full_train_stats": false,
165
+ "tokenized_hf": false,
166
+ "tokenized_pad_token": "pad",
167
+ "elf_conditional_hf": false,
168
+ "record_pad_truncate": false,
169
+ "record_add_eos": false,
170
+ "record_add_special_tokens": false,
171
+ "record_pad_token": "pad",
172
+ "record_shuffle_buffer": 10000,
173
+ "wrap": true,
174
+ "wrap_mode": "stream",
175
+ "wrap_record_buffer_size": 200,
176
+ "owt_cached_chunks": true,
177
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
178
+ "owt_chunk_cache_rebuild": false,
179
+ "owt_chunk_cache_write_batch": 4096,
180
+ "owt_exact_repeat_per_chunk": 64,
181
+ "online_chunk_shuffle": false,
182
+ "online_chunk_shuffle_buffer": 10000,
183
+ "openwebtext_split": "train_minus_100k",
184
+ "detokenizer": "auto",
185
+ "resolved_detokenizer": null,
186
+ "num_workers": 0,
187
+ "latest_every": 500,
188
+ "resume_path": ""
189
+ }
190
+ step=100 epoch=100/500 epoch_step=1/1 micro_steps=100 elapsed=11.2s lr=2.000000e-03 loss=6.7066 loss_recon=6.7066 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5128 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0995 corrupt_frac=1.0000 acc_corrupt=0.0995 loss_corrupt=6.7066 wrong_frac=0.7915 init_acc_corrupt=0.1159 acc_corrupt_t_0p0_0p2=0.0486 corrupt_frac_t_0p0_0p2=0.5559 acc_corrupt_t_0p2_0p4=0.1326 corrupt_frac_t_0p2_0p4=0.3589 acc_corrupt_t_0p4_0p6=0.2811 corrupt_frac_t_0p4_0p6=0.0773 acc_corrupt_t_0p6_0p8=0.4045 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=1.0999 out_g_norm=1.0064 loss_all=6.4488 init_gold_top10=0.2091 init_gold_top100=0.4887 rollout_applied_pos_frac=0.4844 init_acc_rollout_applied=0.1132 init_acc_rollout_kept=0.1206 logit_acc_rollout_applied=0.1071 logit_acc_rollout_kept=0.0991
191
+ W0517 17:58:23.621000 251950 torch/distributed/elastic/agent/server/api.py:719] Received 15 death signal, shutting down workers
192
+ W0517 17:58:23.623000 251950 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 251955 closing signal SIGTERM
193
+ W0517 17:58:23.624000 251950 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 251956 closing signal SIGTERM
194
+ W0517 17:58:23.624000 251950 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 251957 closing signal SIGTERM
195
+ W0517 17:58:23.625000 251950 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 251958 closing signal SIGTERM
196
+ Traceback (most recent call last):
197
+ File "<frozen runpy>", line 198, in _run_module_as_main
198
+ File "<frozen runpy>", line 88, in _run_code
199
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
200
+ main()
201
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
202
+ return f(*args, **kwargs)
203
+ ^^^^^^^^^^^^^^^^^^
204
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
205
+ run(args)
206
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
207
+ elastic_launch(
208
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
209
+ return launch_agent(self._config, self._entrypoint, list(args))
210
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
211
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
212
+ result = agent.run()
213
+ ^^^^^^^^^^^
214
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
215
+ result = f(*args, **kwargs)
216
+ ^^^^^^^^^^^^^^^^^^
217
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run
218
+ result = self._invoke_run(role)
219
+ ^^^^^^^^^^^^^^^^^^^^^^
220
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 870, in _invoke_run
221
+ time.sleep(monitor_interval)
222
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler
223
+ raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
224
+ torch.distributed.elastic.multiprocessing.api.SignalException: Process 251950 got signal: 15
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_wrongfloor_len256_wrongfloor0p3_20260517_1815wrongfloor.log ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 969,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_wrongfloor_len256_wrongfloor0p3_20260517_1815wrongfloor",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 1965440,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_prob_floor": 0.3,
124
+ "categorical_wrong_corpus_unigram_path": "",
125
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
126
+ "categorical_wrong_basin_shared_prob": 0.0,
127
+ "categorical_wrong_unigram_shared_prob": 0.0,
128
+ "mask_mixture_original_prob": 0.0,
129
+ "mask_mixture_lowk_prob": 0.0,
130
+ "mask_mixture_lowcorrupt_prob": 0.0,
131
+ "mask_mixture_block_prob": 0.0,
132
+ "mask_mixture_all_prob": 1.0,
133
+ "mask_mixture_lowk_clean_tokens": "0",
134
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
135
+ "mask_mixture_block_tokens": "64,128",
136
+ "simplex_bridge_sampler": "dirichlet",
137
+ "logistic_normal_sigma_min": 0.1,
138
+ "logistic_normal_sigma_max": 1.0,
139
+ "logistic_normal_tau_min": 1.0,
140
+ "logistic_normal_tau_max": 1.0,
141
+ "torch_compile": false,
142
+ "compile_mode": "max-autotune",
143
+ "state_format": "prob",
144
+ "meanflow_weight": 0.0,
145
+ "rollout_train_prob": 0.0,
146
+ "rollout_train_steps": 1,
147
+ "rollout_train_infer_steps": 64,
148
+ "rollout_train_temp": 1.45,
149
+ "rollout_train_max_gamma": 1.0,
150
+ "rollout_train_corrupt_only": true,
151
+ "rollout_train_samplewise": false,
152
+ "rollout_train_compute_always": false,
153
+ "rollout_train_sync_t": false,
154
+ "bridge_noise_init": "logistic_normal",
155
+ "noise_sigma": -1.0,
156
+ "allow_tf32": true,
157
+ "activation_checkpointing": false,
158
+ "activation_checkpoint_interval": 1,
159
+ "activation_checkpoint_scope": "block",
160
+ "ddp_static_graph": false,
161
+ "ddp_gradient_as_bucket_view": true,
162
+ "blocking_data_transfer": false,
163
+ "dataloader_prefetch_factor": 4,
164
+ "full_train_stats": false,
165
+ "tokenized_hf": false,
166
+ "tokenized_pad_token": "pad",
167
+ "elf_conditional_hf": false,
168
+ "record_pad_truncate": false,
169
+ "record_add_eos": false,
170
+ "record_add_special_tokens": false,
171
+ "record_pad_token": "pad",
172
+ "record_shuffle_buffer": 10000,
173
+ "wrap": true,
174
+ "wrap_mode": "stream",
175
+ "wrap_record_buffer_size": 200,
176
+ "owt_cached_chunks": true,
177
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
178
+ "owt_chunk_cache_rebuild": false,
179
+ "owt_chunk_cache_write_batch": 4096,
180
+ "owt_exact_repeat_per_chunk": 64,
181
+ "online_chunk_shuffle": false,
182
+ "online_chunk_shuffle_buffer": 10000,
183
+ "openwebtext_split": "train_minus_100k",
184
+ "detokenizer": "auto",
185
+ "resolved_detokenizer": null,
186
+ "num_workers": 0,
187
+ "latest_every": 1000,
188
+ "resume_path": ""
189
+ }
190
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=4.4s lr=2.000000e-03 loss=6.7048 loss_recon=6.7048 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0998 corrupt_frac=1.0000 acc_corrupt=0.0998 loss_corrupt=6.7048 wrong_frac=0.7918 init_acc_corrupt=0.1152 acc_corrupt_t_0p0_0p2=0.0485 corrupt_frac_t_0p0_0p2=0.5588 acc_corrupt_t_0p2_0p4=0.1343 corrupt_frac_t_0p2_0p4=0.3579 acc_corrupt_t_0p4_0p6=0.2822 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.4246 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=1.0992 out_g_norm=1.0008 acc_corrupt_t_0p8_1p0=0.4414 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.4736 init_gold_top10=0.1887 init_gold_top100=0.4165
191
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=3.7s lr=2.000000e-03 loss=6.1011 loss_recon=6.1011 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1091 corrupt_frac=1.0000 acc_corrupt=0.1091 loss_corrupt=6.1011 wrong_frac=0.7926 init_acc_corrupt=0.1151 acc_corrupt_t_0p0_0p2=0.0546 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=0.1490 corrupt_frac_t_0p2_0p4=0.3608 acc_corrupt_t_0p4_0p6=0.2945 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.4062 corrupt_frac_t_0p6_0p8=0.0119 out_w_norm=3.3219 out_g_norm=1.4082 acc_corrupt_t_0p8_1p0=0.4609 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.8429 init_gold_top10=0.1850 init_gold_top100=0.4143
192
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=3.7s lr=2.000000e-03 loss=5.5386 loss_recon=5.5386 loss_meanflow=0.0000 mean_model_t=0.2117 mean_corrupt_t=0.2117 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1243 corrupt_frac=1.0000 acc_corrupt=0.1243 loss_corrupt=5.5386 wrong_frac=0.7885 init_acc_corrupt=0.1197 acc_corrupt_t_0p0_0p2=0.0591 corrupt_frac_t_0p0_0p2=0.5484 acc_corrupt_t_0p2_0p4=0.1681 corrupt_frac_t_0p2_0p4=0.3611 acc_corrupt_t_0p4_0p6=0.3285 corrupt_frac_t_0p4_0p6=0.0803 acc_corrupt_t_0p6_0p8=0.4765 corrupt_frac_t_0p6_0p8=0.0139 out_w_norm=5.2102 out_g_norm=0.7079 acc_corrupt_t_0p8_1p0=0.4980 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.2636 init_gold_top10=0.2059 init_gold_top100=0.4209
193
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=3.7s lr=2.000000e-03 loss=5.0143 loss_recon=5.0143 loss_meanflow=0.0000 mean_model_t=0.2073 mean_corrupt_t=0.2073 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1462 corrupt_frac=1.0000 acc_corrupt=0.1462 loss_corrupt=5.0143 wrong_frac=0.7928 init_acc_corrupt=0.1147 acc_corrupt_t_0p0_0p2=0.0638 corrupt_frac_t_0p0_0p2=0.5613 acc_corrupt_t_0p2_0p4=0.2018 corrupt_frac_t_0p2_0p4=0.3571 acc_corrupt_t_0p4_0p6=0.4463 corrupt_frac_t_0p4_0p6=0.0727 acc_corrupt_t_0p6_0p8=0.6573 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=6.8955 out_g_norm=0.4054 loss_all=4.6717 init_gold_top10=0.2018 init_gold_top100=0.4261
194
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=3.7s lr=2.000000e-03 loss=4.2633 loss_recon=4.2633 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1848 corrupt_frac=1.0000 acc_corrupt=0.1848 loss_corrupt=4.2633 wrong_frac=0.7905 init_acc_corrupt=0.1175 acc_corrupt_t_0p0_0p2=0.0726 corrupt_frac_t_0p0_0p2=0.5540 acc_corrupt_t_0p2_0p4=0.2715 corrupt_frac_t_0p2_0p4=0.3605 acc_corrupt_t_0p4_0p6=0.5261 corrupt_frac_t_0p4_0p6=0.0745 out_w_norm=8.4270 out_g_norm=0.4608 acc_corrupt_t_0p6_0p8=0.6848 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=0.7448 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.8613 init_gold_top10=0.2070 init_gold_top100=0.4169
195
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=3.7s lr=2.000000e-03 loss=3.4251 loss_recon=3.4251 loss_meanflow=0.0000 mean_model_t=0.2089 mean_corrupt_t=0.2089 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2389 corrupt_frac=1.0000 acc_corrupt=0.2389 loss_corrupt=3.4251 wrong_frac=0.7911 init_acc_corrupt=0.1167 acc_corrupt_t_0p0_0p2=0.0945 corrupt_frac_t_0p0_0p2=0.5598 acc_corrupt_t_0p2_0p4=0.3712 corrupt_frac_t_0p2_0p4=0.3568 acc_corrupt_t_0p4_0p6=0.6290 corrupt_frac_t_0p4_0p6=0.0750 acc_corrupt_t_0p6_0p8=0.7620 corrupt_frac_t_0p6_0p8=0.0120 out_w_norm=9.5765 out_g_norm=0.5181 acc_corrupt_t_0p8_1p0=0.7689 corrupt_frac_t_0p8_1p0=0.0094 loss_all=3.1182 init_gold_top10=0.1900 init_gold_top100=0.4196
196
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=3.7s lr=2.000000e-03 loss=2.6250 loss_recon=2.6250 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3370 corrupt_frac=1.0000 acc_corrupt=0.3370 loss_corrupt=2.6250 wrong_frac=0.7907 init_acc_corrupt=0.1176 acc_corrupt_t_0p0_0p2=0.1399 corrupt_frac_t_0p0_0p2=0.5552 acc_corrupt_t_0p2_0p4=0.5298 corrupt_frac_t_0p2_0p4=0.3584 acc_corrupt_t_0p4_0p6=0.7932 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=10.2365 out_g_norm=0.6321 acc_corrupt_t_0p6_0p8=0.8852 corrupt_frac_t_0p6_0p8=0.0132 acc_corrupt_t_0p8_1p0=0.8451 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.2547 init_gold_top10=0.1937 init_gold_top100=0.4183
197
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=3.7s lr=2.000000e-03 loss=1.8240 loss_recon=1.8240 loss_meanflow=0.0000 mean_model_t=0.2103 mean_corrupt_t=0.2103 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4941 corrupt_frac=1.0000 acc_corrupt=0.4941 loss_corrupt=1.8240 wrong_frac=0.7898 init_acc_corrupt=0.1186 acc_corrupt_t_0p0_0p2=0.2539 corrupt_frac_t_0p0_0p2=0.5525 acc_corrupt_t_0p2_0p4=0.7542 corrupt_frac_t_0p2_0p4=0.3592 acc_corrupt_t_0p4_0p6=0.9359 corrupt_frac_t_0p4_0p6=0.0800 acc_corrupt_t_0p6_0p8=0.9643 corrupt_frac_t_0p6_0p8=0.0141 out_w_norm=10.6284 out_g_norm=0.8746 acc_corrupt_t_0p8_1p0=0.9531 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.4551 init_gold_top10=0.2038 init_gold_top100=0.4186
198
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=3.7s lr=2.000000e-03 loss=1.1915 loss_recon=1.1915 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6347 corrupt_frac=1.0000 acc_corrupt=0.6347 loss_corrupt=1.1915 wrong_frac=0.7907 init_acc_corrupt=0.1180 acc_corrupt_t_0p0_0p2=0.3964 corrupt_frac_t_0p0_0p2=0.5584 acc_corrupt_t_0p2_0p4=0.9225 corrupt_frac_t_0p2_0p4=0.3530 acc_corrupt_t_0p4_0p6=0.9890 corrupt_frac_t_0p4_0p6=0.0801 acc_corrupt_t_0p6_0p8=0.9912 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=10.9390 out_g_norm=0.9790 acc_corrupt_t_0p8_1p0=0.9727 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0933 init_gold_top10=0.2031 init_gold_top100=0.4228
199
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=3.7s lr=2.000000e-03 loss=0.8912 loss_recon=0.8912 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7007 corrupt_frac=1.0000 acc_corrupt=0.7007 loss_corrupt=0.8912 wrong_frac=0.7908 init_acc_corrupt=0.1167 acc_corrupt_t_0p0_0p2=0.4763 corrupt_frac_t_0p0_0p2=0.5561 acc_corrupt_t_0p2_0p4=0.9778 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.9982 corrupt_frac_t_0p4_0p6=0.0759 acc_corrupt_t_0p6_0p8=0.9958 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=11.1022 out_g_norm=0.9477 acc_corrupt_t_0p8_1p0=0.9805 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.8570 init_gold_top10=0.2021 init_gold_top100=0.4184
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/hf_xet-1.5.0.dist-info/RECORD ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ hf_xet-1.5.0.dist-info/INSTALLER,sha256=5hhM4Q4mYTT9z6QB6PGpUAW81PGNFrYrdXMj4oM_6ak,2
2
+ hf_xet-1.5.0.dist-info/METADATA,sha256=kiRjS5pSbyNKVHLD5pYclp87MWjinAwQrmuPjKJX8yA,4882
3
+ hf_xet-1.5.0.dist-info/RECORD,,
4
+ hf_xet-1.5.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ hf_xet-1.5.0.dist-info/WHEEL,sha256=LbLhSGTXlKRmTCKROpRhcqPcmllwkP3mQdJf3GnRbUM,143
6
+ hf_xet-1.5.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
+ hf_xet-1.5.0.dist-info/sboms/hf_xet.cyclonedx.json,sha256=cqFELPUEOVkJUBakT3vgAWYE1N939XMjcP_vAKt_0xw,305769
8
+ hf_xet/__init__.py,sha256=E8UDdyQ8glZ_nve9hHEf22bPang8-RKx4VuApXYeQUo,107
9
+ hf_xet/hf_xet.abi3.so,sha256=6s8jp7y5mQ7kVbKGwgTcVFDzZ8rBdC63i7CPjbm_wEk,11465992
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/hf_xet-1.5.0.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.13.1)
3
+ Root-Is-Purelib: false
4
+ Tag: cp37-abi3-manylinux_2_17_x86_64
5
+ Tag: cp37-abi3-manylinux2014_x86_64
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/__init__.py ADDED
File without changes
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/anyio.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import ssl
4
+ import typing
5
+
6
+ import anyio
7
+
8
+ from .._exceptions import (
9
+ ConnectError,
10
+ ConnectTimeout,
11
+ ReadError,
12
+ ReadTimeout,
13
+ WriteError,
14
+ WriteTimeout,
15
+ map_exceptions,
16
+ )
17
+ from .._utils import is_socket_readable
18
+ from .base import SOCKET_OPTION, AsyncNetworkBackend, AsyncNetworkStream
19
+
20
+
21
+ class AnyIOStream(AsyncNetworkStream):
22
+ def __init__(self, stream: anyio.abc.ByteStream) -> None:
23
+ self._stream = stream
24
+
25
+ async def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
26
+ exc_map = {
27
+ TimeoutError: ReadTimeout,
28
+ anyio.BrokenResourceError: ReadError,
29
+ anyio.ClosedResourceError: ReadError,
30
+ anyio.EndOfStream: ReadError,
31
+ }
32
+ with map_exceptions(exc_map):
33
+ with anyio.fail_after(timeout):
34
+ try:
35
+ return await self._stream.receive(max_bytes=max_bytes)
36
+ except anyio.EndOfStream: # pragma: nocover
37
+ return b""
38
+
39
+ async def write(self, buffer: bytes, timeout: float | None = None) -> None:
40
+ if not buffer:
41
+ return
42
+
43
+ exc_map = {
44
+ TimeoutError: WriteTimeout,
45
+ anyio.BrokenResourceError: WriteError,
46
+ anyio.ClosedResourceError: WriteError,
47
+ }
48
+ with map_exceptions(exc_map):
49
+ with anyio.fail_after(timeout):
50
+ await self._stream.send(item=buffer)
51
+
52
+ async def aclose(self) -> None:
53
+ await self._stream.aclose()
54
+
55
+ async def start_tls(
56
+ self,
57
+ ssl_context: ssl.SSLContext,
58
+ server_hostname: str | None = None,
59
+ timeout: float | None = None,
60
+ ) -> AsyncNetworkStream:
61
+ exc_map = {
62
+ TimeoutError: ConnectTimeout,
63
+ anyio.BrokenResourceError: ConnectError,
64
+ anyio.EndOfStream: ConnectError,
65
+ ssl.SSLError: ConnectError,
66
+ }
67
+ with map_exceptions(exc_map):
68
+ try:
69
+ with anyio.fail_after(timeout):
70
+ ssl_stream = await anyio.streams.tls.TLSStream.wrap(
71
+ self._stream,
72
+ ssl_context=ssl_context,
73
+ hostname=server_hostname,
74
+ standard_compatible=False,
75
+ server_side=False,
76
+ )
77
+ except Exception as exc: # pragma: nocover
78
+ await self.aclose()
79
+ raise exc
80
+ return AnyIOStream(ssl_stream)
81
+
82
+ def get_extra_info(self, info: str) -> typing.Any:
83
+ if info == "ssl_object":
84
+ return self._stream.extra(anyio.streams.tls.TLSAttribute.ssl_object, None)
85
+ if info == "client_addr":
86
+ return self._stream.extra(anyio.abc.SocketAttribute.local_address, None)
87
+ if info == "server_addr":
88
+ return self._stream.extra(anyio.abc.SocketAttribute.remote_address, None)
89
+ if info == "socket":
90
+ return self._stream.extra(anyio.abc.SocketAttribute.raw_socket, None)
91
+ if info == "is_readable":
92
+ sock = self._stream.extra(anyio.abc.SocketAttribute.raw_socket, None)
93
+ return is_socket_readable(sock)
94
+ return None
95
+
96
+
97
+ class AnyIOBackend(AsyncNetworkBackend):
98
+ async def connect_tcp(
99
+ self,
100
+ host: str,
101
+ port: int,
102
+ timeout: float | None = None,
103
+ local_address: str | None = None,
104
+ socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
105
+ ) -> AsyncNetworkStream: # pragma: nocover
106
+ if socket_options is None:
107
+ socket_options = []
108
+ exc_map = {
109
+ TimeoutError: ConnectTimeout,
110
+ OSError: ConnectError,
111
+ anyio.BrokenResourceError: ConnectError,
112
+ }
113
+ with map_exceptions(exc_map):
114
+ with anyio.fail_after(timeout):
115
+ stream: anyio.abc.ByteStream = await anyio.connect_tcp(
116
+ remote_host=host,
117
+ remote_port=port,
118
+ local_host=local_address,
119
+ )
120
+ # By default TCP sockets opened in `asyncio` include TCP_NODELAY.
121
+ for option in socket_options:
122
+ stream._raw_socket.setsockopt(*option) # type: ignore[attr-defined] # pragma: no cover
123
+ return AnyIOStream(stream)
124
+
125
+ async def connect_unix_socket(
126
+ self,
127
+ path: str,
128
+ timeout: float | None = None,
129
+ socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
130
+ ) -> AsyncNetworkStream: # pragma: nocover
131
+ if socket_options is None:
132
+ socket_options = []
133
+ exc_map = {
134
+ TimeoutError: ConnectTimeout,
135
+ OSError: ConnectError,
136
+ anyio.BrokenResourceError: ConnectError,
137
+ }
138
+ with map_exceptions(exc_map):
139
+ with anyio.fail_after(timeout):
140
+ stream: anyio.abc.ByteStream = await anyio.connect_unix(path)
141
+ for option in socket_options:
142
+ stream._raw_socket.setsockopt(*option) # type: ignore[attr-defined] # pragma: no cover
143
+ return AnyIOStream(stream)
144
+
145
+ async def sleep(self, seconds: float) -> None:
146
+ await anyio.sleep(seconds) # pragma: nocover
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/auto.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import typing
4
+
5
+ from .._synchronization import current_async_library
6
+ from .base import SOCKET_OPTION, AsyncNetworkBackend, AsyncNetworkStream
7
+
8
+
9
+ class AutoBackend(AsyncNetworkBackend):
10
+ async def _init_backend(self) -> None:
11
+ if not (hasattr(self, "_backend")):
12
+ backend = current_async_library()
13
+ if backend == "trio":
14
+ from .trio import TrioBackend
15
+
16
+ self._backend: AsyncNetworkBackend = TrioBackend()
17
+ else:
18
+ from .anyio import AnyIOBackend
19
+
20
+ self._backend = AnyIOBackend()
21
+
22
+ async def connect_tcp(
23
+ self,
24
+ host: str,
25
+ port: int,
26
+ timeout: float | None = None,
27
+ local_address: str | None = None,
28
+ socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
29
+ ) -> AsyncNetworkStream:
30
+ await self._init_backend()
31
+ return await self._backend.connect_tcp(
32
+ host,
33
+ port,
34
+ timeout=timeout,
35
+ local_address=local_address,
36
+ socket_options=socket_options,
37
+ )
38
+
39
+ async def connect_unix_socket(
40
+ self,
41
+ path: str,
42
+ timeout: float | None = None,
43
+ socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
44
+ ) -> AsyncNetworkStream: # pragma: nocover
45
+ await self._init_backend()
46
+ return await self._backend.connect_unix_socket(
47
+ path, timeout=timeout, socket_options=socket_options
48
+ )
49
+
50
+ async def sleep(self, seconds: float) -> None: # pragma: nocover
51
+ await self._init_backend()
52
+ return await self._backend.sleep(seconds)
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/base.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import ssl
4
+ import time
5
+ import typing
6
+
7
+ SOCKET_OPTION = typing.Union[
8
+ typing.Tuple[int, int, int],
9
+ typing.Tuple[int, int, typing.Union[bytes, bytearray]],
10
+ typing.Tuple[int, int, None, int],
11
+ ]
12
+
13
+
14
+ class NetworkStream:
15
+ def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
16
+ raise NotImplementedError() # pragma: nocover
17
+
18
+ def write(self, buffer: bytes, timeout: float | None = None) -> None:
19
+ raise NotImplementedError() # pragma: nocover
20
+
21
+ def close(self) -> None:
22
+ raise NotImplementedError() # pragma: nocover
23
+
24
+ def start_tls(
25
+ self,
26
+ ssl_context: ssl.SSLContext,
27
+ server_hostname: str | None = None,
28
+ timeout: float | None = None,
29
+ ) -> NetworkStream:
30
+ raise NotImplementedError() # pragma: nocover
31
+
32
+ def get_extra_info(self, info: str) -> typing.Any:
33
+ return None # pragma: nocover
34
+
35
+
36
+ class NetworkBackend:
37
+ def connect_tcp(
38
+ self,
39
+ host: str,
40
+ port: int,
41
+ timeout: float | None = None,
42
+ local_address: str | None = None,
43
+ socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
44
+ ) -> NetworkStream:
45
+ raise NotImplementedError() # pragma: nocover
46
+
47
+ def connect_unix_socket(
48
+ self,
49
+ path: str,
50
+ timeout: float | None = None,
51
+ socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
52
+ ) -> NetworkStream:
53
+ raise NotImplementedError() # pragma: nocover
54
+
55
+ def sleep(self, seconds: float) -> None:
56
+ time.sleep(seconds) # pragma: nocover
57
+
58
+
59
+ class AsyncNetworkStream:
60
+ async def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
61
+ raise NotImplementedError() # pragma: nocover
62
+
63
+ async def write(self, buffer: bytes, timeout: float | None = None) -> None:
64
+ raise NotImplementedError() # pragma: nocover
65
+
66
+ async def aclose(self) -> None:
67
+ raise NotImplementedError() # pragma: nocover
68
+
69
+ async def start_tls(
70
+ self,
71
+ ssl_context: ssl.SSLContext,
72
+ server_hostname: str | None = None,
73
+ timeout: float | None = None,
74
+ ) -> AsyncNetworkStream:
75
+ raise NotImplementedError() # pragma: nocover
76
+
77
+ def get_extra_info(self, info: str) -> typing.Any:
78
+ return None # pragma: nocover
79
+
80
+
81
+ class AsyncNetworkBackend:
82
+ async def connect_tcp(
83
+ self,
84
+ host: str,
85
+ port: int,
86
+ timeout: float | None = None,
87
+ local_address: str | None = None,
88
+ socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
89
+ ) -> AsyncNetworkStream:
90
+ raise NotImplementedError() # pragma: nocover
91
+
92
+ async def connect_unix_socket(
93
+ self,
94
+ path: str,
95
+ timeout: float | None = None,
96
+ socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
97
+ ) -> AsyncNetworkStream:
98
+ raise NotImplementedError() # pragma: nocover
99
+
100
+ async def sleep(self, seconds: float) -> None:
101
+ raise NotImplementedError() # pragma: nocover
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/mock.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import ssl
4
+ import typing
5
+
6
+ from .._exceptions import ReadError
7
+ from .base import (
8
+ SOCKET_OPTION,
9
+ AsyncNetworkBackend,
10
+ AsyncNetworkStream,
11
+ NetworkBackend,
12
+ NetworkStream,
13
+ )
14
+
15
+
16
+ class MockSSLObject:
17
+ def __init__(self, http2: bool):
18
+ self._http2 = http2
19
+
20
+ def selected_alpn_protocol(self) -> str:
21
+ return "h2" if self._http2 else "http/1.1"
22
+
23
+
24
+ class MockStream(NetworkStream):
25
+ def __init__(self, buffer: list[bytes], http2: bool = False) -> None:
26
+ self._buffer = buffer
27
+ self._http2 = http2
28
+ self._closed = False
29
+
30
+ def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
31
+ if self._closed:
32
+ raise ReadError("Connection closed")
33
+ if not self._buffer:
34
+ return b""
35
+ return self._buffer.pop(0)
36
+
37
+ def write(self, buffer: bytes, timeout: float | None = None) -> None:
38
+ pass
39
+
40
+ def close(self) -> None:
41
+ self._closed = True
42
+
43
+ def start_tls(
44
+ self,
45
+ ssl_context: ssl.SSLContext,
46
+ server_hostname: str | None = None,
47
+ timeout: float | None = None,
48
+ ) -> NetworkStream:
49
+ return self
50
+
51
+ def get_extra_info(self, info: str) -> typing.Any:
52
+ return MockSSLObject(http2=self._http2) if info == "ssl_object" else None
53
+
54
+ def __repr__(self) -> str:
55
+ return "<httpcore.MockStream>"
56
+
57
+
58
+ class MockBackend(NetworkBackend):
59
+ def __init__(self, buffer: list[bytes], http2: bool = False) -> None:
60
+ self._buffer = buffer
61
+ self._http2 = http2
62
+
63
+ def connect_tcp(
64
+ self,
65
+ host: str,
66
+ port: int,
67
+ timeout: float | None = None,
68
+ local_address: str | None = None,
69
+ socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
70
+ ) -> NetworkStream:
71
+ return MockStream(list(self._buffer), http2=self._http2)
72
+
73
+ def connect_unix_socket(
74
+ self,
75
+ path: str,
76
+ timeout: float | None = None,
77
+ socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
78
+ ) -> NetworkStream:
79
+ return MockStream(list(self._buffer), http2=self._http2)
80
+
81
+ def sleep(self, seconds: float) -> None:
82
+ pass
83
+
84
+
85
+ class AsyncMockStream(AsyncNetworkStream):
86
+ def __init__(self, buffer: list[bytes], http2: bool = False) -> None:
87
+ self._buffer = buffer
88
+ self._http2 = http2
89
+ self._closed = False
90
+
91
+ async def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
92
+ if self._closed:
93
+ raise ReadError("Connection closed")
94
+ if not self._buffer:
95
+ return b""
96
+ return self._buffer.pop(0)
97
+
98
+ async def write(self, buffer: bytes, timeout: float | None = None) -> None:
99
+ pass
100
+
101
+ async def aclose(self) -> None:
102
+ self._closed = True
103
+
104
+ async def start_tls(
105
+ self,
106
+ ssl_context: ssl.SSLContext,
107
+ server_hostname: str | None = None,
108
+ timeout: float | None = None,
109
+ ) -> AsyncNetworkStream:
110
+ return self
111
+
112
+ def get_extra_info(self, info: str) -> typing.Any:
113
+ return MockSSLObject(http2=self._http2) if info == "ssl_object" else None
114
+
115
+ def __repr__(self) -> str:
116
+ return "<httpcore.AsyncMockStream>"
117
+
118
+
119
+ class AsyncMockBackend(AsyncNetworkBackend):
120
+ def __init__(self, buffer: list[bytes], http2: bool = False) -> None:
121
+ self._buffer = buffer
122
+ self._http2 = http2
123
+
124
+ async def connect_tcp(
125
+ self,
126
+ host: str,
127
+ port: int,
128
+ timeout: float | None = None,
129
+ local_address: str | None = None,
130
+ socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
131
+ ) -> AsyncNetworkStream:
132
+ return AsyncMockStream(list(self._buffer), http2=self._http2)
133
+
134
+ async def connect_unix_socket(
135
+ self,
136
+ path: str,
137
+ timeout: float | None = None,
138
+ socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
139
+ ) -> AsyncNetworkStream:
140
+ return AsyncMockStream(list(self._buffer), http2=self._http2)
141
+
142
+ async def sleep(self, seconds: float) -> None:
143
+ pass
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/sync.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import functools
4
+ import socket
5
+ import ssl
6
+ import sys
7
+ import typing
8
+
9
+ from .._exceptions import (
10
+ ConnectError,
11
+ ConnectTimeout,
12
+ ExceptionMapping,
13
+ ReadError,
14
+ ReadTimeout,
15
+ WriteError,
16
+ WriteTimeout,
17
+ map_exceptions,
18
+ )
19
+ from .._utils import is_socket_readable
20
+ from .base import SOCKET_OPTION, NetworkBackend, NetworkStream
21
+
22
+
23
+ class TLSinTLSStream(NetworkStream): # pragma: no cover
24
+ """
25
+ Because the standard `SSLContext.wrap_socket` method does
26
+ not work for `SSLSocket` objects, we need this class
27
+ to implement TLS stream using an underlying `SSLObject`
28
+ instance in order to support TLS on top of TLS.
29
+ """
30
+
31
+ # Defined in RFC 8449
32
+ TLS_RECORD_SIZE = 16384
33
+
34
+ def __init__(
35
+ self,
36
+ sock: socket.socket,
37
+ ssl_context: ssl.SSLContext,
38
+ server_hostname: str | None = None,
39
+ timeout: float | None = None,
40
+ ):
41
+ self._sock = sock
42
+ self._incoming = ssl.MemoryBIO()
43
+ self._outgoing = ssl.MemoryBIO()
44
+
45
+ self.ssl_obj = ssl_context.wrap_bio(
46
+ incoming=self._incoming,
47
+ outgoing=self._outgoing,
48
+ server_hostname=server_hostname,
49
+ )
50
+
51
+ self._sock.settimeout(timeout)
52
+ self._perform_io(self.ssl_obj.do_handshake)
53
+
54
+ def _perform_io(
55
+ self,
56
+ func: typing.Callable[..., typing.Any],
57
+ ) -> typing.Any:
58
+ ret = None
59
+
60
+ while True:
61
+ errno = None
62
+ try:
63
+ ret = func()
64
+ except (ssl.SSLWantReadError, ssl.SSLWantWriteError) as e:
65
+ errno = e.errno
66
+
67
+ self._sock.sendall(self._outgoing.read())
68
+
69
+ if errno == ssl.SSL_ERROR_WANT_READ:
70
+ buf = self._sock.recv(self.TLS_RECORD_SIZE)
71
+
72
+ if buf:
73
+ self._incoming.write(buf)
74
+ else:
75
+ self._incoming.write_eof()
76
+ if errno is None:
77
+ return ret
78
+
79
+ def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
80
+ exc_map: ExceptionMapping = {socket.timeout: ReadTimeout, OSError: ReadError}
81
+ with map_exceptions(exc_map):
82
+ self._sock.settimeout(timeout)
83
+ return typing.cast(
84
+ bytes, self._perform_io(functools.partial(self.ssl_obj.read, max_bytes))
85
+ )
86
+
87
+ def write(self, buffer: bytes, timeout: float | None = None) -> None:
88
+ exc_map: ExceptionMapping = {socket.timeout: WriteTimeout, OSError: WriteError}
89
+ with map_exceptions(exc_map):
90
+ self._sock.settimeout(timeout)
91
+ while buffer:
92
+ nsent = self._perform_io(functools.partial(self.ssl_obj.write, buffer))
93
+ buffer = buffer[nsent:]
94
+
95
+ def close(self) -> None:
96
+ self._sock.close()
97
+
98
+ def start_tls(
99
+ self,
100
+ ssl_context: ssl.SSLContext,
101
+ server_hostname: str | None = None,
102
+ timeout: float | None = None,
103
+ ) -> NetworkStream:
104
+ raise NotImplementedError()
105
+
106
+ def get_extra_info(self, info: str) -> typing.Any:
107
+ if info == "ssl_object":
108
+ return self.ssl_obj
109
+ if info == "client_addr":
110
+ return self._sock.getsockname()
111
+ if info == "server_addr":
112
+ return self._sock.getpeername()
113
+ if info == "socket":
114
+ return self._sock
115
+ if info == "is_readable":
116
+ return is_socket_readable(self._sock)
117
+ return None
118
+
119
+
120
+ class SyncStream(NetworkStream):
121
+ def __init__(self, sock: socket.socket) -> None:
122
+ self._sock = sock
123
+
124
+ def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
125
+ exc_map: ExceptionMapping = {socket.timeout: ReadTimeout, OSError: ReadError}
126
+ with map_exceptions(exc_map):
127
+ self._sock.settimeout(timeout)
128
+ return self._sock.recv(max_bytes)
129
+
130
+ def write(self, buffer: bytes, timeout: float | None = None) -> None:
131
+ if not buffer:
132
+ return
133
+
134
+ exc_map: ExceptionMapping = {socket.timeout: WriteTimeout, OSError: WriteError}
135
+ with map_exceptions(exc_map):
136
+ while buffer:
137
+ self._sock.settimeout(timeout)
138
+ n = self._sock.send(buffer)
139
+ buffer = buffer[n:]
140
+
141
+ def close(self) -> None:
142
+ self._sock.close()
143
+
144
+ def start_tls(
145
+ self,
146
+ ssl_context: ssl.SSLContext,
147
+ server_hostname: str | None = None,
148
+ timeout: float | None = None,
149
+ ) -> NetworkStream:
150
+ exc_map: ExceptionMapping = {
151
+ socket.timeout: ConnectTimeout,
152
+ OSError: ConnectError,
153
+ }
154
+ with map_exceptions(exc_map):
155
+ try:
156
+ if isinstance(self._sock, ssl.SSLSocket): # pragma: no cover
157
+ # If the underlying socket has already been upgraded
158
+ # to the TLS layer (i.e. is an instance of SSLSocket),
159
+ # we need some additional smarts to support TLS-in-TLS.
160
+ return TLSinTLSStream(
161
+ self._sock, ssl_context, server_hostname, timeout
162
+ )
163
+ else:
164
+ self._sock.settimeout(timeout)
165
+ sock = ssl_context.wrap_socket(
166
+ self._sock, server_hostname=server_hostname
167
+ )
168
+ except Exception as exc: # pragma: nocover
169
+ self.close()
170
+ raise exc
171
+ return SyncStream(sock)
172
+
173
+ def get_extra_info(self, info: str) -> typing.Any:
174
+ if info == "ssl_object" and isinstance(self._sock, ssl.SSLSocket):
175
+ return self._sock._sslobj # type: ignore
176
+ if info == "client_addr":
177
+ return self._sock.getsockname()
178
+ if info == "server_addr":
179
+ return self._sock.getpeername()
180
+ if info == "socket":
181
+ return self._sock
182
+ if info == "is_readable":
183
+ return is_socket_readable(self._sock)
184
+ return None
185
+
186
+
187
+ class SyncBackend(NetworkBackend):
188
+ def connect_tcp(
189
+ self,
190
+ host: str,
191
+ port: int,
192
+ timeout: float | None = None,
193
+ local_address: str | None = None,
194
+ socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
195
+ ) -> NetworkStream:
196
+ # Note that we automatically include `TCP_NODELAY`
197
+ # in addition to any other custom socket options.
198
+ if socket_options is None:
199
+ socket_options = [] # pragma: no cover
200
+ address = (host, port)
201
+ source_address = None if local_address is None else (local_address, 0)
202
+ exc_map: ExceptionMapping = {
203
+ socket.timeout: ConnectTimeout,
204
+ OSError: ConnectError,
205
+ }
206
+
207
+ with map_exceptions(exc_map):
208
+ sock = socket.create_connection(
209
+ address,
210
+ timeout,
211
+ source_address=source_address,
212
+ )
213
+ for option in socket_options:
214
+ sock.setsockopt(*option) # pragma: no cover
215
+ sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
216
+ return SyncStream(sock)
217
+
218
+ def connect_unix_socket(
219
+ self,
220
+ path: str,
221
+ timeout: float | None = None,
222
+ socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
223
+ ) -> NetworkStream: # pragma: nocover
224
+ if sys.platform == "win32":
225
+ raise RuntimeError(
226
+ "Attempted to connect to a UNIX socket on a Windows system."
227
+ )
228
+ if socket_options is None:
229
+ socket_options = []
230
+
231
+ exc_map: ExceptionMapping = {
232
+ socket.timeout: ConnectTimeout,
233
+ OSError: ConnectError,
234
+ }
235
+ with map_exceptions(exc_map):
236
+ sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
237
+ for option in socket_options:
238
+ sock.setsockopt(*option)
239
+ sock.settimeout(timeout)
240
+ sock.connect(path)
241
+ return SyncStream(sock)
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/httpcore/_backends/trio.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import ssl
4
+ import typing
5
+
6
+ import trio
7
+
8
+ from .._exceptions import (
9
+ ConnectError,
10
+ ConnectTimeout,
11
+ ExceptionMapping,
12
+ ReadError,
13
+ ReadTimeout,
14
+ WriteError,
15
+ WriteTimeout,
16
+ map_exceptions,
17
+ )
18
+ from .base import SOCKET_OPTION, AsyncNetworkBackend, AsyncNetworkStream
19
+
20
+
21
+ class TrioStream(AsyncNetworkStream):
22
+ def __init__(self, stream: trio.abc.Stream) -> None:
23
+ self._stream = stream
24
+
25
+ async def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
26
+ timeout_or_inf = float("inf") if timeout is None else timeout
27
+ exc_map: ExceptionMapping = {
28
+ trio.TooSlowError: ReadTimeout,
29
+ trio.BrokenResourceError: ReadError,
30
+ trio.ClosedResourceError: ReadError,
31
+ }
32
+ with map_exceptions(exc_map):
33
+ with trio.fail_after(timeout_or_inf):
34
+ data: bytes = await self._stream.receive_some(max_bytes=max_bytes)
35
+ return data
36
+
37
+ async def write(self, buffer: bytes, timeout: float | None = None) -> None:
38
+ if not buffer:
39
+ return
40
+
41
+ timeout_or_inf = float("inf") if timeout is None else timeout
42
+ exc_map: ExceptionMapping = {
43
+ trio.TooSlowError: WriteTimeout,
44
+ trio.BrokenResourceError: WriteError,
45
+ trio.ClosedResourceError: WriteError,
46
+ }
47
+ with map_exceptions(exc_map):
48
+ with trio.fail_after(timeout_or_inf):
49
+ await self._stream.send_all(data=buffer)
50
+
51
+ async def aclose(self) -> None:
52
+ await self._stream.aclose()
53
+
54
+ async def start_tls(
55
+ self,
56
+ ssl_context: ssl.SSLContext,
57
+ server_hostname: str | None = None,
58
+ timeout: float | None = None,
59
+ ) -> AsyncNetworkStream:
60
+ timeout_or_inf = float("inf") if timeout is None else timeout
61
+ exc_map: ExceptionMapping = {
62
+ trio.TooSlowError: ConnectTimeout,
63
+ trio.BrokenResourceError: ConnectError,
64
+ }
65
+ ssl_stream = trio.SSLStream(
66
+ self._stream,
67
+ ssl_context=ssl_context,
68
+ server_hostname=server_hostname,
69
+ https_compatible=True,
70
+ server_side=False,
71
+ )
72
+ with map_exceptions(exc_map):
73
+ try:
74
+ with trio.fail_after(timeout_or_inf):
75
+ await ssl_stream.do_handshake()
76
+ except Exception as exc: # pragma: nocover
77
+ await self.aclose()
78
+ raise exc
79
+ return TrioStream(ssl_stream)
80
+
81
+ def get_extra_info(self, info: str) -> typing.Any:
82
+ if info == "ssl_object" and isinstance(self._stream, trio.SSLStream):
83
+ # Type checkers cannot see `_ssl_object` attribute because trio._ssl.SSLStream uses __getattr__/__setattr__.
84
+ # Tracked at https://github.com/python-trio/trio/issues/542
85
+ return self._stream._ssl_object # type: ignore[attr-defined]
86
+ if info == "client_addr":
87
+ return self._get_socket_stream().socket.getsockname()
88
+ if info == "server_addr":
89
+ return self._get_socket_stream().socket.getpeername()
90
+ if info == "socket":
91
+ stream = self._stream
92
+ while isinstance(stream, trio.SSLStream):
93
+ stream = stream.transport_stream
94
+ assert isinstance(stream, trio.SocketStream)
95
+ return stream.socket
96
+ if info == "is_readable":
97
+ socket = self.get_extra_info("socket")
98
+ return socket.is_readable()
99
+ return None
100
+
101
+ def _get_socket_stream(self) -> trio.SocketStream:
102
+ stream = self._stream
103
+ while isinstance(stream, trio.SSLStream):
104
+ stream = stream.transport_stream
105
+ assert isinstance(stream, trio.SocketStream)
106
+ return stream
107
+
108
+
109
+ class TrioBackend(AsyncNetworkBackend):
110
+ async def connect_tcp(
111
+ self,
112
+ host: str,
113
+ port: int,
114
+ timeout: float | None = None,
115
+ local_address: str | None = None,
116
+ socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
117
+ ) -> AsyncNetworkStream:
118
+ # By default for TCP sockets, trio enables TCP_NODELAY.
119
+ # https://trio.readthedocs.io/en/stable/reference-io.html#trio.SocketStream
120
+ if socket_options is None:
121
+ socket_options = [] # pragma: no cover
122
+ timeout_or_inf = float("inf") if timeout is None else timeout
123
+ exc_map: ExceptionMapping = {
124
+ trio.TooSlowError: ConnectTimeout,
125
+ trio.BrokenResourceError: ConnectError,
126
+ OSError: ConnectError,
127
+ }
128
+ with map_exceptions(exc_map):
129
+ with trio.fail_after(timeout_or_inf):
130
+ stream: trio.abc.Stream = await trio.open_tcp_stream(
131
+ host=host, port=port, local_address=local_address
132
+ )
133
+ for option in socket_options:
134
+ stream.setsockopt(*option) # type: ignore[attr-defined] # pragma: no cover
135
+ return TrioStream(stream)
136
+
137
+ async def connect_unix_socket(
138
+ self,
139
+ path: str,
140
+ timeout: float | None = None,
141
+ socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
142
+ ) -> AsyncNetworkStream: # pragma: nocover
143
+ if socket_options is None:
144
+ socket_options = []
145
+ timeout_or_inf = float("inf") if timeout is None else timeout
146
+ exc_map: ExceptionMapping = {
147
+ trio.TooSlowError: ConnectTimeout,
148
+ trio.BrokenResourceError: ConnectError,
149
+ OSError: ConnectError,
150
+ }
151
+ with map_exceptions(exc_map):
152
+ with trio.fail_after(timeout_or_inf):
153
+ stream: trio.abc.Stream = await trio.open_unix_socket(path)
154
+ for option in socket_options:
155
+ stream.setsockopt(*option) # type: ignore[attr-defined] # pragma: no cover
156
+ return TrioStream(stream)
157
+
158
+ async def sleep(self, seconds: float) -> None:
159
+ await trio.sleep(seconds) # pragma: nocover
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __all__ = (
2
+ "decode",
3
+ "DECODE_DEFAULT_CHARS",
4
+ "DECODE_COMPONENT_CHARS",
5
+ "encode",
6
+ "ENCODE_DEFAULT_CHARS",
7
+ "ENCODE_COMPONENT_CHARS",
8
+ "format",
9
+ "parse",
10
+ "URL",
11
+ )
12
+ __version__ = "0.1.2" # DO NOT EDIT THIS LINE MANUALLY. LET bump2version UTILITY DO IT
13
+
14
+ from mdurl._decode import DECODE_COMPONENT_CHARS, DECODE_DEFAULT_CHARS, decode
15
+ from mdurl._encode import ENCODE_COMPONENT_CHARS, ENCODE_DEFAULT_CHARS, encode
16
+ from mdurl._format import format
17
+ from mdurl._parse import url_parse as parse
18
+ from mdurl._url import URL
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_decode.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Sequence
4
+ import functools
5
+ import re
6
+
7
+ DECODE_DEFAULT_CHARS = ";/?:@&=+$,#"
8
+ DECODE_COMPONENT_CHARS = ""
9
+
10
+ decode_cache: dict[str, list[str]] = {}
11
+
12
+
13
+ def get_decode_cache(exclude: str) -> Sequence[str]:
14
+ if exclude in decode_cache:
15
+ return decode_cache[exclude]
16
+
17
+ cache: list[str] = []
18
+ decode_cache[exclude] = cache
19
+
20
+ for i in range(128):
21
+ ch = chr(i)
22
+ cache.append(ch)
23
+
24
+ for i in range(len(exclude)):
25
+ ch_code = ord(exclude[i])
26
+ cache[ch_code] = "%" + ("0" + hex(ch_code)[2:].upper())[-2:]
27
+
28
+ return cache
29
+
30
+
31
+ # Decode percent-encoded string.
32
+ #
33
+ def decode(string: str, exclude: str = DECODE_DEFAULT_CHARS) -> str:
34
+ cache = get_decode_cache(exclude)
35
+ repl_func = functools.partial(repl_func_with_cache, cache=cache)
36
+ return re.sub(r"(%[a-f0-9]{2})+", repl_func, string, flags=re.IGNORECASE)
37
+
38
+
39
+ def repl_func_with_cache(match: re.Match, cache: Sequence[str]) -> str:
40
+ seq = match.group()
41
+ result = ""
42
+
43
+ i = 0
44
+ l = len(seq) # noqa: E741
45
+ while i < l:
46
+ b1 = int(seq[i + 1 : i + 3], 16)
47
+
48
+ if b1 < 0x80:
49
+ result += cache[b1]
50
+ i += 3 # emulate JS for loop statement3
51
+ continue
52
+
53
+ if (b1 & 0xE0) == 0xC0 and (i + 3 < l):
54
+ # 110xxxxx 10xxxxxx
55
+ b2 = int(seq[i + 4 : i + 6], 16)
56
+
57
+ if (b2 & 0xC0) == 0x80:
58
+ all_bytes = bytes((b1, b2))
59
+ try:
60
+ result += all_bytes.decode()
61
+ except UnicodeDecodeError:
62
+ result += "\ufffd" * 2
63
+
64
+ i += 3
65
+ i += 3 # emulate JS for loop statement3
66
+ continue
67
+
68
+ if (b1 & 0xF0) == 0xE0 and (i + 6 < l):
69
+ # 1110xxxx 10xxxxxx 10xxxxxx
70
+ b2 = int(seq[i + 4 : i + 6], 16)
71
+ b3 = int(seq[i + 7 : i + 9], 16)
72
+
73
+ if (b2 & 0xC0) == 0x80 and (b3 & 0xC0) == 0x80:
74
+ all_bytes = bytes((b1, b2, b3))
75
+ try:
76
+ result += all_bytes.decode()
77
+ except UnicodeDecodeError:
78
+ result += "\ufffd" * 3
79
+
80
+ i += 6
81
+ i += 3 # emulate JS for loop statement3
82
+ continue
83
+
84
+ if (b1 & 0xF8) == 0xF0 and (i + 9 < l):
85
+ # 111110xx 10xxxxxx 10xxxxxx 10xxxxxx
86
+ b2 = int(seq[i + 4 : i + 6], 16)
87
+ b3 = int(seq[i + 7 : i + 9], 16)
88
+ b4 = int(seq[i + 10 : i + 12], 16)
89
+
90
+ if (b2 & 0xC0) == 0x80 and (b3 & 0xC0) == 0x80 and (b4 & 0xC0) == 0x80:
91
+ all_bytes = bytes((b1, b2, b3, b4))
92
+ try:
93
+ result += all_bytes.decode()
94
+ except UnicodeDecodeError:
95
+ result += "\ufffd" * 4
96
+
97
+ i += 9
98
+ i += 3 # emulate JS for loop statement3
99
+ continue
100
+
101
+ result += "\ufffd"
102
+ i += 3 # emulate JS for loop statement3
103
+
104
+ return result
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_encode.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Sequence
4
+ from string import ascii_letters, digits, hexdigits
5
+ from urllib.parse import quote as encode_uri_component
6
+
7
+ ASCII_LETTERS_AND_DIGITS = ascii_letters + digits
8
+
9
+ ENCODE_DEFAULT_CHARS = ";/?:@&=+$,-_.!~*'()#"
10
+ ENCODE_COMPONENT_CHARS = "-_.!~*'()"
11
+
12
+ encode_cache: dict[str, list[str]] = {}
13
+
14
+
15
+ # Create a lookup array where anything but characters in `chars` string
16
+ # and alphanumeric chars is percent-encoded.
17
+ def get_encode_cache(exclude: str) -> Sequence[str]:
18
+ if exclude in encode_cache:
19
+ return encode_cache[exclude]
20
+
21
+ cache: list[str] = []
22
+ encode_cache[exclude] = cache
23
+
24
+ for i in range(128):
25
+ ch = chr(i)
26
+
27
+ if ch in ASCII_LETTERS_AND_DIGITS:
28
+ # always allow unencoded alphanumeric characters
29
+ cache.append(ch)
30
+ else:
31
+ cache.append("%" + ("0" + hex(i)[2:].upper())[-2:])
32
+
33
+ for i in range(len(exclude)):
34
+ cache[ord(exclude[i])] = exclude[i]
35
+
36
+ return cache
37
+
38
+
39
+ # Encode unsafe characters with percent-encoding, skipping already
40
+ # encoded sequences.
41
+ #
42
+ # - string - string to encode
43
+ # - exclude - list of characters to ignore (in addition to a-zA-Z0-9)
44
+ # - keepEscaped - don't encode '%' in a correct escape sequence (default: true)
45
+ def encode(
46
+ string: str, exclude: str = ENCODE_DEFAULT_CHARS, *, keep_escaped: bool = True
47
+ ) -> str:
48
+ result = ""
49
+
50
+ cache = get_encode_cache(exclude)
51
+
52
+ l = len(string) # noqa: E741
53
+ i = 0
54
+ while i < l:
55
+ code = ord(string[i])
56
+
57
+ # %
58
+ if keep_escaped and code == 0x25 and i + 2 < l:
59
+ if all(c in hexdigits for c in string[i + 1 : i + 3]):
60
+ result += string[i : i + 3]
61
+ i += 2
62
+ i += 1 # JS for loop statement3
63
+ continue
64
+
65
+ if code < 128:
66
+ result += cache[code]
67
+ i += 1 # JS for loop statement3
68
+ continue
69
+
70
+ if code >= 0xD800 and code <= 0xDFFF:
71
+ if code >= 0xD800 and code <= 0xDBFF and i + 1 < l:
72
+ next_code = ord(string[i + 1])
73
+ if next_code >= 0xDC00 and next_code <= 0xDFFF:
74
+ result += encode_uri_component(string[i] + string[i + 1])
75
+ i += 1
76
+ i += 1 # JS for loop statement3
77
+ continue
78
+ result += "%EF%BF%BD"
79
+ i += 1 # JS for loop statement3
80
+ continue
81
+
82
+ result += encode_uri_component(string[i])
83
+ i += 1 # JS for loop statement3
84
+
85
+ return result
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_format.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ from mdurl._url import URL
7
+
8
+
9
+ def format(url: URL) -> str: # noqa: A001
10
+ result = ""
11
+
12
+ result += url.protocol or ""
13
+ result += "//" if url.slashes else ""
14
+ result += url.auth + "@" if url.auth else ""
15
+
16
+ if url.hostname and ":" in url.hostname:
17
+ # ipv6 address
18
+ result += "[" + url.hostname + "]"
19
+ else:
20
+ result += url.hostname or ""
21
+
22
+ result += ":" + url.port if url.port else ""
23
+ result += url.pathname or ""
24
+ result += url.search or ""
25
+ result += url.hash or ""
26
+
27
+ return result
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_parse.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright Joyent, Inc. and other Node contributors.
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a
4
+ # copy of this software and associated documentation files (the
5
+ # "Software"), to deal in the Software without restriction, including
6
+ # without limitation the rights to use, copy, modify, merge, publish,
7
+ # distribute, sublicense, and/or sell copies of the Software, and to permit
8
+ # persons to whom the Software is furnished to do so, subject to the
9
+ # following conditions:
10
+ #
11
+ # The above copyright notice and this permission notice shall be included
12
+ # in all copies or substantial portions of the Software.
13
+ #
14
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15
+ # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
17
+ # NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
18
+ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19
+ # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20
+ # USE OR OTHER DEALINGS IN THE SOFTWARE.
21
+
22
+
23
+ # Changes from joyent/node:
24
+ #
25
+ # 1. No leading slash in paths,
26
+ # e.g. in `url.parse('http://foo?bar')` pathname is ``, not `/`
27
+ #
28
+ # 2. Backslashes are not replaced with slashes,
29
+ # so `http:\\example.org\` is treated like a relative path
30
+ #
31
+ # 3. Trailing colon is treated like a part of the path,
32
+ # i.e. in `http://example.org:foo` pathname is `:foo`
33
+ #
34
+ # 4. Nothing is URL-encoded in the resulting object,
35
+ # (in joyent/node some chars in auth and paths are encoded)
36
+ #
37
+ # 5. `url.parse()` does not have `parseQueryString` argument
38
+ #
39
+ # 6. Removed extraneous result properties: `host`, `path`, `query`, etc.,
40
+ # which can be constructed using other parts of the url.
41
+
42
+ from __future__ import annotations
43
+
44
+ from collections import defaultdict
45
+ import re
46
+
47
+ from mdurl._url import URL
48
+
49
+ # Reference: RFC 3986, RFC 1808, RFC 2396
50
+
51
+ # define these here so at least they only have to be
52
+ # compiled once on the first module load.
53
+ PROTOCOL_PATTERN = re.compile(r"^([a-z0-9.+-]+:)", flags=re.IGNORECASE)
54
+ PORT_PATTERN = re.compile(r":[0-9]*$")
55
+
56
+ # Special case for a simple path URL
57
+ SIMPLE_PATH_PATTERN = re.compile(r"^(//?(?!/)[^?\s]*)(\?[^\s]*)?$")
58
+
59
+ # RFC 2396: characters reserved for delimiting URLs.
60
+ # We actually just auto-escape these.
61
+ DELIMS = ("<", ">", '"', "`", " ", "\r", "\n", "\t")
62
+
63
+ # RFC 2396: characters not allowed for various reasons.
64
+ UNWISE = ("{", "}", "|", "\\", "^", "`") + DELIMS
65
+
66
+ # Allowed by RFCs, but cause of XSS attacks. Always escape these.
67
+ AUTO_ESCAPE = ("'",) + UNWISE
68
+ # Characters that are never ever allowed in a hostname.
69
+ # Note that any invalid chars are also handled, but these
70
+ # are the ones that are *expected* to be seen, so we fast-path
71
+ # them.
72
+ NON_HOST_CHARS = ("%", "/", "?", ";", "#") + AUTO_ESCAPE
73
+ HOST_ENDING_CHARS = ("/", "?", "#")
74
+ HOSTNAME_MAX_LEN = 255
75
+ HOSTNAME_PART_PATTERN = re.compile(r"^[+a-z0-9A-Z_-]{0,63}$")
76
+ HOSTNAME_PART_START = re.compile(r"^([+a-z0-9A-Z_-]{0,63})(.*)$")
77
+ # protocols that can allow "unsafe" and "unwise" chars.
78
+
79
+ # protocols that never have a hostname.
80
+ HOSTLESS_PROTOCOL = defaultdict(
81
+ bool,
82
+ {
83
+ "javascript": True,
84
+ "javascript:": True,
85
+ },
86
+ )
87
+ # protocols that always contain a // bit.
88
+ SLASHED_PROTOCOL = defaultdict(
89
+ bool,
90
+ {
91
+ "http": True,
92
+ "https": True,
93
+ "ftp": True,
94
+ "gopher": True,
95
+ "file": True,
96
+ "http:": True,
97
+ "https:": True,
98
+ "ftp:": True,
99
+ "gopher:": True,
100
+ "file:": True,
101
+ },
102
+ )
103
+
104
+
105
+ class MutableURL:
106
+ def __init__(self) -> None:
107
+ self.protocol: str | None = None
108
+ self.slashes: bool = False
109
+ self.auth: str | None = None
110
+ self.port: str | None = None
111
+ self.hostname: str | None = None
112
+ self.hash: str | None = None
113
+ self.search: str | None = None
114
+ self.pathname: str | None = None
115
+
116
+ def parse(self, url: str, slashes_denote_host: bool) -> "MutableURL":
117
+ lower_proto = ""
118
+ slashes = False
119
+ rest = url
120
+
121
+ # trim before proceeding.
122
+ # This is to support parse stuff like " http://foo.com \n"
123
+ rest = rest.strip()
124
+
125
+ if not slashes_denote_host and len(url.split("#")) == 1:
126
+ # Try fast path regexp
127
+ simple_path = SIMPLE_PATH_PATTERN.match(rest)
128
+ if simple_path:
129
+ self.pathname = simple_path.group(1)
130
+ if simple_path.group(2):
131
+ self.search = simple_path.group(2)
132
+ return self
133
+
134
+ proto = ""
135
+ proto_match = PROTOCOL_PATTERN.match(rest)
136
+ if proto_match:
137
+ proto = proto_match.group()
138
+ lower_proto = proto.lower()
139
+ self.protocol = proto
140
+ rest = rest[len(proto) :]
141
+
142
+ # figure out if it's got a host
143
+ # user@server is *always* interpreted as a hostname, and url
144
+ # resolution will treat //foo/bar as host=foo,path=bar because that's
145
+ # how the browser resolves relative URLs.
146
+ if slashes_denote_host or proto or re.search(r"^//[^@/]+@[^@/]+", rest):
147
+ slashes = rest.startswith("//")
148
+ if slashes and not (proto and HOSTLESS_PROTOCOL[proto]):
149
+ rest = rest[2:]
150
+ self.slashes = True
151
+
152
+ if not HOSTLESS_PROTOCOL[proto] and (
153
+ slashes or (proto and not SLASHED_PROTOCOL[proto])
154
+ ):
155
+
156
+ # there's a hostname.
157
+ # the first instance of /, ?, ;, or # ends the host.
158
+ #
159
+ # If there is an @ in the hostname, then non-host chars *are* allowed
160
+ # to the left of the last @ sign, unless some host-ending character
161
+ # comes *before* the @-sign.
162
+ # URLs are obnoxious.
163
+ #
164
+ # ex:
165
+ # http://a@b@c/ => user:a@b host:c
166
+ # http://a@b?@c => user:a host:c path:/?@c
167
+
168
+ # v0.12 TODO(isaacs): This is not quite how Chrome does things.
169
+ # Review our test case against browsers more comprehensively.
170
+
171
+ # find the first instance of any hostEndingChars
172
+ host_end = -1
173
+ for i in range(len(HOST_ENDING_CHARS)):
174
+ hec = rest.find(HOST_ENDING_CHARS[i])
175
+ if hec != -1 and (host_end == -1 or hec < host_end):
176
+ host_end = hec
177
+
178
+ # at this point, either we have an explicit point where the
179
+ # auth portion cannot go past, or the last @ char is the decider.
180
+ if host_end == -1:
181
+ # atSign can be anywhere.
182
+ at_sign = rest.rfind("@")
183
+ else:
184
+ # atSign must be in auth portion.
185
+ # http://a@b/c@d => host:b auth:a path:/c@d
186
+ at_sign = rest.rfind("@", 0, host_end + 1)
187
+
188
+ # Now we have a portion which is definitely the auth.
189
+ # Pull that off.
190
+ if at_sign != -1:
191
+ auth = rest[:at_sign]
192
+ rest = rest[at_sign + 1 :]
193
+ self.auth = auth
194
+
195
+ # the host is the remaining to the left of the first non-host char
196
+ host_end = -1
197
+ for i in range(len(NON_HOST_CHARS)):
198
+ hec = rest.find(NON_HOST_CHARS[i])
199
+ if hec != -1 and (host_end == -1 or hec < host_end):
200
+ host_end = hec
201
+ # if we still have not hit it, then the entire thing is a host.
202
+ if host_end == -1:
203
+ host_end = len(rest)
204
+
205
+ if host_end > 0 and rest[host_end - 1] == ":":
206
+ host_end -= 1
207
+ host = rest[:host_end]
208
+ rest = rest[host_end:]
209
+
210
+ # pull out port.
211
+ self.parse_host(host)
212
+
213
+ # we've indicated that there is a hostname,
214
+ # so even if it's empty, it has to be present.
215
+ self.hostname = self.hostname or ""
216
+
217
+ # if hostname begins with [ and ends with ]
218
+ # assume that it's an IPv6 address.
219
+ ipv6_hostname = self.hostname.startswith("[") and self.hostname.endswith(
220
+ "]"
221
+ )
222
+
223
+ # validate a little.
224
+ if not ipv6_hostname:
225
+ hostparts = self.hostname.split(".")
226
+ l = len(hostparts) # noqa: E741
227
+ i = 0
228
+ while i < l:
229
+ part = hostparts[i]
230
+ if not part:
231
+ i += 1 # emulate statement3 in JS for loop
232
+ continue
233
+ if not HOSTNAME_PART_PATTERN.search(part):
234
+ newpart = ""
235
+ k = len(part)
236
+ j = 0
237
+ while j < k:
238
+ if ord(part[j]) > 127:
239
+ # we replace non-ASCII char with a temporary placeholder
240
+ # we need this to make sure size of hostname is not
241
+ # broken by replacing non-ASCII by nothing
242
+ newpart += "x"
243
+ else:
244
+ newpart += part[j]
245
+ j += 1 # emulate statement3 in JS for loop
246
+
247
+ # we test again with ASCII char only
248
+ if not HOSTNAME_PART_PATTERN.search(newpart):
249
+ valid_parts = hostparts[:i]
250
+ not_host = hostparts[i + 1 :]
251
+ bit = HOSTNAME_PART_START.search(part)
252
+ if bit:
253
+ valid_parts.append(bit.group(1))
254
+ not_host.insert(0, bit.group(2))
255
+ if not_host:
256
+ rest = ".".join(not_host) + rest
257
+ self.hostname = ".".join(valid_parts)
258
+ break
259
+ i += 1 # emulate statement3 in JS for loop
260
+
261
+ if len(self.hostname) > HOSTNAME_MAX_LEN:
262
+ self.hostname = ""
263
+
264
+ # strip [ and ] from the hostname
265
+ # the host field still retains them, though
266
+ if ipv6_hostname:
267
+ self.hostname = self.hostname[1:-1]
268
+
269
+ # chop off from the tail first.
270
+ hash = rest.find("#") # noqa: A001
271
+ if hash != -1:
272
+ # got a fragment string.
273
+ self.hash = rest[hash:]
274
+ rest = rest[:hash]
275
+ qm = rest.find("?")
276
+ if qm != -1:
277
+ self.search = rest[qm:]
278
+ rest = rest[:qm]
279
+ if rest:
280
+ self.pathname = rest
281
+ if SLASHED_PROTOCOL[lower_proto] and self.hostname and not self.pathname:
282
+ self.pathname = ""
283
+
284
+ return self
285
+
286
+ def parse_host(self, host: str) -> None:
287
+ port_match = PORT_PATTERN.search(host)
288
+ if port_match:
289
+ port = port_match.group()
290
+ if port != ":":
291
+ self.port = port[1:]
292
+ host = host[: -len(port)]
293
+ if host:
294
+ self.hostname = host
295
+
296
+
297
+ def url_parse(url: URL | str, *, slashes_denote_host: bool = False) -> URL:
298
+ if isinstance(url, URL):
299
+ return url
300
+ u = MutableURL()
301
+ u.parse(url, slashes_denote_host)
302
+ return URL(
303
+ u.protocol, u.slashes, u.auth, u.port, u.hostname, u.hash, u.search, u.pathname
304
+ )
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/_url.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import NamedTuple
4
+
5
+
6
+ class URL(NamedTuple):
7
+ protocol: str | None
8
+ slashes: bool
9
+ auth: str | None
10
+ port: str | None
11
+ hostname: str | None
12
+ hash: str | None # noqa: A003
13
+ search: str | None
14
+ pathname: str | None
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/mdurl/py.typed ADDED
@@ -0,0 +1 @@
 
 
1
+ # Marker file for PEP 561
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ uv
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/LICENSE.txt ADDED
@@ -0,0 +1,971 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2005-2023, NumPy Developers.
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are
6
+ met:
7
+
8
+ * Redistributions of source code must retain the above copyright
9
+ notice, this list of conditions and the following disclaimer.
10
+
11
+ * Redistributions in binary form must reproduce the above
12
+ copyright notice, this list of conditions and the following
13
+ disclaimer in the documentation and/or other materials provided
14
+ with the distribution.
15
+
16
+ * Neither the name of the NumPy Developers nor the names of any
17
+ contributors may be used to endorse or promote products derived
18
+ from this software without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+
32
+ ----
33
+
34
+ The NumPy repository and source distributions bundle several libraries that are
35
+ compatibly licensed. We list these here.
36
+
37
+ Name: lapack-lite
38
+ Files: numpy/linalg/lapack_lite/*
39
+ License: BSD-3-Clause
40
+ For details, see numpy/linalg/lapack_lite/LICENSE.txt
41
+
42
+ Name: tempita
43
+ Files: tools/npy_tempita/*
44
+ License: MIT
45
+ For details, see tools/npy_tempita/license.txt
46
+
47
+ Name: dragon4
48
+ Files: numpy/core/src/multiarray/dragon4.c
49
+ License: MIT
50
+ For license text, see numpy/core/src/multiarray/dragon4.c
51
+
52
+ Name: libdivide
53
+ Files: numpy/core/include/numpy/libdivide/*
54
+ License: Zlib
55
+ For license text, see numpy/core/include/numpy/libdivide/LICENSE.txt
56
+
57
+
58
+ Note that the following files are vendored in the repository and sdist but not
59
+ installed in built numpy packages:
60
+
61
+ Name: Meson
62
+ Files: vendored-meson/meson/*
63
+ License: Apache 2.0
64
+ For license text, see vendored-meson/meson/COPYING
65
+
66
+ Name: spin
67
+ Files: .spin/cmds.py
68
+ License: BSD-3
69
+ For license text, see .spin/LICENSE
70
+
71
+ ----
72
+
73
+ This binary distribution of NumPy also bundles the following software:
74
+
75
+
76
+ Name: OpenBLAS
77
+ Files: numpy.libs/libopenblas*.so
78
+ Description: bundled as a dynamically linked library
79
+ Availability: https://github.com/OpenMathLib/OpenBLAS/
80
+ License: BSD-3-Clause
81
+ Copyright (c) 2011-2014, The OpenBLAS Project
82
+ All rights reserved.
83
+
84
+ Redistribution and use in source and binary forms, with or without
85
+ modification, are permitted provided that the following conditions are
86
+ met:
87
+
88
+ 1. Redistributions of source code must retain the above copyright
89
+ notice, this list of conditions and the following disclaimer.
90
+
91
+ 2. Redistributions in binary form must reproduce the above copyright
92
+ notice, this list of conditions and the following disclaimer in
93
+ the documentation and/or other materials provided with the
94
+ distribution.
95
+ 3. Neither the name of the OpenBLAS project nor the names of
96
+ its contributors may be used to endorse or promote products
97
+ derived from this software without specific prior written
98
+ permission.
99
+
100
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
101
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
102
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
103
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
104
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
105
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
106
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
107
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
108
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
109
+ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
110
+
111
+
112
+ Name: LAPACK
113
+ Files: numpy.libs/libopenblas*.so
114
+ Description: bundled in OpenBLAS
115
+ Availability: https://github.com/OpenMathLib/OpenBLAS/
116
+ License: BSD-3-Clause-Attribution
117
+ Copyright (c) 1992-2013 The University of Tennessee and The University
118
+ of Tennessee Research Foundation. All rights
119
+ reserved.
120
+ Copyright (c) 2000-2013 The University of California Berkeley. All
121
+ rights reserved.
122
+ Copyright (c) 2006-2013 The University of Colorado Denver. All rights
123
+ reserved.
124
+
125
+ $COPYRIGHT$
126
+
127
+ Additional copyrights may follow
128
+
129
+ $HEADER$
130
+
131
+ Redistribution and use in source and binary forms, with or without
132
+ modification, are permitted provided that the following conditions are
133
+ met:
134
+
135
+ - Redistributions of source code must retain the above copyright
136
+ notice, this list of conditions and the following disclaimer.
137
+
138
+ - Redistributions in binary form must reproduce the above copyright
139
+ notice, this list of conditions and the following disclaimer listed
140
+ in this license in the documentation and/or other materials
141
+ provided with the distribution.
142
+
143
+ - Neither the name of the copyright holders nor the names of its
144
+ contributors may be used to endorse or promote products derived from
145
+ this software without specific prior written permission.
146
+
147
+ The copyright holders provide no reassurances that the source code
148
+ provided does not infringe any patent, copyright, or any other
149
+ intellectual property rights of third parties. The copyright holders
150
+ disclaim any liability to any recipient for claims brought against
151
+ recipient by any third party for infringement of that parties
152
+ intellectual property rights.
153
+
154
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
155
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
156
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
157
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
158
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
159
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
160
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
161
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
162
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
163
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
164
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
165
+
166
+
167
+ Name: GCC runtime library
168
+ Files: numpy.libs/libgfortran*.so
169
+ Description: dynamically linked to files compiled with gcc
170
+ Availability: https://gcc.gnu.org/git/?p=gcc.git;a=tree;f=libgfortran
171
+ License: GPL-3.0-with-GCC-exception
172
+ Copyright (C) 2002-2017 Free Software Foundation, Inc.
173
+
174
+ Libgfortran is free software; you can redistribute it and/or modify
175
+ it under the terms of the GNU General Public License as published by
176
+ the Free Software Foundation; either version 3, or (at your option)
177
+ any later version.
178
+
179
+ Libgfortran is distributed in the hope that it will be useful,
180
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
181
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
182
+ GNU General Public License for more details.
183
+
184
+ Under Section 7 of GPL version 3, you are granted additional
185
+ permissions described in the GCC Runtime Library Exception, version
186
+ 3.1, as published by the Free Software Foundation.
187
+
188
+ You should have received a copy of the GNU General Public License and
189
+ a copy of the GCC Runtime Library Exception along with this program;
190
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
191
+ <http://www.gnu.org/licenses/>.
192
+
193
+ ----
194
+
195
+ Full text of license texts referred to above follows (that they are
196
+ listed below does not necessarily imply the conditions apply to the
197
+ present binary release):
198
+
199
+ ----
200
+
201
+ GCC RUNTIME LIBRARY EXCEPTION
202
+
203
+ Version 3.1, 31 March 2009
204
+
205
+ Copyright (C) 2009 Free Software Foundation, Inc. <http://fsf.org/>
206
+
207
+ Everyone is permitted to copy and distribute verbatim copies of this
208
+ license document, but changing it is not allowed.
209
+
210
+ This GCC Runtime Library Exception ("Exception") is an additional
211
+ permission under section 7 of the GNU General Public License, version
212
+ 3 ("GPLv3"). It applies to a given file (the "Runtime Library") that
213
+ bears a notice placed by the copyright holder of the file stating that
214
+ the file is governed by GPLv3 along with this Exception.
215
+
216
+ When you use GCC to compile a program, GCC may combine portions of
217
+ certain GCC header files and runtime libraries with the compiled
218
+ program. The purpose of this Exception is to allow compilation of
219
+ non-GPL (including proprietary) programs to use, in this way, the
220
+ header files and runtime libraries covered by this Exception.
221
+
222
+ 0. Definitions.
223
+
224
+ A file is an "Independent Module" if it either requires the Runtime
225
+ Library for execution after a Compilation Process, or makes use of an
226
+ interface provided by the Runtime Library, but is not otherwise based
227
+ on the Runtime Library.
228
+
229
+ "GCC" means a version of the GNU Compiler Collection, with or without
230
+ modifications, governed by version 3 (or a specified later version) of
231
+ the GNU General Public License (GPL) with the option of using any
232
+ subsequent versions published by the FSF.
233
+
234
+ "GPL-compatible Software" is software whose conditions of propagation,
235
+ modification and use would permit combination with GCC in accord with
236
+ the license of GCC.
237
+
238
+ "Target Code" refers to output from any compiler for a real or virtual
239
+ target processor architecture, in executable form or suitable for
240
+ input to an assembler, loader, linker and/or execution
241
+ phase. Notwithstanding that, Target Code does not include data in any
242
+ format that is used as a compiler intermediate representation, or used
243
+ for producing a compiler intermediate representation.
244
+
245
+ The "Compilation Process" transforms code entirely represented in
246
+ non-intermediate languages designed for human-written code, and/or in
247
+ Java Virtual Machine byte code, into Target Code. Thus, for example,
248
+ use of source code generators and preprocessors need not be considered
249
+ part of the Compilation Process, since the Compilation Process can be
250
+ understood as starting with the output of the generators or
251
+ preprocessors.
252
+
253
+ A Compilation Process is "Eligible" if it is done using GCC, alone or
254
+ with other GPL-compatible software, or if it is done without using any
255
+ work based on GCC. For example, using non-GPL-compatible Software to
256
+ optimize any GCC intermediate representations would not qualify as an
257
+ Eligible Compilation Process.
258
+
259
+ 1. Grant of Additional Permission.
260
+
261
+ You have permission to propagate a work of Target Code formed by
262
+ combining the Runtime Library with Independent Modules, even if such
263
+ propagation would otherwise violate the terms of GPLv3, provided that
264
+ all Target Code was generated by Eligible Compilation Processes. You
265
+ may then convey such a combination under terms of your choice,
266
+ consistent with the licensing of the Independent Modules.
267
+
268
+ 2. No Weakening of GCC Copyleft.
269
+
270
+ The availability of this Exception does not imply any general
271
+ presumption that third-party software is unaffected by the copyleft
272
+ requirements of the license of GCC.
273
+
274
+ ----
275
+
276
+ GNU GENERAL PUBLIC LICENSE
277
+ Version 3, 29 June 2007
278
+
279
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
280
+ Everyone is permitted to copy and distribute verbatim copies
281
+ of this license document, but changing it is not allowed.
282
+
283
+ Preamble
284
+
285
+ The GNU General Public License is a free, copyleft license for
286
+ software and other kinds of works.
287
+
288
+ The licenses for most software and other practical works are designed
289
+ to take away your freedom to share and change the works. By contrast,
290
+ the GNU General Public License is intended to guarantee your freedom to
291
+ share and change all versions of a program--to make sure it remains free
292
+ software for all its users. We, the Free Software Foundation, use the
293
+ GNU General Public License for most of our software; it applies also to
294
+ any other work released this way by its authors. You can apply it to
295
+ your programs, too.
296
+
297
+ When we speak of free software, we are referring to freedom, not
298
+ price. Our General Public Licenses are designed to make sure that you
299
+ have the freedom to distribute copies of free software (and charge for
300
+ them if you wish), that you receive source code or can get it if you
301
+ want it, that you can change the software or use pieces of it in new
302
+ free programs, and that you know you can do these things.
303
+
304
+ To protect your rights, we need to prevent others from denying you
305
+ these rights or asking you to surrender the rights. Therefore, you have
306
+ certain responsibilities if you distribute copies of the software, or if
307
+ you modify it: responsibilities to respect the freedom of others.
308
+
309
+ For example, if you distribute copies of such a program, whether
310
+ gratis or for a fee, you must pass on to the recipients the same
311
+ freedoms that you received. You must make sure that they, too, receive
312
+ or can get the source code. And you must show them these terms so they
313
+ know their rights.
314
+
315
+ Developers that use the GNU GPL protect your rights with two steps:
316
+ (1) assert copyright on the software, and (2) offer you this License
317
+ giving you legal permission to copy, distribute and/or modify it.
318
+
319
+ For the developers' and authors' protection, the GPL clearly explains
320
+ that there is no warranty for this free software. For both users' and
321
+ authors' sake, the GPL requires that modified versions be marked as
322
+ changed, so that their problems will not be attributed erroneously to
323
+ authors of previous versions.
324
+
325
+ Some devices are designed to deny users access to install or run
326
+ modified versions of the software inside them, although the manufacturer
327
+ can do so. This is fundamentally incompatible with the aim of
328
+ protecting users' freedom to change the software. The systematic
329
+ pattern of such abuse occurs in the area of products for individuals to
330
+ use, which is precisely where it is most unacceptable. Therefore, we
331
+ have designed this version of the GPL to prohibit the practice for those
332
+ products. If such problems arise substantially in other domains, we
333
+ stand ready to extend this provision to those domains in future versions
334
+ of the GPL, as needed to protect the freedom of users.
335
+
336
+ Finally, every program is threatened constantly by software patents.
337
+ States should not allow patents to restrict development and use of
338
+ software on general-purpose computers, but in those that do, we wish to
339
+ avoid the special danger that patents applied to a free program could
340
+ make it effectively proprietary. To prevent this, the GPL assures that
341
+ patents cannot be used to render the program non-free.
342
+
343
+ The precise terms and conditions for copying, distribution and
344
+ modification follow.
345
+
346
+ TERMS AND CONDITIONS
347
+
348
+ 0. Definitions.
349
+
350
+ "This License" refers to version 3 of the GNU General Public License.
351
+
352
+ "Copyright" also means copyright-like laws that apply to other kinds of
353
+ works, such as semiconductor masks.
354
+
355
+ "The Program" refers to any copyrightable work licensed under this
356
+ License. Each licensee is addressed as "you". "Licensees" and
357
+ "recipients" may be individuals or organizations.
358
+
359
+ To "modify" a work means to copy from or adapt all or part of the work
360
+ in a fashion requiring copyright permission, other than the making of an
361
+ exact copy. The resulting work is called a "modified version" of the
362
+ earlier work or a work "based on" the earlier work.
363
+
364
+ A "covered work" means either the unmodified Program or a work based
365
+ on the Program.
366
+
367
+ To "propagate" a work means to do anything with it that, without
368
+ permission, would make you directly or secondarily liable for
369
+ infringement under applicable copyright law, except executing it on a
370
+ computer or modifying a private copy. Propagation includes copying,
371
+ distribution (with or without modification), making available to the
372
+ public, and in some countries other activities as well.
373
+
374
+ To "convey" a work means any kind of propagation that enables other
375
+ parties to make or receive copies. Mere interaction with a user through
376
+ a computer network, with no transfer of a copy, is not conveying.
377
+
378
+ An interactive user interface displays "Appropriate Legal Notices"
379
+ to the extent that it includes a convenient and prominently visible
380
+ feature that (1) displays an appropriate copyright notice, and (2)
381
+ tells the user that there is no warranty for the work (except to the
382
+ extent that warranties are provided), that licensees may convey the
383
+ work under this License, and how to view a copy of this License. If
384
+ the interface presents a list of user commands or options, such as a
385
+ menu, a prominent item in the list meets this criterion.
386
+
387
+ 1. Source Code.
388
+
389
+ The "source code" for a work means the preferred form of the work
390
+ for making modifications to it. "Object code" means any non-source
391
+ form of a work.
392
+
393
+ A "Standard Interface" means an interface that either is an official
394
+ standard defined by a recognized standards body, or, in the case of
395
+ interfaces specified for a particular programming language, one that
396
+ is widely used among developers working in that language.
397
+
398
+ The "System Libraries" of an executable work include anything, other
399
+ than the work as a whole, that (a) is included in the normal form of
400
+ packaging a Major Component, but which is not part of that Major
401
+ Component, and (b) serves only to enable use of the work with that
402
+ Major Component, or to implement a Standard Interface for which an
403
+ implementation is available to the public in source code form. A
404
+ "Major Component", in this context, means a major essential component
405
+ (kernel, window system, and so on) of the specific operating system
406
+ (if any) on which the executable work runs, or a compiler used to
407
+ produce the work, or an object code interpreter used to run it.
408
+
409
+ The "Corresponding Source" for a work in object code form means all
410
+ the source code needed to generate, install, and (for an executable
411
+ work) run the object code and to modify the work, including scripts to
412
+ control those activities. However, it does not include the work's
413
+ System Libraries, or general-purpose tools or generally available free
414
+ programs which are used unmodified in performing those activities but
415
+ which are not part of the work. For example, Corresponding Source
416
+ includes interface definition files associated with source files for
417
+ the work, and the source code for shared libraries and dynamically
418
+ linked subprograms that the work is specifically designed to require,
419
+ such as by intimate data communication or control flow between those
420
+ subprograms and other parts of the work.
421
+
422
+ The Corresponding Source need not include anything that users
423
+ can regenerate automatically from other parts of the Corresponding
424
+ Source.
425
+
426
+ The Corresponding Source for a work in source code form is that
427
+ same work.
428
+
429
+ 2. Basic Permissions.
430
+
431
+ All rights granted under this License are granted for the term of
432
+ copyright on the Program, and are irrevocable provided the stated
433
+ conditions are met. This License explicitly affirms your unlimited
434
+ permission to run the unmodified Program. The output from running a
435
+ covered work is covered by this License only if the output, given its
436
+ content, constitutes a covered work. This License acknowledges your
437
+ rights of fair use or other equivalent, as provided by copyright law.
438
+
439
+ You may make, run and propagate covered works that you do not
440
+ convey, without conditions so long as your license otherwise remains
441
+ in force. You may convey covered works to others for the sole purpose
442
+ of having them make modifications exclusively for you, or provide you
443
+ with facilities for running those works, provided that you comply with
444
+ the terms of this License in conveying all material for which you do
445
+ not control copyright. Those thus making or running the covered works
446
+ for you must do so exclusively on your behalf, under your direction
447
+ and control, on terms that prohibit them from making any copies of
448
+ your copyrighted material outside their relationship with you.
449
+
450
+ Conveying under any other circumstances is permitted solely under
451
+ the conditions stated below. Sublicensing is not allowed; section 10
452
+ makes it unnecessary.
453
+
454
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
455
+
456
+ No covered work shall be deemed part of an effective technological
457
+ measure under any applicable law fulfilling obligations under article
458
+ 11 of the WIPO copyright treaty adopted on 20 December 1996, or
459
+ similar laws prohibiting or restricting circumvention of such
460
+ measures.
461
+
462
+ When you convey a covered work, you waive any legal power to forbid
463
+ circumvention of technological measures to the extent such circumvention
464
+ is effected by exercising rights under this License with respect to
465
+ the covered work, and you disclaim any intention to limit operation or
466
+ modification of the work as a means of enforcing, against the work's
467
+ users, your or third parties' legal rights to forbid circumvention of
468
+ technological measures.
469
+
470
+ 4. Conveying Verbatim Copies.
471
+
472
+ You may convey verbatim copies of the Program's source code as you
473
+ receive it, in any medium, provided that you conspicuously and
474
+ appropriately publish on each copy an appropriate copyright notice;
475
+ keep intact all notices stating that this License and any
476
+ non-permissive terms added in accord with section 7 apply to the code;
477
+ keep intact all notices of the absence of any warranty; and give all
478
+ recipients a copy of this License along with the Program.
479
+
480
+ You may charge any price or no price for each copy that you convey,
481
+ and you may offer support or warranty protection for a fee.
482
+
483
+ 5. Conveying Modified Source Versions.
484
+
485
+ You may convey a work based on the Program, or the modifications to
486
+ produce it from the Program, in the form of source code under the
487
+ terms of section 4, provided that you also meet all of these conditions:
488
+
489
+ a) The work must carry prominent notices stating that you modified
490
+ it, and giving a relevant date.
491
+
492
+ b) The work must carry prominent notices stating that it is
493
+ released under this License and any conditions added under section
494
+ 7. This requirement modifies the requirement in section 4 to
495
+ "keep intact all notices".
496
+
497
+ c) You must license the entire work, as a whole, under this
498
+ License to anyone who comes into possession of a copy. This
499
+ License will therefore apply, along with any applicable section 7
500
+ additional terms, to the whole of the work, and all its parts,
501
+ regardless of how they are packaged. This License gives no
502
+ permission to license the work in any other way, but it does not
503
+ invalidate such permission if you have separately received it.
504
+
505
+ d) If the work has interactive user interfaces, each must display
506
+ Appropriate Legal Notices; however, if the Program has interactive
507
+ interfaces that do not display Appropriate Legal Notices, your
508
+ work need not make them do so.
509
+
510
+ A compilation of a covered work with other separate and independent
511
+ works, which are not by their nature extensions of the covered work,
512
+ and which are not combined with it such as to form a larger program,
513
+ in or on a volume of a storage or distribution medium, is called an
514
+ "aggregate" if the compilation and its resulting copyright are not
515
+ used to limit the access or legal rights of the compilation's users
516
+ beyond what the individual works permit. Inclusion of a covered work
517
+ in an aggregate does not cause this License to apply to the other
518
+ parts of the aggregate.
519
+
520
+ 6. Conveying Non-Source Forms.
521
+
522
+ You may convey a covered work in object code form under the terms
523
+ of sections 4 and 5, provided that you also convey the
524
+ machine-readable Corresponding Source under the terms of this License,
525
+ in one of these ways:
526
+
527
+ a) Convey the object code in, or embodied in, a physical product
528
+ (including a physical distribution medium), accompanied by the
529
+ Corresponding Source fixed on a durable physical medium
530
+ customarily used for software interchange.
531
+
532
+ b) Convey the object code in, or embodied in, a physical product
533
+ (including a physical distribution medium), accompanied by a
534
+ written offer, valid for at least three years and valid for as
535
+ long as you offer spare parts or customer support for that product
536
+ model, to give anyone who possesses the object code either (1) a
537
+ copy of the Corresponding Source for all the software in the
538
+ product that is covered by this License, on a durable physical
539
+ medium customarily used for software interchange, for a price no
540
+ more than your reasonable cost of physically performing this
541
+ conveying of source, or (2) access to copy the
542
+ Corresponding Source from a network server at no charge.
543
+
544
+ c) Convey individual copies of the object code with a copy of the
545
+ written offer to provide the Corresponding Source. This
546
+ alternative is allowed only occasionally and noncommercially, and
547
+ only if you received the object code with such an offer, in accord
548
+ with subsection 6b.
549
+
550
+ d) Convey the object code by offering access from a designated
551
+ place (gratis or for a charge), and offer equivalent access to the
552
+ Corresponding Source in the same way through the same place at no
553
+ further charge. You need not require recipients to copy the
554
+ Corresponding Source along with the object code. If the place to
555
+ copy the object code is a network server, the Corresponding Source
556
+ may be on a different server (operated by you or a third party)
557
+ that supports equivalent copying facilities, provided you maintain
558
+ clear directions next to the object code saying where to find the
559
+ Corresponding Source. Regardless of what server hosts the
560
+ Corresponding Source, you remain obligated to ensure that it is
561
+ available for as long as needed to satisfy these requirements.
562
+
563
+ e) Convey the object code using peer-to-peer transmission, provided
564
+ you inform other peers where the object code and Corresponding
565
+ Source of the work are being offered to the general public at no
566
+ charge under subsection 6d.
567
+
568
+ A separable portion of the object code, whose source code is excluded
569
+ from the Corresponding Source as a System Library, need not be
570
+ included in conveying the object code work.
571
+
572
+ A "User Product" is either (1) a "consumer product", which means any
573
+ tangible personal property which is normally used for personal, family,
574
+ or household purposes, or (2) anything designed or sold for incorporation
575
+ into a dwelling. In determining whether a product is a consumer product,
576
+ doubtful cases shall be resolved in favor of coverage. For a particular
577
+ product received by a particular user, "normally used" refers to a
578
+ typical or common use of that class of product, regardless of the status
579
+ of the particular user or of the way in which the particular user
580
+ actually uses, or expects or is expected to use, the product. A product
581
+ is a consumer product regardless of whether the product has substantial
582
+ commercial, industrial or non-consumer uses, unless such uses represent
583
+ the only significant mode of use of the product.
584
+
585
+ "Installation Information" for a User Product means any methods,
586
+ procedures, authorization keys, or other information required to install
587
+ and execute modified versions of a covered work in that User Product from
588
+ a modified version of its Corresponding Source. The information must
589
+ suffice to ensure that the continued functioning of the modified object
590
+ code is in no case prevented or interfered with solely because
591
+ modification has been made.
592
+
593
+ If you convey an object code work under this section in, or with, or
594
+ specifically for use in, a User Product, and the conveying occurs as
595
+ part of a transaction in which the right of possession and use of the
596
+ User Product is transferred to the recipient in perpetuity or for a
597
+ fixed term (regardless of how the transaction is characterized), the
598
+ Corresponding Source conveyed under this section must be accompanied
599
+ by the Installation Information. But this requirement does not apply
600
+ if neither you nor any third party retains the ability to install
601
+ modified object code on the User Product (for example, the work has
602
+ been installed in ROM).
603
+
604
+ The requirement to provide Installation Information does not include a
605
+ requirement to continue to provide support service, warranty, or updates
606
+ for a work that has been modified or installed by the recipient, or for
607
+ the User Product in which it has been modified or installed. Access to a
608
+ network may be denied when the modification itself materially and
609
+ adversely affects the operation of the network or violates the rules and
610
+ protocols for communication across the network.
611
+
612
+ Corresponding Source conveyed, and Installation Information provided,
613
+ in accord with this section must be in a format that is publicly
614
+ documented (and with an implementation available to the public in
615
+ source code form), and must require no special password or key for
616
+ unpacking, reading or copying.
617
+
618
+ 7. Additional Terms.
619
+
620
+ "Additional permissions" are terms that supplement the terms of this
621
+ License by making exceptions from one or more of its conditions.
622
+ Additional permissions that are applicable to the entire Program shall
623
+ be treated as though they were included in this License, to the extent
624
+ that they are valid under applicable law. If additional permissions
625
+ apply only to part of the Program, that part may be used separately
626
+ under those permissions, but the entire Program remains governed by
627
+ this License without regard to the additional permissions.
628
+
629
+ When you convey a copy of a covered work, you may at your option
630
+ remove any additional permissions from that copy, or from any part of
631
+ it. (Additional permissions may be written to require their own
632
+ removal in certain cases when you modify the work.) You may place
633
+ additional permissions on material, added by you to a covered work,
634
+ for which you have or can give appropriate copyright permission.
635
+
636
+ Notwithstanding any other provision of this License, for material you
637
+ add to a covered work, you may (if authorized by the copyright holders of
638
+ that material) supplement the terms of this License with terms:
639
+
640
+ a) Disclaiming warranty or limiting liability differently from the
641
+ terms of sections 15 and 16 of this License; or
642
+
643
+ b) Requiring preservation of specified reasonable legal notices or
644
+ author attributions in that material or in the Appropriate Legal
645
+ Notices displayed by works containing it; or
646
+
647
+ c) Prohibiting misrepresentation of the origin of that material, or
648
+ requiring that modified versions of such material be marked in
649
+ reasonable ways as different from the original version; or
650
+
651
+ d) Limiting the use for publicity purposes of names of licensors or
652
+ authors of the material; or
653
+
654
+ e) Declining to grant rights under trademark law for use of some
655
+ trade names, trademarks, or service marks; or
656
+
657
+ f) Requiring indemnification of licensors and authors of that
658
+ material by anyone who conveys the material (or modified versions of
659
+ it) with contractual assumptions of liability to the recipient, for
660
+ any liability that these contractual assumptions directly impose on
661
+ those licensors and authors.
662
+
663
+ All other non-permissive additional terms are considered "further
664
+ restrictions" within the meaning of section 10. If the Program as you
665
+ received it, or any part of it, contains a notice stating that it is
666
+ governed by this License along with a term that is a further
667
+ restriction, you may remove that term. If a license document contains
668
+ a further restriction but permits relicensing or conveying under this
669
+ License, you may add to a covered work material governed by the terms
670
+ of that license document, provided that the further restriction does
671
+ not survive such relicensing or conveying.
672
+
673
+ If you add terms to a covered work in accord with this section, you
674
+ must place, in the relevant source files, a statement of the
675
+ additional terms that apply to those files, or a notice indicating
676
+ where to find the applicable terms.
677
+
678
+ Additional terms, permissive or non-permissive, may be stated in the
679
+ form of a separately written license, or stated as exceptions;
680
+ the above requirements apply either way.
681
+
682
+ 8. Termination.
683
+
684
+ You may not propagate or modify a covered work except as expressly
685
+ provided under this License. Any attempt otherwise to propagate or
686
+ modify it is void, and will automatically terminate your rights under
687
+ this License (including any patent licenses granted under the third
688
+ paragraph of section 11).
689
+
690
+ However, if you cease all violation of this License, then your
691
+ license from a particular copyright holder is reinstated (a)
692
+ provisionally, unless and until the copyright holder explicitly and
693
+ finally terminates your license, and (b) permanently, if the copyright
694
+ holder fails to notify you of the violation by some reasonable means
695
+ prior to 60 days after the cessation.
696
+
697
+ Moreover, your license from a particular copyright holder is
698
+ reinstated permanently if the copyright holder notifies you of the
699
+ violation by some reasonable means, this is the first time you have
700
+ received notice of violation of this License (for any work) from that
701
+ copyright holder, and you cure the violation prior to 30 days after
702
+ your receipt of the notice.
703
+
704
+ Termination of your rights under this section does not terminate the
705
+ licenses of parties who have received copies or rights from you under
706
+ this License. If your rights have been terminated and not permanently
707
+ reinstated, you do not qualify to receive new licenses for the same
708
+ material under section 10.
709
+
710
+ 9. Acceptance Not Required for Having Copies.
711
+
712
+ You are not required to accept this License in order to receive or
713
+ run a copy of the Program. Ancillary propagation of a covered work
714
+ occurring solely as a consequence of using peer-to-peer transmission
715
+ to receive a copy likewise does not require acceptance. However,
716
+ nothing other than this License grants you permission to propagate or
717
+ modify any covered work. These actions infringe copyright if you do
718
+ not accept this License. Therefore, by modifying or propagating a
719
+ covered work, you indicate your acceptance of this License to do so.
720
+
721
+ 10. Automatic Licensing of Downstream Recipients.
722
+
723
+ Each time you convey a covered work, the recipient automatically
724
+ receives a license from the original licensors, to run, modify and
725
+ propagate that work, subject to this License. You are not responsible
726
+ for enforcing compliance by third parties with this License.
727
+
728
+ An "entity transaction" is a transaction transferring control of an
729
+ organization, or substantially all assets of one, or subdividing an
730
+ organization, or merging organizations. If propagation of a covered
731
+ work results from an entity transaction, each party to that
732
+ transaction who receives a copy of the work also receives whatever
733
+ licenses to the work the party's predecessor in interest had or could
734
+ give under the previous paragraph, plus a right to possession of the
735
+ Corresponding Source of the work from the predecessor in interest, if
736
+ the predecessor has it or can get it with reasonable efforts.
737
+
738
+ You may not impose any further restrictions on the exercise of the
739
+ rights granted or affirmed under this License. For example, you may
740
+ not impose a license fee, royalty, or other charge for exercise of
741
+ rights granted under this License, and you may not initiate litigation
742
+ (including a cross-claim or counterclaim in a lawsuit) alleging that
743
+ any patent claim is infringed by making, using, selling, offering for
744
+ sale, or importing the Program or any portion of it.
745
+
746
+ 11. Patents.
747
+
748
+ A "contributor" is a copyright holder who authorizes use under this
749
+ License of the Program or a work on which the Program is based. The
750
+ work thus licensed is called the contributor's "contributor version".
751
+
752
+ A contributor's "essential patent claims" are all patent claims
753
+ owned or controlled by the contributor, whether already acquired or
754
+ hereafter acquired, that would be infringed by some manner, permitted
755
+ by this License, of making, using, or selling its contributor version,
756
+ but do not include claims that would be infringed only as a
757
+ consequence of further modification of the contributor version. For
758
+ purposes of this definition, "control" includes the right to grant
759
+ patent sublicenses in a manner consistent with the requirements of
760
+ this License.
761
+
762
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
763
+ patent license under the contributor's essential patent claims, to
764
+ make, use, sell, offer for sale, import and otherwise run, modify and
765
+ propagate the contents of its contributor version.
766
+
767
+ In the following three paragraphs, a "patent license" is any express
768
+ agreement or commitment, however denominated, not to enforce a patent
769
+ (such as an express permission to practice a patent or covenant not to
770
+ sue for patent infringement). To "grant" such a patent license to a
771
+ party means to make such an agreement or commitment not to enforce a
772
+ patent against the party.
773
+
774
+ If you convey a covered work, knowingly relying on a patent license,
775
+ and the Corresponding Source of the work is not available for anyone
776
+ to copy, free of charge and under the terms of this License, through a
777
+ publicly available network server or other readily accessible means,
778
+ then you must either (1) cause the Corresponding Source to be so
779
+ available, or (2) arrange to deprive yourself of the benefit of the
780
+ patent license for this particular work, or (3) arrange, in a manner
781
+ consistent with the requirements of this License, to extend the patent
782
+ license to downstream recipients. "Knowingly relying" means you have
783
+ actual knowledge that, but for the patent license, your conveying the
784
+ covered work in a country, or your recipient's use of the covered work
785
+ in a country, would infringe one or more identifiable patents in that
786
+ country that you have reason to believe are valid.
787
+
788
+ If, pursuant to or in connection with a single transaction or
789
+ arrangement, you convey, or propagate by procuring conveyance of, a
790
+ covered work, and grant a patent license to some of the parties
791
+ receiving the covered work authorizing them to use, propagate, modify
792
+ or convey a specific copy of the covered work, then the patent license
793
+ you grant is automatically extended to all recipients of the covered
794
+ work and works based on it.
795
+
796
+ A patent license is "discriminatory" if it does not include within
797
+ the scope of its coverage, prohibits the exercise of, or is
798
+ conditioned on the non-exercise of one or more of the rights that are
799
+ specifically granted under this License. You may not convey a covered
800
+ work if you are a party to an arrangement with a third party that is
801
+ in the business of distributing software, under which you make payment
802
+ to the third party based on the extent of your activity of conveying
803
+ the work, and under which the third party grants, to any of the
804
+ parties who would receive the covered work from you, a discriminatory
805
+ patent license (a) in connection with copies of the covered work
806
+ conveyed by you (or copies made from those copies), or (b) primarily
807
+ for and in connection with specific products or compilations that
808
+ contain the covered work, unless you entered into that arrangement,
809
+ or that patent license was granted, prior to 28 March 2007.
810
+
811
+ Nothing in this License shall be construed as excluding or limiting
812
+ any implied license or other defenses to infringement that may
813
+ otherwise be available to you under applicable patent law.
814
+
815
+ 12. No Surrender of Others' Freedom.
816
+
817
+ If conditions are imposed on you (whether by court order, agreement or
818
+ otherwise) that contradict the conditions of this License, they do not
819
+ excuse you from the conditions of this License. If you cannot convey a
820
+ covered work so as to satisfy simultaneously your obligations under this
821
+ License and any other pertinent obligations, then as a consequence you may
822
+ not convey it at all. For example, if you agree to terms that obligate you
823
+ to collect a royalty for further conveying from those to whom you convey
824
+ the Program, the only way you could satisfy both those terms and this
825
+ License would be to refrain entirely from conveying the Program.
826
+
827
+ 13. Use with the GNU Affero General Public License.
828
+
829
+ Notwithstanding any other provision of this License, you have
830
+ permission to link or combine any covered work with a work licensed
831
+ under version 3 of the GNU Affero General Public License into a single
832
+ combined work, and to convey the resulting work. The terms of this
833
+ License will continue to apply to the part which is the covered work,
834
+ but the special requirements of the GNU Affero General Public License,
835
+ section 13, concerning interaction through a network will apply to the
836
+ combination as such.
837
+
838
+ 14. Revised Versions of this License.
839
+
840
+ The Free Software Foundation may publish revised and/or new versions of
841
+ the GNU General Public License from time to time. Such new versions will
842
+ be similar in spirit to the present version, but may differ in detail to
843
+ address new problems or concerns.
844
+
845
+ Each version is given a distinguishing version number. If the
846
+ Program specifies that a certain numbered version of the GNU General
847
+ Public License "or any later version" applies to it, you have the
848
+ option of following the terms and conditions either of that numbered
849
+ version or of any later version published by the Free Software
850
+ Foundation. If the Program does not specify a version number of the
851
+ GNU General Public License, you may choose any version ever published
852
+ by the Free Software Foundation.
853
+
854
+ If the Program specifies that a proxy can decide which future
855
+ versions of the GNU General Public License can be used, that proxy's
856
+ public statement of acceptance of a version permanently authorizes you
857
+ to choose that version for the Program.
858
+
859
+ Later license versions may give you additional or different
860
+ permissions. However, no additional obligations are imposed on any
861
+ author or copyright holder as a result of your choosing to follow a
862
+ later version.
863
+
864
+ 15. Disclaimer of Warranty.
865
+
866
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
867
+ APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
868
+ HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
869
+ OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
870
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
871
+ PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
872
+ IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
873
+ ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
874
+
875
+ 16. Limitation of Liability.
876
+
877
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
878
+ WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
879
+ THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
880
+ GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
881
+ USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
882
+ DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
883
+ PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
884
+ EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
885
+ SUCH DAMAGES.
886
+
887
+ 17. Interpretation of Sections 15 and 16.
888
+
889
+ If the disclaimer of warranty and limitation of liability provided
890
+ above cannot be given local legal effect according to their terms,
891
+ reviewing courts shall apply local law that most closely approximates
892
+ an absolute waiver of all civil liability in connection with the
893
+ Program, unless a warranty or assumption of liability accompanies a
894
+ copy of the Program in return for a fee.
895
+
896
+ END OF TERMS AND CONDITIONS
897
+
898
+ How to Apply These Terms to Your New Programs
899
+
900
+ If you develop a new program, and you want it to be of the greatest
901
+ possible use to the public, the best way to achieve this is to make it
902
+ free software which everyone can redistribute and change under these terms.
903
+
904
+ To do so, attach the following notices to the program. It is safest
905
+ to attach them to the start of each source file to most effectively
906
+ state the exclusion of warranty; and each file should have at least
907
+ the "copyright" line and a pointer to where the full notice is found.
908
+
909
+ <one line to give the program's name and a brief idea of what it does.>
910
+ Copyright (C) <year> <name of author>
911
+
912
+ This program is free software: you can redistribute it and/or modify
913
+ it under the terms of the GNU General Public License as published by
914
+ the Free Software Foundation, either version 3 of the License, or
915
+ (at your option) any later version.
916
+
917
+ This program is distributed in the hope that it will be useful,
918
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
919
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
920
+ GNU General Public License for more details.
921
+
922
+ You should have received a copy of the GNU General Public License
923
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
924
+
925
+ Also add information on how to contact you by electronic and paper mail.
926
+
927
+ If the program does terminal interaction, make it output a short
928
+ notice like this when it starts in an interactive mode:
929
+
930
+ <program> Copyright (C) <year> <name of author>
931
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
932
+ This is free software, and you are welcome to redistribute it
933
+ under certain conditions; type `show c' for details.
934
+
935
+ The hypothetical commands `show w' and `show c' should show the appropriate
936
+ parts of the General Public License. Of course, your program's commands
937
+ might be different; for a GUI interface, you would use an "about box".
938
+
939
+ You should also get your employer (if you work as a programmer) or school,
940
+ if any, to sign a "copyright disclaimer" for the program, if necessary.
941
+ For more information on this, and how to apply and follow the GNU GPL, see
942
+ <http://www.gnu.org/licenses/>.
943
+
944
+ The GNU General Public License does not permit incorporating your program
945
+ into proprietary programs. If your program is a subroutine library, you
946
+ may consider it more useful to permit linking proprietary applications with
947
+ the library. If this is what you want to do, use the GNU Lesser General
948
+ Public License instead of this License. But first, please read
949
+ <http://www.gnu.org/philosophy/why-not-lgpl.html>.
950
+
951
+ Name: libquadmath
952
+ Files: numpy.libs/libquadmath*.so
953
+ Description: dynamically linked to files compiled with gcc
954
+ Availability: https://gcc.gnu.org/git/?p=gcc.git;a=tree;f=libquadmath
955
+ License: LGPL-2.1-or-later
956
+
957
+ GCC Quad-Precision Math Library
958
+ Copyright (C) 2010-2019 Free Software Foundation, Inc.
959
+ Written by Francois-Xavier Coudert <fxcoudert@gcc.gnu.org>
960
+
961
+ This file is part of the libquadmath library.
962
+ Libquadmath is free software; you can redistribute it and/or
963
+ modify it under the terms of the GNU Library General Public
964
+ License as published by the Free Software Foundation; either
965
+ version 2.1 of the License, or (at your option) any later version.
966
+
967
+ Libquadmath is distributed in the hope that it will be useful,
968
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
969
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
970
+ Lesser General Public License for more details.
971
+ https://www.gnu.org/licenses/old-licenses/lgpl-2.1.html
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/METADATA ADDED
@@ -0,0 +1,1092 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: numpy
3
+ Version: 1.26.4
4
+ Summary: Fundamental package for array computing in Python
5
+ Home-page: https://numpy.org
6
+ Author: Travis E. Oliphant et al.
7
+ Maintainer-Email: NumPy Developers <numpy-discussion@python.org>
8
+ License: Copyright (c) 2005-2023, NumPy Developers.
9
+ All rights reserved.
10
+
11
+ Redistribution and use in source and binary forms, with or without
12
+ modification, are permitted provided that the following conditions are
13
+ met:
14
+
15
+ * Redistributions of source code must retain the above copyright
16
+ notice, this list of conditions and the following disclaimer.
17
+
18
+ * Redistributions in binary form must reproduce the above
19
+ copyright notice, this list of conditions and the following
20
+ disclaimer in the documentation and/or other materials provided
21
+ with the distribution.
22
+
23
+ * Neither the name of the NumPy Developers nor the names of any
24
+ contributors may be used to endorse or promote products derived
25
+ from this software without specific prior written permission.
26
+
27
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
+
39
+ ----
40
+
41
+ The NumPy repository and source distributions bundle several libraries that are
42
+ compatibly licensed. We list these here.
43
+
44
+ Name: lapack-lite
45
+ Files: numpy/linalg/lapack_lite/*
46
+ License: BSD-3-Clause
47
+ For details, see numpy/linalg/lapack_lite/LICENSE.txt
48
+
49
+ Name: tempita
50
+ Files: tools/npy_tempita/*
51
+ License: MIT
52
+ For details, see tools/npy_tempita/license.txt
53
+
54
+ Name: dragon4
55
+ Files: numpy/core/src/multiarray/dragon4.c
56
+ License: MIT
57
+ For license text, see numpy/core/src/multiarray/dragon4.c
58
+
59
+ Name: libdivide
60
+ Files: numpy/core/include/numpy/libdivide/*
61
+ License: Zlib
62
+ For license text, see numpy/core/include/numpy/libdivide/LICENSE.txt
63
+
64
+
65
+ Note that the following files are vendored in the repository and sdist but not
66
+ installed in built numpy packages:
67
+
68
+ Name: Meson
69
+ Files: vendored-meson/meson/*
70
+ License: Apache 2.0
71
+ For license text, see vendored-meson/meson/COPYING
72
+
73
+ Name: spin
74
+ Files: .spin/cmds.py
75
+ License: BSD-3
76
+ For license text, see .spin/LICENSE
77
+
78
+ ----
79
+
80
+ This binary distribution of NumPy also bundles the following software:
81
+
82
+
83
+ Name: OpenBLAS
84
+ Files: numpy.libs/libopenblas*.so
85
+ Description: bundled as a dynamically linked library
86
+ Availability: https://github.com/OpenMathLib/OpenBLAS/
87
+ License: BSD-3-Clause
88
+ Copyright (c) 2011-2014, The OpenBLAS Project
89
+ All rights reserved.
90
+
91
+ Redistribution and use in source and binary forms, with or without
92
+ modification, are permitted provided that the following conditions are
93
+ met:
94
+
95
+ 1. Redistributions of source code must retain the above copyright
96
+ notice, this list of conditions and the following disclaimer.
97
+
98
+ 2. Redistributions in binary form must reproduce the above copyright
99
+ notice, this list of conditions and the following disclaimer in
100
+ the documentation and/or other materials provided with the
101
+ distribution.
102
+ 3. Neither the name of the OpenBLAS project nor the names of
103
+ its contributors may be used to endorse or promote products
104
+ derived from this software without specific prior written
105
+ permission.
106
+
107
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
108
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
109
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
110
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
111
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
112
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
113
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
114
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
115
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
116
+ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
117
+
118
+
119
+ Name: LAPACK
120
+ Files: numpy.libs/libopenblas*.so
121
+ Description: bundled in OpenBLAS
122
+ Availability: https://github.com/OpenMathLib/OpenBLAS/
123
+ License: BSD-3-Clause-Attribution
124
+ Copyright (c) 1992-2013 The University of Tennessee and The University
125
+ of Tennessee Research Foundation. All rights
126
+ reserved.
127
+ Copyright (c) 2000-2013 The University of California Berkeley. All
128
+ rights reserved.
129
+ Copyright (c) 2006-2013 The University of Colorado Denver. All rights
130
+ reserved.
131
+
132
+ $COPYRIGHT$
133
+
134
+ Additional copyrights may follow
135
+
136
+ $HEADER$
137
+
138
+ Redistribution and use in source and binary forms, with or without
139
+ modification, are permitted provided that the following conditions are
140
+ met:
141
+
142
+ - Redistributions of source code must retain the above copyright
143
+ notice, this list of conditions and the following disclaimer.
144
+
145
+ - Redistributions in binary form must reproduce the above copyright
146
+ notice, this list of conditions and the following disclaimer listed
147
+ in this license in the documentation and/or other materials
148
+ provided with the distribution.
149
+
150
+ - Neither the name of the copyright holders nor the names of its
151
+ contributors may be used to endorse or promote products derived from
152
+ this software without specific prior written permission.
153
+
154
+ The copyright holders provide no reassurances that the source code
155
+ provided does not infringe any patent, copyright, or any other
156
+ intellectual property rights of third parties. The copyright holders
157
+ disclaim any liability to any recipient for claims brought against
158
+ recipient by any third party for infringement of that parties
159
+ intellectual property rights.
160
+
161
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
162
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
163
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
164
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
165
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
166
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
167
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
168
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
169
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
170
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
171
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
172
+
173
+
174
+ Name: GCC runtime library
175
+ Files: numpy.libs/libgfortran*.so
176
+ Description: dynamically linked to files compiled with gcc
177
+ Availability: https://gcc.gnu.org/git/?p=gcc.git;a=tree;f=libgfortran
178
+ License: GPL-3.0-with-GCC-exception
179
+ Copyright (C) 2002-2017 Free Software Foundation, Inc.
180
+
181
+ Libgfortran is free software; you can redistribute it and/or modify
182
+ it under the terms of the GNU General Public License as published by
183
+ the Free Software Foundation; either version 3, or (at your option)
184
+ any later version.
185
+
186
+ Libgfortran is distributed in the hope that it will be useful,
187
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
188
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
189
+ GNU General Public License for more details.
190
+
191
+ Under Section 7 of GPL version 3, you are granted additional
192
+ permissions described in the GCC Runtime Library Exception, version
193
+ 3.1, as published by the Free Software Foundation.
194
+
195
+ You should have received a copy of the GNU General Public License and
196
+ a copy of the GCC Runtime Library Exception along with this program;
197
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
198
+ <http://www.gnu.org/licenses/>.
199
+
200
+ ----
201
+
202
+ Full text of license texts referred to above follows (that they are
203
+ listed below does not necessarily imply the conditions apply to the
204
+ present binary release):
205
+
206
+ ----
207
+
208
+ GCC RUNTIME LIBRARY EXCEPTION
209
+
210
+ Version 3.1, 31 March 2009
211
+
212
+ Copyright (C) 2009 Free Software Foundation, Inc. <http://fsf.org/>
213
+
214
+ Everyone is permitted to copy and distribute verbatim copies of this
215
+ license document, but changing it is not allowed.
216
+
217
+ This GCC Runtime Library Exception ("Exception") is an additional
218
+ permission under section 7 of the GNU General Public License, version
219
+ 3 ("GPLv3"). It applies to a given file (the "Runtime Library") that
220
+ bears a notice placed by the copyright holder of the file stating that
221
+ the file is governed by GPLv3 along with this Exception.
222
+
223
+ When you use GCC to compile a program, GCC may combine portions of
224
+ certain GCC header files and runtime libraries with the compiled
225
+ program. The purpose of this Exception is to allow compilation of
226
+ non-GPL (including proprietary) programs to use, in this way, the
227
+ header files and runtime libraries covered by this Exception.
228
+
229
+ 0. Definitions.
230
+
231
+ A file is an "Independent Module" if it either requires the Runtime
232
+ Library for execution after a Compilation Process, or makes use of an
233
+ interface provided by the Runtime Library, but is not otherwise based
234
+ on the Runtime Library.
235
+
236
+ "GCC" means a version of the GNU Compiler Collection, with or without
237
+ modifications, governed by version 3 (or a specified later version) of
238
+ the GNU General Public License (GPL) with the option of using any
239
+ subsequent versions published by the FSF.
240
+
241
+ "GPL-compatible Software" is software whose conditions of propagation,
242
+ modification and use would permit combination with GCC in accord with
243
+ the license of GCC.
244
+
245
+ "Target Code" refers to output from any compiler for a real or virtual
246
+ target processor architecture, in executable form or suitable for
247
+ input to an assembler, loader, linker and/or execution
248
+ phase. Notwithstanding that, Target Code does not include data in any
249
+ format that is used as a compiler intermediate representation, or used
250
+ for producing a compiler intermediate representation.
251
+
252
+ The "Compilation Process" transforms code entirely represented in
253
+ non-intermediate languages designed for human-written code, and/or in
254
+ Java Virtual Machine byte code, into Target Code. Thus, for example,
255
+ use of source code generators and preprocessors need not be considered
256
+ part of the Compilation Process, since the Compilation Process can be
257
+ understood as starting with the output of the generators or
258
+ preprocessors.
259
+
260
+ A Compilation Process is "Eligible" if it is done using GCC, alone or
261
+ with other GPL-compatible software, or if it is done without using any
262
+ work based on GCC. For example, using non-GPL-compatible Software to
263
+ optimize any GCC intermediate representations would not qualify as an
264
+ Eligible Compilation Process.
265
+
266
+ 1. Grant of Additional Permission.
267
+
268
+ You have permission to propagate a work of Target Code formed by
269
+ combining the Runtime Library with Independent Modules, even if such
270
+ propagation would otherwise violate the terms of GPLv3, provided that
271
+ all Target Code was generated by Eligible Compilation Processes. You
272
+ may then convey such a combination under terms of your choice,
273
+ consistent with the licensing of the Independent Modules.
274
+
275
+ 2. No Weakening of GCC Copyleft.
276
+
277
+ The availability of this Exception does not imply any general
278
+ presumption that third-party software is unaffected by the copyleft
279
+ requirements of the license of GCC.
280
+
281
+ ----
282
+
283
+ GNU GENERAL PUBLIC LICENSE
284
+ Version 3, 29 June 2007
285
+
286
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
287
+ Everyone is permitted to copy and distribute verbatim copies
288
+ of this license document, but changing it is not allowed.
289
+
290
+ Preamble
291
+
292
+ The GNU General Public License is a free, copyleft license for
293
+ software and other kinds of works.
294
+
295
+ The licenses for most software and other practical works are designed
296
+ to take away your freedom to share and change the works. By contrast,
297
+ the GNU General Public License is intended to guarantee your freedom to
298
+ share and change all versions of a program--to make sure it remains free
299
+ software for all its users. We, the Free Software Foundation, use the
300
+ GNU General Public License for most of our software; it applies also to
301
+ any other work released this way by its authors. You can apply it to
302
+ your programs, too.
303
+
304
+ When we speak of free software, we are referring to freedom, not
305
+ price. Our General Public Licenses are designed to make sure that you
306
+ have the freedom to distribute copies of free software (and charge for
307
+ them if you wish), that you receive source code or can get it if you
308
+ want it, that you can change the software or use pieces of it in new
309
+ free programs, and that you know you can do these things.
310
+
311
+ To protect your rights, we need to prevent others from denying you
312
+ these rights or asking you to surrender the rights. Therefore, you have
313
+ certain responsibilities if you distribute copies of the software, or if
314
+ you modify it: responsibilities to respect the freedom of others.
315
+
316
+ For example, if you distribute copies of such a program, whether
317
+ gratis or for a fee, you must pass on to the recipients the same
318
+ freedoms that you received. You must make sure that they, too, receive
319
+ or can get the source code. And you must show them these terms so they
320
+ know their rights.
321
+
322
+ Developers that use the GNU GPL protect your rights with two steps:
323
+ (1) assert copyright on the software, and (2) offer you this License
324
+ giving you legal permission to copy, distribute and/or modify it.
325
+
326
+ For the developers' and authors' protection, the GPL clearly explains
327
+ that there is no warranty for this free software. For both users' and
328
+ authors' sake, the GPL requires that modified versions be marked as
329
+ changed, so that their problems will not be attributed erroneously to
330
+ authors of previous versions.
331
+
332
+ Some devices are designed to deny users access to install or run
333
+ modified versions of the software inside them, although the manufacturer
334
+ can do so. This is fundamentally incompatible with the aim of
335
+ protecting users' freedom to change the software. The systematic
336
+ pattern of such abuse occurs in the area of products for individuals to
337
+ use, which is precisely where it is most unacceptable. Therefore, we
338
+ have designed this version of the GPL to prohibit the practice for those
339
+ products. If such problems arise substantially in other domains, we
340
+ stand ready to extend this provision to those domains in future versions
341
+ of the GPL, as needed to protect the freedom of users.
342
+
343
+ Finally, every program is threatened constantly by software patents.
344
+ States should not allow patents to restrict development and use of
345
+ software on general-purpose computers, but in those that do, we wish to
346
+ avoid the special danger that patents applied to a free program could
347
+ make it effectively proprietary. To prevent this, the GPL assures that
348
+ patents cannot be used to render the program non-free.
349
+
350
+ The precise terms and conditions for copying, distribution and
351
+ modification follow.
352
+
353
+ TERMS AND CONDITIONS
354
+
355
+ 0. Definitions.
356
+
357
+ "This License" refers to version 3 of the GNU General Public License.
358
+
359
+ "Copyright" also means copyright-like laws that apply to other kinds of
360
+ works, such as semiconductor masks.
361
+
362
+ "The Program" refers to any copyrightable work licensed under this
363
+ License. Each licensee is addressed as "you". "Licensees" and
364
+ "recipients" may be individuals or organizations.
365
+
366
+ To "modify" a work means to copy from or adapt all or part of the work
367
+ in a fashion requiring copyright permission, other than the making of an
368
+ exact copy. The resulting work is called a "modified version" of the
369
+ earlier work or a work "based on" the earlier work.
370
+
371
+ A "covered work" means either the unmodified Program or a work based
372
+ on the Program.
373
+
374
+ To "propagate" a work means to do anything with it that, without
375
+ permission, would make you directly or secondarily liable for
376
+ infringement under applicable copyright law, except executing it on a
377
+ computer or modifying a private copy. Propagation includes copying,
378
+ distribution (with or without modification), making available to the
379
+ public, and in some countries other activities as well.
380
+
381
+ To "convey" a work means any kind of propagation that enables other
382
+ parties to make or receive copies. Mere interaction with a user through
383
+ a computer network, with no transfer of a copy, is not conveying.
384
+
385
+ An interactive user interface displays "Appropriate Legal Notices"
386
+ to the extent that it includes a convenient and prominently visible
387
+ feature that (1) displays an appropriate copyright notice, and (2)
388
+ tells the user that there is no warranty for the work (except to the
389
+ extent that warranties are provided), that licensees may convey the
390
+ work under this License, and how to view a copy of this License. If
391
+ the interface presents a list of user commands or options, such as a
392
+ menu, a prominent item in the list meets this criterion.
393
+
394
+ 1. Source Code.
395
+
396
+ The "source code" for a work means the preferred form of the work
397
+ for making modifications to it. "Object code" means any non-source
398
+ form of a work.
399
+
400
+ A "Standard Interface" means an interface that either is an official
401
+ standard defined by a recognized standards body, or, in the case of
402
+ interfaces specified for a particular programming language, one that
403
+ is widely used among developers working in that language.
404
+
405
+ The "System Libraries" of an executable work include anything, other
406
+ than the work as a whole, that (a) is included in the normal form of
407
+ packaging a Major Component, but which is not part of that Major
408
+ Component, and (b) serves only to enable use of the work with that
409
+ Major Component, or to implement a Standard Interface for which an
410
+ implementation is available to the public in source code form. A
411
+ "Major Component", in this context, means a major essential component
412
+ (kernel, window system, and so on) of the specific operating system
413
+ (if any) on which the executable work runs, or a compiler used to
414
+ produce the work, or an object code interpreter used to run it.
415
+
416
+ The "Corresponding Source" for a work in object code form means all
417
+ the source code needed to generate, install, and (for an executable
418
+ work) run the object code and to modify the work, including scripts to
419
+ control those activities. However, it does not include the work's
420
+ System Libraries, or general-purpose tools or generally available free
421
+ programs which are used unmodified in performing those activities but
422
+ which are not part of the work. For example, Corresponding Source
423
+ includes interface definition files associated with source files for
424
+ the work, and the source code for shared libraries and dynamically
425
+ linked subprograms that the work is specifically designed to require,
426
+ such as by intimate data communication or control flow between those
427
+ subprograms and other parts of the work.
428
+
429
+ The Corresponding Source need not include anything that users
430
+ can regenerate automatically from other parts of the Corresponding
431
+ Source.
432
+
433
+ The Corresponding Source for a work in source code form is that
434
+ same work.
435
+
436
+ 2. Basic Permissions.
437
+
438
+ All rights granted under this License are granted for the term of
439
+ copyright on the Program, and are irrevocable provided the stated
440
+ conditions are met. This License explicitly affirms your unlimited
441
+ permission to run the unmodified Program. The output from running a
442
+ covered work is covered by this License only if the output, given its
443
+ content, constitutes a covered work. This License acknowledges your
444
+ rights of fair use or other equivalent, as provided by copyright law.
445
+
446
+ You may make, run and propagate covered works that you do not
447
+ convey, without conditions so long as your license otherwise remains
448
+ in force. You may convey covered works to others for the sole purpose
449
+ of having them make modifications exclusively for you, or provide you
450
+ with facilities for running those works, provided that you comply with
451
+ the terms of this License in conveying all material for which you do
452
+ not control copyright. Those thus making or running the covered works
453
+ for you must do so exclusively on your behalf, under your direction
454
+ and control, on terms that prohibit them from making any copies of
455
+ your copyrighted material outside their relationship with you.
456
+
457
+ Conveying under any other circumstances is permitted solely under
458
+ the conditions stated below. Sublicensing is not allowed; section 10
459
+ makes it unnecessary.
460
+
461
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
462
+
463
+ No covered work shall be deemed part of an effective technological
464
+ measure under any applicable law fulfilling obligations under article
465
+ 11 of the WIPO copyright treaty adopted on 20 December 1996, or
466
+ similar laws prohibiting or restricting circumvention of such
467
+ measures.
468
+
469
+ When you convey a covered work, you waive any legal power to forbid
470
+ circumvention of technological measures to the extent such circumvention
471
+ is effected by exercising rights under this License with respect to
472
+ the covered work, and you disclaim any intention to limit operation or
473
+ modification of the work as a means of enforcing, against the work's
474
+ users, your or third parties' legal rights to forbid circumvention of
475
+ technological measures.
476
+
477
+ 4. Conveying Verbatim Copies.
478
+
479
+ You may convey verbatim copies of the Program's source code as you
480
+ receive it, in any medium, provided that you conspicuously and
481
+ appropriately publish on each copy an appropriate copyright notice;
482
+ keep intact all notices stating that this License and any
483
+ non-permissive terms added in accord with section 7 apply to the code;
484
+ keep intact all notices of the absence of any warranty; and give all
485
+ recipients a copy of this License along with the Program.
486
+
487
+ You may charge any price or no price for each copy that you convey,
488
+ and you may offer support or warranty protection for a fee.
489
+
490
+ 5. Conveying Modified Source Versions.
491
+
492
+ You may convey a work based on the Program, or the modifications to
493
+ produce it from the Program, in the form of source code under the
494
+ terms of section 4, provided that you also meet all of these conditions:
495
+
496
+ a) The work must carry prominent notices stating that you modified
497
+ it, and giving a relevant date.
498
+
499
+ b) The work must carry prominent notices stating that it is
500
+ released under this License and any conditions added under section
501
+ 7. This requirement modifies the requirement in section 4 to
502
+ "keep intact all notices".
503
+
504
+ c) You must license the entire work, as a whole, under this
505
+ License to anyone who comes into possession of a copy. This
506
+ License will therefore apply, along with any applicable section 7
507
+ additional terms, to the whole of the work, and all its parts,
508
+ regardless of how they are packaged. This License gives no
509
+ permission to license the work in any other way, but it does not
510
+ invalidate such permission if you have separately received it.
511
+
512
+ d) If the work has interactive user interfaces, each must display
513
+ Appropriate Legal Notices; however, if the Program has interactive
514
+ interfaces that do not display Appropriate Legal Notices, your
515
+ work need not make them do so.
516
+
517
+ A compilation of a covered work with other separate and independent
518
+ works, which are not by their nature extensions of the covered work,
519
+ and which are not combined with it such as to form a larger program,
520
+ in or on a volume of a storage or distribution medium, is called an
521
+ "aggregate" if the compilation and its resulting copyright are not
522
+ used to limit the access or legal rights of the compilation's users
523
+ beyond what the individual works permit. Inclusion of a covered work
524
+ in an aggregate does not cause this License to apply to the other
525
+ parts of the aggregate.
526
+
527
+ 6. Conveying Non-Source Forms.
528
+
529
+ You may convey a covered work in object code form under the terms
530
+ of sections 4 and 5, provided that you also convey the
531
+ machine-readable Corresponding Source under the terms of this License,
532
+ in one of these ways:
533
+
534
+ a) Convey the object code in, or embodied in, a physical product
535
+ (including a physical distribution medium), accompanied by the
536
+ Corresponding Source fixed on a durable physical medium
537
+ customarily used for software interchange.
538
+
539
+ b) Convey the object code in, or embodied in, a physical product
540
+ (including a physical distribution medium), accompanied by a
541
+ written offer, valid for at least three years and valid for as
542
+ long as you offer spare parts or customer support for that product
543
+ model, to give anyone who possesses the object code either (1) a
544
+ copy of the Corresponding Source for all the software in the
545
+ product that is covered by this License, on a durable physical
546
+ medium customarily used for software interchange, for a price no
547
+ more than your reasonable cost of physically performing this
548
+ conveying of source, or (2) access to copy the
549
+ Corresponding Source from a network server at no charge.
550
+
551
+ c) Convey individual copies of the object code with a copy of the
552
+ written offer to provide the Corresponding Source. This
553
+ alternative is allowed only occasionally and noncommercially, and
554
+ only if you received the object code with such an offer, in accord
555
+ with subsection 6b.
556
+
557
+ d) Convey the object code by offering access from a designated
558
+ place (gratis or for a charge), and offer equivalent access to the
559
+ Corresponding Source in the same way through the same place at no
560
+ further charge. You need not require recipients to copy the
561
+ Corresponding Source along with the object code. If the place to
562
+ copy the object code is a network server, the Corresponding Source
563
+ may be on a different server (operated by you or a third party)
564
+ that supports equivalent copying facilities, provided you maintain
565
+ clear directions next to the object code saying where to find the
566
+ Corresponding Source. Regardless of what server hosts the
567
+ Corresponding Source, you remain obligated to ensure that it is
568
+ available for as long as needed to satisfy these requirements.
569
+
570
+ e) Convey the object code using peer-to-peer transmission, provided
571
+ you inform other peers where the object code and Corresponding
572
+ Source of the work are being offered to the general public at no
573
+ charge under subsection 6d.
574
+
575
+ A separable portion of the object code, whose source code is excluded
576
+ from the Corresponding Source as a System Library, need not be
577
+ included in conveying the object code work.
578
+
579
+ A "User Product" is either (1) a "consumer product", which means any
580
+ tangible personal property which is normally used for personal, family,
581
+ or household purposes, or (2) anything designed or sold for incorporation
582
+ into a dwelling. In determining whether a product is a consumer product,
583
+ doubtful cases shall be resolved in favor of coverage. For a particular
584
+ product received by a particular user, "normally used" refers to a
585
+ typical or common use of that class of product, regardless of the status
586
+ of the particular user or of the way in which the particular user
587
+ actually uses, or expects or is expected to use, the product. A product
588
+ is a consumer product regardless of whether the product has substantial
589
+ commercial, industrial or non-consumer uses, unless such uses represent
590
+ the only significant mode of use of the product.
591
+
592
+ "Installation Information" for a User Product means any methods,
593
+ procedures, authorization keys, or other information required to install
594
+ and execute modified versions of a covered work in that User Product from
595
+ a modified version of its Corresponding Source. The information must
596
+ suffice to ensure that the continued functioning of the modified object
597
+ code is in no case prevented or interfered with solely because
598
+ modification has been made.
599
+
600
+ If you convey an object code work under this section in, or with, or
601
+ specifically for use in, a User Product, and the conveying occurs as
602
+ part of a transaction in which the right of possession and use of the
603
+ User Product is transferred to the recipient in perpetuity or for a
604
+ fixed term (regardless of how the transaction is characterized), the
605
+ Corresponding Source conveyed under this section must be accompanied
606
+ by the Installation Information. But this requirement does not apply
607
+ if neither you nor any third party retains the ability to install
608
+ modified object code on the User Product (for example, the work has
609
+ been installed in ROM).
610
+
611
+ The requirement to provide Installation Information does not include a
612
+ requirement to continue to provide support service, warranty, or updates
613
+ for a work that has been modified or installed by the recipient, or for
614
+ the User Product in which it has been modified or installed. Access to a
615
+ network may be denied when the modification itself materially and
616
+ adversely affects the operation of the network or violates the rules and
617
+ protocols for communication across the network.
618
+
619
+ Corresponding Source conveyed, and Installation Information provided,
620
+ in accord with this section must be in a format that is publicly
621
+ documented (and with an implementation available to the public in
622
+ source code form), and must require no special password or key for
623
+ unpacking, reading or copying.
624
+
625
+ 7. Additional Terms.
626
+
627
+ "Additional permissions" are terms that supplement the terms of this
628
+ License by making exceptions from one or more of its conditions.
629
+ Additional permissions that are applicable to the entire Program shall
630
+ be treated as though they were included in this License, to the extent
631
+ that they are valid under applicable law. If additional permissions
632
+ apply only to part of the Program, that part may be used separately
633
+ under those permissions, but the entire Program remains governed by
634
+ this License without regard to the additional permissions.
635
+
636
+ When you convey a copy of a covered work, you may at your option
637
+ remove any additional permissions from that copy, or from any part of
638
+ it. (Additional permissions may be written to require their own
639
+ removal in certain cases when you modify the work.) You may place
640
+ additional permissions on material, added by you to a covered work,
641
+ for which you have or can give appropriate copyright permission.
642
+
643
+ Notwithstanding any other provision of this License, for material you
644
+ add to a covered work, you may (if authorized by the copyright holders of
645
+ that material) supplement the terms of this License with terms:
646
+
647
+ a) Disclaiming warranty or limiting liability differently from the
648
+ terms of sections 15 and 16 of this License; or
649
+
650
+ b) Requiring preservation of specified reasonable legal notices or
651
+ author attributions in that material or in the Appropriate Legal
652
+ Notices displayed by works containing it; or
653
+
654
+ c) Prohibiting misrepresentation of the origin of that material, or
655
+ requiring that modified versions of such material be marked in
656
+ reasonable ways as different from the original version; or
657
+
658
+ d) Limiting the use for publicity purposes of names of licensors or
659
+ authors of the material; or
660
+
661
+ e) Declining to grant rights under trademark law for use of some
662
+ trade names, trademarks, or service marks; or
663
+
664
+ f) Requiring indemnification of licensors and authors of that
665
+ material by anyone who conveys the material (or modified versions of
666
+ it) with contractual assumptions of liability to the recipient, for
667
+ any liability that these contractual assumptions directly impose on
668
+ those licensors and authors.
669
+
670
+ All other non-permissive additional terms are considered "further
671
+ restrictions" within the meaning of section 10. If the Program as you
672
+ received it, or any part of it, contains a notice stating that it is
673
+ governed by this License along with a term that is a further
674
+ restriction, you may remove that term. If a license document contains
675
+ a further restriction but permits relicensing or conveying under this
676
+ License, you may add to a covered work material governed by the terms
677
+ of that license document, provided that the further restriction does
678
+ not survive such relicensing or conveying.
679
+
680
+ If you add terms to a covered work in accord with this section, you
681
+ must place, in the relevant source files, a statement of the
682
+ additional terms that apply to those files, or a notice indicating
683
+ where to find the applicable terms.
684
+
685
+ Additional terms, permissive or non-permissive, may be stated in the
686
+ form of a separately written license, or stated as exceptions;
687
+ the above requirements apply either way.
688
+
689
+ 8. Termination.
690
+
691
+ You may not propagate or modify a covered work except as expressly
692
+ provided under this License. Any attempt otherwise to propagate or
693
+ modify it is void, and will automatically terminate your rights under
694
+ this License (including any patent licenses granted under the third
695
+ paragraph of section 11).
696
+
697
+ However, if you cease all violation of this License, then your
698
+ license from a particular copyright holder is reinstated (a)
699
+ provisionally, unless and until the copyright holder explicitly and
700
+ finally terminates your license, and (b) permanently, if the copyright
701
+ holder fails to notify you of the violation by some reasonable means
702
+ prior to 60 days after the cessation.
703
+
704
+ Moreover, your license from a particular copyright holder is
705
+ reinstated permanently if the copyright holder notifies you of the
706
+ violation by some reasonable means, this is the first time you have
707
+ received notice of violation of this License (for any work) from that
708
+ copyright holder, and you cure the violation prior to 30 days after
709
+ your receipt of the notice.
710
+
711
+ Termination of your rights under this section does not terminate the
712
+ licenses of parties who have received copies or rights from you under
713
+ this License. If your rights have been terminated and not permanently
714
+ reinstated, you do not qualify to receive new licenses for the same
715
+ material under section 10.
716
+
717
+ 9. Acceptance Not Required for Having Copies.
718
+
719
+ You are not required to accept this License in order to receive or
720
+ run a copy of the Program. Ancillary propagation of a covered work
721
+ occurring solely as a consequence of using peer-to-peer transmission
722
+ to receive a copy likewise does not require acceptance. However,
723
+ nothing other than this License grants you permission to propagate or
724
+ modify any covered work. These actions infringe copyright if you do
725
+ not accept this License. Therefore, by modifying or propagating a
726
+ covered work, you indicate your acceptance of this License to do so.
727
+
728
+ 10. Automatic Licensing of Downstream Recipients.
729
+
730
+ Each time you convey a covered work, the recipient automatically
731
+ receives a license from the original licensors, to run, modify and
732
+ propagate that work, subject to this License. You are not responsible
733
+ for enforcing compliance by third parties with this License.
734
+
735
+ An "entity transaction" is a transaction transferring control of an
736
+ organization, or substantially all assets of one, or subdividing an
737
+ organization, or merging organizations. If propagation of a covered
738
+ work results from an entity transaction, each party to that
739
+ transaction who receives a copy of the work also receives whatever
740
+ licenses to the work the party's predecessor in interest had or could
741
+ give under the previous paragraph, plus a right to possession of the
742
+ Corresponding Source of the work from the predecessor in interest, if
743
+ the predecessor has it or can get it with reasonable efforts.
744
+
745
+ You may not impose any further restrictions on the exercise of the
746
+ rights granted or affirmed under this License. For example, you may
747
+ not impose a license fee, royalty, or other charge for exercise of
748
+ rights granted under this License, and you may not initiate litigation
749
+ (including a cross-claim or counterclaim in a lawsuit) alleging that
750
+ any patent claim is infringed by making, using, selling, offering for
751
+ sale, or importing the Program or any portion of it.
752
+
753
+ 11. Patents.
754
+
755
+ A "contributor" is a copyright holder who authorizes use under this
756
+ License of the Program or a work on which the Program is based. The
757
+ work thus licensed is called the contributor's "contributor version".
758
+
759
+ A contributor's "essential patent claims" are all patent claims
760
+ owned or controlled by the contributor, whether already acquired or
761
+ hereafter acquired, that would be infringed by some manner, permitted
762
+ by this License, of making, using, or selling its contributor version,
763
+ but do not include claims that would be infringed only as a
764
+ consequence of further modification of the contributor version. For
765
+ purposes of this definition, "control" includes the right to grant
766
+ patent sublicenses in a manner consistent with the requirements of
767
+ this License.
768
+
769
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
770
+ patent license under the contributor's essential patent claims, to
771
+ make, use, sell, offer for sale, import and otherwise run, modify and
772
+ propagate the contents of its contributor version.
773
+
774
+ In the following three paragraphs, a "patent license" is any express
775
+ agreement or commitment, however denominated, not to enforce a patent
776
+ (such as an express permission to practice a patent or covenant not to
777
+ sue for patent infringement). To "grant" such a patent license to a
778
+ party means to make such an agreement or commitment not to enforce a
779
+ patent against the party.
780
+
781
+ If you convey a covered work, knowingly relying on a patent license,
782
+ and the Corresponding Source of the work is not available for anyone
783
+ to copy, free of charge and under the terms of this License, through a
784
+ publicly available network server or other readily accessible means,
785
+ then you must either (1) cause the Corresponding Source to be so
786
+ available, or (2) arrange to deprive yourself of the benefit of the
787
+ patent license for this particular work, or (3) arrange, in a manner
788
+ consistent with the requirements of this License, to extend the patent
789
+ license to downstream recipients. "Knowingly relying" means you have
790
+ actual knowledge that, but for the patent license, your conveying the
791
+ covered work in a country, or your recipient's use of the covered work
792
+ in a country, would infringe one or more identifiable patents in that
793
+ country that you have reason to believe are valid.
794
+
795
+ If, pursuant to or in connection with a single transaction or
796
+ arrangement, you convey, or propagate by procuring conveyance of, a
797
+ covered work, and grant a patent license to some of the parties
798
+ receiving the covered work authorizing them to use, propagate, modify
799
+ or convey a specific copy of the covered work, then the patent license
800
+ you grant is automatically extended to all recipients of the covered
801
+ work and works based on it.
802
+
803
+ A patent license is "discriminatory" if it does not include within
804
+ the scope of its coverage, prohibits the exercise of, or is
805
+ conditioned on the non-exercise of one or more of the rights that are
806
+ specifically granted under this License. You may not convey a covered
807
+ work if you are a party to an arrangement with a third party that is
808
+ in the business of distributing software, under which you make payment
809
+ to the third party based on the extent of your activity of conveying
810
+ the work, and under which the third party grants, to any of the
811
+ parties who would receive the covered work from you, a discriminatory
812
+ patent license (a) in connection with copies of the covered work
813
+ conveyed by you (or copies made from those copies), or (b) primarily
814
+ for and in connection with specific products or compilations that
815
+ contain the covered work, unless you entered into that arrangement,
816
+ or that patent license was granted, prior to 28 March 2007.
817
+
818
+ Nothing in this License shall be construed as excluding or limiting
819
+ any implied license or other defenses to infringement that may
820
+ otherwise be available to you under applicable patent law.
821
+
822
+ 12. No Surrender of Others' Freedom.
823
+
824
+ If conditions are imposed on you (whether by court order, agreement or
825
+ otherwise) that contradict the conditions of this License, they do not
826
+ excuse you from the conditions of this License. If you cannot convey a
827
+ covered work so as to satisfy simultaneously your obligations under this
828
+ License and any other pertinent obligations, then as a consequence you may
829
+ not convey it at all. For example, if you agree to terms that obligate you
830
+ to collect a royalty for further conveying from those to whom you convey
831
+ the Program, the only way you could satisfy both those terms and this
832
+ License would be to refrain entirely from conveying the Program.
833
+
834
+ 13. Use with the GNU Affero General Public License.
835
+
836
+ Notwithstanding any other provision of this License, you have
837
+ permission to link or combine any covered work with a work licensed
838
+ under version 3 of the GNU Affero General Public License into a single
839
+ combined work, and to convey the resulting work. The terms of this
840
+ License will continue to apply to the part which is the covered work,
841
+ but the special requirements of the GNU Affero General Public License,
842
+ section 13, concerning interaction through a network will apply to the
843
+ combination as such.
844
+
845
+ 14. Revised Versions of this License.
846
+
847
+ The Free Software Foundation may publish revised and/or new versions of
848
+ the GNU General Public License from time to time. Such new versions will
849
+ be similar in spirit to the present version, but may differ in detail to
850
+ address new problems or concerns.
851
+
852
+ Each version is given a distinguishing version number. If the
853
+ Program specifies that a certain numbered version of the GNU General
854
+ Public License "or any later version" applies to it, you have the
855
+ option of following the terms and conditions either of that numbered
856
+ version or of any later version published by the Free Software
857
+ Foundation. If the Program does not specify a version number of the
858
+ GNU General Public License, you may choose any version ever published
859
+ by the Free Software Foundation.
860
+
861
+ If the Program specifies that a proxy can decide which future
862
+ versions of the GNU General Public License can be used, that proxy's
863
+ public statement of acceptance of a version permanently authorizes you
864
+ to choose that version for the Program.
865
+
866
+ Later license versions may give you additional or different
867
+ permissions. However, no additional obligations are imposed on any
868
+ author or copyright holder as a result of your choosing to follow a
869
+ later version.
870
+
871
+ 15. Disclaimer of Warranty.
872
+
873
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
874
+ APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
875
+ HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
876
+ OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
877
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
878
+ PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
879
+ IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
880
+ ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
881
+
882
+ 16. Limitation of Liability.
883
+
884
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
885
+ WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
886
+ THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
887
+ GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
888
+ USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
889
+ DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
890
+ PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
891
+ EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
892
+ SUCH DAMAGES.
893
+
894
+ 17. Interpretation of Sections 15 and 16.
895
+
896
+ If the disclaimer of warranty and limitation of liability provided
897
+ above cannot be given local legal effect according to their terms,
898
+ reviewing courts shall apply local law that most closely approximates
899
+ an absolute waiver of all civil liability in connection with the
900
+ Program, unless a warranty or assumption of liability accompanies a
901
+ copy of the Program in return for a fee.
902
+
903
+ END OF TERMS AND CONDITIONS
904
+
905
+ How to Apply These Terms to Your New Programs
906
+
907
+ If you develop a new program, and you want it to be of the greatest
908
+ possible use to the public, the best way to achieve this is to make it
909
+ free software which everyone can redistribute and change under these terms.
910
+
911
+ To do so, attach the following notices to the program. It is safest
912
+ to attach them to the start of each source file to most effectively
913
+ state the exclusion of warranty; and each file should have at least
914
+ the "copyright" line and a pointer to where the full notice is found.
915
+
916
+ <one line to give the program's name and a brief idea of what it does.>
917
+ Copyright (C) <year> <name of author>
918
+
919
+ This program is free software: you can redistribute it and/or modify
920
+ it under the terms of the GNU General Public License as published by
921
+ the Free Software Foundation, either version 3 of the License, or
922
+ (at your option) any later version.
923
+
924
+ This program is distributed in the hope that it will be useful,
925
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
926
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
927
+ GNU General Public License for more details.
928
+
929
+ You should have received a copy of the GNU General Public License
930
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
931
+
932
+ Also add information on how to contact you by electronic and paper mail.
933
+
934
+ If the program does terminal interaction, make it output a short
935
+ notice like this when it starts in an interactive mode:
936
+
937
+ <program> Copyright (C) <year> <name of author>
938
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
939
+ This is free software, and you are welcome to redistribute it
940
+ under certain conditions; type `show c' for details.
941
+
942
+ The hypothetical commands `show w' and `show c' should show the appropriate
943
+ parts of the General Public License. Of course, your program's commands
944
+ might be different; for a GUI interface, you would use an "about box".
945
+
946
+ You should also get your employer (if you work as a programmer) or school,
947
+ if any, to sign a "copyright disclaimer" for the program, if necessary.
948
+ For more information on this, and how to apply and follow the GNU GPL, see
949
+ <http://www.gnu.org/licenses/>.
950
+
951
+ The GNU General Public License does not permit incorporating your program
952
+ into proprietary programs. If your program is a subroutine library, you
953
+ may consider it more useful to permit linking proprietary applications with
954
+ the library. If this is what you want to do, use the GNU Lesser General
955
+ Public License instead of this License. But first, please read
956
+ <http://www.gnu.org/philosophy/why-not-lgpl.html>.
957
+
958
+ Name: libquadmath
959
+ Files: numpy.libs/libquadmath*.so
960
+ Description: dynamically linked to files compiled with gcc
961
+ Availability: https://gcc.gnu.org/git/?p=gcc.git;a=tree;f=libquadmath
962
+ License: LGPL-2.1-or-later
963
+
964
+ GCC Quad-Precision Math Library
965
+ Copyright (C) 2010-2019 Free Software Foundation, Inc.
966
+ Written by Francois-Xavier Coudert <fxcoudert@gcc.gnu.org>
967
+
968
+ This file is part of the libquadmath library.
969
+ Libquadmath is free software; you can redistribute it and/or
970
+ modify it under the terms of the GNU Library General Public
971
+ License as published by the Free Software Foundation; either
972
+ version 2.1 of the License, or (at your option) any later version.
973
+
974
+ Libquadmath is distributed in the hope that it will be useful,
975
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
976
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
977
+ Lesser General Public License for more details.
978
+ https://www.gnu.org/licenses/old-licenses/lgpl-2.1.html
979
+ Classifier: Development Status :: 5 - Production/Stable
980
+ Classifier: Intended Audience :: Science/Research
981
+ Classifier: Intended Audience :: Developers
982
+ Classifier: License :: OSI Approved :: BSD License
983
+ Classifier: Programming Language :: C
984
+ Classifier: Programming Language :: Python
985
+ Classifier: Programming Language :: Python :: 3
986
+ Classifier: Programming Language :: Python :: 3.9
987
+ Classifier: Programming Language :: Python :: 3.10
988
+ Classifier: Programming Language :: Python :: 3.11
989
+ Classifier: Programming Language :: Python :: 3.12
990
+ Classifier: Programming Language :: Python :: 3 :: Only
991
+ Classifier: Programming Language :: Python :: Implementation :: CPython
992
+ Classifier: Topic :: Software Development
993
+ Classifier: Topic :: Scientific/Engineering
994
+ Classifier: Typing :: Typed
995
+ Classifier: Operating System :: Microsoft :: Windows
996
+ Classifier: Operating System :: POSIX
997
+ Classifier: Operating System :: Unix
998
+ Classifier: Operating System :: MacOS
999
+ Project-URL: Homepage, https://numpy.org
1000
+ Project-URL: Documentation, https://numpy.org/doc/
1001
+ Project-URL: Source, https://github.com/numpy/numpy
1002
+ Project-URL: Download, https://pypi.org/project/numpy/#files
1003
+ Project-URL: Tracker, https://github.com/numpy/numpy/issues
1004
+ Project-URL: Release notes, https://numpy.org/doc/stable/release
1005
+ Requires-Python: >=3.9
1006
+ Description-Content-Type: text/markdown
1007
+
1008
+ <h1 align="center">
1009
+ <img src="https://raw.githubusercontent.com/numpy/numpy/main/branding/logo/primary/numpylogo.svg" width="300">
1010
+ </h1><br>
1011
+
1012
+
1013
+ [![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](
1014
+ https://numfocus.org)
1015
+ [![PyPI Downloads](https://img.shields.io/pypi/dm/numpy.svg?label=PyPI%20downloads)](
1016
+ https://pypi.org/project/numpy/)
1017
+ [![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/numpy.svg?label=Conda%20downloads)](
1018
+ https://anaconda.org/conda-forge/numpy)
1019
+ [![Stack Overflow](https://img.shields.io/badge/stackoverflow-Ask%20questions-blue.svg)](
1020
+ https://stackoverflow.com/questions/tagged/numpy)
1021
+ [![Nature Paper](https://img.shields.io/badge/DOI-10.1038%2Fs41592--019--0686--2-blue)](
1022
+ https://doi.org/10.1038/s41586-020-2649-2)
1023
+ [![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/numpy/numpy/badge)](https://api.securityscorecards.dev/projects/github.com/numpy/numpy)
1024
+
1025
+
1026
+ NumPy is the fundamental package for scientific computing with Python.
1027
+
1028
+ - **Website:** https://www.numpy.org
1029
+ - **Documentation:** https://numpy.org/doc
1030
+ - **Mailing list:** https://mail.python.org/mailman/listinfo/numpy-discussion
1031
+ - **Source code:** https://github.com/numpy/numpy
1032
+ - **Contributing:** https://www.numpy.org/devdocs/dev/index.html
1033
+ - **Bug reports:** https://github.com/numpy/numpy/issues
1034
+ - **Report a security vulnerability:** https://tidelift.com/docs/security
1035
+
1036
+ It provides:
1037
+
1038
+ - a powerful N-dimensional array object
1039
+ - sophisticated (broadcasting) functions
1040
+ - tools for integrating C/C++ and Fortran code
1041
+ - useful linear algebra, Fourier transform, and random number capabilities
1042
+
1043
+ Testing:
1044
+
1045
+ NumPy requires `pytest` and `hypothesis`. Tests can then be run after installation with:
1046
+
1047
+ python -c "import numpy, sys; sys.exit(numpy.test() is False)"
1048
+
1049
+ Code of Conduct
1050
+ ----------------------
1051
+
1052
+ NumPy is a community-driven open source project developed by a diverse group of
1053
+ [contributors](https://numpy.org/teams/). The NumPy leadership has made a strong
1054
+ commitment to creating an open, inclusive, and positive community. Please read the
1055
+ [NumPy Code of Conduct](https://numpy.org/code-of-conduct/) for guidance on how to interact
1056
+ with others in a way that makes our community thrive.
1057
+
1058
+ Call for Contributions
1059
+ ----------------------
1060
+
1061
+ The NumPy project welcomes your expertise and enthusiasm!
1062
+
1063
+ Small improvements or fixes are always appreciated. If you are considering larger contributions
1064
+ to the source code, please contact us through the [mailing
1065
+ list](https://mail.python.org/mailman/listinfo/numpy-discussion) first.
1066
+
1067
+ Writing code isn’t the only way to contribute to NumPy. You can also:
1068
+ - review pull requests
1069
+ - help us stay on top of new and old issues
1070
+ - develop tutorials, presentations, and other educational materials
1071
+ - maintain and improve [our website](https://github.com/numpy/numpy.org)
1072
+ - develop graphic design for our brand assets and promotional materials
1073
+ - translate website content
1074
+ - help with outreach and onboard new contributors
1075
+ - write grant proposals and help with other fundraising efforts
1076
+
1077
+ For more information about the ways you can contribute to NumPy, visit [our website](https://numpy.org/contribute/).
1078
+ If you’re unsure where to start or how your skills fit in, reach out! You can
1079
+ ask on the mailing list or here, on GitHub, by opening a new issue or leaving a
1080
+ comment on a relevant issue that is already open.
1081
+
1082
+ Our preferred channels of communication are all public, but if you’d like to
1083
+ speak to us in private first, contact our community coordinators at
1084
+ numpy-team@googlegroups.com or on Slack (write numpy-team@googlegroups.com for
1085
+ an invitation).
1086
+
1087
+ We also have a biweekly community call, details of which are announced on the
1088
+ mailing list. You are very welcome to join.
1089
+
1090
+ If you are new to contributing to open source, [this
1091
+ guide](https://opensource.guide/how-to-contribute/) helps explain why, what,
1092
+ and how to successfully get involved.
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/RECORD ADDED
@@ -0,0 +1,792 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ../../../bin/f2py,sha256=ScMFUWEA5j-JCSbPSeQ-eEGgHxanaIyHWKHzNTwpC6A,398
2
+ numpy-1.26.4.dist-info/INSTALLER,sha256=5hhM4Q4mYTT9z6QB6PGpUAW81PGNFrYrdXMj4oM_6ak,2
3
+ numpy-1.26.4.dist-info/LICENSE.txt,sha256=EQewyDHpGNTx28KKMxkMdyFe8njUpMQAlXIIh3DUM0o,47721
4
+ numpy-1.26.4.dist-info/METADATA,sha256=sJc0p_7UToS0yBYZNM5TLf8ed57Ggi1BVkTRF_Y4EHA,61041
5
+ numpy-1.26.4.dist-info/RECORD,,
6
+ numpy-1.26.4.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ numpy-1.26.4.dist-info/WHEEL,sha256=3qIDcXCk577AXiK3pDifO-gE9U_MYWYGgtD78gLa2_U,137
8
+ numpy-1.26.4.dist-info/entry_points.txt,sha256=zddyYJuUw9Uud7LeLfynXk62_ry0lGihDwCIgugBdZM,144
9
+ numpy.libs/libgfortran-040039e1.so.5.0.0,sha256=FK-zEpsai1C8QKOwggx_EVLqm8EBIaqxUpQ_cFdHKIY,2686065
10
+ numpy.libs/libopenblas64_p-r0-0cf96a72.3.23.dev.so,sha256=klTQhU3XYV4R3ijXca5AiHjKgSOnrCBPIeTMejdswuU,35123345
11
+ numpy.libs/libquadmath-96973f99.so.0.0.0,sha256=k0wi3tDn0WnE1GeIdslgUa3z2UVF2pYvYLQWWbB12js,247609
12
+ numpy/__config__.py,sha256=z0NFqd9D20ShQlKyPTlbfAPWIJFDEJ7aVp3TQ5_vTxU,4902
13
+ numpy/__init__.cython-30.pxd,sha256=yk2a3etxRNlBgj5uLfIho2RYDYDzhRW8oagAG-wzbPI,36690
14
+ numpy/__init__.pxd,sha256=Pa0VYRSeQRSFepQ6ROgZrNtGY5TzBXIddWsMHtK0OkM,35066
15
+ numpy/__init__.py,sha256=Is0VNfoU10729FfMoUn_3ICHX0YL4xO4-JUnP3i8QC4,17005
16
+ numpy/__init__.pyi,sha256=9kK465XL9oS_X3fJLv0Na29NEYnWvtdMhXPtrnF_cG8,154080
17
+ numpy/_core/__init__.py,sha256=C8_7wbHqUkB35JouY_XKsas1KLpRZ7JHWuZ7VGOPVpU,136
18
+ numpy/_core/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ numpy/_core/_dtype.py,sha256=vE16-yiwUSYsAIbq7FlEY1GbXZAp8wjADDxJg3eBX-U,126
20
+ numpy/_core/_dtype_ctypes.py,sha256=i5EhoWPUhu4kla3Xu4ZvXF1lVLPiI6Zg4h6o8jaiamo,147
21
+ numpy/_core/_internal.py,sha256=g5ugmqDgUhSlie5-onOctcm4p0gcMHSIRLHVYtFTk1M,135
22
+ numpy/_core/_multiarray_umath.py,sha256=VPtoT2uHnyU3rKL0G27CgmNmB1WRHM0mtc7Y9L85C3U,159
23
+ numpy/_core/multiarray.py,sha256=kZxC_7P3Jwz1RApzQU2QGmqSq4MAEvKmaJEYnAsbSOs,138
24
+ numpy/_core/umath.py,sha256=YcV0cdbGcem6D5P3yX7cR9HGYBrT8VMoAgCBzGwPhgg,123
25
+ numpy/_distributor_init.py,sha256=IKy2THwmu5UgBjtVbwbD9H-Ap8uaUJoPJ2btQ4Jatdo,407
26
+ numpy/_globals.py,sha256=neEdcfLZoHLwber_1Xyrn26LcXy0MrSta03Ze7aKa6g,3094
27
+ numpy/_pyinstaller/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
+ numpy/_pyinstaller/hook-numpy.py,sha256=PUQ-mNWje6bFALB-mLVFRPkvbM4JpLXunB6sjBbTy5g,1409
29
+ numpy/_pyinstaller/pyinstaller-smoke.py,sha256=6iL-eHMQaG3rxnS5EgcvrCqElm9aKL07Cjr1FZJSXls,1143
30
+ numpy/_pyinstaller/test_pyinstaller.py,sha256=8K-7QxmfoXCG0NwR0bhIgCNrDjGlrTzWnrR1sR8btgU,1135
31
+ numpy/_pytesttester.py,sha256=lQUTvKVz6kT8b4yiMV-uW-vG9KSv9UzqAmxaEMezTd8,6731
32
+ numpy/_pytesttester.pyi,sha256=OtyXSiuSy8o_78w3QNQRjMLpvvNyEdC0aMsx6T-vRxU,489
33
+ numpy/_typing/__init__.py,sha256=6w9E9V9VaT7vTM-veua8XcySv50Je5qSPJzK9HTocIg,7003
34
+ numpy/_typing/_add_docstring.py,sha256=xQhQX372aN_m3XN95CneMxOST2FdPcovR-MXM-9ep58,3922
35
+ numpy/_typing/_array_like.py,sha256=L4gnx2KWG8yYcouz5b9boJIkkFNtOJV6QjcnGCrbnRY,4298
36
+ numpy/_typing/_callable.pyi,sha256=Mf57BwohRn9ye6ixJqjNEnK0gKqnVPE9Gy8vK-6_zxo,11121
37
+ numpy/_typing/_char_codes.py,sha256=LR51O5AUBDbCmJvlMoxyUvsfvb1p7WHrexgtTGtuWTc,5916
38
+ numpy/_typing/_dtype_like.py,sha256=21Uxy0UgIawGM82xjDF_ifMq-nP-Bkhn_LpiK_HvWC4,5661
39
+ numpy/_typing/_extended_precision.py,sha256=dGios-1k-QBGew7YFzONZTzVWxz-aYAaqlccl2_h5Bo,777
40
+ numpy/_typing/_nbit.py,sha256=-EQOShHpB3r30b4RVEcruQRTcTaFAZwtqCJ4BsvpEzA,345
41
+ numpy/_typing/_nested_sequence.py,sha256=5eNaVZAV9tZQLFWHYOuVs336JjoiaWxyZQ7cMKb6m1I,2566
42
+ numpy/_typing/_scalars.py,sha256=eVP8PjlcTIlY7v0fRI3tFXPogWtpLJZ8nFvRRrLjDqs,980
43
+ numpy/_typing/_shape.py,sha256=JPy7jJMkISGFTnkgiEifYM-4xTcjb7JMRkLIIjZLw08,211
44
+ numpy/_typing/_ufunc.pyi,sha256=e74LtOP9e8kkRhvrIJ_RXz9Ua_L43Pd9IixwNwermnM,12638
45
+ numpy/_typing/setup.py,sha256=SE0Q6HPqDjWUfceA4yXgkII8y3z7EiSF0Z-MNwOIyG4,337
46
+ numpy/_utils/__init__.py,sha256=Hhetwsi3eTBe8HdWbG51zXmcrX1DiPLxkYSrslMLYcc,723
47
+ numpy/_utils/_convertions.py,sha256=0xMxdeLOziDmHsRM_8luEh4S-kQdMoMg6GxNDDas69k,329
48
+ numpy/_utils/_inspect.py,sha256=8Ma7QBRwfSWKeK1ShJpFNc7CDhE6fkIE_wr1FxrG1A8,7447
49
+ numpy/_utils/_pep440.py,sha256=Vr7B3QsijR5p6h8YAz2LjNGUyzHUJ5gZ4v26NpZAKDc,14069
50
+ numpy/array_api/__init__.py,sha256=XtttWbDf6Yh0_m4zp-L_us4HKnV3oGwdlB6n-01Q9M8,10375
51
+ numpy/array_api/_array_object.py,sha256=rfCBzE6vUjk4HElQGTVwe6Tw2vxiUx7tmBpQEmm1iBk,43794
52
+ numpy/array_api/_constants.py,sha256=AYayN2jf1Dp5rXZ7WPBdUhtPBo_JMCi-pD9oW5zmFkI,87
53
+ numpy/array_api/_creation_functions.py,sha256=6SqHdzZqHOJFEyWFtqnj6KIKRivrGXxROlgnez_3Mt0,10050
54
+ numpy/array_api/_data_type_functions.py,sha256=P57FOsNdXahNUriVtdldonbvBQrrZkVzxZbcqkR_8AA,6288
55
+ numpy/array_api/_dtypes.py,sha256=kDU1NLvEQN-W2HPmJ2wGPx8jiNkFbrvTCD1T1RT8Pwo,4823
56
+ numpy/array_api/_elementwise_functions.py,sha256=0kGuDX3Ur_Qp6tBMBWTO7LPUxzXNGAlA2SSJhdAp4DU,25992
57
+ numpy/array_api/_indexing_functions.py,sha256=d-gzqzyvR45FQerRYJrbBzCWFnDsZWSI9pggA5QWRO4,715
58
+ numpy/array_api/_manipulation_functions.py,sha256=qCoW5B5FXcFOWKPU9D9MXHdMeXIuzvnHUUvprNlwfjc,3317
59
+ numpy/array_api/_searching_functions.py,sha256=mGZiqheYXGWiDK9rqXFiDKX0_B0mJ1OjdA-9FC2o5lA,1715
60
+ numpy/array_api/_set_functions.py,sha256=ULpfK1zznW9joX1DXSiP0R3ahcDB_po7mZlpsRqi7Fs,2948
61
+ numpy/array_api/_sorting_functions.py,sha256=7pszlxNN7-DNqEZlonGLFQrlXPP7evVA8jN31NShg00,2031
62
+ numpy/array_api/_statistical_functions.py,sha256=HspfYteZWSa3InMs10KZz-sk3ZuW6teX6fNdo829T84,3584
63
+ numpy/array_api/_typing.py,sha256=uKidRp6nYxgHnEPaqXXZsDDZ6tw1LshpbwLvy-09eeM,1347
64
+ numpy/array_api/_utility_functions.py,sha256=HwycylbPAgRVz4nZvjvwqN3mQnJbqKA-NRMaAvIP-CE,824
65
+ numpy/array_api/linalg.py,sha256=QPpG2tG1pZgzjrtTjjOu2GDu3cI6UpSsLrsG_o1jXYk,18411
66
+ numpy/array_api/setup.py,sha256=Wx6qD7GU_APiqKolYPO0OHv4eHGYrjPZmDAgjWhOEhM,341
67
+ numpy/array_api/tests/__init__.py,sha256=t_2GZ3lKcsu4ec4GMKPUDYaeMUJyDquBlQAcPgj7kFE,282
68
+ numpy/array_api/tests/test_array_object.py,sha256=FQoAxP4CLDiv6iih8KKUDSLuYM6dtnDcB1f0pMHw4-M,17035
69
+ numpy/array_api/tests/test_creation_functions.py,sha256=s3A1COWmXIAJdhzd8v7VtL-jbiSspskTqwYy0BTpmpw,5023
70
+ numpy/array_api/tests/test_data_type_functions.py,sha256=qc8ktRlVXWC3PKhxPVWI_UF9f1zZtpmzHjdCtf3e16E,1018
71
+ numpy/array_api/tests/test_elementwise_functions.py,sha256=CTj4LLwtusI51HkpzD0JPohP1ffNxogAVFz8WLuWFzM,3800
72
+ numpy/array_api/tests/test_indexing_functions.py,sha256=AbuBGyEufEAf24b7fy8JQhdJtGPdP9XEIxPTJAfAFFo,627
73
+ numpy/array_api/tests/test_manipulation_functions.py,sha256=wce25dSJjubrGhFxmiatzR_IpmNYp9ICJ9PZBBnZTOQ,1087
74
+ numpy/array_api/tests/test_set_functions.py,sha256=D016G7v3ko49bND5sVERP8IqQXZiwr-2yrKbBPJ-oqg,546
75
+ numpy/array_api/tests/test_sorting_functions.py,sha256=INPiYnuGBcsmWtYqdTTX3ENHmM4iUx4zs9KdwDaSmdA,602
76
+ numpy/array_api/tests/test_validation.py,sha256=QUG9yWC3QhkPxNhbQeakwBbl-0Rr0iTuZ41_0sfVIGU,676
77
+ numpy/compat/__init__.py,sha256=iAHrmsZWzouOMSyD9bdSE0APWMlRpqW92MQgF8y6x3E,448
78
+ numpy/compat/py3k.py,sha256=Je74CVk_7qI_qX7pLbYcuQJsxlMq1poGIfRIrH99kZQ,3833
79
+ numpy/compat/setup.py,sha256=36X1kF0C_NVROXfJ7w3SQeBm5AIDBuJbM5qT7cvSDgU,335
80
+ numpy/compat/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
+ numpy/compat/tests/test_compat.py,sha256=YqV67pSN8nXPbXaEdjhmyaoVetNyFupVv57OMEgCwKA,579
82
+ numpy/conftest.py,sha256=HZyWo_wJyrbgnyXxI8t05WOg_IrzNAMnEV7O8koHous,4623
83
+ numpy/core/__init__.py,sha256=CNsO-Ab4ywM2Wz3AbqWOH3ig1q5Bno9PsUMrCv-HNS4,5780
84
+ numpy/core/__init__.pyi,sha256=xtd9OFYza-ZG3jyEJrlzRPT-SkVoB_qYmVCe6FxRks0,126
85
+ numpy/core/_add_newdocs.py,sha256=39JFaeDPN2OQlSwfpY6_Jq9fO5vML8ZMF8J4ZTx_nrs,208972
86
+ numpy/core/_add_newdocs_scalars.py,sha256=PF9v8POcSNH6ELYltkx9e07DWgMmft6NJy9zER3Jk44,12106
87
+ numpy/core/_asarray.py,sha256=P2ddlZAsg1iGleRRfoQv_aKs2N7AGwpo5K4ZQv4Ujlk,3884
88
+ numpy/core/_asarray.pyi,sha256=gNNxUVhToNU_F1QpgeEvUYddpUFN-AKP0QWa4gqcTGw,1086
89
+ numpy/core/_dtype.py,sha256=SihUz41pHRB3Q2LiYYkug6LgMBKh6VV89MOpLxnXQdo,10606
90
+ numpy/core/_dtype_ctypes.py,sha256=Vug4i7xKhznK2tdIjmn4ebclClpaCJwSZUlvEoYl0Eg,3673
91
+ numpy/core/_exceptions.py,sha256=dZWKqfdLRvJvbAEG_fof_8ikEKxjakADMty1kLC_l_M,5379
92
+ numpy/core/_internal.py,sha256=f9kNDuT-FGxF1EtVOVIxXWnH9gM9n-J5V2zwHMv4HEk,28348
93
+ numpy/core/_internal.pyi,sha256=_mCTOX6Su8D4R9fV4HNeohPJx7515B-WOlv4uq6mry8,1032
94
+ numpy/core/_machar.py,sha256=G3a3TXu8VDW_1EMxKKLnGMbvUShEIUEve3ealBlJJ3E,11565
95
+ numpy/core/_methods.py,sha256=m31p0WjcFUGckbJiHnCpSaIQGqv-Lq5niIYkdd33YMo,8613
96
+ numpy/core/_multiarray_tests.cpython-312-x86_64-linux-gnu.so,sha256=Cyy7dBn_wvcSmHqrr1GKOx2d6EBgk_edyx1xKjSrYFc,175912
97
+ numpy/core/_multiarray_umath.cpython-312-x86_64-linux-gnu.so,sha256=amUIEKhzXL25iPdHKZc3QKM3ZF3RWF_vaW5z4tvGW-s,7463681
98
+ numpy/core/_operand_flag_tests.cpython-312-x86_64-linux-gnu.so,sha256=VPbGfwOkzwWoNNVSh3jahuBTI8LrKbN_dCaMcOtDfQE,16856
99
+ numpy/core/_rational_tests.cpython-312-x86_64-linux-gnu.so,sha256=0JmPpR0Ej5eZ4vrHN_6fvrKVCeUVuQam83AxViSkN2k,59776
100
+ numpy/core/_simd.cpython-312-x86_64-linux-gnu.so,sha256=lAK8a8uKjaYoFqMQZBWnVvjeUm-KDsnZzyH_RThl9do,3535232
101
+ numpy/core/_string_helpers.py,sha256=-fQM8z5s8_yX440PmgNEH3SUjEoXMPpPSysZwWZNbuo,2852
102
+ numpy/core/_struct_ufunc_tests.cpython-312-x86_64-linux-gnu.so,sha256=PB6RqEbim2Ezi96GVTzyqi9IuqNcVGCKcgPxwHBVCAM,16960
103
+ numpy/core/_type_aliases.py,sha256=qV6AZlsUWHMWTydmZya73xuBkKXiUKq_WXLj7q2CbZ0,7534
104
+ numpy/core/_type_aliases.pyi,sha256=lguMSqMwvqAFHuRtm8YZSdKbikVz985BdKo_lo7GQCg,404
105
+ numpy/core/_ufunc_config.py,sha256=-Twpe8dnd45ccXH-w-B9nvU8yCOd1E0e3Wpsts3g_bQ,13944
106
+ numpy/core/_ufunc_config.pyi,sha256=-615enOVQMBhVx7Pln7DY_s4H6JjSgSnBy89YkpvuLg,1066
107
+ numpy/core/_umath_tests.cpython-312-x86_64-linux-gnu.so,sha256=kT7z3gJc2t_GgamgqAf3MNRWeVo8KrSWPZVh3mLs_t8,42272
108
+ numpy/core/arrayprint.py,sha256=ySZj4TZFFVCa5yhMmJKFYQYhuQTabZTRBb1YoiCD-ac,63608
109
+ numpy/core/arrayprint.pyi,sha256=21pOWjTSfJOBaKgOOPzRox1ERb3c9ydufqL0b11_P_Q,4428
110
+ numpy/core/cversions.py,sha256=H_iNIpx9-hY1cQNxqjT2d_5SXZhJbMo_caq4_q6LB7I,347
111
+ numpy/core/defchararray.py,sha256=G1LExk-dMeVTYRhtYgcCZEsHk5tkawk7giXcK4Q5KVM,73617
112
+ numpy/core/defchararray.pyi,sha256=ib3aWFcM7F4KooU57mWUNi4GlosNjdfgrLKBVSIKDvU,9216
113
+ numpy/core/einsumfunc.py,sha256=TrL6t79F0H0AQH0y5Cj7Tq0_pzk4fVFi-4q4jJmujYQ,51868
114
+ numpy/core/einsumfunc.pyi,sha256=IJZNdHHG_soig8XvCbXZl43gMr3MMKl9dckTYWecqLs,4860
115
+ numpy/core/fromnumeric.py,sha256=YMtxOBg51VMem39AHXFs-4_vOb1p48ei7njXdYTRJ_Q,128821
116
+ numpy/core/fromnumeric.pyi,sha256=KATMFeFxUJ8YNRaC-jd_dTOt3opz2ng6lHgke5u5COk,23726
117
+ numpy/core/function_base.py,sha256=tHg1qSHTz1eO_wHXNFRt3Q40uqVtPT2eyQdrWbIi4wQ,19836
118
+ numpy/core/function_base.pyi,sha256=3ZYad3cdaGwNEyP8VwK97IYMqk2PDoVjpjQzhIYHjk0,4725
119
+ numpy/core/getlimits.py,sha256=AopcTZDCUXMPcEKIZE1botc3mEhmLb2p1_ejlq1CLqY,25865
120
+ numpy/core/getlimits.pyi,sha256=qeIXUEtognTHr_T-tv-VcZI7n8Z2VzAyIpIgKXzsLkc,82
121
+ numpy/core/include/numpy/__multiarray_api.c,sha256=nPRzTez_Wy3YXy3zZNJNPMspAzxbLOdohqhXwouwMLM,12116
122
+ numpy/core/include/numpy/__multiarray_api.h,sha256=ZM--FKMhIaSQS39cPW0hj5dx8ngNMmbcy6SbgXZBd8U,61450
123
+ numpy/core/include/numpy/__ufunc_api.c,sha256=670Gcz-vhkF4taBDmktCpFRBrZ9CHJnPRx7ag7Z6HsI,1714
124
+ numpy/core/include/numpy/__ufunc_api.h,sha256=0MBOl7dgO3ldqdDi-SdciEOuqGv1UNsmk7mp7tEy4AY,12456
125
+ numpy/core/include/numpy/_dtype_api.h,sha256=4veCexGvx9KNWMIUuEUAVOfcsei9GqugohDY5ud16pA,16697
126
+ numpy/core/include/numpy/_neighborhood_iterator_imp.h,sha256=s-Hw_l5WRwKtYvsiIghF0bg-mA_CgWnzFFOYVFJ-q4k,1857
127
+ numpy/core/include/numpy/_numpyconfig.h,sha256=o0fV_jb-wgVtRxnVIWvUttiZafyrWYFm2ab9Uixz1Cw,855
128
+ numpy/core/include/numpy/arrayobject.h,sha256=-BlWQ7kfVbzCqzHn0qaeMe0_08AbwliuG98XWG57lT8,282
129
+ numpy/core/include/numpy/arrayscalars.h,sha256=C3vDRndZTZRbppiDyV5jp8sV3dRKsrwBIZcNlh9gSTA,3944
130
+ numpy/core/include/numpy/experimental_dtype_api.h,sha256=tlehD5r_pYhHbGzIrUea6vtOgf6IQ8Txblnhx7455h8,15532
131
+ numpy/core/include/numpy/halffloat.h,sha256=TRZfXgipa-dFppX2uNgkrjrPli-1BfJtadWjAembJ4s,1959
132
+ numpy/core/include/numpy/ndarrayobject.h,sha256=PhY4NjRZDoU5Zbc8MW0swPEm81hwgWZ63gAU93bLVVI,10183
133
+ numpy/core/include/numpy/ndarraytypes.h,sha256=EjWXv-J8C5JET4AlIbJRdctycL7-dyJZcnoWgnlCPc8,68009
134
+ numpy/core/include/numpy/noprefix.h,sha256=d83l1QpCCVqMV2k29NMkL3Ld1qNjiC6hzOPWZAivEjQ,6830
135
+ numpy/core/include/numpy/npy_1_7_deprecated_api.h,sha256=y0MJ8Qw7Bkt4H_4VxIzHzpkw5JqAdj5ECgtn08fZFrI,4327
136
+ numpy/core/include/numpy/npy_3kcompat.h,sha256=SvN9yRA3i02O4JFMXxZz0Uq_vJ5ZpvC-pC2sfF56A5I,15883
137
+ numpy/core/include/numpy/npy_common.h,sha256=apWBsCJeP8P5T0exgzhFcGohbASsUF8vtFdS2jc1VfU,37746
138
+ numpy/core/include/numpy/npy_cpu.h,sha256=pcVRtj-Y6120C5kWB1VAiAjZoxkTPDEg0gGm5IAt3jM,4629
139
+ numpy/core/include/numpy/npy_endian.h,sha256=we7X9fPeWzNpo_YTh09MPGDwdE0Rw_WDM4c9y4nBj5I,2786
140
+ numpy/core/include/numpy/npy_interrupt.h,sha256=DQZIxi6FycLXD8drdHn2SSmLoRhIpo6osvPv13vowUA,1948
141
+ numpy/core/include/numpy/npy_math.h,sha256=SbKRoc7O3gVuDl7HOZjk424O049I0zn-7i9GwBwNmmk,18945
142
+ numpy/core/include/numpy/npy_no_deprecated_api.h,sha256=0yZrJcQEJ6MCHJInQk5TP9_qZ4t7EfBuoLOJ34IlJd4,678
143
+ numpy/core/include/numpy/npy_os.h,sha256=hlQsg_7-RkvS3s8OM8KXy99xxyJbCm-W1AYVcdnO1cw,1256
144
+ numpy/core/include/numpy/numpyconfig.h,sha256=Nr59kE3cXmen6y0UymIBaU7F1BSIuPwgKZ4gdV5Q5JU,5308
145
+ numpy/core/include/numpy/old_defines.h,sha256=xuYQDDlMywu0Zsqm57hkgGwLsOFx6IvxzN2eiNF-gJY,6405
146
+ numpy/core/include/numpy/random/LICENSE.txt,sha256=-8U59H0M-DvGE3gID7hz1cFGMBJsrL_nVANcOSbapew,1018
147
+ numpy/core/include/numpy/random/bitgen.h,sha256=49AwKOR552r-NkhuSOF1usb_URiMSRMvD22JF5pKIng,488
148
+ numpy/core/include/numpy/random/distributions.h,sha256=W5tOyETd0m1W0GdaZ5dJP8fKlBtsTpG23V2Zlmrlqpg,9861
149
+ numpy/core/include/numpy/random/libdivide.h,sha256=ew9MNhPQd1LsCZiWiFmj9IZ7yOnA3HKOXffDeR9X1jw,80138
150
+ numpy/core/include/numpy/ufuncobject.h,sha256=Xmnny_ulZo9VwxkfkXF-1HCTKDavIp9PV_H7XWhi0Z8,12070
151
+ numpy/core/include/numpy/utils.h,sha256=wMNomSH3Dfj0q78PrjLVtFtN-FPo7UJ4o0ifCUO-6Es,1185
152
+ numpy/core/lib/libnpymath.a,sha256=mb8EluEp8SLpEeCTQJ0VshL-CqeZfWxSbS5ItM-9POc,93960
153
+ numpy/core/lib/npy-pkg-config/mlib.ini,sha256=_LsWV1eStNqwhdiYPa2538GL46dnfVwT4MrI1zbsoFw,147
154
+ numpy/core/lib/npy-pkg-config/npymath.ini,sha256=kamUNrYKAmXqQa8BcNv7D5sLqHh6bnChM0_5rZCsTfY,360
155
+ numpy/core/memmap.py,sha256=yWBJLeVClHsD8BYusnf9bdqypOMPrj3_zoO_lQ2zVMc,11771
156
+ numpy/core/memmap.pyi,sha256=sxIQ7T5hPLG-RBNndAc8JPvrsKEX1amBSH2HGg48Obo,55
157
+ numpy/core/multiarray.py,sha256=zXaWf_DSkFEWjUQqVRCGeevwsI6kjQ3x6_MUwA1Y8fk,56097
158
+ numpy/core/multiarray.pyi,sha256=_0X4W90U5ZiKt2n-9OscK-pcQyV6oGK-8jwGy5k1qxA,24768
159
+ numpy/core/numeric.py,sha256=DgajaCDXiiQR-zuW_rrx_QhApSsa5k5FONK3Uk9mfTs,77014
160
+ numpy/core/numeric.pyi,sha256=oVQkI4ABayFl_ZzCiGH4DxfYASL-3aETi-3B93THnEQ,14315
161
+ numpy/core/numerictypes.py,sha256=qIf9v1OpNjjVQzXnKpD-3V01y5Bj9huw5F-U5Wa4glc,18098
162
+ numpy/core/numerictypes.pyi,sha256=dEqtq9MLrGaqqeAF1sdXBgnEwDWOzlK02A6MTg1PS5g,3267
163
+ numpy/core/overrides.py,sha256=YUZFS8RCBvOJ27sH-jDRcyMjOCn9VigMyuQY4J21JBI,7093
164
+ numpy/core/records.py,sha256=4mpIjUp2XtZxY5cD2S8mgfn8GCzQGGrrkqLBqAJwM-Q,37533
165
+ numpy/core/records.pyi,sha256=uYwE6cAoGKgN6U4ryfGZx_3m-3sY006jytjWLrDRRy0,5692
166
+ numpy/core/shape_base.py,sha256=RPMKxA7_FCAgg_CruExl0LehnczSTFaxA6hrcfrUzns,29743
167
+ numpy/core/shape_base.pyi,sha256=Ilb4joJmbjkIZLzKww7NJeaxg2FP3AfFib3HtfOsrC0,2774
168
+ numpy/core/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
169
+ numpy/core/tests/_locales.py,sha256=S4x5soqF0oxpBYOE8J9Iky72O9J25IiZ8349m93pWC4,2206
170
+ numpy/core/tests/data/astype_copy.pkl,sha256=lWSzCcvzRB_wpuRGj92spGIw-rNPFcd9hwJaRVvfWdk,716
171
+ numpy/core/tests/data/generate_umath_validation_data.cpp,sha256=fyhQPNhIX9hzjeXujn6mhi1MVc133zELSV_hlSQ7BQU,5842
172
+ numpy/core/tests/data/numpy_2_0_array.pkl,sha256=Vh02tdyCypa8Nb4QzdVhnDAiXEO2WQrcwcvOdDDFF5w,718
173
+ numpy/core/tests/data/recarray_from_file.fits,sha256=NA0kliz31FlLnYxv3ppzeruONqNYkuEvts5wzXEeIc4,8640
174
+ numpy/core/tests/data/umath-validation-set-README.txt,sha256=pxWwOaGGahaRd-AlAidDfocLyrAiDp0whf5hC7hYwqM,967
175
+ numpy/core/tests/data/umath-validation-set-arccos.csv,sha256=W_aL99bjzVjlVyd5omfDUORag8jHzx6uctedPVZgOHQ,61365
176
+ numpy/core/tests/data/umath-validation-set-arccosh.csv,sha256=Uko_d0kDXr1YlN-6Ii-fQQxUvbXAhRfC7Un4gJ23GJk,61365
177
+ numpy/core/tests/data/umath-validation-set-arcsin.csv,sha256=15Aenze4WD2a2dF2aOBXpv9B7u3wwAeUVJdEm4TjOkQ,61339
178
+ numpy/core/tests/data/umath-validation-set-arcsinh.csv,sha256=uDwx4PStpfV21IaPF8pmzQpul6i72g7zDwlfcynWaVQ,60289
179
+ numpy/core/tests/data/umath-validation-set-arctan.csv,sha256=mw5tYze_BMs6ugGEZfg5mcXoInGYdn7fvSCYSUi9Bqw,60305
180
+ numpy/core/tests/data/umath-validation-set-arctanh.csv,sha256=95l4Uu5RmZajljabfqlv5U34RVrifCMhhkop6iLeNBo,61339
181
+ numpy/core/tests/data/umath-validation-set-cbrt.csv,sha256=v855MTZih-fZp_GuEDst2qaIsxU4a7vlAbeIJy2xKpc,60846
182
+ numpy/core/tests/data/umath-validation-set-cos.csv,sha256=0PNnDqKkokZ7ERVDgbes8KNZc-ISJrZUlVZc5LkW18E,59122
183
+ numpy/core/tests/data/umath-validation-set-cosh.csv,sha256=FGCNeUSUTAeASsb_j18iRSsCxXLxmzF-_C7tq1elVrQ,60869
184
+ numpy/core/tests/data/umath-validation-set-exp.csv,sha256=BKg1_cyrKD2GXYMX_EB0DnXua8DI2O1KWODXf_BRhrk,17491
185
+ numpy/core/tests/data/umath-validation-set-exp2.csv,sha256=f1b05MRXPOXihC9M-yi52udKBzVXalhbTuIcqoDAk-g,58624
186
+ numpy/core/tests/data/umath-validation-set-expm1.csv,sha256=_ghc1xiUECNsBGrKCFUAy2lvu01_lkpeYJN0zDtCYWk,60299
187
+ numpy/core/tests/data/umath-validation-set-log.csv,sha256=z9ej1ykKUoMRqYMUIJENWXbYi_A_x_RKs7K_GuXZJus,11692
188
+ numpy/core/tests/data/umath-validation-set-log10.csv,sha256=RJgpruL16FVPgUT3-3xW4eppS_tn6o5yEW79KnITn48,68922
189
+ numpy/core/tests/data/umath-validation-set-log1p.csv,sha256=IZZI-hi55HGCOvBat3vSBVha_8Nt-5alf2fqz6QeTG0,60303
190
+ numpy/core/tests/data/umath-validation-set-log2.csv,sha256=HL2rOCsrEi378rNrbsXHPqlWlEGkXQq8R4e63YeTksU,68917
191
+ numpy/core/tests/data/umath-validation-set-sin.csv,sha256=8PUjnQ_YfmxFb42XJrvpvmkeSpEOlEXSmNvIK4VgfAM,58611
192
+ numpy/core/tests/data/umath-validation-set-sinh.csv,sha256=CYiibE8aX7MQnBatl__5k_PWc_9vHUifwS-sFZzzKk0,60293
193
+ numpy/core/tests/data/umath-validation-set-tan.csv,sha256=Oq7gxMvblRVBrQ23kMxc8iT0bHnCWKg9EE4ZqzbJbOA,60299
194
+ numpy/core/tests/data/umath-validation-set-tanh.csv,sha256=iolZF_MOyWRgYSa-SsD4df5mnyFK18zrICI740SWoTc,60299
195
+ numpy/core/tests/examples/cython/checks.pyx,sha256=rKAhPSGHJ9oPK9Q_85YoUQyRTftEP1jcYOR5lSPB6oQ,662
196
+ numpy/core/tests/examples/cython/meson.build,sha256=Qk4Q6OkpZ0xsLUkcGQVVrYkzb0ozoyL6YlSZ8_5tH1I,1088
197
+ numpy/core/tests/examples/cython/setup.py,sha256=aAR-TvQabUabnCzuB6UdWdmRXaaPfIG7MzTIfMF-0tk,496
198
+ numpy/core/tests/examples/limited_api/limited_api.c,sha256=mncE8TjjXmYpkwli433G0jB2zGQO_5NqWmGKdzRJZug,344
199
+ numpy/core/tests/examples/limited_api/setup.py,sha256=p2w7F1ardi_GRXSrnNIR8W1oeH_pgmw_1P2wS0A2I6M,435
200
+ numpy/core/tests/test__exceptions.py,sha256=QqxQSLXboPXEVwHz-TyE2JeIl_TC-rPugzfo25nbcns,2846
201
+ numpy/core/tests/test_abc.py,sha256=FfgYA_HjYAi8XWGK_oOh6Zw86chB_KG_XoW_7ZlFp4c,2220
202
+ numpy/core/tests/test_api.py,sha256=UMc7SvczAQ5ngHxE-NoXVvNpVzYRrn8oMwFNta1yMS0,22995
203
+ numpy/core/tests/test_argparse.py,sha256=C0zBbwQ9xzzymXe_hHpWnnWQPwOi2ZdQB78gBAgJHvU,1969
204
+ numpy/core/tests/test_array_coercion.py,sha256=zY4Pjlt4QZ0w71WxWGLHcrPnnhEF51yXYVLg5HMIy5c,34379
205
+ numpy/core/tests/test_array_interface.py,sha256=8tGgj1Nzi76H_WF5GULkxqWL7Yu_Xf0lvTJZOwOBKsI,7774
206
+ numpy/core/tests/test_arraymethod.py,sha256=VpjDYTmoMDTZcY7CsGzinBh0R_OICuwOykWCbmCRQZU,3244
207
+ numpy/core/tests/test_arrayprint.py,sha256=cKaIoD9ZvsjJH0PHwZyOxmcRcBt1kN1WfFneqVqs0b8,40462
208
+ numpy/core/tests/test_casting_floatingpoint_errors.py,sha256=W3Fgk0oKtXFv684fEZ7POwj6DHTYK0Jj_oGRLZ8UdyA,5063
209
+ numpy/core/tests/test_casting_unittests.py,sha256=9-vkR0oXczQz8ED8DxGVPmalC8IZXe2jKgOCMGr8hIg,34298
210
+ numpy/core/tests/test_conversion_utils.py,sha256=jNhbNNI-T8qtQnsIMEax7KFN30kjh0ICntLMwTyxJ5Q,6559
211
+ numpy/core/tests/test_cpu_dispatcher.py,sha256=v_SlhUpENuoe7QYXizzYITLGXa7WfZ7jqcqmbSBg7JU,1542
212
+ numpy/core/tests/test_cpu_features.py,sha256=mieGx7dxXFiyTYatbcCCjIjR67Un2hVcbJx4GEf2yFo,14892
213
+ numpy/core/tests/test_custom_dtypes.py,sha256=JogRmttDLwfQ3PTbewEnGLKco9zV2Nu3yIfrMeCsx_I,9401
214
+ numpy/core/tests/test_cython.py,sha256=t5-h4XSIFNLyw_9BIAQDYl8_80t_pH0SCfEa1Vf_3aI,3755
215
+ numpy/core/tests/test_datetime.py,sha256=2vAGbrCQmsrWNXCVXOMZqUGZn2c-cQT-eZ1wTprYbcM,116211
216
+ numpy/core/tests/test_defchararray.py,sha256=F88HUkByEP4H6cJ_ITvIe0a_T1BH2JOdRysMCu1XIn0,24997
217
+ numpy/core/tests/test_deprecations.py,sha256=w2lhHb-W8hh7RoE_0Ftg8thpG86jvbFAJgior22DY2Q,31076
218
+ numpy/core/tests/test_dlpack.py,sha256=cDlwFmTombb2rDeB8RHEAJ4eVMUiDbw8Oz5Jo1NQwk0,3522
219
+ numpy/core/tests/test_dtype.py,sha256=J09pJF59v7UO6iNuJFISKP2DLPgdkQ_df5OAMDRLikU,75702
220
+ numpy/core/tests/test_einsum.py,sha256=QzQAPIC-IjTV3Dxz97hBnvLBCmF8kpsBTBckThhgRjQ,53712
221
+ numpy/core/tests/test_errstate.py,sha256=U3GT9I058jkF725mx4GdWUr9RoceCkGDV7Go79VA4wY,2219
222
+ numpy/core/tests/test_extint128.py,sha256=gCZfAwPOb-F1TLsEEeDI0amQYwHk-60-OXi0ccZrrZ8,5643
223
+ numpy/core/tests/test_function_base.py,sha256=Ibs6-WXZE5hsRx4VCnX-cZOWYKU-5PFXjouwAQzgnqQ,15595
224
+ numpy/core/tests/test_getlimits.py,sha256=apdxr0zKkxaVHIUpLrqAvO39q54JKN14sV4xSbK2Ifs,6718
225
+ numpy/core/tests/test_half.py,sha256=VYPyap9GYOWZuphsfFofcIRl-oa5Ufrtv83OTp6azdU,24593
226
+ numpy/core/tests/test_hashtable.py,sha256=ZV8HL8NkDnoQZfnje7BP0fyIp4fSFqjKsQc40PaTggc,1011
227
+ numpy/core/tests/test_indexerrors.py,sha256=kN9xLl6FVTzmI7fumn_cuZ3k0omXnTetgtCnPY44cvw,5130
228
+ numpy/core/tests/test_indexing.py,sha256=x0ojWuhOwWD5MZuiJ9Ncim3CgkwI-GldWxrSCmjmFJM,54314
229
+ numpy/core/tests/test_item_selection.py,sha256=kI30kiX8mIrZYPn0jw3lGGw1ruZF4PpE9zw-aai9EPA,6458
230
+ numpy/core/tests/test_limited_api.py,sha256=5yO0nGmCKZ9b3S66QP7vY-HIgAoyOtHZmp8mvzKuOHI,1172
231
+ numpy/core/tests/test_longdouble.py,sha256=jO8YMm_Hsz-XPKbmv6iMcOdHgTlIFkKTwAtxpy3Q1pE,13905
232
+ numpy/core/tests/test_machar.py,sha256=_5_TDUVtAJvJI5jBfEFKpCZtAfKCsCFt7tXlWSkWzzc,1067
233
+ numpy/core/tests/test_mem_overlap.py,sha256=QJ0unWD_LOoAGAo4ra0IvYenj56IYUtiz1fEJEmTY9Q,29086
234
+ numpy/core/tests/test_mem_policy.py,sha256=CXa10FQw2Qj6MqJuaC8Fm4slsoipKFjCIpYF6c5IIAU,16801
235
+ numpy/core/tests/test_memmap.py,sha256=tZ5lJs_4ZFsJmg392ZQ33fX0m8tdfZ8ZtY9Lq41LNtk,7477
236
+ numpy/core/tests/test_multiarray.py,sha256=GPv4IJR9dijNG-icUsQsX2tBD2RdP3EhUehY4cxvVQU,380106
237
+ numpy/core/tests/test_nditer.py,sha256=nVQ00aNxPHqf4ZcFs3e9AVDK64TCqlO0TzfocTAACZQ,130818
238
+ numpy/core/tests/test_nep50_promotions.py,sha256=2TwtFvj1LBpYTtdR6NFe1RAAGXIJltLqwpA1vhQCVY4,8840
239
+ numpy/core/tests/test_numeric.py,sha256=ZGNW5NKgShEjZC_TcPOtTuRaTM_GbuM21u82D205UPs,137294
240
+ numpy/core/tests/test_numerictypes.py,sha256=f_xMjZJnyDwlc6XCrd71b6x1_6dAWOv-kZ3-NEq37hU,21687
241
+ numpy/core/tests/test_numpy_2_0_compat.py,sha256=kVCTAXska7Xi5w_TYduWhid0nlCqI6Nvmt-gDnYsuKI,1630
242
+ numpy/core/tests/test_overrides.py,sha256=t0gOZOzu7pevE58HA-npFYJqnInHR-LLBklnzKJWHqo,26080
243
+ numpy/core/tests/test_print.py,sha256=ErZAWd88b0ygSEoYpd0BL2tFjkerMtn1vZ7dWvaNqTc,6837
244
+ numpy/core/tests/test_protocols.py,sha256=fEXE9K9s22oiVWkX92BY-g00-uXCK-HxjZhZxxYAKFc,1168
245
+ numpy/core/tests/test_records.py,sha256=pluit5x6jkWoPEIrHXM13L3xZuuSSiaxoXFsOdkakCU,20269
246
+ numpy/core/tests/test_regression.py,sha256=SJo9cPTVr2SNjhgtW7boUMyNQlXxygsZ5g0oyqC8Eks,91595
247
+ numpy/core/tests/test_scalar_ctors.py,sha256=qDIZV-tBukwAxNDhUmGtH3CemDXlS3xd_q3L52touuA,6115
248
+ numpy/core/tests/test_scalar_methods.py,sha256=Uj-zU0zzzKAjMBdpkzsWZ3nSFj5gJkUlqi_euhOYdnU,7541
249
+ numpy/core/tests/test_scalarbuffer.py,sha256=FSL94hriWX1_uV6Z33wB3ZXUrpmmX2-x87kNjIxUeBk,5580
250
+ numpy/core/tests/test_scalarinherit.py,sha256=fMInDGKsiH3IS_2ejZtIcmJZ0Ry8c7kVsHx7wp5XDoM,2368
251
+ numpy/core/tests/test_scalarmath.py,sha256=XZj_m2I2TLktJdFD1SWj2XtV8hT26VIxasDz3cAFvgA,43247
252
+ numpy/core/tests/test_scalarprint.py,sha256=1599W5X0tjGhBnSQjalXkg6AY8eHXnr6PMqs4vYZQqs,18771
253
+ numpy/core/tests/test_shape_base.py,sha256=D9haeuUVx3x3pOLmFQ9vUz7iU4T2bFTsPoI8HgSncFU,29723
254
+ numpy/core/tests/test_simd.py,sha256=-L1UhIn9Eu_euLwaSU7bPRfYpWWOTb43qovoJS7Ws7w,48696
255
+ numpy/core/tests/test_simd_module.py,sha256=OSpYhH_3QDxItyQcaW6SjXW57k2m-weRwpYOnJjCqN0,3902
256
+ numpy/core/tests/test_strings.py,sha256=A9t1B65lFrYRLXgDJSg3mMDAe_hypIPcTMVOdAYIbU0,3835
257
+ numpy/core/tests/test_ufunc.py,sha256=5pS2x3LACHn8GogYYad8LRAjByK7Gg9xTD9ik3d0Fm0,124907
258
+ numpy/core/tests/test_umath.py,sha256=huHpclJqkO32k7BTflRHj8nImzg3p6yyryeS9LyHKWU,186482
259
+ numpy/core/tests/test_umath_accuracy.py,sha256=mFcVdzXhhD9mqhzLDJVZsWfCHbjbFQ6XeEl5G8l-PTc,3897
260
+ numpy/core/tests/test_umath_complex.py,sha256=WvZZZWeijo52RiOfx-G83bxzQOp_IJ3i9fEnUDVukLQ,23247
261
+ numpy/core/tests/test_unicode.py,sha256=hUXIwMmoq89y_KXWzuXVyQaXvRwGjfY4TvKJsCbygEI,12775
262
+ numpy/core/umath.py,sha256=JbT_SxnZ_3MEmjOI9UtX3CcAzX5Q-4RDlnnhDAEJ5Vo,2040
263
+ numpy/core/umath_tests.py,sha256=TIzaDfrEHHgSc2J5kxFEibq8MOPhwSuyOZOUBsZNVSM,389
264
+ numpy/ctypeslib.py,sha256=Po4XCWfxhwFQ1Q8x8DeayGiMCJLxREaCDkVyeladxBU,17247
265
+ numpy/ctypeslib.pyi,sha256=A9te473aRO920iDVuyKypeVIQp-ueZK6EiI-qLSwJNg,7972
266
+ numpy/doc/__init__.py,sha256=OYmE-F6x0CD05PCDY2MiW1HLlwB6i9vhDpk-a3r4lHY,508
267
+ numpy/doc/constants.py,sha256=PlXoj7b4A8Aa9nADbg83uzTBRJaX8dvJmEdbn4FDPPo,9155
268
+ numpy/doc/ufuncs.py,sha256=i1alLg19mNyCFZ2LYSOZGm--RsRN1x63U_UYU-N3x60,5357
269
+ numpy/dtypes.py,sha256=BuBztrPQRasUmVZhXr2_NgJujdUTNhNwd59pZZHk3lA,2229
270
+ numpy/dtypes.pyi,sha256=tIHniAYP7ALg2iT7NgSXO67jvE-zRlDod3MazEmD4M8,1315
271
+ numpy/exceptions.py,sha256=7j7tv8cwXGZYgldyMisGmnAxAl2s4YU0vexME81yYlA,7339
272
+ numpy/exceptions.pyi,sha256=KsZqWNvyPUEXUGR9EhZCUQF2f9EVSpBRlJUlGqRT02k,600
273
+ numpy/f2py/__init__.py,sha256=m-ty_WiJZ4GVfV5--kJ3MFJaLXestz5Eo-4H0FPscK4,5565
274
+ numpy/f2py/__init__.pyi,sha256=eA7uYXZr0p0aaz5rBW-EypLx9RchrvqDYtSnkEJQsYw,1087
275
+ numpy/f2py/__main__.py,sha256=6i2jVH2fPriV1aocTY_dUFvWK18qa-zjpnISA-OpF3w,130
276
+ numpy/f2py/__version__.py,sha256=7HHdjR82FCBmftwMRyrlhcEj-8mGQb6oCH-wlUPH4Nw,34
277
+ numpy/f2py/_backends/__init__.py,sha256=7_bA7c_xDpLc4_8vPfH32-Lxn9fcUTgjQ25srdvwvAM,299
278
+ numpy/f2py/_backends/_backend.py,sha256=GKb9-UaFszT045vUgVukPs1n97iyyjqahrWKxLOKNYo,1187
279
+ numpy/f2py/_backends/_distutils.py,sha256=pxh2YURFYYSykIOvBFwVvhoNX1oSk-c30IPPhzlko-0,2383
280
+ numpy/f2py/_backends/_meson.py,sha256=gi-nbnPFDC38sumfAjg-Q5FPu6nNkyQXTjEuVf9W9Cc,6916
281
+ numpy/f2py/_backends/meson.build.template,sha256=oTPNMAQzS4CJ_lfEzYv-oBeJTtQuThUYVN5R6ROWpNU,1579
282
+ numpy/f2py/_isocbind.py,sha256=zaBgpfPNRmxVG3doUIlbZIiyB990MsXiwDabrSj9HnQ,2360
283
+ numpy/f2py/_src_pyf.py,sha256=4t6TN4ZKWciC4f1z6fwaGrpIGhHKRiwHfcrNj4FIzCg,7654
284
+ numpy/f2py/auxfuncs.py,sha256=dNs4b2KDIcG4M1hPBvD09-Vh7CDzlPIrFscOdvL3p1o,26539
285
+ numpy/f2py/capi_maps.py,sha256=ENjYyeZ3CCJcLwJJgmKOSYrD1KPuhpwauXqeizdV55o,30563
286
+ numpy/f2py/cb_rules.py,sha256=5TuHbJWGjsF6yVNzKuV2tAnwdLyhcWlmdsjYlDOZOv4,24992
287
+ numpy/f2py/cfuncs.py,sha256=KJyW7mdjmFSmxssfeegGJs5NZyF3mZMgNvOxN9-vYHQ,51913
288
+ numpy/f2py/common_rules.py,sha256=gHB76WypbkVmhaD_RWhy8Od4zDTgj8cbDOdUdIp6PIQ,5131
289
+ numpy/f2py/crackfortran.py,sha256=ErLdkWP8MxeyW5vVPGXwyvrxZAwymlvIBC0th2rvK74,148553
290
+ numpy/f2py/diagnose.py,sha256=0SRXBE2hJgKJN_Rf4Zn00oKXC_Tka3efPWM47zg6BoY,5197
291
+ numpy/f2py/f2py2e.py,sha256=5t093ZQ4xs0_0UbyaYVd2yA2EVOaOAcuU29JI-IU2Ag,27717
292
+ numpy/f2py/f90mod_rules.py,sha256=otm3_dmVIna0eBVHLu_693s3a_82lU3pqeqDacWI37s,9594
293
+ numpy/f2py/func2subr.py,sha256=6d2R5awuHRT4xzgfUfwS7JHTqhhAieSXcENlssD_2c4,10298
294
+ numpy/f2py/rules.py,sha256=B4FxSYEfZ_1j_z9GulQNZ1BNrPrUvlU3ybxwTkrIxjI,62727
295
+ numpy/f2py/setup.cfg,sha256=Fpn4sjqTl5OT5sp8haqKIRnUcTPZNM6MIvUJBU7BIhg,48
296
+ numpy/f2py/setup.py,sha256=MmAVspT8DDTqDuL8ZJhxK62g0lcso4vqI6QNQ9CsfoQ,2422
297
+ numpy/f2py/src/fortranobject.c,sha256=g4BKDO1_9pCu6hithKXD2oH_Mt-HH1NTnP6leCqJrzc,46017
298
+ numpy/f2py/src/fortranobject.h,sha256=neMKotYWbHvrhW9KXz4QzQ8fzPkiQXLHHjy82vLSeog,5835
299
+ numpy/f2py/symbolic.py,sha256=jWBoAwECCxRdWczR9r7O6UERcYmH_GbdcAReNp7cmJY,53270
300
+ numpy/f2py/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
301
+ numpy/f2py/tests/src/abstract_interface/foo.f90,sha256=JFU2w98cB_XNwfrqNtI0yDTmpEdxYO_UEl2pgI_rnt8,658
302
+ numpy/f2py/tests/src/abstract_interface/gh18403_mod.f90,sha256=gvQJIzNtvacWE0dhysxn30-iUeI65Hpq7DiE9oRauz8,105
303
+ numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c,sha256=Ff5wHYV9-OJnZuelfFWcjAibRvDkEIlbTVczTyv6TG8,7299
304
+ numpy/f2py/tests/src/assumed_shape/.f2py_f2cmap,sha256=But9r9m4iL7EGq_haMW8IiQ4VivH0TgUozxX4pPvdpE,29
305
+ numpy/f2py/tests/src/assumed_shape/foo_free.f90,sha256=oBwbGSlbr9MkFyhVO2aldjc01dr9GHrMrSiRQek8U64,460
306
+ numpy/f2py/tests/src/assumed_shape/foo_mod.f90,sha256=rfzw3QdI-eaDSl-hslCgGpd5tHftJOVhXvb21Y9Gf6M,499
307
+ numpy/f2py/tests/src/assumed_shape/foo_use.f90,sha256=rmT9k4jP9Ru1PLcGqepw9Jc6P9XNXM0axY7o4hi9lUw,269
308
+ numpy/f2py/tests/src/assumed_shape/precision.f90,sha256=r08JeTVmTTExA-hYZ6HzaxVwBn1GMbPAuuwBhBDtJUk,130
309
+ numpy/f2py/tests/src/block_docstring/foo.f,sha256=y7lPCPu7_Fhs_Tf2hfdpDQo1bhtvNSKRaZAOpM_l3dg,97
310
+ numpy/f2py/tests/src/callback/foo.f,sha256=C1hjfpRCQWiOVVzIHqnsYcnLrqQcixrnHCn8hd9GhVk,1254
311
+ numpy/f2py/tests/src/callback/gh17797.f90,sha256=_Nrl0a2HgUbtymGU0twaJ--7rMa1Uco2A3swbWvHoMo,148
312
+ numpy/f2py/tests/src/callback/gh18335.f90,sha256=NraOyKIXyvv_Y-3xGnmTjtNjW2Znsnlk8AViI8zfovc,506
313
+ numpy/f2py/tests/src/callback/gh25211.f,sha256=a2sxlQhtDVbYn8KOKHUYqwc-aCFt7sDPSnJsXFG35uI,179
314
+ numpy/f2py/tests/src/callback/gh25211.pyf,sha256=FWxo0JWQlw519BpZV8PoYeI_FZ_K6C-3Wk6gLrfBPlw,447
315
+ numpy/f2py/tests/src/cli/gh_22819.pyf,sha256=5rvOfCv-wSosB354LC9pExJmMoSHnbGZGl_rtA2fogA,142
316
+ numpy/f2py/tests/src/cli/hi77.f,sha256=ttyI6vAP3qLnDqy82V04XmoqrXNM6uhMvvLri2p0dq0,71
317
+ numpy/f2py/tests/src/cli/hiworld.f90,sha256=QWOLPrTxYQu1yrEtyQMbM0fE9M2RmXe7c185KnD5x3o,51
318
+ numpy/f2py/tests/src/common/block.f,sha256=GQ0Pd-VMX3H3a-__f2SuosSdwNXHpBqoGnQDjf8aG9g,224
319
+ numpy/f2py/tests/src/common/gh19161.f90,sha256=BUejyhqpNVfHZHQ-QC7o7ZSo7lQ6YHyX08lSmQqs6YM,193
320
+ numpy/f2py/tests/src/crackfortran/accesstype.f90,sha256=-5Din7YlY1TU7tUHD2p-_DSTxGBpDsWYNeT9WOwGhno,208
321
+ numpy/f2py/tests/src/crackfortran/data_common.f,sha256=ZSUAh3uhn9CCF-cYqK5TNmosBGPfsuHBIEfudgysun4,193
322
+ numpy/f2py/tests/src/crackfortran/data_multiplier.f,sha256=jYrJKZWF_59JF9EMOSALUjn0UupWvp1teuGpcL5s1Sc,197
323
+ numpy/f2py/tests/src/crackfortran/data_stmts.f90,sha256=19YO7OGj0IksyBlmMLZGRBQLjoE3erfkR4tFvhznvvE,693
324
+ numpy/f2py/tests/src/crackfortran/data_with_comments.f,sha256=hoyXw330VHh8duMVmAQZjr1lgLVF4zFCIuEaUIrupv0,175
325
+ numpy/f2py/tests/src/crackfortran/foo_deps.f90,sha256=CaH7mnWTG7FcnJe2vXN_0zDbMadw6NCqK-JJ2HmDjK8,128
326
+ numpy/f2py/tests/src/crackfortran/gh15035.f,sha256=jJly1AzF5L9VxbVQ0vr-sf4LaUo4eQzJguhuemFxnvg,375
327
+ numpy/f2py/tests/src/crackfortran/gh17859.f,sha256=7K5dtOXGuBDAENPNCt-tAGJqTfNKz5OsqVSk16_e7Es,340
328
+ numpy/f2py/tests/src/crackfortran/gh22648.pyf,sha256=qZHPRNQljIeYNwbqPLxREnOrSdVV14f3fnaHqB1M7c0,241
329
+ numpy/f2py/tests/src/crackfortran/gh23533.f,sha256=w3tr_KcY3s7oSWGDmjfMHv5h0RYVGUpyXquNdNFOJQg,126
330
+ numpy/f2py/tests/src/crackfortran/gh23598.f90,sha256=41W6Ire-5wjJTTg6oAo7O1WZfd1Ug9vvNtNgHS5MhEU,101
331
+ numpy/f2py/tests/src/crackfortran/gh23598Warn.f90,sha256=1v-hMCT_K7prhhamoM20nMU9zILam84Hr-imck_dYYk,205
332
+ numpy/f2py/tests/src/crackfortran/gh23879.f90,sha256=LWDJTYR3t9h1IsrKC8dVXZlBfWX7clLeU006X6Ow8oI,332
333
+ numpy/f2py/tests/src/crackfortran/gh2848.f90,sha256=gPNasx98SIf7Z9ibk_DHiGKCvl7ERtsfoGXiFDT7FbM,282
334
+ numpy/f2py/tests/src/crackfortran/operators.f90,sha256=-Fc-qjW1wBr3Dkvdd5dMTrt0hnjnV-1AYo-NFWcwFSo,1184
335
+ numpy/f2py/tests/src/crackfortran/privatemod.f90,sha256=7bubZGMIn7iD31wDkjF1TlXCUM7naCIK69M9d0e3y-U,174
336
+ numpy/f2py/tests/src/crackfortran/publicmod.f90,sha256=Pnwyf56Qd6W3FUH-ZMgnXEYkb7gn18ptNTdwmGan0Jo,167
337
+ numpy/f2py/tests/src/crackfortran/pubprivmod.f90,sha256=eYpJwBYLKGOxVbKgEqfny1znib-b7uYhxcRXIf7uwXg,165
338
+ numpy/f2py/tests/src/crackfortran/unicode_comment.f90,sha256=aINLh6GlfTwFewxvDoqnMqwuCNb4XAqi5Nj5vXguXYs,98
339
+ numpy/f2py/tests/src/f2cmap/.f2py_f2cmap,sha256=iUOtfHd3OuT1Rz2-yiSgt4uPKGvCt5AzQ1iygJt_yjg,82
340
+ numpy/f2py/tests/src/f2cmap/isoFortranEnvMap.f90,sha256=iJCD8a8MUTmuPuedbcmxW54Nr4alYuLhksBe1sHS4K0,298
341
+ numpy/f2py/tests/src/isocintrin/isoCtests.f90,sha256=jcw-fzrFh0w5U66uJYfeUW4gv94L5MnWQ_NpsV9y0oI,998
342
+ numpy/f2py/tests/src/kind/foo.f90,sha256=zIHpw1KdkWbTzbXb73hPbCg4N2Htj3XL8DIwM7seXpo,347
343
+ numpy/f2py/tests/src/mixed/foo.f,sha256=90zmbSHloY1XQYcPb8B5d9bv9mCZx8Z8AMTtgDwJDz8,85
344
+ numpy/f2py/tests/src/mixed/foo_fixed.f90,sha256=pxKuPzxF3Kn5khyFq9ayCsQiolxB3SaNtcWaK5j6Rv4,179
345
+ numpy/f2py/tests/src/mixed/foo_free.f90,sha256=fIQ71wrBc00JUAVUj_r3QF9SdeNniBiMw6Ly7CGgPWU,139
346
+ numpy/f2py/tests/src/module_data/mod.mod,sha256=EkjrU7NTZrOH68yKrz6C_eyJMSFSxGgC2yMQT9Zscek,412
347
+ numpy/f2py/tests/src/module_data/module_data_docstring.f90,sha256=tDZ3fUlazLL8ThJm3VwNGJ75QIlLcW70NnMFv-JA4W0,224
348
+ numpy/f2py/tests/src/negative_bounds/issue_20853.f90,sha256=fdOPhRi7ipygwYCXcda7p_dlrws5Hd2GlpF9EZ-qnck,157
349
+ numpy/f2py/tests/src/parameter/constant_both.f90,sha256=-bBf2eqHb-uFxgo6Q7iAtVUUQzrGFqzhHDNaxwSICfQ,1939
350
+ numpy/f2py/tests/src/parameter/constant_compound.f90,sha256=re7pfzcuaquiOia53UT7qNNrTYu2euGKOF4IhoLmT6g,469
351
+ numpy/f2py/tests/src/parameter/constant_integer.f90,sha256=nEmMLitKoSAG7gBBEQLWumogN-KS3DBZOAZJWcSDnFw,612
352
+ numpy/f2py/tests/src/parameter/constant_non_compound.f90,sha256=IcxESVLKJUZ1k9uYKoSb8Hfm9-O_4rVnlkiUU2diy8Q,609
353
+ numpy/f2py/tests/src/parameter/constant_real.f90,sha256=quNbDsM1Ts2rN4WtPO67S9Xi_8l2cXabWRO00CPQSSQ,610
354
+ numpy/f2py/tests/src/quoted_character/foo.f,sha256=WjC9D9171fe2f7rkUAZUvik9bkIf9adByfRGzh6V0cM,482
355
+ numpy/f2py/tests/src/regression/gh25337/data.f90,sha256=9Uz8CHB9i3_mjC3cTOmkTgPAF5tWSwYacG3MUrU-SY0,180
356
+ numpy/f2py/tests/src/regression/gh25337/use_data.f90,sha256=WATiDGAoCKnGgMzm_iMgmfVU0UKOQlk5Fm0iXCmPAkE,179
357
+ numpy/f2py/tests/src/regression/inout.f90,sha256=CpHpgMrf0bqA1W3Ozo3vInDz0RP904S7LkpdAH6ODck,277
358
+ numpy/f2py/tests/src/return_character/foo77.f,sha256=WzDNF3d_hUDSSZjtxd3DtE-bSx1ilOMEviGyYHbcFgM,980
359
+ numpy/f2py/tests/src/return_character/foo90.f90,sha256=ULcETDEt7gXHRzmsMhPsGG4o3lGrcx-FEFaJsPGFKyA,1248
360
+ numpy/f2py/tests/src/return_complex/foo77.f,sha256=8ECRJkfX82oFvGWKbIrCvKjf5QQQClx4sSEvsbkB6A8,973
361
+ numpy/f2py/tests/src/return_complex/foo90.f90,sha256=c1BnrtWwL2dkrTr7wvlEqNDg59SeNMo3gyJuGdRwcDw,1238
362
+ numpy/f2py/tests/src/return_integer/foo77.f,sha256=_8k1evlzBwvgZ047ofpdcbwKdF8Bm3eQ7VYl2Y8b5kA,1178
363
+ numpy/f2py/tests/src/return_integer/foo90.f90,sha256=bzxbYtofivGRYH35Ang9ScnbNsVERN8-6ub5-eI-LGQ,1531
364
+ numpy/f2py/tests/src/return_logical/foo77.f,sha256=FxiF_X0HkyXHzJM2rLyTubZJu4JB-ObLnVqfZwAQFl8,1188
365
+ numpy/f2py/tests/src/return_logical/foo90.f90,sha256=9KmCe7yJYpi4ftkKOM3BCDnPOdBPTbUNrKxY3p37O14,1531
366
+ numpy/f2py/tests/src/return_real/foo77.f,sha256=ZTrzb6oDrIDPlrVWP3Bmtkbz3ffHaaSQoXkfTGtCuFE,933
367
+ numpy/f2py/tests/src/return_real/foo90.f90,sha256=gZuH5lj2lG6gqHlH766KQ3J4-Ero-G4WpOOo2MG3ohU,1194
368
+ numpy/f2py/tests/src/size/foo.f90,sha256=IlFAQazwBRr3zyT7v36-tV0-fXtB1d7WFp6S1JVMstg,815
369
+ numpy/f2py/tests/src/string/char.f90,sha256=ihr_BH9lY7eXcQpHHDQhFoKcbu7VMOX5QP2Tlr7xlaM,618
370
+ numpy/f2py/tests/src/string/fixed_string.f90,sha256=5n6IkuASFKgYICXY9foCVoqndfAY0AQZFEK8L8ARBGM,695
371
+ numpy/f2py/tests/src/string/gh24008.f,sha256=UA8Pr-_yplfOFmc6m4v9ryFQ8W9OulaglulefkFWD68,217
372
+ numpy/f2py/tests/src/string/gh24662.f90,sha256=-Tp9Kd1avvM7AIr8ZukFA9RVr-wusziAnE8AvG9QQI4,197
373
+ numpy/f2py/tests/src/string/gh25286.f90,sha256=2EpxvC-0_dA58MBfGQcLyHzpZgKcMf_W9c73C_Mqnok,304
374
+ numpy/f2py/tests/src/string/gh25286.pyf,sha256=GjgWKh1fHNdPGRiX5ek60i1XSeZsfFalydWqjISPVV8,381
375
+ numpy/f2py/tests/src/string/gh25286_bc.pyf,sha256=6Y9zU66NfcGhTXlFOdFjCSMSwKXpq5ZfAe3FwpkAsm4,384
376
+ numpy/f2py/tests/src/string/scalar_string.f90,sha256=ACxV2i6iPDk-a6L_Bs4jryVKYJMEGUTitEIYTjbJes4,176
377
+ numpy/f2py/tests/src/string/string.f,sha256=shr3fLVZaa6SyUJFYIF1OZuhff8v5lCwsVNBU2B-3pk,248
378
+ numpy/f2py/tests/src/value_attrspec/gh21665.f90,sha256=JC0FfVXsnB2lZHb-nGbySnxv_9VHAyD0mKaLDowczFU,190
379
+ numpy/f2py/tests/test_abstract_interface.py,sha256=C8-ly0_TqkmpQNZmwPHwo2IV2MBH0jQEjAhpqHrg8Y4,832
380
+ numpy/f2py/tests/test_array_from_pyobj.py,sha256=Txff89VUeEhWqUCRVybIqsqH4YQvpk4Uyjmh_XjyMi0,24049
381
+ numpy/f2py/tests/test_assumed_shape.py,sha256=FeaqtrWyBf5uyArcmI0D2e_f763aSMpgU3QmdDXe-tA,1466
382
+ numpy/f2py/tests/test_block_docstring.py,sha256=SEpuq73T9oVtHhRVilFf1xF7nb683d4-Kv7V0kfL4AA,564
383
+ numpy/f2py/tests/test_callback.py,sha256=cReSlVjgnoT74wmtNn-oEIZiJUTfRX7ljjlqJi716IQ,6494
384
+ numpy/f2py/tests/test_character.py,sha256=3ugjM1liymMRbY8wub1eiap-jdyNYVHxlNZBqNoRLe4,21868
385
+ numpy/f2py/tests/test_common.py,sha256=m7TTSJt5zUZKJF-MQUeTtCyxW7YwRBSETINXGPFu8S4,896
386
+ numpy/f2py/tests/test_compile_function.py,sha256=9d_FZ8P2wbIlQ2qPDRrsFqPb4nMH8tiWqYZN-P_shCs,4186
387
+ numpy/f2py/tests/test_crackfortran.py,sha256=y1x3U-jlQWD5rmTXz1I2RlTz7LEfbI6qxCDkR5fzPwY,13441
388
+ numpy/f2py/tests/test_data.py,sha256=HFcmPYbiveKa-swJ8x8XlRR9sM0ESB9FEN-txZnHTok,2876
389
+ numpy/f2py/tests/test_docs.py,sha256=jqtuHE5ZjxP4D8Of3Fkzz36F8_0qKbeS040_m0ac4v4,1662
390
+ numpy/f2py/tests/test_f2cmap.py,sha256=p-Sylbr3ctdKT3UQV9FzpCuYPH5U7Vyn8weXFAjiI9o,391
391
+ numpy/f2py/tests/test_f2py2e.py,sha256=eoswH-daMEBlueoVpxXrDloahCpr0RLzHbr3zBHOsjk,25423
392
+ numpy/f2py/tests/test_isoc.py,sha256=_nPTPxNEEagiKriZBeFNesOattIlHDzaNKmj35xxDBY,1406
393
+ numpy/f2py/tests/test_kind.py,sha256=aOMQSBoD_dw49acKN25_abEvQBLI27DsnWIb9CNpSAE,1671
394
+ numpy/f2py/tests/test_mixed.py,sha256=Ctuw-H7DxhPjSt7wZdJ2xffawIoEBCPWc5F7PSkY4HY,848
395
+ numpy/f2py/tests/test_module_doc.py,sha256=sjCXWIKrqMD1NQ1DUAzgQqkjS5w9h9gvM_Lj29Rdcrg,863
396
+ numpy/f2py/tests/test_parameter.py,sha256=ADI7EV_CM4ztICpqHqeq8LI-WdB6cX0ttatdRdjbsUA,3941
397
+ numpy/f2py/tests/test_pyf_src.py,sha256=eD0bZu_GWfoCq--wWqEKRf-F2h5AwoTyO6GMA9wJPr4,1135
398
+ numpy/f2py/tests/test_quoted_character.py,sha256=cpjMdrHwimnkoJkXd_W_FSlh43oWytY5VHySW9oskO4,454
399
+ numpy/f2py/tests/test_regression.py,sha256=v_6RDQr6IcMmbCMElfzRSLPgZhHnH5l99uztrbJAzqE,2532
400
+ numpy/f2py/tests/test_return_character.py,sha256=18HJtiRwQ7a_2mdPUonD5forKWZJEapD-Vi1DsbTjVs,1493
401
+ numpy/f2py/tests/test_return_complex.py,sha256=BZIIqQ1abdiPLgVmu03_q37yCtND0ijxGSMhGz2Wf-o,2397
402
+ numpy/f2py/tests/test_return_integer.py,sha256=t--9UsdLF9flLTQv7a0KTSVoBuoDtTnmOG2QIFPINVc,1758
403
+ numpy/f2py/tests/test_return_logical.py,sha256=XCmp8E8I6BOeNYF59HjSFAdv1hM9WaDvl8UDS10_05o,2017
404
+ numpy/f2py/tests/test_return_real.py,sha256=ATek5AM7dCCPeIvoMOQIt5yFNFzKrFb1Kno8B4M0rn4,3235
405
+ numpy/f2py/tests/test_semicolon_split.py,sha256=_Mdsi84lES18pPjl9J-QsbGttV4tPFFjZvJvejNcqPc,1635
406
+ numpy/f2py/tests/test_size.py,sha256=q6YqQvcyqdXJeWbGijTiCbxyEG3EkPcvT8AlAW6RCMo,1164
407
+ numpy/f2py/tests/test_string.py,sha256=5xZOfdReoHnId0950XfmtfduPPfBbtMkzBoXMtygvMk,2962
408
+ numpy/f2py/tests/test_symbolic.py,sha256=28quk2kTKfWhKe56n4vINJ8G9weKBfc7HysMlE9J3_g,18341
409
+ numpy/f2py/tests/test_value_attrspec.py,sha256=rWwJBfE2qGzqilZZurJ-7ucNoJDICye6lLetQSLFees,323
410
+ numpy/f2py/tests/util.py,sha256=bEhG699c4bLVPR2WR8fV67avgX6kH5I74SicGb7Z7T4,11167
411
+ numpy/f2py/use_rules.py,sha256=3pTDOPur6gbPHPtwuMJPQvpnUMw39Law1KFSH0coB_0,3527
412
+ numpy/fft/__init__.py,sha256=HqjmF6s_dh0Ri4UZzUDtOKbNUyfAfJAWew3e3EL_KUk,8175
413
+ numpy/fft/__init__.pyi,sha256=vD9Xzz5r13caF4AVL87Y4U9KOj9ic25Vci_wb3dmgpk,550
414
+ numpy/fft/_pocketfft.py,sha256=Xkm8wcP4JyBNMbp0ZoHIWhNDlgliX24RzrDuo29uRks,52897
415
+ numpy/fft/_pocketfft.pyi,sha256=S6-ylUuHbgm8vNbh7tLru6K2R5SJzE81BC_Sllm6QrQ,2371
416
+ numpy/fft/_pocketfft_internal.cpython-312-x86_64-linux-gnu.so,sha256=ONIiSfNRsdUOkmnFloif_GOGOevBUMHnX8n1Wg8zGrU,97008
417
+ numpy/fft/helper.py,sha256=aNj1AcLvtfoX26RiLOwcR-k2QSMuBZkGj2Fu0CeFPJs,6154
418
+ numpy/fft/helper.pyi,sha256=NLTEjy2Gz1aAMDZwCgssIyUne0ubjJqukfYkpsL3gXM,1176
419
+ numpy/fft/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
420
+ numpy/fft/tests/test_helper.py,sha256=whgeaQ8PzFf3B1wkbXobGZ5sF4WxPp4gf1UPUVZest8,6148
421
+ numpy/fft/tests/test_pocketfft.py,sha256=RdeCCvUQmJYVvccOJwToobTKDg9yzUL06o9MkPmRfmI,12895
422
+ numpy/lib/__init__.py,sha256=XMPNJkG_mQ__xuvbf0OcpotgMbA9owt10ZHYVnYHq8E,2713
423
+ numpy/lib/__init__.pyi,sha256=y5ANokFm7EkrlNoHdeQm1FsUhLFxkYtLuanCbsWrGio,5596
424
+ numpy/lib/_datasource.py,sha256=CDF3im6IxdY3Mu6fwRQmkSEBmXS3kQVInQ4plXsoX9c,22631
425
+ numpy/lib/_iotools.py,sha256=Yg9HCfPg4tbhbdgLPcxSMiZXq1xDprvJKLebLwhDszY,30868
426
+ numpy/lib/_version.py,sha256=6vK7czNSB_KrWx2rZJzJ1pyOc73Q07hAgfLB5ItUCnU,4855
427
+ numpy/lib/_version.pyi,sha256=B572hyWrUWG-TAAAXrNNAT4AgyUAmJ4lvgpwMkDzunk,633
428
+ numpy/lib/arraypad.py,sha256=bKP7ZS9NYFYzqSk8OnpFLFrMsua4m_hcqFsi7cGkrJE,31803
429
+ numpy/lib/arraypad.pyi,sha256=ADXphtAORYl3EqvE5qs_u32B_TALKSOtF43jOLmoxRw,1728
430
+ numpy/lib/arraysetops.py,sha256=GJ2RhkzIJmIbwyG6h3LOFTPXg62kM9tcV1a-7tdbVuU,33655
431
+ numpy/lib/arraysetops.pyi,sha256=6X-5l5Yss_9y10LYyIsDLbGX77vt7PtVLDqxOlSRPfY,8372
432
+ numpy/lib/arrayterator.py,sha256=BQ97S00zvfURUZfes0GZo-5hydYNRuvwX1I1bLzeRik,7063
433
+ numpy/lib/arrayterator.pyi,sha256=f7Pwp83_6DiMYmJGUsffncM-FRAynB1iYGvhmHM_SZE,1537
434
+ numpy/lib/format.py,sha256=T8qJMyG2DDVjjYNNpUvBgfA9tCo23IS0w9byRB6twwQ,34769
435
+ numpy/lib/format.pyi,sha256=YWBxC3GdsZ7SKBN8I7nMwWeVuFD1aT9d-VJ8zE4-P-o,748
436
+ numpy/lib/function_base.py,sha256=IhhgfSmYJE-dHoUOMXHPiGYXso-NdXPpLXF9y0gEA6I,189172
437
+ numpy/lib/function_base.pyi,sha256=KWaC5UOBANU4hiIoN2eptE4HYsm4vgp_8BMFV1Y3JX4,16585
438
+ numpy/lib/histograms.py,sha256=xsj_qpaZoI2Bv1FBpY8mIMPJrYRiuIBszn_6kO7YFRA,37778
439
+ numpy/lib/histograms.pyi,sha256=hNwR2xYWkgJCP-nfRGxc-EgHLTD3qm4zmWXthZLt08M,995
440
+ numpy/lib/index_tricks.py,sha256=4PEvXk6VFTkttMViYBVC4yDhyOiKIon6JpIm0d_CmNg,31346
441
+ numpy/lib/index_tricks.pyi,sha256=D2nkNXOB9Vea1PfMaTn94OGBGayjTaQ-bKMsjDmYpak,4251
442
+ numpy/lib/mixins.py,sha256=y6_MzQuiNjv-1EFVROqv2y2cAJi5X4rQYzbZCyUyXgw,7071
443
+ numpy/lib/mixins.pyi,sha256=h9N1kbZsUntF0zjOxPYeD_rCB2dMiG35TYYPl9ymkI4,3117
444
+ numpy/lib/nanfunctions.py,sha256=6EjzydZlugIzfiENKtC4ycZ2Nckt8ZQg5v6D6tX1SiU,65775
445
+ numpy/lib/nanfunctions.pyi,sha256=oPqAfCinmBL85Ji7ko4QlzAzLAK9nZL0t2_CllEbCEU,606
446
+ numpy/lib/npyio.py,sha256=NUjtFvAmPdTjwJQ-ia-xbCr849M_M6NilP5IHfkKaRg,97316
447
+ numpy/lib/npyio.pyi,sha256=SUFWJh90vWZCdd6GCSGbfYeXKlWut0XY_SHvZJc8yqY,9728
448
+ numpy/lib/polynomial.py,sha256=6Aw3_2vdbh4urERQ6NaPhf9a_T1o1o6cjm3fb5Z3_YE,44133
449
+ numpy/lib/polynomial.pyi,sha256=GerIpQnf5LdtFMOy9AxhOTqUyfn57k4MxqEYrfdckWE,6958
450
+ numpy/lib/recfunctions.py,sha256=-90AbWWvVFOqVUPLh9K9NYdKUHYIgSEyg2Y35MnOVUA,59423
451
+ numpy/lib/scimath.py,sha256=T4ITysZgqhY1J8IxyXCtioHjMTg2ci-4i3mr9TBF2UA,15037
452
+ numpy/lib/scimath.pyi,sha256=E2roKJzMFwWSyhLu8UPUr54WOpxF8jp_pyXYBgsUSQ8,2883
453
+ numpy/lib/setup.py,sha256=0K5NJKuvKvNEWp-EX7j0ODi3ZQQgIMHobzSFJq3G7yM,405
454
+ numpy/lib/shape_base.py,sha256=AhCO9DEyysE-P-QJF9ryUtJ1ghU4_0mORhAJ59poObU,38947
455
+ numpy/lib/shape_base.pyi,sha256=bGJhLA_RvUpVTiDFgCV-1rUjV8e1qCh0gK_3PLgXA_U,5341
456
+ numpy/lib/stride_tricks.py,sha256=brY5b-0YQJuIH2CavfpIinMolyTUv5k9DUvLoZ-imis,17911
457
+ numpy/lib/stride_tricks.pyi,sha256=0pQ4DP9l6g21q2Ajv6dJFRWMr9auPGTNV9BmZUbogPY,1747
458
+ numpy/lib/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
459
+ numpy/lib/tests/data/py2-objarr.npy,sha256=F4cyUC-_TB9QSFLAo2c7c44rC6NUYIgrfGx9PqWPSKk,258
460
+ numpy/lib/tests/data/py2-objarr.npz,sha256=xo13HBT0FbFZ2qvZz0LWGDb3SuQASSaXh7rKfVcJjx4,366
461
+ numpy/lib/tests/data/py3-objarr.npy,sha256=pTTVh8ezp-lwAK3fkgvdKU8Arp5NMKznVD-M6Ex_uA0,341
462
+ numpy/lib/tests/data/py3-objarr.npz,sha256=qQR0gS57e9ta16d_vCQjaaKM74gPdlwCPkp55P-qrdw,449
463
+ numpy/lib/tests/data/python3.npy,sha256=X0ad3hAaLGXig9LtSHAo-BgOvLlFfPYMnZuVIxRmj-0,96
464
+ numpy/lib/tests/data/win64python2.npy,sha256=agOcgHVYFJrV-nrRJDbGnUnF4ZTPYXuSeF-Mtg7GMpc,96
465
+ numpy/lib/tests/test__datasource.py,sha256=65KXfUUvp8wXSqgQisuYlkhg-qHjBV5FXYetL8Ba-rc,10571
466
+ numpy/lib/tests/test__iotools.py,sha256=HerCqvDE07JxjFQlWEfpZO7lC9z0Sbr3z20GSutoCPs,13743
467
+ numpy/lib/tests/test__version.py,sha256=aO3YgkAohLsLzCNQ7vjIwdpFUMz0cPLbcuuxIkjuN74,1999
468
+ numpy/lib/tests/test_arraypad.py,sha256=obohHbyM0gPYPUkd7iJSOSiDqyqtJsjDNtQX68NC4lM,54830
469
+ numpy/lib/tests/test_arraysetops.py,sha256=5-T1MVhfIMivat8Z47GZw0ZaR811W_FskM1bAXnFyLU,35912
470
+ numpy/lib/tests/test_arrayterator.py,sha256=AYs2SwV5ankgwnvKI9RSO1jZck118nu3SyZ4ngzZNso,1291
471
+ numpy/lib/tests/test_financial_expired.py,sha256=yq5mqGMvqpkiiw9CuZhJgrYa7Squj1mXr_G-IvAFgwI,247
472
+ numpy/lib/tests/test_format.py,sha256=xV0oi1eoRnVwAAhSOcPFQHQWF7TfsROtDYShQLPtdaA,41028
473
+ numpy/lib/tests/test_function_base.py,sha256=DBKugIUEFTMP7g6iL1bk986E6ldCrcNdBCWOJbQla_Y,157830
474
+ numpy/lib/tests/test_histograms.py,sha256=16_XJp-eFgsuM8B4mDQpQ4w_Ib29Hg0EPO-WFsdaFWA,32815
475
+ numpy/lib/tests/test_index_tricks.py,sha256=Vjz25Y6H_ih0iEE2AG0kaxO9U8PwcXSrofzqnN4XBwI,20256
476
+ numpy/lib/tests/test_io.py,sha256=3Tow1pucrQ7z7osNN4a2grBYUoBGNkQEhjmCjXT6Vag,107891
477
+ numpy/lib/tests/test_loadtxt.py,sha256=gwcDJDJmLJRMLpg322yjQ1IzI505w9EqJoq4DmDPCdI,38560
478
+ numpy/lib/tests/test_mixins.py,sha256=Wivwz3XBWsEozGzrzsyyvL3qAuE14t1BHk2LPm9Z9Zc,7030
479
+ numpy/lib/tests/test_nanfunctions.py,sha256=01r_mmTCvKVdZuOGTEHNDZXrMS724us_jwZANzCd74A,47609
480
+ numpy/lib/tests/test_packbits.py,sha256=OWGAd5g5GG0gl7WHqNfwkZ7G-2rrtLt2sI854PG4nnw,17546
481
+ numpy/lib/tests/test_polynomial.py,sha256=URouxJpr8FQ5hiKybqhtOcLA7e-3hj4kWzjLBROByyA,11395
482
+ numpy/lib/tests/test_recfunctions.py,sha256=6jzouPEQ7Uhtj8_-W5yTI6ymNp2nLgmdHzxdd74jVuM,44001
483
+ numpy/lib/tests/test_regression.py,sha256=KzGFkhTcvEG97mymoOQ2hP2CEr2nPZou0Ztf4-WaXCs,8257
484
+ numpy/lib/tests/test_shape_base.py,sha256=2iQCEFR6evVpF8woaenxUOzooHkfuMYkBaUj8ecyJ-E,26817
485
+ numpy/lib/tests/test_stride_tricks.py,sha256=wprpWWH5eq07DY7rzG0WDv5fMtLxzRQz6fm6TZWlScQ,22849
486
+ numpy/lib/tests/test_twodim_base.py,sha256=ll-72RhqCItIPB97nOWhH7H292h4nVIX_w1toKTPMUg,18841
487
+ numpy/lib/tests/test_type_check.py,sha256=lxCH5aApWVYhhSoDQSLDTCHLVHuK2c-jBbnfnZUrOaA,15114
488
+ numpy/lib/tests/test_ufunclike.py,sha256=4hSnXGlSC8HE-_pRRMzD8-HI4hGHqsAWu1pD0o2kPI0,2982
489
+ numpy/lib/tests/test_utils.py,sha256=RVAxrzSFu6N3C4_jIgAlTDOWF_B7wr2v1Y20dX5upYM,6218
490
+ numpy/lib/twodim_base.py,sha256=Mvzn_PyShIb9m7nJjJ4IetdxwmLYEsCPHvJoK7n2viU,32947
491
+ numpy/lib/twodim_base.pyi,sha256=xFRcEVJdDj4mrXW_6iVP1lTMoJx4QJjYRD3o2_9f2eY,5370
492
+ numpy/lib/type_check.py,sha256=_EOtB296nFYlNT7ztBYoC_yK9aycIb0KTmRjvzVdZNg,19954
493
+ numpy/lib/type_check.pyi,sha256=LPvAvIxU-p5i_Qe-ic7hEvo4OTfSrNpplxMG7OAZe8Q,5571
494
+ numpy/lib/ufunclike.py,sha256=_ceBGbGCMOd3u_h2UVzyaRK6ZY7ryoJ0GJB7zqcJG3w,6325
495
+ numpy/lib/ufunclike.pyi,sha256=hLxcYfQprh1tTY_UO2QscA3Hd9Zd7cVGXIINZLhMFqY,1293
496
+ numpy/lib/user_array.py,sha256=LE958--CMkBI2r3l1SQxmCHdCSw6HY6-RhWCnduzGA4,7721
497
+ numpy/lib/utils.py,sha256=6NdleaELZiqARdj-ECZjxtwLf1bqklOcK43m9yoZefs,37804
498
+ numpy/lib/utils.pyi,sha256=mVHVzWuc2-M3Oz60lFsbok0v8LH_HRHMjZpXwrtzF_c,2360
499
+ numpy/linalg/__init__.py,sha256=mpdlEXWtTvpF7In776ONLwp6RIyo4U_GLPT1L1eIJnw,1813
500
+ numpy/linalg/__init__.pyi,sha256=XBy4ocuypsRVflw_mbSTUhR4N5Roemu6w5SfeVwbkAc,620
501
+ numpy/linalg/_umath_linalg.cpython-312-x86_64-linux-gnu.so,sha256=iCLnctdD1AWYPxucazS3BN0pd4CJDcJFRU8Qga31Ckw,216793
502
+ numpy/linalg/lapack_lite.cpython-312-x86_64-linux-gnu.so,sha256=UAZPuN2wY1u7YCi4990o-QwErZqxw_rd0RF8K7fcj_0,29849
503
+ numpy/linalg/linalg.py,sha256=kDVK1GBxbUjlRgxXCoEfkRJm8yrNr1Iu7hMn2rKK8RE,90923
504
+ numpy/linalg/linalg.pyi,sha256=zD9U5BUCB1uQggSxfZaTGX_uB2Hkp75sttGmZbCGgBI,7505
505
+ numpy/linalg/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
506
+ numpy/linalg/tests/test_deprecations.py,sha256=9p_SRmtxj2zc1doY9Ie3dyy5JzWy-tCQWFoajcAJUmM,640
507
+ numpy/linalg/tests/test_linalg.py,sha256=rgvmK6Or70u8mN04puetL3FgSxZ8fJrOlI5ptTgCU5k,78085
508
+ numpy/linalg/tests/test_regression.py,sha256=qbugUmrENybkEaM1GhfA01RXQUy8AkzalbrfzSIgUmM,5434
509
+ numpy/ma/API_CHANGES.txt,sha256=F_4jW8X5cYBbzpcwteymkonTmvzgKKY2kGrHF1AtnrI,3405
510
+ numpy/ma/LICENSE,sha256=BfO4g1GYjs-tEKvpLAxQ5YdcZFLVAJoAhMwpFVH_zKY,1593
511
+ numpy/ma/README.rst,sha256=q-gCsZ4Cw_gUGGvEjog556sJUHIm8WTAwkFK5Qnz9XA,9872
512
+ numpy/ma/__init__.py,sha256=dgP0WdnOpph28Fd6UiqoyDKhfrct0H6QWqbCcETsk6M,1404
513
+ numpy/ma/__init__.pyi,sha256=ppCg_TS0POutNB3moJE4kBabWURnc0WGXyYPquXZxS4,6063
514
+ numpy/ma/core.py,sha256=4MglVRJtmQ9_iIVaQ2b-_Vmw1TjAhEsMJdtKOhyBFXQ,278213
515
+ numpy/ma/core.pyi,sha256=YfgyuBuKxZ5v4I2JxZDvCLhnztOCRgzTeDg-JGTon_M,14305
516
+ numpy/ma/extras.py,sha256=MC7QPS34PC4wxNbOp7pTy57dqF9B-L6L1KMI6rrfe2w,64383
517
+ numpy/ma/extras.pyi,sha256=BBsiCZbaPpGCY506fkmqZdBkJNCXcglc3wcSBuAACNk,2646
518
+ numpy/ma/mrecords.py,sha256=degd6dLaDEvEWNHmvSnUZXos1csIzaqjR_jAutm8JfI,27232
519
+ numpy/ma/mrecords.pyi,sha256=r1a2I662ywnhGS6zvfcyK-9RHVvb4sHxiCx9Dhf5AE4,1934
520
+ numpy/ma/setup.py,sha256=MqmMicr_xHkAGoG-T7NJ4YdUZIJLO4ZFp6AmEJDlyhw,418
521
+ numpy/ma/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
522
+ numpy/ma/tests/test_core.py,sha256=xd5S3oa0jObo8jnsJk0-o46d-KNC3RtgNRKinJeY_kE,215100
523
+ numpy/ma/tests/test_deprecations.py,sha256=nq_wFVt2EBHcT3AHxattfKXx2JDf1K5D-QBzUU0_15A,2566
524
+ numpy/ma/tests/test_extras.py,sha256=lX4cbdGDEXaBHzA3q8hJxve4635XCJw4AP7FO7zhOfk,74858
525
+ numpy/ma/tests/test_mrecords.py,sha256=PsJhUlABgdpSsPUeijonfyFNqz5AfNSGQTtJUte7yts,19890
526
+ numpy/ma/tests/test_old_ma.py,sha256=h4BncexBcBigqvZMA6RjDjpHPurWtt99A7KTag2rmOs,32690
527
+ numpy/ma/tests/test_regression.py,sha256=foMpI0luAvwkkRpAfPDV_810h1URISXDZhmaNhxb50k,3287
528
+ numpy/ma/tests/test_subclassing.py,sha256=HeTIE_n1I8atwzF8tpvNtGHp-0dmM8PT8AS4IDWbcso,16967
529
+ numpy/ma/testutils.py,sha256=RQw0RyS7hOSVTk4KrCGleq0VHlnDqzwwaLtuZbRE4_I,10235
530
+ numpy/ma/timer_comparison.py,sha256=pIGSZG-qYYYlRWSTgzPlyCAINbGKhXrZrDZBBjiM080,15658
531
+ numpy/matlib.py,sha256=-54vTuGIgeTMg9ZUmElRPZ4Hr-XZ-om9xLzAsSoTvnc,10465
532
+ numpy/matrixlib/__init__.py,sha256=BHBpQKoQv4EjT0UpWBA-Ck4L5OsMqTI2IuY24p-ucXk,242
533
+ numpy/matrixlib/__init__.pyi,sha256=-t3ZuvbzRuRwWfZOeN4xlNWdm7gQEprhUsWzu8MRvUE,252
534
+ numpy/matrixlib/defmatrix.py,sha256=JXdJGm1LayOOXfKpp7OVZfb0pzzP4Lwh45sTJrleALc,30656
535
+ numpy/matrixlib/defmatrix.pyi,sha256=lmBMRahKcMOl2PHDo79J67VRAZOkI54BzfDaTLpE0LI,451
536
+ numpy/matrixlib/setup.py,sha256=1r7JRkSM4HyVorgtjoKJGWLcOcPO3wmvivpeEsVtAEg,426
537
+ numpy/matrixlib/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
538
+ numpy/matrixlib/tests/test_defmatrix.py,sha256=8E_-y7VD2vsq1y8CcI8km37pp5qcAtkciO16xqf2UIs,14982
539
+ numpy/matrixlib/tests/test_interaction.py,sha256=PpjmgjEKighDXvt38labKE6L7f2jP74UEmp3JRb_iOY,11875
540
+ numpy/matrixlib/tests/test_masked_matrix.py,sha256=7YO_LCO8DOhW3CuXJuxH93rnmttfvHnU7El-MBzxzFw,8932
541
+ numpy/matrixlib/tests/test_matrix_linalg.py,sha256=ObbSUXU4R2pWajH__xAdizADrU2kBKDDCxkDV-oVBXc,2059
542
+ numpy/matrixlib/tests/test_multiarray.py,sha256=jB3XCBmAtcqf-Wb9PwBW6uIykPpMPthuXLJ0giTKzZE,554
543
+ numpy/matrixlib/tests/test_numeric.py,sha256=MP70qUwgshTtThKZaZDp7_6U-Z66NIV1geVhasGXejQ,441
544
+ numpy/matrixlib/tests/test_regression.py,sha256=8sHDtO8Zi8p3a1eQKEWxtCmKrXmHoD3qxlIokg2AIAU,927
545
+ numpy/polynomial/__init__.py,sha256=braLh6zP2QwuNKRKAaZGdC_qKWZ-tJlc3BN83LeuE_0,6781
546
+ numpy/polynomial/__init__.pyi,sha256=W8szYtVUy0RUi83jmFLK58BN8CKVSoHA2CW7IcdUl1c,701
547
+ numpy/polynomial/_polybase.py,sha256=YEnnQwlTgbn3dyD89ueraUx5nxx3x_pH6K6mmyEmhi8,39271
548
+ numpy/polynomial/_polybase.pyi,sha256=J7yU9PPZW4W8mkqAltDfnL4ZNwljuM-bDEj4DPTJZpY,2321
549
+ numpy/polynomial/chebyshev.py,sha256=NZCKjIblcX99foqZyp51i0_r8p0r1VKVGZFmQ1__kEk,62796
550
+ numpy/polynomial/chebyshev.pyi,sha256=035CNdOas4dnb6lFLzRiBrYT_VnWh2T1-A3ibm_HYkI,1387
551
+ numpy/polynomial/hermite.py,sha256=t5CFM-qE4tszYJiQZ301VcMn7IM67y2rUZPFPtnVRAc,52514
552
+ numpy/polynomial/hermite.pyi,sha256=hdsvTULow8bIjnATudf0i6brpLHV7vbOoHzaMvbjMy0,1217
553
+ numpy/polynomial/hermite_e.py,sha256=jRR3f8Oth8poV2Ix8c0eLEQR3UZary-2RupOrEAEUMY,52642
554
+ numpy/polynomial/hermite_e.pyi,sha256=zV7msb9v9rV0iv_rnD3SjP-TGyc6pd3maCqiPCj3PbA,1238
555
+ numpy/polynomial/laguerre.py,sha256=mcVw0ckWVX-kzJ1QIhdcuuxzPjuFmA3plQLkloQMOYM,50858
556
+ numpy/polynomial/laguerre.pyi,sha256=Gxc9SLISNKMWrKdsVJ9fKFFFwfxxZzfF-Yc-2r__z5M,1178
557
+ numpy/polynomial/legendre.py,sha256=wjtgFajmKEbYkSUk3vWSCveMHDP6UymK28bNUk4Ov0s,51550
558
+ numpy/polynomial/legendre.pyi,sha256=9dmANwkxf7EbOHV3XQBPoaDtc56cCkf75Wo7FG9Zfj4,1178
559
+ numpy/polynomial/polynomial.py,sha256=XsaZPHmLGJFqpJs7rPvO5E0loWQ1L3YHLIUybVu4dU8,49112
560
+ numpy/polynomial/polynomial.pyi,sha256=bOPRnub4xXxsUwNGeiQLTT4PCfN1ysSrf6LBZIcAN2Y,1132
561
+ numpy/polynomial/polyutils.py,sha256=Xy5qjdrjnRaqSlClG1ROmwWccLkAPC7IcHaNJLvhCf4,23237
562
+ numpy/polynomial/polyutils.pyi,sha256=cFAyZ9Xzuw8Huhn9FEz4bhyD00m2Dp-2DiUSyogJwSo,264
563
+ numpy/polynomial/setup.py,sha256=dXQfzVUMP9OcB6iKv5yo1GLEwFB3gJ48phIgo4N-eM0,373
564
+ numpy/polynomial/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
565
+ numpy/polynomial/tests/test_chebyshev.py,sha256=6tMsFP1h7K8Zf72mNOta6Tv52_fVTlXknseuffj080c,20522
566
+ numpy/polynomial/tests/test_classes.py,sha256=DFyY2IQBj3r2GZkvbRIeZO2EEY466xbuwc4PShAl4Sw,18331
567
+ numpy/polynomial/tests/test_hermite.py,sha256=N9b2dx2UWPyja5v02dSoWYPnKvb6H-Ozgtrx-xjWz2k,18577
568
+ numpy/polynomial/tests/test_hermite_e.py,sha256=_A3ohAWS4HXrQG06S8L47dImdZGTwYosCXnoyw7L45o,18911
569
+ numpy/polynomial/tests/test_laguerre.py,sha256=BZOgs49VBXOFBepHopxuEDkIROHEvFBfWe4X73UZhn8,17511
570
+ numpy/polynomial/tests/test_legendre.py,sha256=b_bblHs0F_BWw9ESuSq52ZsLKcQKFR5eqPf_SppWFqo,18673
571
+ numpy/polynomial/tests/test_polynomial.py,sha256=4cuO8-5wdIxcz5CrucB5Ix7ySuMROokUF12F7ogQ_hc,20529
572
+ numpy/polynomial/tests/test_polyutils.py,sha256=IxkbVfpcBqe5lOZluHFUPbLATLu1rwVg7ghLASpfYrY,3579
573
+ numpy/polynomial/tests/test_printing.py,sha256=rfP4MaQbjGcO52faHmYrgsaarkm3Ndi3onwr6DDuapE,20525
574
+ numpy/polynomial/tests/test_symbol.py,sha256=msTPv7B1niaKujU33kuZmdxJvLYvOjfl1oykmlL0dXo,5371
575
+ numpy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
576
+ numpy/random/LICENSE.md,sha256=EDFmtiuARDr7nrNIjgUuoGvgz_VmuQjxmeVh_eSa8Z8,3511
577
+ numpy/random/__init__.pxd,sha256=9JbnX540aJNSothGs-7e23ozhilG6U8tINOUEp08M_k,431
578
+ numpy/random/__init__.py,sha256=81Thnexg5umN5WZwD5TRyzNc2Yp-d14B6UC7NBgVKh8,7506
579
+ numpy/random/__init__.pyi,sha256=RfW8mco48UaWDL1UC5ROv9vXiFZ9EGho62avhgEAHPc,2143
580
+ numpy/random/_bounded_integers.cpython-312-x86_64-linux-gnu.so,sha256=s59-K0zP1pBK5g_hUX9r2ovng1tb9p1U3sDWk8Xot5M,348704
581
+ numpy/random/_bounded_integers.pxd,sha256=hcoucPH5hkFEM2nm12zYO-5O_Rt8RujEXT5YWuAzl1Q,1669
582
+ numpy/random/_common.cpython-312-x86_64-linux-gnu.so,sha256=q9iMqPRH8ixPUfImc000cylmuuYe3SqiX3S_7JVL7ig,258888
583
+ numpy/random/_common.pxd,sha256=s2_IdIQ0MhNbogamulvXe-b93wbx882onmYkxqswwpo,4939
584
+ numpy/random/_examples/cffi/extending.py,sha256=xSla3zWqxi6Hj48EvnYfD3WHfE189VvC4XsKu4_T_Iw,880
585
+ numpy/random/_examples/cffi/parse.py,sha256=Bnb7t_6S_c5-3dZrQ-XX9EazOKhftUfcCejXXWyd1EU,1771
586
+ numpy/random/_examples/cython/extending.pyx,sha256=4IE692pq1V53UhPZqQiQGcIHXDoNyqTx62x5a36puVg,2290
587
+ numpy/random/_examples/cython/extending_distributions.pyx,sha256=oazFVWeemfE0eDzax7r7MMHNL1_Yofws2m-c_KT2Hbo,3870
588
+ numpy/random/_examples/cython/meson.build,sha256=rXtugURMEo-ef4bPE1QIv4mzvWbeGjmcTdKCBvjxjtw,1443
589
+ numpy/random/_examples/numba/extending.py,sha256=Ipyzel_h5iU_DMJ_vnXUgQC38uMDMn7adUpWSeEQLFE,1957
590
+ numpy/random/_examples/numba/extending_distributions.py,sha256=Jnr9aWkHyIWygNbdae32GVURK-5T9BTGhuExRpvve98,2034
591
+ numpy/random/_generator.cpython-312-x86_64-linux-gnu.so,sha256=Wz7yrIt4qoO8hptw4w4qcPvTqzc8UlPtbrqZgqVf1-I,946872
592
+ numpy/random/_generator.pyi,sha256=zRvo_y6g0pWkE4fO1M9jLYUkxDfGdA6Enreb3U2AADM,22442
593
+ numpy/random/_mt19937.cpython-312-x86_64-linux-gnu.so,sha256=Nhn3-Rue5xl8KQLA4Zfmmy5d1F-xHNIuVy6bC4hlFKk,119488
594
+ numpy/random/_mt19937.pyi,sha256=_iZKaAmuKBQ4itSggfQvYYj_KjktcN4rt-YpE6bqFAM,724
595
+ numpy/random/_pcg64.cpython-312-x86_64-linux-gnu.so,sha256=V3wUaPT7QLsjGEND4sG2RaF9HUk2QeqSwFLyhtxutVY,125040
596
+ numpy/random/_pcg64.pyi,sha256=uxr5CbEJetN6lv9vBG21jlRhuzOK8SQnXrwqAQBxj_c,1091
597
+ numpy/random/_philox.cpython-312-x86_64-linux-gnu.so,sha256=LJsf5T7xGePtKstzyALPKZZQKw_VHUkm1AR1ds6ldRQ,106712
598
+ numpy/random/_philox.pyi,sha256=OKlaiIU-hj72Bp04zjNifwusOD_3-mYxIfvyuys8c_o,978
599
+ numpy/random/_pickle.py,sha256=4NhdT-yk7C0m3tyZWmouYAs3ZGNPdPVNGfUIyuh8HDY,2318
600
+ numpy/random/_sfc64.cpython-312-x86_64-linux-gnu.so,sha256=WIMwLOM6_VTbZjGtv14AApe460LA7IlLvsMYteaxQmg,76224
601
+ numpy/random/_sfc64.pyi,sha256=09afHTedVW-519493ZXtGcl-H-_zluj-B_yfEJG8MMs,709
602
+ numpy/random/bit_generator.cpython-312-x86_64-linux-gnu.so,sha256=h8XHMIh5Q8YDsWxTzyEzmDNu5BDvRllVGAk5d6_VsMs,234016
603
+ numpy/random/bit_generator.pxd,sha256=lArpIXSgTwVnJMYc4XX0NGxegXq3h_QsUDK6qeZKbNc,1007
604
+ numpy/random/bit_generator.pyi,sha256=aXv7a_hwa0nkjY8P2YENslwWp89UcFRn09woXh7Uoc0,3510
605
+ numpy/random/c_distributions.pxd,sha256=7DE-mV3H_Dihk4OK4gMHHkyD4tPX1cAi4570zi5CI30,6344
606
+ numpy/random/lib/libnpyrandom.a,sha256=xUcvOvieju5PThPQ8q0-uGJ5fjsCd5umnjIerIc85Sg,71926
607
+ numpy/random/mtrand.cpython-312-x86_64-linux-gnu.so,sha256=jYZrS2EHQBq5VGBEVkII4KWJnDkD2gknfRxIddmLzw8,749040
608
+ numpy/random/mtrand.pyi,sha256=3vAGOXsvyFFv0yZl34pVVPP7Dgt22COyfn4tUoi_hEQ,19753
609
+ numpy/random/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
610
+ numpy/random/tests/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
611
+ numpy/random/tests/data/mt19937-testset-1.csv,sha256=Xkef402AVB-eZgYQkVtoxERHkxffCA9Jyt_oMbtJGwY,15844
612
+ numpy/random/tests/data/mt19937-testset-2.csv,sha256=nsBEQNnff-aFjHYK4thjvUK4xSXDSfv5aTbcE59pOkE,15825
613
+ numpy/random/tests/data/pcg64-testset-1.csv,sha256=xB00DpknGUTTCxDr9L6aNo9Hs-sfzEMbUSS4t11TTfE,23839
614
+ numpy/random/tests/data/pcg64-testset-2.csv,sha256=NTdzTKvG2U7_WyU_IoQUtMzU3kEvDH39CgnR6VzhTkw,23845
615
+ numpy/random/tests/data/pcg64dxsm-testset-1.csv,sha256=vNSUT-gXS_oEw_awR3O30ziVO4seNPUv1UIZ01SfVnI,23833
616
+ numpy/random/tests/data/pcg64dxsm-testset-2.csv,sha256=uylS8PU2AIKZ185OC04RBr_OePweGRtvn-dE4YN0yYA,23839
617
+ numpy/random/tests/data/philox-testset-1.csv,sha256=SedRaIy5zFadmk71nKrGxCFZ6BwKz8g1A9-OZp3IkkY,23852
618
+ numpy/random/tests/data/philox-testset-2.csv,sha256=dWECt-sbfvaSiK8-Ygp5AqyjoN5i26VEOrXqg01rk3g,23838
619
+ numpy/random/tests/data/sfc64-testset-1.csv,sha256=iHs6iX6KR8bxGwKk-3tedAdMPz6ZW8slDSUECkAqC8Q,23840
620
+ numpy/random/tests/data/sfc64-testset-2.csv,sha256=FIDIDFCaPZfWUSxsJMAe58hPNmMrU27kCd9FhCEYt_k,23833
621
+ numpy/random/tests/test_direct.py,sha256=6vLpCyeKnAWFEZei7l2YihVLQ0rSewO1hJBWt7A5fyQ,17779
622
+ numpy/random/tests/test_extending.py,sha256=S3Wrzu3di4uBhr-Pxnx5dOPvlBY0FRdZqVX6CC1IN6s,4038
623
+ numpy/random/tests/test_generator_mt19937.py,sha256=35LBwV6TtWPnxhefutxTQmhLzAQ5Ee4YiY8ziDXM-eQ,115477
624
+ numpy/random/tests/test_generator_mt19937_regressions.py,sha256=xGkdz76BMX1EK0QPfabVxpNx9qQ9OC-1ZStWOs6N_M8,6387
625
+ numpy/random/tests/test_random.py,sha256=kEkQs3i7zcpm9MozIRIz1FIx5B6fmXk0QqX0l6l-u_Y,70087
626
+ numpy/random/tests/test_randomstate.py,sha256=DxF7rMUSxaAlL4h1qC3onHcHR7T_6rKWPbr0nJH84nE,85031
627
+ numpy/random/tests/test_randomstate_regression.py,sha256=VucYWIjA7sAquWsalvZMnfkmYLM1O6ysyWnLl931-lA,7917
628
+ numpy/random/tests/test_regression.py,sha256=trntK51UvajOVELiluEO85l64CKSw5nvBSc5SqYyr9w,5439
629
+ numpy/random/tests/test_seed_sequence.py,sha256=GNRJ4jyzrtfolOND3gUWamnbvK6-b_p1bBK_RIG0sfU,3311
630
+ numpy/random/tests/test_smoke.py,sha256=jjNz0aEGD1_oQl9a9UWt6Mz_298alG7KryLT1pgHljw,28183
631
+ numpy/testing/__init__.py,sha256=InpVKoDAzMKO_l_HNcatziW_u1k9_JZze__t2nybrL0,595
632
+ numpy/testing/__init__.pyi,sha256=AhK5NuOpdD-JjIzXOlssE8_iSLyFAAHzyGV_w1BT7vA,1674
633
+ numpy/testing/_private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
634
+ numpy/testing/_private/extbuild.py,sha256=nG2dwP4nUmQS3e5eIRinxt0s_f4sxxA1YfohCg-navo,8017
635
+ numpy/testing/_private/utils.py,sha256=3FrSTMi0OdpDODBDoncgiDQzdo5NKA6YVfQ3uKRSQnc,85242
636
+ numpy/testing/_private/utils.pyi,sha256=MMNrvwEeSTYzZFWawSSzHnTFYG-cSAIiID-1FuJ1f8U,10123
637
+ numpy/testing/overrides.py,sha256=u6fcKSBC8HIzMPWKAbdyowU71h2Fx2ekDQxpG5NhIr8,2123
638
+ numpy/testing/print_coercion_tables.py,sha256=ndxOsS4XfrZ4UY_9nqRTCnxhkzgdqcuUHL8nezd7Op4,6180
639
+ numpy/testing/setup.py,sha256=GPKAtTTBRsNW4kmR7NjP6mmBR_GTdpaTvkTm10_VcLg,709
640
+ numpy/testing/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
641
+ numpy/testing/tests/test_utils.py,sha256=IDOr-GXuNGlrsb-XzGSYUHXEqcGYJ78p60jOpBqyPM4,55740
642
+ numpy/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
643
+ numpy/tests/test__all__.py,sha256=L3mCnYPTpzAgNfedVuq9g7xPWbc0c1Pot94k9jZ9NpI,221
644
+ numpy/tests/test_ctypeslib.py,sha256=B06QKeFRgDIEbkEPBy_zYA1H5E2exuhTi7IDkzV8gfo,12257
645
+ numpy/tests/test_lazyloading.py,sha256=YETrYiDLAqLX04K_u5_3NVxAfxDoeguxwkIRfz6qKcY,1162
646
+ numpy/tests/test_matlib.py,sha256=gwhIXrJJo9DiecaGLCHLJBjhx2nVGl6yHq80AOUQSRM,1852
647
+ numpy/tests/test_numpy_config.py,sha256=qHvepgi9oyAbQuZD06k7hpcCC2MYhdzcY6D1iQDPNMI,1241
648
+ numpy/tests/test_numpy_version.py,sha256=A8cXFzp4k-p6J5zkOxlDfDvkoFMxDW2hpTFVXcaQRVo,1479
649
+ numpy/tests/test_public_api.py,sha256=DTq7SO84uBjC2tKPoqX17xazc-SLkTAbQ2fLZwGM2jc,18170
650
+ numpy/tests/test_reloading.py,sha256=QuVaPQulcNLg4Fl31Lw-O89L42KclYCK68n5GVy0PNQ,2354
651
+ numpy/tests/test_scripts.py,sha256=jluCLfG94VM1cuX-5RcLFBli_yaJZpIvmVuMxRKRJrc,1645
652
+ numpy/tests/test_warnings.py,sha256=ZEtXqHI1iyeVeLfVxDcMfN5qw67Ti2u54709hvBG4eY,2284
653
+ numpy/typing/__init__.py,sha256=VoTILNDrUWvZx0LK9_97lBLQFKtSGmDt4QLOH8zYvlo,5234
654
+ numpy/typing/mypy_plugin.py,sha256=24zVk4Ei3qH4Hc3SSz3v0XtIsycTo8HKoY6ilhB_7AQ,6376
655
+ numpy/typing/setup.py,sha256=Cnz9q53w-vJNyE6vYxqYvQXx0pJbrG9quHyz9sqxfek,374
656
+ numpy/typing/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
657
+ numpy/typing/tests/data/fail/arithmetic.pyi,sha256=4rY_ASCERAl8WCus1RakOe0Aw-8vvjilL29mgdD4lv0,3850
658
+ numpy/typing/tests/data/fail/array_constructors.pyi,sha256=X9y_jUYS17WfYmXW5NwkVudyiR6ouUaAwEh0JRte42o,1089
659
+ numpy/typing/tests/data/fail/array_like.pyi,sha256=OVAlEJZ5k8ZRKt0aGpZQwIjlUGpy0PzOOYqfI-IMqBQ,455
660
+ numpy/typing/tests/data/fail/array_pad.pyi,sha256=57oK0Yp53rtKjjIrRFYLcxa-IfIGhtI-bEem7ggJKwI,132
661
+ numpy/typing/tests/data/fail/arrayprint.pyi,sha256=-Fs9VnQfxyfak008Hq8kJWfB0snA6jGDXZz8ljQnwGE,549
662
+ numpy/typing/tests/data/fail/arrayterator.pyi,sha256=FoU4ahHkJZ67dwWXer5FXLjjjesKKg-w2Jq1X1bHymA,480
663
+ numpy/typing/tests/data/fail/bitwise_ops.pyi,sha256=GN9dVqk4_HFXn7zbRrHzJq_UGRFBccoYVUG1UuE7bXs,515
664
+ numpy/typing/tests/data/fail/char.pyi,sha256=-vgN6EmfQ8VaA4SOZ5Ol9u4-Z7Q5I7G78LmaxZOuZ90,2615
665
+ numpy/typing/tests/data/fail/chararray.pyi,sha256=jrNryZFpr8nxG2IHb9e0x3ranpvJpBy_RDex-WpT5rU,2296
666
+ numpy/typing/tests/data/fail/comparisons.pyi,sha256=U4neWzwwtxG6QXsKlNGJuKXHBtwzYBQOa47_7SKF5Wg,888
667
+ numpy/typing/tests/data/fail/constants.pyi,sha256=YSqNbXdhbdMmYbs7ntH0FCKbnm8IFeqsDlZBqcU43iw,286
668
+ numpy/typing/tests/data/fail/datasource.pyi,sha256=PRT2hixR-mVxr2UILvHa99Dr54EF2h3snJXE-v3rWcc,395
669
+ numpy/typing/tests/data/fail/dtype.pyi,sha256=OAGABqdXNB8gClJFEGMckoycuZcIasMaAlS2RkiKROI,334
670
+ numpy/typing/tests/data/fail/einsumfunc.pyi,sha256=RS7GZqUCT_vEFJoyUx4gZlPO8GNFFNFWidxl-wLyRv0,539
671
+ numpy/typing/tests/data/fail/false_positives.pyi,sha256=Q61qMsSsNCtmO0EMRxHj5Z7RYTyrELVpkzfJY5eK8Z0,366
672
+ numpy/typing/tests/data/fail/flatiter.pyi,sha256=qLM4qm7gvJtEZ0rTHcyasUzoP5JbX4FREtqV3g1w6Lo,843
673
+ numpy/typing/tests/data/fail/fromnumeric.pyi,sha256=FH2mjkgtCbA9soqlJRhYN7IIfRRrUL1i9mwqcbYKZSc,5591
674
+ numpy/typing/tests/data/fail/histograms.pyi,sha256=yAPVt0rYTwtxnigoGT-u7hhKCE9iYxsXc24x2HGBrmA,367
675
+ numpy/typing/tests/data/fail/index_tricks.pyi,sha256=moINir9iQoi6Q1ZuVg5BuSB9hSBtbg_uzv-Qm_lLYZk,509
676
+ numpy/typing/tests/data/fail/lib_function_base.pyi,sha256=6y9T773CBLX-jUry1sCQGVuKVKM2wMuQ56Ni5V5j4Dw,2081
677
+ numpy/typing/tests/data/fail/lib_polynomial.pyi,sha256=Ur7Y4iZX6WmoH5SDm0ePi8C8LPsuPs2Yr7g7P5O613g,899
678
+ numpy/typing/tests/data/fail/lib_utils.pyi,sha256=VFpE6_DisvlDByyp1PiNPJEe5IcZp8cH0FlAJyoZipo,276
679
+ numpy/typing/tests/data/fail/lib_version.pyi,sha256=7-ZJDZwDcB-wzpMN8TeYtZAgaqc7xnQ8Dnx2ISiX2Ts,158
680
+ numpy/typing/tests/data/fail/linalg.pyi,sha256=yDd05aK1dI37RPt3pD2eJYo4dZFaT2yB1PEu3K0y9Tg,1322
681
+ numpy/typing/tests/data/fail/memmap.pyi,sha256=HSTCQYNuW1Y6X1Woj361pN4rusSPs4oDCXywqk20yUo,159
682
+ numpy/typing/tests/data/fail/modules.pyi,sha256=_ek4zKcdP-sIh_f-IDY0tP-RbLORKCSWelM9AOYxsyA,670
683
+ numpy/typing/tests/data/fail/multiarray.pyi,sha256=XCdBxufNhR8ZtG8UMzk8nt9_NC5gJTKP9-xTqKO_K9I,1693
684
+ numpy/typing/tests/data/fail/ndarray.pyi,sha256=YnjXy16RHs_esKelMjB07865CQ7gLyQnXhnitq5Kv5c,405
685
+ numpy/typing/tests/data/fail/ndarray_misc.pyi,sha256=w-10xTDDWoff9Lq0dBO-jBeiBR-XjCz2qmes0dLx238,1372
686
+ numpy/typing/tests/data/fail/nditer.pyi,sha256=w7emjnOxnf3NcvLktNLlke6Cuivn2gU3sVmGCfbG6rw,325
687
+ numpy/typing/tests/data/fail/nested_sequence.pyi,sha256=em4GZwLDFE0QSxxg081wVwhh-Dmtkn8f7wThI0DiXVs,427
688
+ numpy/typing/tests/data/fail/npyio.pyi,sha256=56QuHo9SvVR3Uhzl6gQZncCpX575Gy5wugjMICh20m0,620
689
+ numpy/typing/tests/data/fail/numerictypes.pyi,sha256=fevH9x80CafYkiyBJ7LMLVl6GyTvQrZ34trBu6O8TtM,276
690
+ numpy/typing/tests/data/fail/random.pyi,sha256=p5WsUGyOL-MGIeALh9Y0dVhYSRQLaUwMdjXc3G6C_7Q,2830
691
+ numpy/typing/tests/data/fail/rec.pyi,sha256=Ws3TyesnoQjt7Q0wwtpShRDJmZCs2jjP17buFMomVGA,704
692
+ numpy/typing/tests/data/fail/scalars.pyi,sha256=o91BwSfzPTczYVtbXsirqQUoUoYP1C_msGjc2GYsV04,2952
693
+ numpy/typing/tests/data/fail/shape_base.pyi,sha256=Y_f4buHtX2Q2ZA4kaDTyR8LErlPXTzCB_-jBoScGh_Q,152
694
+ numpy/typing/tests/data/fail/stride_tricks.pyi,sha256=IjA0Xrnx0lG3m07d1Hjbhtyo1Te5cXgjgr5fLUo4LYQ,315
695
+ numpy/typing/tests/data/fail/testing.pyi,sha256=e7b5GKTWCtKGoB8z2a8edsW0Xjl1rMheALsvzEJjlCw,1370
696
+ numpy/typing/tests/data/fail/twodim_base.pyi,sha256=ZqbRJfy5S_pW3fFLuomy4L5SBNqj6Nklexg9KDTo65c,899
697
+ numpy/typing/tests/data/fail/type_check.pyi,sha256=CIyI0j0Buxv0QgCvNG2urjaKpoIZ-ZNawC2m6NzGlbo,379
698
+ numpy/typing/tests/data/fail/ufunc_config.pyi,sha256=ukA0xwfJHLoGfoOIpWIN-91wj-DG8oaIjYbO72ymjg4,733
699
+ numpy/typing/tests/data/fail/ufunclike.pyi,sha256=lbxjJyfARmt_QK1HxhxFxvwQTqCEZwJ9I53Wp8X3KIY,679
700
+ numpy/typing/tests/data/fail/ufuncs.pyi,sha256=YaDTL7QLmGSUxE6JVMzpOlZTjHWrgbOo0UIlkX-6ZQk,1347
701
+ numpy/typing/tests/data/fail/warnings_and_errors.pyi,sha256=PrbYDFI7IGN3Gf0OPBkVfefzQs4AXHwDQ495pvrX3RY,174
702
+ numpy/typing/tests/data/misc/extended_precision.pyi,sha256=bS8bBeCFqjgtOiy-8_y39wfa7rwhdjLz2Vmo-RXAYD4,884
703
+ numpy/typing/tests/data/mypy.ini,sha256=Ynv1VSx_kXTD2mFC3ZpgEFuCOg1F2VJXxPk0dxUnF2M,108
704
+ numpy/typing/tests/data/pass/arithmetic.py,sha256=2z3dmuysQQmiPz8x0bg8SOOKW62mVJn97uMa9T0L7Vk,7455
705
+ numpy/typing/tests/data/pass/array_constructors.py,sha256=3GrhfBcmWX53pJHD0NvhXjwr2-uNKREbR1I9WCcZ7rI,2419
706
+ numpy/typing/tests/data/pass/array_like.py,sha256=ce_IVubBd7J6FkSpJmD7qMlRLuwmiidhOqhYfZb16Wo,916
707
+ numpy/typing/tests/data/pass/arrayprint.py,sha256=y_KkuLz1uM7pv53qfq7GQOuud4LoXE3apK1wtARdVyM,766
708
+ numpy/typing/tests/data/pass/arrayterator.py,sha256=FqcpKdUQBQ0FazHFxr9MsLEZG-jnJVGKWZX2owRr4DQ,393
709
+ numpy/typing/tests/data/pass/bitwise_ops.py,sha256=UnmxVr9HwI8ifdrutGm_u3EZU4iOOPQhrOku7hTaH0c,970
710
+ numpy/typing/tests/data/pass/comparisons.py,sha256=nTE-fvraLK6xTZcP4uPV02wOShzYKWDaoapx35AeDOY,2992
711
+ numpy/typing/tests/data/pass/dtype.py,sha256=MqDKC6Ywv6jNkWsR8rdLuabzHUco5w1OylDHEdxve_I,1069
712
+ numpy/typing/tests/data/pass/einsumfunc.py,sha256=eXj5L5MWPtQHgrHPsJ36qqrmBHqct9UoujjJCvHnF1k,1370
713
+ numpy/typing/tests/data/pass/flatiter.py,sha256=0BnbuLMBC7MQlprNZ0QhNSscfYwPhEhXOhWoyiRACWU,174
714
+ numpy/typing/tests/data/pass/fromnumeric.py,sha256=Xd_nJVVDoONdztUX8ddgo7EXJ2FD8AX51MO_Yujnmog,3742
715
+ numpy/typing/tests/data/pass/index_tricks.py,sha256=oaFD9vY01_RI5OkrXt-xTk1n_dd-SpuPp-eZ58XR3c8,1492
716
+ numpy/typing/tests/data/pass/lib_utils.py,sha256=sDQCjHVGUwct0RQqAtH5_16y241siSY4bXKZRsuJ8xA,434
717
+ numpy/typing/tests/data/pass/lib_version.py,sha256=HnuGOx7tQA_bcxFIJ3dRoMAR0fockxg4lGqQ4g7LGIw,299
718
+ numpy/typing/tests/data/pass/literal.py,sha256=DLzdWHD6ttW4S0NEvGQbsH_UEJjhZyhvO4OXJjoyvZQ,1331
719
+ numpy/typing/tests/data/pass/mod.py,sha256=HB9aK4_wGJbc44tomaoroNy0foIL5cI9KIjknvMTbkk,1578
720
+ numpy/typing/tests/data/pass/modules.py,sha256=t0KJxYWbrWd7HbbgIDFb3LAhJBiNNb6QPjjFDAgC2mU,576
721
+ numpy/typing/tests/data/pass/multiarray.py,sha256=MxHax6l94yqlTVZleAqG77ILEbW6wU5osPcHzxJ85ns,1331
722
+ numpy/typing/tests/data/pass/ndarray_conversion.py,sha256=yPgzXG6paY1uF_z-QyHYrcmrZvhX7qtvTUh7ANLseCA,1626
723
+ numpy/typing/tests/data/pass/ndarray_misc.py,sha256=z3mucbn9fLM1gxmbUhWlp2lcrOv4zFjqZFze0caE2EA,2715
724
+ numpy/typing/tests/data/pass/ndarray_shape_manipulation.py,sha256=37eYwMNqMLwanIW9-63hrokacnSz2K_qtPUlkdpsTjo,640
725
+ numpy/typing/tests/data/pass/numeric.py,sha256=SdnsD5zv0wm8T2hnIylyS14ig2McSz6rG9YslckbNQ4,1490
726
+ numpy/typing/tests/data/pass/numerictypes.py,sha256=r0_s-a0-H2MdWIn4U4P6W9RQO0V1xrDusgodHNZeIYM,750
727
+ numpy/typing/tests/data/pass/random.py,sha256=uJCnzlsOn9hr_G1TpHLdsweJI4EdhUSEQ4dxROPjqAs,61881
728
+ numpy/typing/tests/data/pass/scalars.py,sha256=En0adCZAwEigZrzdQ0JQwDEmrS0b-DMd1vvjkFcvwo8,3479
729
+ numpy/typing/tests/data/pass/simple.py,sha256=HmAfCOdZBWQF211YaZFrIGisMgu5FzTELApKny08n3Y,2676
730
+ numpy/typing/tests/data/pass/simple_py3.py,sha256=HuLrc5aphThQkLjU2_19KgGFaXwKOfSzXe0p2xMm8ZI,96
731
+ numpy/typing/tests/data/pass/ufunc_config.py,sha256=_M8v-QWAeT1-2MkfSeAbNl_ZwyPvYfPTsLl6c1X8d_w,1204
732
+ numpy/typing/tests/data/pass/ufunclike.py,sha256=Gve6cJ2AT3TAwOjUOQQDIUnqsRCGYq70_tv_sgODiiA,1039
733
+ numpy/typing/tests/data/pass/ufuncs.py,sha256=xGuKuqPetUTS4io5YDHaki5nbYRu-wC29SGU32tzVIg,462
734
+ numpy/typing/tests/data/pass/warnings_and_errors.py,sha256=Pcg-QWfY4PAhTKyehae8q6LhtbUABxa2Ye63-3h1f4w,150
735
+ numpy/typing/tests/data/reveal/arithmetic.pyi,sha256=Ndmi_IFAl8z28RHsYTbOouf-B5FH91x_9ky-JwsdXVg,19765
736
+ numpy/typing/tests/data/reveal/array_constructors.pyi,sha256=DcT8Z2rEpqYfjXySBejk8cGOUidUmizZGE5ZEy7r14E,10600
737
+ numpy/typing/tests/data/reveal/arraypad.pyi,sha256=Q1pcU4B3eRsw5jsv-S0MsEfNUbp_4aMdO_o3n0rtA2A,776
738
+ numpy/typing/tests/data/reveal/arrayprint.pyi,sha256=YyzzkL-wj4Rs-fdo3brpoaWtb5g3yk4Vn2HKu5KRo4w,876
739
+ numpy/typing/tests/data/reveal/arraysetops.pyi,sha256=ApCFQcZzQ08zV32SJ86Xyv_7jazl3XKMmJmULtNquJ8,4155
740
+ numpy/typing/tests/data/reveal/arrayterator.pyi,sha256=TF_1eneHoT0v9HqS9dKc5Xiv3iY3E330GR1RNcJ7s2Q,1111
741
+ numpy/typing/tests/data/reveal/bitwise_ops.pyi,sha256=nRkyUGrBB_Es7TKyDxS_s3u2dFgBfzjocInI9Ea-J10,3919
742
+ numpy/typing/tests/data/reveal/char.pyi,sha256=M_iTa9Pn8F7jQ1k6RN9KvbhEn00g7UYJZ5PV57ikcZM,7289
743
+ numpy/typing/tests/data/reveal/chararray.pyi,sha256=O0EfwnKc3W1Fnx1c7Yotb1O84kVMuqJLlMBXd2duvjI,6093
744
+ numpy/typing/tests/data/reveal/comparisons.pyi,sha256=huaf-seaF5ndTqfoaBfPtMMkOYovq7ibJl5-CRoQW7s,7468
745
+ numpy/typing/tests/data/reveal/constants.pyi,sha256=P9vFEMkPpJ5KeUnzqPOuyHlh3zAFl9lzB4WxyB2od7A,1949
746
+ numpy/typing/tests/data/reveal/ctypeslib.pyi,sha256=-Pk2rLEGCzz3B_y8Mu10JSVA8gPFztl5fV1dspPzqig,4727
747
+ numpy/typing/tests/data/reveal/datasource.pyi,sha256=e8wjn60tO5EdnkBF34JrZT5XvdyW7kRWD2abtgr6qUg,671
748
+ numpy/typing/tests/data/reveal/dtype.pyi,sha256=TKrYyxMu5IGobs0SDTIRcPuWsZ5X7zMYB4pmUlTTJxA,2872
749
+ numpy/typing/tests/data/reveal/einsumfunc.pyi,sha256=pbtSfzIWUJRkDpe2riHBlvFlNSC3CqVM-SbYtBgX9H0,2044
750
+ numpy/typing/tests/data/reveal/emath.pyi,sha256=-muNpWOv_niIn-zS3gUnFO4qBZAouNlVGue2x1L5Ris,2423
751
+ numpy/typing/tests/data/reveal/false_positives.pyi,sha256=AplTmZV7TS7nivU8vegbstMN5MdMv4U0JJdZ4IeeA5M,482
752
+ numpy/typing/tests/data/reveal/fft.pyi,sha256=ReQ9qn5frvJEy-g0RWpUGlPBntUS1cFSIu6WfPotHzE,1749
753
+ numpy/typing/tests/data/reveal/flatiter.pyi,sha256=e1OQsVxQpgyfqMNw2puUTATl-w3swvdknlctAiWxf_E,882
754
+ numpy/typing/tests/data/reveal/fromnumeric.pyi,sha256=PNtGQR1VmGk_xNbd0eP7k7B2oNCMBz2XOJ17-_SdE5M,12101
755
+ numpy/typing/tests/data/reveal/getlimits.pyi,sha256=nUGOMFpWj3pMgqLy6ZbR7A4G2q7iLIl5zEFBGf-Qcfw,1592
756
+ numpy/typing/tests/data/reveal/histograms.pyi,sha256=MxKWoa7UoJRRLim53H6OoyYfz87P3_9YUXGYPTknGVQ,1303
757
+ numpy/typing/tests/data/reveal/index_tricks.pyi,sha256=HpD7lU7hcyDoLdZbeqskPXnX7KYwPtll7uJKYUzrlE8,3177
758
+ numpy/typing/tests/data/reveal/lib_function_base.pyi,sha256=eSiSZUlmPXqVPKknM7GcEv76BDgj0IJRu3FXcZXpmqc,8318
759
+ numpy/typing/tests/data/reveal/lib_polynomial.pyi,sha256=TOzOdMPDqveDv3vDKSjtq6RRvN-j_s2J7aud2ySDAB0,5986
760
+ numpy/typing/tests/data/reveal/lib_utils.pyi,sha256=_zj7WGYGYMFXAHLK-F11aeFfDvjRvFARUjoXhbXn8V0,1049
761
+ numpy/typing/tests/data/reveal/lib_version.pyi,sha256=UCioUeykot8-nWL6goKxZnKZxtgB4lFEi9wdN_xyF1U,672
762
+ numpy/typing/tests/data/reveal/linalg.pyi,sha256=LPaY-RyYL7Xt3djCgNaWEgI8beI9Eo_XnvOwi6Y7-eo,4877
763
+ numpy/typing/tests/data/reveal/matrix.pyi,sha256=ciJXsn5v2O1IZ3VEn5Ilp8-40NTQokfrOOgVXMFsvLo,2922
764
+ numpy/typing/tests/data/reveal/memmap.pyi,sha256=A5PovMzjRp2zslF1vw3TdTQjj4Y0dIEJ__HDBV_svGM,842
765
+ numpy/typing/tests/data/reveal/mod.pyi,sha256=-CNWft2jQGSdrO8dYRgwbl7OhL3a78Zo60JVmiY-gQI,5666
766
+ numpy/typing/tests/data/reveal/modules.pyi,sha256=0WPq7A-aqWkJsV-IA1_7dFNCcxBacj1AWExaXbXErG4,1958
767
+ numpy/typing/tests/data/reveal/multiarray.pyi,sha256=6MvfNKihK-oN6QwG9HFNelgheo4lnL0FCrmIF_qxdoA,5326
768
+ numpy/typing/tests/data/reveal/nbit_base_example.pyi,sha256=DRUMGatQvQXTuovKEMF4dzazIU6it6FU53LkOEo2vNo,657
769
+ numpy/typing/tests/data/reveal/ndarray_conversion.pyi,sha256=BfjQD8U756l4gOfY0LD47HhDRxbq0yCFfEFKvbXs7Rs,1791
770
+ numpy/typing/tests/data/reveal/ndarray_misc.pyi,sha256=0EN-a47Msn4pZgKVdD-GrXCCmt-oxjlov5rszchBmOI,7126
771
+ numpy/typing/tests/data/reveal/ndarray_shape_manipulation.pyi,sha256=QDQ9g6l-e73pTJp-Dosiynb-okbqi91D4KirjhIjcv4,1233
772
+ numpy/typing/tests/data/reveal/nditer.pyi,sha256=VFXnT75BgWSUpb-dD-q5cZkfeOqsk-x9cH626g9FWT4,2021
773
+ numpy/typing/tests/data/reveal/nested_sequence.pyi,sha256=IQyRlXduk-ZEakOtoliMLCqNgGbeg0mzZf-a-a3Gq_0,734
774
+ numpy/typing/tests/data/reveal/npyio.pyi,sha256=YXagt2J-1suu5WXZ_si5NuJf7sHj_7NlaSLqQkam1Po,4209
775
+ numpy/typing/tests/data/reveal/numeric.pyi,sha256=aJKnav-X45tjSFfgGD4iCetwEFcJXdNgU7valktjiCg,6160
776
+ numpy/typing/tests/data/reveal/numerictypes.pyi,sha256=-YQRhwjBjsFJHjpGCRqzafNnKDdsmbBHbmPwccP0pLI,2487
777
+ numpy/typing/tests/data/reveal/random.pyi,sha256=s6T074ZIpGAUqHnA-yAlozTLvt7PNBjCBqd-nGMqWGg,104091
778
+ numpy/typing/tests/data/reveal/rec.pyi,sha256=DbRVk6lc7-3qPe-7Q26tUWpdaH9B4UVoQSYrRGJUo1Q,3858
779
+ numpy/typing/tests/data/reveal/scalars.pyi,sha256=Qn3B3rsqSN397Jh25xs4odt2pfCQtWkoJe-e0-oX8d4,4790
780
+ numpy/typing/tests/data/reveal/shape_base.pyi,sha256=YjiVukrK6OOydvopOaOmeAIIa0YQ2hn9_I_-FyYkHVU,2427
781
+ numpy/typing/tests/data/reveal/stride_tricks.pyi,sha256=EBZR8gSP385nhotwJ3GH9DOUD2q5nUEYbXfhLo5xrPo,1542
782
+ numpy/typing/tests/data/reveal/testing.pyi,sha256=_WOAj_t5SWYiqN0KG26Mza8RvaD3WAa7rFUlgksjLms,8611
783
+ numpy/typing/tests/data/reveal/twodim_base.pyi,sha256=ZdNVo2HIJcx8iF9PA-z5W3Bs0hWM2nlVdbhLuAQlljM,3132
784
+ numpy/typing/tests/data/reveal/type_check.pyi,sha256=yZSp50TtvPqv_PN7zmVcNOVUTUXMNYFGcguMNj25E9Y,3044
785
+ numpy/typing/tests/data/reveal/ufunc_config.pyi,sha256=buwSvat3SVFAFl5k8TL6Mgpi32o6hHZYZ2Lpn6AHdEU,1327
786
+ numpy/typing/tests/data/reveal/ufunclike.pyi,sha256=V_gLcZVrTXJ21VkUMwA0HyxUgA1r6OzjsdJegaKL2GE,1329
787
+ numpy/typing/tests/data/reveal/ufuncs.pyi,sha256=VnwYr5KT_FLKfc0wV7dtNz7bNtaC9VIQt-oz56Hb5EE,2798
788
+ numpy/typing/tests/data/reveal/warnings_and_errors.pyi,sha256=ImMlPt2PQBtX8Qf1EZFmLjNWm8fPE6IWQ_deaq_-85s,538
789
+ numpy/typing/tests/test_isfile.py,sha256=BhKZs4-LrhFUfKjcG0yelySjE6ZITMxGIBYWGDHMRb8,864
790
+ numpy/typing/tests/test_runtime.py,sha256=2qu8JEliITnZCBJ_QJpohacj_OQ08o73ixS2w2ooNXI,3275
791
+ numpy/typing/tests/test_typing.py,sha256=Da1ZOFjtPh_Mvb5whpI-okBJdgLOAfJtJNyG6leGFoQ,8743
792
+ numpy/version.py,sha256=OTLnSh0NGfWyL8VrnIj0Ndt_KZOTl1Z-kD9Cf-jRMmY,216
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy-1.26.4.dist-info/REQUESTED ADDED
File without changes