JinghuiLuAstronaut commited on
Commit
bd3a803
·
verified ·
1 Parent(s): 9566c86

Add files using upload-large-folder tool

Browse files
Files changed (20) hide show
  1. LTA_openwebtext_dualt/logs/ar_lm1b_flmpack_bert_small_len128_gbs512_4gpu_1m_rowshard_b64_resume4000_20260504_203021.nohup.out +0 -0
  2. LTA_openwebtext_dualt/logs/lta_owt_c1024_gpt2_cached_chunks_len1024_fast10k_4gpu_b16_100step.log +149 -0
  3. LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/scalinglaw_ctx1024_exact10_vocab50257_small384x6_4gpu_2000step.log +212 -0
  4. LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/scalinglaw_samples2_192x3_c512_vocab50257_4gpu_3000step.log +66 -0
  5. LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/trace_ctx256_small384x6_step500.log +1 -0
  6. LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/trace_ctx512_small384x6_step500.log +3 -0
  7. LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/trace_params512x8_c512_vocab50257_step750.log +3 -0
  8. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/__multiarray_api.c +314 -0
  9. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/experimental_dtype_api.h +365 -0
  10. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/halffloat.h +70 -0
  11. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/npy_cpu.h +129 -0
  12. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/npy_interrupt.h +56 -0
  13. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/npy_no_deprecated_api.h +20 -0
  14. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/utils.h +37 -0
  15. LTA_openwebtext_dualt/scripts/flowtext_score_decode_lab.py +129 -0
  16. LTA_openwebtext_dualt/scripts/launch_lta_owt_from_lm1b_c1024_4gpu.sh +85 -0
  17. LTA_openwebtext_dualt/scripts/launch_lta_wmt14_deen_fullycoupled_4gpu_smoke.sh +113 -0
  18. LTA_openwebtext_dualt/scripts/run_lta_owt_bert_absrope_time4_dirichlet_len1024_C1_to_1024_8gpu_1m_mask1_sameT_save10k.sh +77 -0
  19. LTA_openwebtext_dualt/scripts/tmp_run_three_quick_infer_20260525.sh +130 -0
  20. LTA_openwebtext_dualt/scripts/trace_lta_decode_steps.py +129 -0
LTA_openwebtext_dualt/logs/ar_lm1b_flmpack_bert_small_len128_gbs512_4gpu_1m_rowshard_b64_resume4000_20260504_203021.nohup.out ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/lta_owt_c1024_gpt2_cached_chunks_len1024_fast10k_4gpu_b16_100step.log ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ *****************************************
3
+ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
4
+ *****************************************
5
+ [rank0]:[W512 16:41:13.244392390 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
6
+ NCCL version 2.25.1+cuda12.8
7
+ [rank1]:[W512 16:41:14.936907560 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
8
+ [rank3]:[W512 16:41:14.612056126 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
9
+ [rank2]:[W512 16:41:15.111442637 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
10
+ {
11
+ "device": "cuda:0",
12
+ "rank": 0,
13
+ "world_size": 4,
14
+ "samples": "owt_cached_chunks:10904",
15
+ "vocab_size": 50257,
16
+ "save_dir": "runs/lta_owt_c1024_gpt2_cached_chunks_len1024_fast10k_4gpu_b16_100step",
17
+ "batch_size": 16,
18
+ "grad_accum": 8,
19
+ "effective_batch_size": 512,
20
+ "global_batch_size": 512,
21
+ "lr_schedule": "constant_warmup",
22
+ "warmup_steps": 20,
23
+ "adam_beta1": 0.9,
24
+ "adam_beta2": 0.999,
25
+ "adam_eps": 1e-08,
26
+ "model_type": "ddit",
27
+ "dual_t": true,
28
+ "corrupt_t_mode": "same",
29
+ "corrupt_min_t": 0.0,
30
+ "corrupt_max_t": 1.0,
31
+ "dirichlet_endpoint_mode": "categorical_dual_t",
32
+ "dirichlet_semantic_t_mode": "same",
33
+ "dirichlet_semantic_t_value": 0.0,
34
+ "categorical_wrong_from_full_vocab": true,
35
+ "simplex_bridge_sampler": "dirichlet",
36
+ "logistic_normal_sigma_min": 0.18,
37
+ "logistic_normal_sigma_max": 2.2,
38
+ "logistic_normal_tau_min": 0.65,
39
+ "logistic_normal_tau_max": 1.15,
40
+ "torch_compile": false,
41
+ "compile_mode": "max-autotune",
42
+ "state_format": "prob",
43
+ "target_loss": "hard_ce",
44
+ "meanflow_weight": 0.0,
45
+ "bridge_noise_init": "logistic_normal",
46
+ "noise_sigma": -1.0,
47
+ "wrap": true,
48
+ "wrap_mode": "stream",
49
+ "wrap_record_buffer_size": 200,
50
+ "owt_cached_chunks": true,
51
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train_minus_100k_fast10k",
52
+ "owt_chunk_cache_rebuild": false,
53
+ "owt_chunk_cache_write_batch": 4096,
54
+ "online_chunk_shuffle": false,
55
+ "online_chunk_shuffle_buffer": 10000,
56
+ "openwebtext_split": "train_minus_100k",
57
+ "detokenizer": "auto",
58
+ "resolved_detokenizer": null,
59
+ "num_workers": 0,
60
+ "latest_every": 25,
61
+ "resume_path": ""
62
+ }
63
+ step=5 micro_steps=40 elapsed=50.1s lr=9.000000e-05 loss_all=10.7950 acc_all=0.5424 loss_corrupt=10.8013 acc_corrupt=0.3710 corrupt_frac=0.5505 loss=10.8013 loss_recon=10.8013 loss_meanflow=0.0000 mean_model_t=0.5037 mean_corrupt_t=0.5037 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4924 init_acc_corrupt=0.4756 init_gold_top10=0.5024 init_gold_top100=0.5300
64
+ step=10 micro_steps=80 elapsed=48.8s lr=1.650000e-04 loss_all=10.5876 acc_all=0.5773 loss_corrupt=10.6249 acc_corrupt=0.3641 corrupt_frac=0.5710 loss=10.6249 loss_recon=10.6249 loss_meanflow=0.0000 mean_model_t=0.4988 mean_corrupt_t=0.4988 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5066 init_acc_corrupt=0.4603 init_gold_top10=0.4881 init_gold_top100=0.5178
65
+ step=15 micro_steps=120 elapsed=48.5s lr=2.400000e-04 loss_all=10.0529 acc_all=0.1321 loss_corrupt=10.0839 acc_corrupt=0.0846 corrupt_frac=0.5482 loss=10.0839 loss_recon=10.0839 loss_meanflow=0.0000 mean_model_t=0.4826 mean_corrupt_t=0.4826 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5143 init_acc_corrupt=0.4517 init_gold_top10=0.4797 init_gold_top100=0.5107
66
+ step=20 micro_steps=160 elapsed=46.3s lr=3.000000e-04 loss_all=9.1611 acc_all=0.2368 loss_corrupt=9.2121 acc_corrupt=0.1448 corrupt_frac=0.5527 loss=9.2121 loss_recon=9.2121 loss_meanflow=0.0000 mean_model_t=0.4898 mean_corrupt_t=0.4898 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5102 init_acc_corrupt=0.4514 init_gold_top10=0.4836 init_gold_top100=0.5166
67
+ step=25 micro_steps=200 elapsed=43.5s lr=3.000000e-04 loss_all=8.2892 acc_all=0.1852 loss_corrupt=8.3501 acc_corrupt=0.1275 corrupt_frac=0.5519 loss=8.3501 loss_recon=8.3501 loss_meanflow=0.0000 mean_model_t=0.5035 mean_corrupt_t=0.5035 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5017 init_acc_corrupt=0.4630 init_gold_top10=0.4920 init_gold_top100=0.5225
68
+ step=30 micro_steps=240 elapsed=58.5s lr=3.000000e-04 loss_all=7.5538 acc_all=0.2846 loss_corrupt=7.7125 acc_corrupt=0.1894 corrupt_frac=0.5483 loss=7.7125 loss_recon=7.7125 loss_meanflow=0.0000 mean_model_t=0.4809 mean_corrupt_t=0.4809 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5216 init_acc_corrupt=0.4416 init_gold_top10=0.4725 init_gold_top100=0.5025
69
+ step=35 micro_steps=280 elapsed=44.9s lr=3.000000e-04 loss_all=6.9044 acc_all=0.3164 loss_corrupt=7.2563 acc_corrupt=0.2266 corrupt_frac=0.5421 loss=7.2563 loss_recon=7.2563 loss_meanflow=0.0000 mean_model_t=0.5272 mean_corrupt_t=0.5272 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4720 init_acc_corrupt=0.4966 init_gold_top10=0.5229 init_gold_top100=0.5512
70
+ step=40 micro_steps=320 elapsed=44.0s lr=3.000000e-04 loss_all=6.4465 acc_all=0.2756 loss_corrupt=6.9584 acc_corrupt=0.1858 corrupt_frac=0.5546 loss=6.9584 loss_recon=6.9584 loss_meanflow=0.0000 mean_model_t=0.4862 mean_corrupt_t=0.4862 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5271 init_acc_corrupt=0.4348 init_gold_top10=0.4664 init_gold_top100=0.5004
71
+ step=45 micro_steps=360 elapsed=42.7s lr=3.000000e-04 loss_all=5.9251 acc_all=0.2724 loss_corrupt=6.5543 acc_corrupt=0.1895 corrupt_frac=0.5743 loss=6.5543 loss_recon=6.5543 loss_meanflow=0.0000 mean_model_t=0.4939 mean_corrupt_t=0.4939 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5111 init_acc_corrupt=0.4545 init_gold_top10=0.4826 init_gold_top100=0.5159
72
+
73
+ *****************************************
74
+ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
75
+ *****************************************
76
+ [rank0]:[W512 16:53:29.911882582 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
77
+ NCCL version 2.25.1+cuda12.8
78
+ [rank3]:[W512 16:53:30.911647198 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
79
+ [rank1]:[W512 16:53:30.554297191 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
80
+ [rank2]:[W512 16:53:30.591215668 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
81
+ resumed_from=runs/lta_owt_c1024_gpt2_cached_chunks_len1024_fast10k_4gpu_b16_100step/latest.pt start_step=26
82
+ {
83
+ "device": "cuda:0",
84
+ "rank": 0,
85
+ "world_size": 4,
86
+ "samples": "owt_cached_chunks:10904",
87
+ "vocab_size": 50257,
88
+ "save_dir": "runs/lta_owt_c1024_gpt2_cached_chunks_len1024_fast10k_4gpu_b16_100step",
89
+ "batch_size": 16,
90
+ "grad_accum": 8,
91
+ "effective_batch_size": 512,
92
+ "global_batch_size": 512,
93
+ "lr_schedule": "constant_warmup",
94
+ "warmup_steps": 20,
95
+ "adam_beta1": 0.9,
96
+ "adam_beta2": 0.999,
97
+ "adam_eps": 1e-08,
98
+ "model_type": "ddit",
99
+ "dual_t": true,
100
+ "corrupt_t_mode": "same",
101
+ "corrupt_min_t": 0.0,
102
+ "corrupt_max_t": 1.0,
103
+ "dirichlet_endpoint_mode": "categorical_dual_t",
104
+ "dirichlet_semantic_t_mode": "same",
105
+ "dirichlet_semantic_t_value": 0.0,
106
+ "categorical_wrong_from_full_vocab": true,
107
+ "simplex_bridge_sampler": "dirichlet",
108
+ "logistic_normal_sigma_min": 0.18,
109
+ "logistic_normal_sigma_max": 2.2,
110
+ "logistic_normal_tau_min": 0.65,
111
+ "logistic_normal_tau_max": 1.15,
112
+ "torch_compile": false,
113
+ "compile_mode": "max-autotune",
114
+ "state_format": "prob",
115
+ "target_loss": "hard_ce",
116
+ "meanflow_weight": 0.0,
117
+ "bridge_noise_init": "logistic_normal",
118
+ "noise_sigma": -1.0,
119
+ "wrap": true,
120
+ "wrap_mode": "stream",
121
+ "wrap_record_buffer_size": 200,
122
+ "owt_cached_chunks": true,
123
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train_minus_100k_fast10k",
124
+ "owt_chunk_cache_rebuild": false,
125
+ "owt_chunk_cache_write_batch": 4096,
126
+ "online_chunk_shuffle": false,
127
+ "online_chunk_shuffle_buffer": 10000,
128
+ "openwebtext_split": "train_minus_100k",
129
+ "detokenizer": "auto",
130
+ "resolved_detokenizer": null,
131
+ "num_workers": 0,
132
+ "latest_every": 25,
133
+ "resume_path": "runs/lta_owt_c1024_gpt2_cached_chunks_len1024_fast10k_4gpu_b16_100step/latest.pt"
134
+ }
135
+ step=30 micro_steps=240 elapsed=49.2s lr=3.000000e-04 loss_all=7.5368 acc_all=0.2895 loss_corrupt=7.6733 acc_corrupt=0.2021 corrupt_frac=0.5505 loss=7.6733 loss_recon=7.6733 loss_meanflow=0.0000 mean_model_t=0.5037 mean_corrupt_t=0.5037 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4924 init_acc_corrupt=0.4756 init_gold_top10=0.5024 init_gold_top100=0.5300
136
+ step=35 micro_steps=280 elapsed=48.2s lr=3.000000e-04 loss_all=7.0131 acc_all=0.2880 loss_corrupt=7.3238 acc_corrupt=0.1995 corrupt_frac=0.5710 loss=7.3238 loss_recon=7.3238 loss_meanflow=0.0000 mean_model_t=0.4988 mean_corrupt_t=0.4988 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5066 init_acc_corrupt=0.4603 init_gold_top10=0.4881 init_gold_top100=0.5178
137
+ step=40 micro_steps=320 elapsed=43.2s lr=3.000000e-04 loss_all=6.4932 acc_all=0.2757 loss_corrupt=6.9707 acc_corrupt=0.1886 corrupt_frac=0.5482 loss=6.9707 loss_recon=6.9707 loss_meanflow=0.0000 mean_model_t=0.4826 mean_corrupt_t=0.4826 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5143 init_acc_corrupt=0.4517 init_gold_top10=0.4797 init_gold_top100=0.5107
138
+ step=45 micro_steps=360 elapsed=42.4s lr=3.000000e-04 loss_all=6.0021 acc_all=0.2643 loss_corrupt=6.6188 acc_corrupt=0.1833 corrupt_frac=0.5527 loss=6.6188 loss_recon=6.6188 loss_meanflow=0.0000 mean_model_t=0.4898 mean_corrupt_t=0.4898 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5102 init_acc_corrupt=0.4514 init_gold_top10=0.4836 init_gold_top100=0.5166
139
+ step=50 micro_steps=400 elapsed=43.3s lr=3.000000e-04 loss_all=5.3270 acc_all=0.3213 loss_corrupt=6.1520 acc_corrupt=0.2234 corrupt_frac=0.5519 loss=6.1520 loss_recon=6.1520 loss_meanflow=0.0000 mean_model_t=0.5035 mean_corrupt_t=0.5035 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5017 init_acc_corrupt=0.4630 init_gold_top10=0.4920 init_gold_top100=0.5225
140
+ step=55 micro_steps=440 elapsed=73.6s lr=3.000000e-04 loss_all=4.5452 acc_all=0.4781 loss_corrupt=5.7044 acc_corrupt=0.3135 corrupt_frac=0.5483 loss=5.7044 loss_recon=5.7044 loss_meanflow=0.0000 mean_model_t=0.4809 mean_corrupt_t=0.4809 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5216 init_acc_corrupt=0.4416 init_gold_top10=0.4725 init_gold_top100=0.5025
141
+ step=60 micro_steps=480 elapsed=48.1s lr=3.000000e-04 loss_all=3.5387 acc_all=0.6034 loss_corrupt=4.8525 acc_corrupt=0.4246 corrupt_frac=0.5421 loss=4.8525 loss_recon=4.8525 loss_meanflow=0.0000 mean_model_t=0.5272 mean_corrupt_t=0.5272 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4720 init_acc_corrupt=0.4966 init_gold_top10=0.5229 init_gold_top100=0.5512
142
+ step=65 micro_steps=520 elapsed=45.9s lr=3.000000e-04 loss_all=3.3057 acc_all=0.6112 loss_corrupt=4.9579 acc_corrupt=0.4032 corrupt_frac=0.5546 loss=4.9579 loss_recon=4.9579 loss_meanflow=0.0000 mean_model_t=0.4862 mean_corrupt_t=0.4862 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5271 init_acc_corrupt=0.4348 init_gold_top10=0.4664 init_gold_top100=0.5004
143
+ step=70 micro_steps=560 elapsed=42.2s lr=3.000000e-04 loss_all=3.1512 acc_all=0.6235 loss_corrupt=4.7593 acc_corrupt=0.4248 corrupt_frac=0.5743 loss=4.7593 loss_recon=4.7593 loss_meanflow=0.0000 mean_model_t=0.4939 mean_corrupt_t=0.4939 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5111 init_acc_corrupt=0.4545 init_gold_top10=0.4826 init_gold_top100=0.5159
144
+ step=75 micro_steps=600 elapsed=42.2s lr=3.000000e-04 loss_all=2.9748 acc_all=0.6482 loss_corrupt=4.6855 acc_corrupt=0.4381 corrupt_frac=0.5472 loss=4.6855 loss_recon=4.6855 loss_meanflow=0.0000 mean_model_t=0.4975 mean_corrupt_t=0.4975 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4982 init_acc_corrupt=0.4679 init_gold_top10=0.4956 init_gold_top100=0.5266
145
+ step=80 micro_steps=640 elapsed=68.9s lr=3.000000e-04 loss_all=2.8523 acc_all=0.6580 loss_corrupt=4.6641 acc_corrupt=0.4379 corrupt_frac=0.5418 loss=4.6641 loss_recon=4.6641 loss_meanflow=0.0000 mean_model_t=0.4888 mean_corrupt_t=0.4888 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5130 init_acc_corrupt=0.4518 init_gold_top10=0.4813 init_gold_top100=0.5123
146
+ step=85 micro_steps=680 elapsed=43.6s lr=3.000000e-04 loss_all=2.8406 acc_all=0.6570 loss_corrupt=4.4957 acc_corrupt=0.4543 corrupt_frac=0.5720 loss=4.4957 loss_recon=4.4957 loss_meanflow=0.0000 mean_model_t=0.4943 mean_corrupt_t=0.4943 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4969 init_acc_corrupt=0.4674 init_gold_top10=0.4971 init_gold_top100=0.5276
147
+ step=90 micro_steps=720 elapsed=41.1s lr=3.000000e-04 loss_all=2.7984 acc_all=0.6605 loss_corrupt=4.5917 acc_corrupt=0.4384 corrupt_frac=0.5533 loss=4.5917 loss_recon=4.5917 loss_meanflow=0.0000 mean_model_t=0.4842 mean_corrupt_t=0.4842 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5198 init_acc_corrupt=0.4432 init_gold_top10=0.4741 init_gold_top100=0.5064
148
+ step=95 micro_steps=760 elapsed=45.6s lr=3.000000e-04 loss_all=2.5898 acc_all=0.6871 loss_corrupt=4.3891 acc_corrupt=0.4635 corrupt_frac=0.5377 loss=4.3891 loss_recon=4.3891 loss_meanflow=0.0000 mean_model_t=0.4992 mean_corrupt_t=0.4992 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4987 init_acc_corrupt=0.4669 init_gold_top10=0.4958 init_gold_top100=0.5251
149
+ step=100 micro_steps=800 elapsed=46.1s lr=3.000000e-04 loss_all=2.5553 acc_all=0.6918 loss_corrupt=4.3386 acc_corrupt=0.4699 corrupt_frac=0.5449 loss=4.3386 loss_recon=4.3386 loss_meanflow=0.0000 mean_model_t=0.5043 mean_corrupt_t=0.5043 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4947 init_acc_corrupt=0.4710 init_gold_top10=0.5004 init_gold_top100=0.5283
LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/scalinglaw_ctx1024_exact10_vocab50257_small384x6_4gpu_2000step.log ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ *****************************************
3
+ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
4
+ *****************************************
5
+ [rank0]: Traceback (most recent call last):
6
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1000, in <module>
7
+ [rank0]: main()
8
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 615, in main
9
+ [rank0]: raise ValueError("--owt_cached_chunks requires --wrap --wrap_mode stream")
10
+ [rank0]: ValueError: --owt_cached_chunks requires --wrap --wrap_mode stream
11
+ [rank0]:[W513 01:44:17.923645362 ProcessGroupNCCL.cpp:1487] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
12
+ [rank2]: Traceback (most recent call last):
13
+ [rank2]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1000, in <module>
14
+ [rank2]: main()
15
+ [rank2]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 615, in main
16
+ [rank2]: raise ValueError("--owt_cached_chunks requires --wrap --wrap_mode stream")
17
+ [rank2]: ValueError: --owt_cached_chunks requires --wrap --wrap_mode stream
18
+ [rank1]: Traceback (most recent call last):
19
+ [rank1]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1000, in <module>
20
+ [rank1]: main()
21
+ [rank1]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 615, in main
22
+ [rank1]: raise ValueError("--owt_cached_chunks requires --wrap --wrap_mode stream")
23
+ [rank1]: ValueError: --owt_cached_chunks requires --wrap --wrap_mode stream
24
+ [rank3]: Traceback (most recent call last):
25
+ [rank3]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1000, in <module>
26
+ [rank3]: main()
27
+ [rank3]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 615, in main
28
+ [rank3]: raise ValueError("--owt_cached_chunks requires --wrap --wrap_mode stream")
29
+ [rank3]: ValueError: --owt_cached_chunks requires --wrap --wrap_mode stream
30
+ W0513 01:44:17.315000 312465 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 312533 closing signal SIGTERM
31
+ W0513 01:44:17.316000 312465 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 312534 closing signal SIGTERM
32
+ W0513 01:44:17.317000 312465 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 312535 closing signal SIGTERM
33
+ E0513 01:44:17.445000 312465 torch/distributed/elastic/multiprocessing/api.py:870] failed (exitcode: 1) local_rank: 0 (pid: 312532) of binary: /usr/bin/python
34
+ Traceback (most recent call last):
35
+ File "<frozen runpy>", line 198, in _run_module_as_main
36
+ File "<frozen runpy>", line 88, in _run_code
37
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
38
+ main()
39
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
40
+ return f(*args, **kwargs)
41
+ ^^^^^^^^^^^^^^^^^^
42
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
43
+ run(args)
44
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
45
+ elastic_launch(
46
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
47
+ return launch_agent(self._config, self._entrypoint, list(args))
48
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
49
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
50
+ raise ChildFailedError(
51
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
52
+ ============================================================
53
+ train.py FAILED
54
+ ------------------------------------------------------------
55
+ Failures:
56
+ <NO_OTHER_FAILURES>
57
+ ------------------------------------------------------------
58
+ Root Cause (first observed failure):
59
+ [0]:
60
+ time : 2026-05-13_01:44:17
61
+ host : localhost
62
+ rank : 0 (local_rank: 0)
63
+ exitcode : 1 (pid: 312532)
64
+ error_file: <N/A>
65
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
66
+ ============================================================
67
+
68
+ *****************************************
69
+ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
70
+ *****************************************
71
+ [rank0]:[W513 01:46:18.106846526 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
72
+ NCCL version 2.25.1+cuda12.8
73
+ [rank1]:[W513 01:46:18.152602439 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
74
+ [rank3]:[W513 01:46:18.156248186 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
75
+ [rank2]:[W513 01:46:18.173783313 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
76
+ /usr/local/lib/python3.12/dist-packages/torch/nn/modules/transformer.py:375: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.norm_first was True
77
+ warnings.warn(
78
+ /usr/local/lib/python3.12/dist-packages/torch/nn/modules/transformer.py:375: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.norm_first was True
79
+ warnings.warn(
80
+ /usr/local/lib/python3.12/dist-packages/torch/nn/modules/transformer.py:375: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.norm_first was True
81
+ warnings.warn(
82
+ /usr/local/lib/python3.12/dist-packages/torch/nn/modules/transformer.py:375: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.norm_first was True
83
+ warnings.warn(
84
+ {
85
+ "device": "cuda:0",
86
+ "rank": 0,
87
+ "world_size": 4,
88
+ "samples": "owt_cached_chunks:10",
89
+ "vocab_size": 50257,
90
+ "tokenizer_vocab_size": 50257,
91
+ "save_dir": "runs/scalinglaw_ctx1024_exact10_vocab50257_small384x6_4gpu_2000step",
92
+ "batch_size": 16,
93
+ "grad_accum": 8,
94
+ "effective_batch_size": 512,
95
+ "global_batch_size": 512,
96
+ "lr_schedule": "constant_warmup",
97
+ "warmup_steps": 20,
98
+ "min_lr": 0.0,
99
+ "adamw_param_groups": "all_decay",
100
+ "adam_beta1": 0.9,
101
+ "adam_beta2": 0.999,
102
+ "adam_eps": 1e-08,
103
+ "model_type": "transformer",
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "independent",
106
+ "corrupt_min_t": null,
107
+ "corrupt_max_t": null,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "dirichlet_endpoint_mode": "categorical_dual_t",
111
+ "dirichlet_semantic_t_mode": "same",
112
+ "dirichlet_semantic_t_value": 0.0,
113
+ "categorical_wrong_from_full_vocab": true,
114
+ "categorical_wrong_from_batch_valid_tokens": false,
115
+ "mask_mixture_original_prob": 0.0,
116
+ "mask_mixture_lowk_prob": 0.0,
117
+ "mask_mixture_lowcorrupt_prob": 0.0,
118
+ "mask_mixture_block_prob": 0.0,
119
+ "mask_mixture_all_prob": 0.0,
120
+ "mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
121
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
122
+ "mask_mixture_block_tokens": "64,128",
123
+ "simplex_bridge_sampler": "dirichlet",
124
+ "logistic_normal_sigma_min": 0.18,
125
+ "logistic_normal_sigma_max": 2.2,
126
+ "logistic_normal_tau_min": 0.65,
127
+ "logistic_normal_tau_max": 1.15,
128
+ "torch_compile": false,
129
+ "compile_mode": "max-autotune",
130
+ "state_format": "prob",
131
+ "target_loss": "hard_ce",
132
+ "meanflow_weight": 0.0,
133
+ "bridge_noise_init": "logistic_normal",
134
+ "noise_sigma": -1.0,
135
+ "wrap": true,
136
+ "wrap_mode": "stream",
137
+ "wrap_record_buffer_size": 200,
138
+ "owt_cached_chunks": true,
139
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train_minus_100k_exact10_minvocab",
140
+ "owt_chunk_cache_rebuild": false,
141
+ "owt_chunk_cache_write_batch": 4096,
142
+ "owt_exact_repeat_per_chunk": 10000,
143
+ "online_chunk_shuffle": false,
144
+ "online_chunk_shuffle_buffer": 10000,
145
+ "openwebtext_split": "all",
146
+ "detokenizer": "auto",
147
+ "resolved_detokenizer": null,
148
+ "num_workers": 0,
149
+ "latest_every": 500,
150
+ "resume_path": ""
151
+ }
152
+ step=25 micro_steps=200 elapsed=51.0s lr=3.000000e-04 loss_all=9.5698 acc_all=0.0351 loss_corrupt=9.5720 acc_corrupt=0.0348 corrupt_frac=0.5538 loss=9.5720 loss_recon=9.5720 loss_meanflow=0.0000 mean_model_t=0.5004 mean_corrupt_t=0.5067 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4889 init_acc_corrupt=0.4773 init_gold_top10=0.5053 init_gold_top100=0.5565
153
+ step=50 micro_steps=400 elapsed=59.4s lr=3.000000e-04 loss_all=6.9808 acc_all=0.0435 loss_corrupt=6.9836 acc_corrupt=0.0426 corrupt_frac=0.5502 loss=6.9836 loss_recon=6.9836 loss_meanflow=0.0000 mean_model_t=0.4927 mean_corrupt_t=0.4946 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5015 init_acc_corrupt=0.4633 init_gold_top10=0.4927 init_gold_top100=0.5466
154
+ step=75 micro_steps=600 elapsed=62.6s lr=3.000000e-04 loss_all=6.5358 acc_all=0.0417 loss_corrupt=6.5414 acc_corrupt=0.0417 corrupt_frac=0.5479 loss=6.5414 loss_recon=6.5414 loss_meanflow=0.0000 mean_model_t=0.4907 mean_corrupt_t=0.5026 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4971 init_acc_corrupt=0.4696 init_gold_top10=0.4972 init_gold_top100=0.5490
155
+ step=100 micro_steps=800 elapsed=63.8s lr=3.000000e-04 loss_all=6.4942 acc_all=0.0471 loss_corrupt=6.5060 acc_corrupt=0.0454 corrupt_frac=0.5491 loss=6.5060 loss_recon=6.5060 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.5094 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4881 init_acc_corrupt=0.4785 init_gold_top10=0.5065 init_gold_top100=0.5552
156
+ step=125 micro_steps=1000 elapsed=64.4s lr=3.000000e-04 loss_all=6.2915 acc_all=0.0970 loss_corrupt=6.3833 acc_corrupt=0.0779 corrupt_frac=0.5601 loss=6.3833 loss_recon=6.3833 loss_meanflow=0.0000 mean_model_t=0.5049 mean_corrupt_t=0.4978 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5000 init_acc_corrupt=0.4654 init_gold_top10=0.4943 init_gold_top100=0.5459
157
+ step=150 micro_steps=1200 elapsed=64.7s lr=3.000000e-04 loss_all=5.5438 acc_all=0.1897 loss_corrupt=5.8703 acc_corrupt=0.1546 corrupt_frac=0.5550 loss=5.8703 loss_recon=5.8703 loss_meanflow=0.0000 mean_model_t=0.5132 mean_corrupt_t=0.4969 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5024 init_acc_corrupt=0.4618 init_gold_top10=0.4917 init_gold_top100=0.5428
158
+ step=175 micro_steps=1400 elapsed=64.9s lr=3.000000e-04 loss_all=4.6997 acc_all=0.2680 loss_corrupt=5.2135 acc_corrupt=0.2113 corrupt_frac=0.5524 loss=5.2135 loss_recon=5.2135 loss_meanflow=0.0000 mean_model_t=0.5089 mean_corrupt_t=0.5104 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4923 init_acc_corrupt=0.4735 init_gold_top10=0.5019 init_gold_top100=0.5528
159
+ step=200 micro_steps=1600 elapsed=65.0s lr=3.000000e-04 loss_all=3.9019 acc_all=0.3769 loss_corrupt=4.5546 acc_corrupt=0.2747 corrupt_frac=0.5513 loss=4.5546 loss_recon=4.5546 loss_meanflow=0.0000 mean_model_t=0.5030 mean_corrupt_t=0.5062 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4907 init_acc_corrupt=0.4750 init_gold_top10=0.5040 init_gold_top100=0.5515
160
+ step=225 micro_steps=1800 elapsed=65.2s lr=3.000000e-04 loss_all=3.3872 acc_all=0.5167 loss_corrupt=4.1294 acc_corrupt=0.3459 corrupt_frac=0.5499 loss=4.1294 loss_recon=4.1294 loss_meanflow=0.0000 mean_model_t=0.4978 mean_corrupt_t=0.5020 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5021 init_acc_corrupt=0.4618 init_gold_top10=0.4920 init_gold_top100=0.5455
161
+ step=250 micro_steps=2000 elapsed=65.2s lr=3.000000e-04 loss_all=3.0234 acc_all=0.6385 loss_corrupt=3.8056 acc_corrupt=0.4251 corrupt_frac=0.5563 loss=3.8056 loss_recon=3.8056 loss_meanflow=0.0000 mean_model_t=0.4949 mean_corrupt_t=0.5037 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4973 init_acc_corrupt=0.4692 init_gold_top10=0.4971 init_gold_top100=0.5482
162
+ step=275 micro_steps=2200 elapsed=65.1s lr=3.000000e-04 loss_all=2.6022 acc_all=0.7098 loss_corrupt=3.4180 acc_corrupt=0.4904 corrupt_frac=0.5559 loss=3.4180 loss_recon=3.4180 loss_meanflow=0.0000 mean_model_t=0.4927 mean_corrupt_t=0.5018 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4963 init_acc_corrupt=0.4691 init_gold_top10=0.4982 init_gold_top100=0.5482
163
+ step=300 micro_steps=2400 elapsed=64.9s lr=3.000000e-04 loss_all=2.2382 acc_all=0.7375 loss_corrupt=3.0869 acc_corrupt=0.5279 corrupt_frac=0.5560 loss=3.0869 loss_recon=3.0869 loss_meanflow=0.0000 mean_model_t=0.5101 mean_corrupt_t=0.5021 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4983 init_acc_corrupt=0.4672 init_gold_top10=0.4959 init_gold_top100=0.5493
164
+ step=325 micro_steps=2600 elapsed=64.5s lr=3.000000e-04 loss_all=1.9767 acc_all=0.7410 loss_corrupt=2.8649 acc_corrupt=0.5331 corrupt_frac=0.5541 loss=2.8649 loss_recon=2.8649 loss_meanflow=0.0000 mean_model_t=0.5050 mean_corrupt_t=0.4960 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5083 init_acc_corrupt=0.4563 init_gold_top10=0.4857 init_gold_top100=0.5395
165
+ step=350 micro_steps=2800 elapsed=65.1s lr=3.000000e-04 loss_all=1.6568 acc_all=0.7574 loss_corrupt=2.5097 acc_corrupt=0.5616 corrupt_frac=0.5528 loss=2.5097 loss_recon=2.5097 loss_meanflow=0.0000 mean_model_t=0.4963 mean_corrupt_t=0.5107 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4878 init_acc_corrupt=0.4793 init_gold_top10=0.5066 init_gold_top100=0.5567
166
+ step=375 micro_steps=3000 elapsed=64.8s lr=3.000000e-04 loss_all=1.4678 acc_all=0.7555 loss_corrupt=2.3501 acc_corrupt=0.5523 corrupt_frac=0.5468 loss=2.3501 loss_recon=2.3501 loss_meanflow=0.0000 mean_model_t=0.5051 mean_corrupt_t=0.4982 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5020 init_acc_corrupt=0.4619 init_gold_top10=0.4920 init_gold_top100=0.5448
167
+ step=400 micro_steps=3200 elapsed=64.6s lr=3.000000e-04 loss_all=1.2952 acc_all=0.7603 loss_corrupt=2.1428 acc_corrupt=0.5624 corrupt_frac=0.5475 loss=2.1428 loss_recon=2.1428 loss_meanflow=0.0000 mean_model_t=0.4886 mean_corrupt_t=0.5051 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4978 init_acc_corrupt=0.4664 init_gold_top10=0.4966 init_gold_top100=0.5475
168
+ step=425 micro_steps=3400 elapsed=64.8s lr=3.000000e-04 loss_all=1.1578 acc_all=0.7646 loss_corrupt=1.9592 acc_corrupt=0.5729 corrupt_frac=0.5509 loss=1.9592 loss_recon=1.9592 loss_meanflow=0.0000 mean_model_t=0.5035 mean_corrupt_t=0.4999 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4939 init_acc_corrupt=0.4715 init_gold_top10=0.5003 init_gold_top100=0.5506
169
+ step=450 micro_steps=3600 elapsed=64.7s lr=3.000000e-04 loss_all=1.0533 acc_all=0.7693 loss_corrupt=1.8220 acc_corrupt=0.5805 corrupt_frac=0.5503 loss=1.8220 loss_recon=1.8220 loss_meanflow=0.0000 mean_model_t=0.4990 mean_corrupt_t=0.5016 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4946 init_acc_corrupt=0.4706 init_gold_top10=0.4998 init_gold_top100=0.5502
170
+ step=475 micro_steps=3800 elapsed=64.7s lr=3.000000e-04 loss_all=0.9739 acc_all=0.7743 loss_corrupt=1.6798 acc_corrupt=0.5961 corrupt_frac=0.5586 loss=1.6798 loss_recon=1.6798 loss_meanflow=0.0000 mean_model_t=0.5064 mean_corrupt_t=0.5119 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4869 init_acc_corrupt=0.4807 init_gold_top10=0.5078 init_gold_top100=0.5566
171
+ step=500 micro_steps=4000 elapsed=64.8s lr=3.000000e-04 loss_all=0.8900 acc_all=0.7838 loss_corrupt=1.5706 acc_corrupt=0.6084 corrupt_frac=0.5516 loss=1.5706 loss_recon=1.5706 loss_meanflow=0.0000 mean_model_t=0.4952 mean_corrupt_t=0.5048 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4970 init_acc_corrupt=0.4698 init_gold_top10=0.4974 init_gold_top100=0.5487
172
+ step=525 micro_steps=4200 elapsed=65.9s lr=3.000000e-04 loss_all=0.7953 acc_all=0.8066 loss_corrupt=1.4415 acc_corrupt=0.6420 corrupt_frac=0.5395 loss=1.4415 loss_recon=1.4415 loss_meanflow=0.0000 mean_model_t=0.5024 mean_corrupt_t=0.5028 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4925 init_acc_corrupt=0.4743 init_gold_top10=0.5020 init_gold_top100=0.5513
173
+ step=550 micro_steps=4400 elapsed=64.7s lr=3.000000e-04 loss_all=0.7557 acc_all=0.8208 loss_corrupt=1.3456 acc_corrupt=0.6753 corrupt_frac=0.5522 loss=1.3456 loss_recon=1.3456 loss_meanflow=0.0000 mean_model_t=0.4996 mean_corrupt_t=0.4982 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5028 init_acc_corrupt=0.4624 init_gold_top10=0.4915 init_gold_top100=0.5432
174
+ step=575 micro_steps=4600 elapsed=64.5s lr=3.000000e-04 loss_all=0.6402 acc_all=0.8644 loss_corrupt=1.1534 acc_corrupt=0.7515 corrupt_frac=0.5452 loss=1.1534 loss_recon=1.1534 loss_meanflow=0.0000 mean_model_t=0.5019 mean_corrupt_t=0.4997 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5020 init_acc_corrupt=0.4630 init_gold_top10=0.4920 init_gold_top100=0.5448
175
+ step=600 micro_steps=4800 elapsed=64.6s lr=3.000000e-04 loss_all=0.5154 acc_all=0.9112 loss_corrupt=0.9217 acc_corrupt=0.8384 corrupt_frac=0.5482 loss=0.9217 loss_recon=0.9217 loss_meanflow=0.0000 mean_model_t=0.4926 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5034 init_acc_corrupt=0.4628 init_gold_top10=0.4908 init_gold_top100=0.5449
176
+ step=625 micro_steps=5000 elapsed=64.8s lr=3.000000e-04 loss_all=0.3651 acc_all=0.9599 loss_corrupt=0.6465 acc_corrupt=0.9279 corrupt_frac=0.5518 loss=0.6465 loss_recon=0.6465 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5011 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5007 init_acc_corrupt=0.4636 init_gold_top10=0.4933 init_gold_top100=0.5457
177
+ step=650 micro_steps=5200 elapsed=64.4s lr=3.000000e-04 loss_all=0.2195 acc_all=0.9897 loss_corrupt=0.3918 acc_corrupt=0.9812 corrupt_frac=0.5405 loss=0.3918 loss_recon=0.3918 loss_meanflow=0.0000 mean_model_t=0.4929 mean_corrupt_t=0.4927 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5096 init_acc_corrupt=0.4549 init_gold_top10=0.4843 init_gold_top100=0.5381
178
+ step=675 micro_steps=5400 elapsed=64.7s lr=3.000000e-04 loss_all=0.1207 acc_all=0.9953 loss_corrupt=0.2134 acc_corrupt=0.9913 corrupt_frac=0.5407 loss=0.2134 loss_recon=0.2134 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5034 init_acc_corrupt=0.4609 init_gold_top10=0.4908 init_gold_top100=0.5421
179
+ W0513 02:15:32.291000 316519 torch/distributed/elastic/agent/server/api.py:719] Received 15 death signal, shutting down workers
180
+ W0513 02:15:32.293000 316519 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 316613 closing signal SIGTERM
181
+ W0513 02:15:32.294000 316519 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 316614 closing signal SIGTERM
182
+ W0513 02:15:32.294000 316519 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 316615 closing signal SIGTERM
183
+ W0513 02:15:32.295000 316519 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 316616 closing signal SIGTERM
184
+ Traceback (most recent call last):
185
+ File "<frozen runpy>", line 198, in _run_module_as_main
186
+ File "<frozen runpy>", line 88, in _run_code
187
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
188
+ main()
189
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
190
+ return f(*args, **kwargs)
191
+ ^^^^^^^^^^^^^^^^^^
192
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
193
+ run(args)
194
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
195
+ elastic_launch(
196
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
197
+ return launch_agent(self._config, self._entrypoint, list(args))
198
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
199
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
200
+ result = agent.run()
201
+ ^^^^^^^^^^^
202
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
203
+ result = f(*args, **kwargs)
204
+ ^^^^^^^^^^^^^^^^^^
205
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run
206
+ result = self._invoke_run(role)
207
+ ^^^^^^^^^^^^^^^^^^^^^^
208
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 870, in _invoke_run
209
+ time.sleep(monitor_interval)
210
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler
211
+ raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
212
+ torch.distributed.elastic.multiprocessing.api.SignalException: Process 316519 got signal: 15
LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/scalinglaw_samples2_192x3_c512_vocab50257_4gpu_3000step.log ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ *****************************************
3
+ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
4
+ *****************************************
5
+ [rank0]: Traceback (most recent call last):
6
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1000, in <module>
7
+ [rank0]: main()
8
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 615, in main
9
+ [rank0]: raise ValueError("--owt_cached_chunks requires --wrap --wrap_mode stream")
10
+ [rank0]: ValueError: --owt_cached_chunks requires --wrap --wrap_mode stream
11
+ [rank0]:[W513 01:44:40.999052026 ProcessGroupNCCL.cpp:1487] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
12
+ [rank3]: Traceback (most recent call last):
13
+ [rank3]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1000, in <module>
14
+ [rank3]: main()
15
+ [rank3]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 615, in main
16
+ [rank3]: raise ValueError("--owt_cached_chunks requires --wrap --wrap_mode stream")
17
+ [rank3]: ValueError: --owt_cached_chunks requires --wrap --wrap_mode stream
18
+ [rank2]: Traceback (most recent call last):
19
+ [rank2]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1000, in <module>
20
+ [rank2]: main()
21
+ [rank2]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 615, in main
22
+ [rank2]: raise ValueError("--owt_cached_chunks requires --wrap --wrap_mode stream")
23
+ [rank2]: ValueError: --owt_cached_chunks requires --wrap --wrap_mode stream
24
+ [rank1]: Traceback (most recent call last):
25
+ [rank1]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1000, in <module>
26
+ [rank1]: main()
27
+ [rank1]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 615, in main
28
+ [rank1]: raise ValueError("--owt_cached_chunks requires --wrap --wrap_mode stream")
29
+ [rank1]: ValueError: --owt_cached_chunks requires --wrap --wrap_mode stream
30
+ W0513 01:44:40.334000 313116 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 313184 closing signal SIGTERM
31
+ W0513 01:44:40.335000 313116 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 313185 closing signal SIGTERM
32
+ W0513 01:44:40.336000 313116 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 313186 closing signal SIGTERM
33
+ E0513 01:44:40.463000 313116 torch/distributed/elastic/multiprocessing/api.py:870] failed (exitcode: 1) local_rank: 0 (pid: 313183) of binary: /usr/bin/python
34
+ Traceback (most recent call last):
35
+ File "<frozen runpy>", line 198, in _run_module_as_main
36
+ File "<frozen runpy>", line 88, in _run_code
37
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
38
+ main()
39
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
40
+ return f(*args, **kwargs)
41
+ ^^^^^^^^^^^^^^^^^^
42
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
43
+ run(args)
44
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
45
+ elastic_launch(
46
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
47
+ return launch_agent(self._config, self._entrypoint, list(args))
48
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
49
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
50
+ raise ChildFailedError(
51
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
52
+ ============================================================
53
+ train.py FAILED
54
+ ------------------------------------------------------------
55
+ Failures:
56
+ <NO_OTHER_FAILURES>
57
+ ------------------------------------------------------------
58
+ Root Cause (first observed failure):
59
+ [0]:
60
+ time : 2026-05-13_01:44:40
61
+ host : localhost
62
+ rank : 0 (local_rank: 0)
63
+ exitcode : 1 (pid: 313183)
64
+ error_file: <N/A>
65
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
66
+ ============================================================
LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/trace_ctx256_small384x6_step500.log ADDED
@@ -0,0 +1 @@
 
 
1
+ {"out_json": "docs/lta_samples/metrics_20260513/scalinglaw_4gpu_20260513/ctx256_small384x6_step500/trace_steps64_c48_t1p45.json", "records": 10, "step": 500}
LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/trace_ctx512_small384x6_step500.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /usr/local/lib/python3.12/dist-packages/torch/nn/modules/transformer.py:375: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.norm_first was True
2
+ warnings.warn(
3
+ {"out_json": "docs/lta_samples/metrics_20260513/scalinglaw_4gpu_20260513/ctx512_small384x6_step500/trace_steps64_c48_t1p45.json", "records": 10, "step": 500}
LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/trace_params512x8_c512_vocab50257_step750.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /usr/local/lib/python3.12/dist-packages/torch/nn/modules/transformer.py:375: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.norm_first was True
2
+ warnings.warn(
3
+ {"out_json": "docs/lta_samples/metrics_20260513/scalinglaw_4gpu_20260513/params512x8_c512_vocab50257_step750/trace_steps64_c48_t1p45.json", "records": 10, "step": 750}
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/__multiarray_api.c ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ /* These pointers will be stored in the C-object for use in other
3
+ extension modules
4
+ */
5
+
6
+ void *PyArray_API[] = {
7
+ (void *) PyArray_GetNDArrayCVersion,
8
+ (void *) &PyBigArray_Type,
9
+ (void *) &PyArray_Type,
10
+ (void *) &PyArrayDescr_Type,
11
+ (void *) &PyArrayFlags_Type,
12
+ (void *) &PyArrayIter_Type,
13
+ (void *) &PyArrayMultiIter_Type,
14
+ (int *) &NPY_NUMUSERTYPES,
15
+ (void *) &PyBoolArrType_Type,
16
+ (void *) &_PyArrayScalar_BoolValues,
17
+ (void *) &PyGenericArrType_Type,
18
+ (void *) &PyNumberArrType_Type,
19
+ (void *) &PyIntegerArrType_Type,
20
+ (void *) &PySignedIntegerArrType_Type,
21
+ (void *) &PyUnsignedIntegerArrType_Type,
22
+ (void *) &PyInexactArrType_Type,
23
+ (void *) &PyFloatingArrType_Type,
24
+ (void *) &PyComplexFloatingArrType_Type,
25
+ (void *) &PyFlexibleArrType_Type,
26
+ (void *) &PyCharacterArrType_Type,
27
+ (void *) &PyByteArrType_Type,
28
+ (void *) &PyShortArrType_Type,
29
+ (void *) &PyIntArrType_Type,
30
+ (void *) &PyLongArrType_Type,
31
+ (void *) &PyLongLongArrType_Type,
32
+ (void *) &PyUByteArrType_Type,
33
+ (void *) &PyUShortArrType_Type,
34
+ (void *) &PyUIntArrType_Type,
35
+ (void *) &PyULongArrType_Type,
36
+ (void *) &PyULongLongArrType_Type,
37
+ (void *) &PyFloatArrType_Type,
38
+ (void *) &PyDoubleArrType_Type,
39
+ (void *) &PyLongDoubleArrType_Type,
40
+ (void *) &PyCFloatArrType_Type,
41
+ (void *) &PyCDoubleArrType_Type,
42
+ (void *) &PyCLongDoubleArrType_Type,
43
+ (void *) &PyObjectArrType_Type,
44
+ (void *) &PyStringArrType_Type,
45
+ (void *) &PyUnicodeArrType_Type,
46
+ (void *) &PyVoidArrType_Type,
47
+ (void *) PyArray_SetNumericOps,
48
+ (void *) PyArray_GetNumericOps,
49
+ (void *) PyArray_INCREF,
50
+ (void *) PyArray_XDECREF,
51
+ (void *) PyArray_SetStringFunction,
52
+ (void *) PyArray_DescrFromType,
53
+ (void *) PyArray_TypeObjectFromType,
54
+ (void *) PyArray_Zero,
55
+ (void *) PyArray_One,
56
+ (void *) PyArray_CastToType,
57
+ (void *) PyArray_CastTo,
58
+ (void *) PyArray_CastAnyTo,
59
+ (void *) PyArray_CanCastSafely,
60
+ (void *) PyArray_CanCastTo,
61
+ (void *) PyArray_ObjectType,
62
+ (void *) PyArray_DescrFromObject,
63
+ (void *) PyArray_ConvertToCommonType,
64
+ (void *) PyArray_DescrFromScalar,
65
+ (void *) PyArray_DescrFromTypeObject,
66
+ (void *) PyArray_Size,
67
+ (void *) PyArray_Scalar,
68
+ (void *) PyArray_FromScalar,
69
+ (void *) PyArray_ScalarAsCtype,
70
+ (void *) PyArray_CastScalarToCtype,
71
+ (void *) PyArray_CastScalarDirect,
72
+ (void *) PyArray_ScalarFromObject,
73
+ (void *) PyArray_GetCastFunc,
74
+ (void *) PyArray_FromDims,
75
+ (void *) PyArray_FromDimsAndDataAndDescr,
76
+ (void *) PyArray_FromAny,
77
+ (void *) PyArray_EnsureArray,
78
+ (void *) PyArray_EnsureAnyArray,
79
+ (void *) PyArray_FromFile,
80
+ (void *) PyArray_FromString,
81
+ (void *) PyArray_FromBuffer,
82
+ (void *) PyArray_FromIter,
83
+ (void *) PyArray_Return,
84
+ (void *) PyArray_GetField,
85
+ (void *) PyArray_SetField,
86
+ (void *) PyArray_Byteswap,
87
+ (void *) PyArray_Resize,
88
+ (void *) PyArray_MoveInto,
89
+ (void *) PyArray_CopyInto,
90
+ (void *) PyArray_CopyAnyInto,
91
+ (void *) PyArray_CopyObject,
92
+ (void *) PyArray_NewCopy,
93
+ (void *) PyArray_ToList,
94
+ (void *) PyArray_ToString,
95
+ (void *) PyArray_ToFile,
96
+ (void *) PyArray_Dump,
97
+ (void *) PyArray_Dumps,
98
+ (void *) PyArray_ValidType,
99
+ (void *) PyArray_UpdateFlags,
100
+ (void *) PyArray_New,
101
+ (void *) PyArray_NewFromDescr,
102
+ (void *) PyArray_DescrNew,
103
+ (void *) PyArray_DescrNewFromType,
104
+ (void *) PyArray_GetPriority,
105
+ (void *) PyArray_IterNew,
106
+ (void *) PyArray_MultiIterNew,
107
+ (void *) PyArray_PyIntAsInt,
108
+ (void *) PyArray_PyIntAsIntp,
109
+ (void *) PyArray_Broadcast,
110
+ (void *) PyArray_FillObjectArray,
111
+ (void *) PyArray_FillWithScalar,
112
+ (void *) PyArray_CheckStrides,
113
+ (void *) PyArray_DescrNewByteorder,
114
+ (void *) PyArray_IterAllButAxis,
115
+ (void *) PyArray_CheckFromAny,
116
+ (void *) PyArray_FromArray,
117
+ (void *) PyArray_FromInterface,
118
+ (void *) PyArray_FromStructInterface,
119
+ (void *) PyArray_FromArrayAttr,
120
+ (void *) PyArray_ScalarKind,
121
+ (void *) PyArray_CanCoerceScalar,
122
+ (void *) PyArray_NewFlagsObject,
123
+ (void *) PyArray_CanCastScalar,
124
+ (void *) PyArray_CompareUCS4,
125
+ (void *) PyArray_RemoveSmallest,
126
+ (void *) PyArray_ElementStrides,
127
+ (void *) PyArray_Item_INCREF,
128
+ (void *) PyArray_Item_XDECREF,
129
+ (void *) PyArray_FieldNames,
130
+ (void *) PyArray_Transpose,
131
+ (void *) PyArray_TakeFrom,
132
+ (void *) PyArray_PutTo,
133
+ (void *) PyArray_PutMask,
134
+ (void *) PyArray_Repeat,
135
+ (void *) PyArray_Choose,
136
+ (void *) PyArray_Sort,
137
+ (void *) PyArray_ArgSort,
138
+ (void *) PyArray_SearchSorted,
139
+ (void *) PyArray_ArgMax,
140
+ (void *) PyArray_ArgMin,
141
+ (void *) PyArray_Reshape,
142
+ (void *) PyArray_Newshape,
143
+ (void *) PyArray_Squeeze,
144
+ (void *) PyArray_View,
145
+ (void *) PyArray_SwapAxes,
146
+ (void *) PyArray_Max,
147
+ (void *) PyArray_Min,
148
+ (void *) PyArray_Ptp,
149
+ (void *) PyArray_Mean,
150
+ (void *) PyArray_Trace,
151
+ (void *) PyArray_Diagonal,
152
+ (void *) PyArray_Clip,
153
+ (void *) PyArray_Conjugate,
154
+ (void *) PyArray_Nonzero,
155
+ (void *) PyArray_Std,
156
+ (void *) PyArray_Sum,
157
+ (void *) PyArray_CumSum,
158
+ (void *) PyArray_Prod,
159
+ (void *) PyArray_CumProd,
160
+ (void *) PyArray_All,
161
+ (void *) PyArray_Any,
162
+ (void *) PyArray_Compress,
163
+ (void *) PyArray_Flatten,
164
+ (void *) PyArray_Ravel,
165
+ (void *) PyArray_MultiplyList,
166
+ (void *) PyArray_MultiplyIntList,
167
+ (void *) PyArray_GetPtr,
168
+ (void *) PyArray_CompareLists,
169
+ (void *) PyArray_AsCArray,
170
+ (void *) PyArray_As1D,
171
+ (void *) PyArray_As2D,
172
+ (void *) PyArray_Free,
173
+ (void *) PyArray_Converter,
174
+ (void *) PyArray_IntpFromSequence,
175
+ (void *) PyArray_Concatenate,
176
+ (void *) PyArray_InnerProduct,
177
+ (void *) PyArray_MatrixProduct,
178
+ (void *) PyArray_CopyAndTranspose,
179
+ (void *) PyArray_Correlate,
180
+ (void *) PyArray_TypestrConvert,
181
+ (void *) PyArray_DescrConverter,
182
+ (void *) PyArray_DescrConverter2,
183
+ (void *) PyArray_IntpConverter,
184
+ (void *) PyArray_BufferConverter,
185
+ (void *) PyArray_AxisConverter,
186
+ (void *) PyArray_BoolConverter,
187
+ (void *) PyArray_ByteorderConverter,
188
+ (void *) PyArray_OrderConverter,
189
+ (void *) PyArray_EquivTypes,
190
+ (void *) PyArray_Zeros,
191
+ (void *) PyArray_Empty,
192
+ (void *) PyArray_Where,
193
+ (void *) PyArray_Arange,
194
+ (void *) PyArray_ArangeObj,
195
+ (void *) PyArray_SortkindConverter,
196
+ (void *) PyArray_LexSort,
197
+ (void *) PyArray_Round,
198
+ (void *) PyArray_EquivTypenums,
199
+ (void *) PyArray_RegisterDataType,
200
+ (void *) PyArray_RegisterCastFunc,
201
+ (void *) PyArray_RegisterCanCast,
202
+ (void *) PyArray_InitArrFuncs,
203
+ (void *) PyArray_IntTupleFromIntp,
204
+ (void *) PyArray_TypeNumFromName,
205
+ (void *) PyArray_ClipmodeConverter,
206
+ (void *) PyArray_OutputConverter,
207
+ (void *) PyArray_BroadcastToShape,
208
+ (void *) _PyArray_SigintHandler,
209
+ (void *) _PyArray_GetSigintBuf,
210
+ (void *) PyArray_DescrAlignConverter,
211
+ (void *) PyArray_DescrAlignConverter2,
212
+ (void *) PyArray_SearchsideConverter,
213
+ (void *) PyArray_CheckAxis,
214
+ (void *) PyArray_OverflowMultiplyList,
215
+ (void *) PyArray_CompareString,
216
+ (void *) PyArray_MultiIterFromObjects,
217
+ (void *) PyArray_GetEndianness,
218
+ (void *) PyArray_GetNDArrayCFeatureVersion,
219
+ (void *) PyArray_Correlate2,
220
+ (void *) PyArray_NeighborhoodIterNew,
221
+ (void *) &PyTimeIntegerArrType_Type,
222
+ (void *) &PyDatetimeArrType_Type,
223
+ (void *) &PyTimedeltaArrType_Type,
224
+ (void *) &PyHalfArrType_Type,
225
+ (void *) &NpyIter_Type,
226
+ (void *) PyArray_SetDatetimeParseFunction,
227
+ (void *) PyArray_DatetimeToDatetimeStruct,
228
+ (void *) PyArray_TimedeltaToTimedeltaStruct,
229
+ (void *) PyArray_DatetimeStructToDatetime,
230
+ (void *) PyArray_TimedeltaStructToTimedelta,
231
+ (void *) NpyIter_New,
232
+ (void *) NpyIter_MultiNew,
233
+ (void *) NpyIter_AdvancedNew,
234
+ (void *) NpyIter_Copy,
235
+ (void *) NpyIter_Deallocate,
236
+ (void *) NpyIter_HasDelayedBufAlloc,
237
+ (void *) NpyIter_HasExternalLoop,
238
+ (void *) NpyIter_EnableExternalLoop,
239
+ (void *) NpyIter_GetInnerStrideArray,
240
+ (void *) NpyIter_GetInnerLoopSizePtr,
241
+ (void *) NpyIter_Reset,
242
+ (void *) NpyIter_ResetBasePointers,
243
+ (void *) NpyIter_ResetToIterIndexRange,
244
+ (void *) NpyIter_GetNDim,
245
+ (void *) NpyIter_GetNOp,
246
+ (void *) NpyIter_GetIterNext,
247
+ (void *) NpyIter_GetIterSize,
248
+ (void *) NpyIter_GetIterIndexRange,
249
+ (void *) NpyIter_GetIterIndex,
250
+ (void *) NpyIter_GotoIterIndex,
251
+ (void *) NpyIter_HasMultiIndex,
252
+ (void *) NpyIter_GetShape,
253
+ (void *) NpyIter_GetGetMultiIndex,
254
+ (void *) NpyIter_GotoMultiIndex,
255
+ (void *) NpyIter_RemoveMultiIndex,
256
+ (void *) NpyIter_HasIndex,
257
+ (void *) NpyIter_IsBuffered,
258
+ (void *) NpyIter_IsGrowInner,
259
+ (void *) NpyIter_GetBufferSize,
260
+ (void *) NpyIter_GetIndexPtr,
261
+ (void *) NpyIter_GotoIndex,
262
+ (void *) NpyIter_GetDataPtrArray,
263
+ (void *) NpyIter_GetDescrArray,
264
+ (void *) NpyIter_GetOperandArray,
265
+ (void *) NpyIter_GetIterView,
266
+ (void *) NpyIter_GetReadFlags,
267
+ (void *) NpyIter_GetWriteFlags,
268
+ (void *) NpyIter_DebugPrint,
269
+ (void *) NpyIter_IterationNeedsAPI,
270
+ (void *) NpyIter_GetInnerFixedStrideArray,
271
+ (void *) NpyIter_RemoveAxis,
272
+ (void *) NpyIter_GetAxisStrideArray,
273
+ (void *) NpyIter_RequiresBuffering,
274
+ (void *) NpyIter_GetInitialDataPtrArray,
275
+ (void *) NpyIter_CreateCompatibleStrides,
276
+ (void *) PyArray_CastingConverter,
277
+ (void *) PyArray_CountNonzero,
278
+ (void *) PyArray_PromoteTypes,
279
+ (void *) PyArray_MinScalarType,
280
+ (void *) PyArray_ResultType,
281
+ (void *) PyArray_CanCastArrayTo,
282
+ (void *) PyArray_CanCastTypeTo,
283
+ (void *) PyArray_EinsteinSum,
284
+ (void *) PyArray_NewLikeArray,
285
+ (void *) PyArray_GetArrayParamsFromObject,
286
+ (void *) PyArray_ConvertClipmodeSequence,
287
+ (void *) PyArray_MatrixProduct2,
288
+ (void *) NpyIter_IsFirstVisit,
289
+ (void *) PyArray_SetBaseObject,
290
+ (void *) PyArray_CreateSortedStridePerm,
291
+ (void *) PyArray_RemoveAxesInPlace,
292
+ (void *) PyArray_DebugPrint,
293
+ (void *) PyArray_FailUnlessWriteable,
294
+ (void *) PyArray_SetUpdateIfCopyBase,
295
+ (void *) PyDataMem_NEW,
296
+ (void *) PyDataMem_FREE,
297
+ (void *) PyDataMem_RENEW,
298
+ (void *) PyDataMem_SetEventHook,
299
+ (NPY_CASTING *) &NPY_DEFAULT_ASSIGN_CASTING,
300
+ (void *) PyArray_MapIterSwapAxes,
301
+ (void *) PyArray_MapIterArray,
302
+ (void *) PyArray_MapIterNext,
303
+ (void *) PyArray_Partition,
304
+ (void *) PyArray_ArgPartition,
305
+ (void *) PyArray_SelectkindConverter,
306
+ (void *) PyDataMem_NEW_ZEROED,
307
+ (void *) PyArray_CheckAnyScalarExact,
308
+ (void *) PyArray_MapIterArrayCopyIfOverlap,
309
+ (void *) PyArray_ResolveWritebackIfCopy,
310
+ (void *) PyArray_SetWritebackIfCopyBase,
311
+ (void *) PyDataMem_SetHandler,
312
+ (void *) PyDataMem_GetHandler,
313
+ (PyObject* *) &PyDataMem_DefaultHandler
314
+ };
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/experimental_dtype_api.h ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * This header exports the new experimental DType API as proposed in
3
+ * NEPs 41 to 43. For background, please check these NEPs. Otherwise,
4
+ * this header also serves as documentation for the time being.
5
+ *
6
+ * The header includes `_dtype_api.h` which holds most definition while this
7
+ * header mainly wraps functions for public consumption.
8
+ *
9
+ * Please do not hesitate to contact @seberg with questions. This is
10
+ * developed together with https://github.com/seberg/experimental_user_dtypes
11
+ * and those interested in experimenting are encouraged to contribute there.
12
+ *
13
+ * To use the functions defined in the header, call::
14
+ *
15
+ * if (import_experimental_dtype_api(version) < 0) {
16
+ * return NULL;
17
+ * }
18
+ *
19
+ * in your module init. (A version mismatch will be reported, just update
20
+ * to the correct one, this will alert you of possible changes.)
21
+ *
22
+ * The following lists the main symbols currently exported. Please do not
23
+ * hesitate to ask for help or clarification:
24
+ *
25
+ * - PyUFunc_AddLoopFromSpec:
26
+ *
27
+ * Register a new loop for a ufunc. This uses the `PyArrayMethod_Spec`
28
+ * which must be filled in (see in-line comments).
29
+ *
30
+ * - PyUFunc_AddWrappingLoop:
31
+ *
32
+ * Register a new loop which reuses an existing one, but modifies the
33
+ * result dtypes. Please search the internal NumPy docs for more info
34
+ * at this point. (Used for physical units dtype.)
35
+ *
36
+ * - PyUFunc_AddPromoter:
37
+ *
38
+ * Register a new promoter for a ufunc. A promoter is a function stored
39
+ * in a PyCapsule (see in-line comments). It is passed the operation and
40
+ * requested DType signatures and can mutate it to attempt a new search
41
+ * for a matching loop/promoter.
42
+ * I.e. for Numba a promoter could even add the desired loop.
43
+ *
44
+ * - PyArrayInitDTypeMeta_FromSpec:
45
+ *
46
+ * Initialize a new DType. It must currently be a static Python C type
47
+ * that is declared as `PyArray_DTypeMeta` and not `PyTypeObject`.
48
+ * Further, it must subclass `np.dtype` and set its type to
49
+ * `PyArrayDTypeMeta_Type` (before calling `PyType_Read()`).
50
+ *
51
+ * - PyArray_CommonDType:
52
+ *
53
+ * Find the common-dtype ("promotion") for two DType classes. Similar
54
+ * to `np.result_type`, but works on the classes and not instances.
55
+ *
56
+ * - PyArray_PromoteDTypeSequence:
57
+ *
58
+ * Same as CommonDType, but works with an arbitrary number of DTypes.
59
+ * This function is smarter and can often return successful and unambiguous
60
+ * results when `common_dtype(common_dtype(dt1, dt2), dt3)` would
61
+ * depend on the operation order or fail. Nevertheless, DTypes should
62
+ * aim to ensure that their common-dtype implementation is associative
63
+ * and commutative! (Mainly, unsigned and signed integers are not.)
64
+ *
65
+ * For guaranteed consistent results DTypes must implement common-Dtype
66
+ * "transitively". If A promotes B and B promotes C, than A must generally
67
+ * also promote C; where "promotes" means implements the promotion.
68
+ * (There are some exceptions for abstract DTypes)
69
+ *
70
+ * - PyArray_GetDefaultDescr:
71
+ *
72
+ * Given a DType class, returns the default instance (descriptor).
73
+ * This is an inline function checking for `singleton` first and only
74
+ * calls the `default_descr` function if necessary.
75
+ *
76
+ * - PyArray_DoubleDType, etc.:
77
+ *
78
+ * Aliases to the DType classes for the builtin NumPy DTypes.
79
+ *
80
+ * WARNING
81
+ * =======
82
+ *
83
+ * By using this header, you understand that this is a fully experimental
84
+ * exposure. Details are expected to change, and some options may have no
85
+ * effect. (Please contact @seberg if you have questions!)
86
+ * If the exposure stops working, please file a bug report with NumPy.
87
+ * Further, a DType created using this API/header should still be expected
88
+ * to be incompatible with some functionality inside and outside of NumPy.
89
+ * In this case crashes must be expected. Please report any such problems
90
+ * so that they can be fixed before final exposure.
91
+ * Furthermore, expect missing checks for programming errors which the final
92
+ * API is expected to have.
93
+ *
94
+ * Symbols with a leading underscore are likely to not be included in the
95
+ * first public version, if these are central to your use-case, please let
96
+ * us know, so that we can reconsider.
97
+ *
98
+ * "Array-like" consumer API not yet under considerations
99
+ * ======================================================
100
+ *
101
+ * The new DType API is designed in a way to make it potentially useful for
102
+ * alternative "array-like" implementations. This will require careful
103
+ * exposure of details and functions and is not part of this experimental API.
104
+ *
105
+ * Brief (incompatibility) changelog
106
+ * =================================
107
+ *
108
+ * 2. None (only additions).
109
+ * 3. New `npy_intp *view_offset` argument for `resolve_descriptors`.
110
+ * This replaces the `NPY_CAST_IS_VIEW` flag. It can be set to 0 if the
111
+ * operation is a view, and is pre-initialized to `NPY_MIN_INTP` indicating
112
+ * that the operation is not a view.
113
+ */
114
+
115
+ #ifndef NUMPY_CORE_INCLUDE_NUMPY_EXPERIMENTAL_DTYPE_API_H_
116
+ #define NUMPY_CORE_INCLUDE_NUMPY_EXPERIMENTAL_DTYPE_API_H_
117
+
118
+ #include <Python.h>
119
+ #include "ndarraytypes.h"
120
+ #include "_dtype_api.h"
121
+
122
+ /*
123
+ * The contents of PyArrayMethodObject are currently opaque (is there a way
124
+ * good way to make them be `PyObject *`?)
125
+ */
126
+ typedef struct PyArrayMethodObject_tag PyArrayMethodObject;
127
+
128
+ /*
129
+ * There must be a better way?! -- Oh well, this is experimental
130
+ * (my issue with it, is that I cannot undef those helpers).
131
+ */
132
+ #if defined(PY_ARRAY_UNIQUE_SYMBOL)
133
+ #define NPY_EXP_DTYPE_API_CONCAT_HELPER2(x, y) x ## y
134
+ #define NPY_EXP_DTYPE_API_CONCAT_HELPER(arg) NPY_EXP_DTYPE_API_CONCAT_HELPER2(arg, __experimental_dtype_api_table)
135
+ #define __experimental_dtype_api_table NPY_EXP_DTYPE_API_CONCAT_HELPER(PY_ARRAY_UNIQUE_SYMBOL)
136
+ #else
137
+ #define __experimental_dtype_api_table __experimental_dtype_api_table
138
+ #endif
139
+
140
+ /* Support for correct multi-file projects: */
141
+ #if defined(NO_IMPORT) || defined(NO_IMPORT_ARRAY)
142
+ extern void **__experimental_dtype_api_table;
143
+ #else
144
+ /*
145
+ * Just a hack so I don't forget importing as much myself, I spend way too
146
+ * much time noticing it the first time around :).
147
+ */
148
+ static void
149
+ __not_imported(void)
150
+ {
151
+ printf("*****\nCritical error, dtype API not imported\n*****\n");
152
+ }
153
+
154
+ static void *__uninitialized_table[] = {
155
+ &__not_imported, &__not_imported, &__not_imported, &__not_imported,
156
+ &__not_imported, &__not_imported, &__not_imported, &__not_imported};
157
+
158
+ #if defined(PY_ARRAY_UNIQUE_SYMBOL)
159
+ void **__experimental_dtype_api_table = __uninitialized_table;
160
+ #else
161
+ static void **__experimental_dtype_api_table = __uninitialized_table;
162
+ #endif
163
+ #endif
164
+
165
+
166
+ typedef int _ufunc_addloop_fromspec_func(
167
+ PyObject *ufunc, PyArrayMethod_Spec *spec);
168
+ /*
169
+ * The main ufunc registration function. This adds a new implementation/loop
170
+ * to a ufunc. It replaces `PyUFunc_RegisterLoopForType`.
171
+ */
172
+ #define PyUFunc_AddLoopFromSpec \
173
+ (*(_ufunc_addloop_fromspec_func *)(__experimental_dtype_api_table[0]))
174
+
175
+
176
+ /* Please see the NumPy definitions in `array_method.h` for details on these */
177
+ typedef int translate_given_descrs_func(int nin, int nout,
178
+ PyArray_DTypeMeta *wrapped_dtypes[],
179
+ PyArray_Descr *given_descrs[], PyArray_Descr *new_descrs[]);
180
+ typedef int translate_loop_descrs_func(int nin, int nout,
181
+ PyArray_DTypeMeta *new_dtypes[], PyArray_Descr *given_descrs[],
182
+ PyArray_Descr *original_descrs[], PyArray_Descr *loop_descrs[]);
183
+
184
+ typedef int _ufunc_wrapping_loop_func(PyObject *ufunc_obj,
185
+ PyArray_DTypeMeta *new_dtypes[], PyArray_DTypeMeta *wrapped_dtypes[],
186
+ translate_given_descrs_func *translate_given_descrs,
187
+ translate_loop_descrs_func *translate_loop_descrs);
188
+ #define PyUFunc_AddWrappingLoop \
189
+ (*(_ufunc_wrapping_loop_func *)(__experimental_dtype_api_table[7]))
190
+
191
+ /*
192
+ * Type of the C promoter function, which must be wrapped into a
193
+ * PyCapsule with name "numpy._ufunc_promoter".
194
+ *
195
+ * Note that currently the output dtypes are always NULL unless they are
196
+ * also part of the signature. This is an implementation detail and could
197
+ * change in the future. However, in general promoters should not have a
198
+ * need for output dtypes.
199
+ * (There are potential use-cases, these are currently unsupported.)
200
+ */
201
+ typedef int promoter_function(PyObject *ufunc,
202
+ PyArray_DTypeMeta *op_dtypes[], PyArray_DTypeMeta *signature[],
203
+ PyArray_DTypeMeta *new_op_dtypes[]);
204
+
205
+ /*
206
+ * Function to register a promoter.
207
+ *
208
+ * @param ufunc The ufunc object to register the promoter with.
209
+ * @param DType_tuple A Python tuple containing DTypes or None matching the
210
+ * number of inputs and outputs of the ufunc.
211
+ * @param promoter A PyCapsule with name "numpy._ufunc_promoter" containing
212
+ * a pointer to a `promoter_function`.
213
+ */
214
+ typedef int _ufunc_addpromoter_func(
215
+ PyObject *ufunc, PyObject *DType_tuple, PyObject *promoter);
216
+ #define PyUFunc_AddPromoter \
217
+ (*(_ufunc_addpromoter_func *)(__experimental_dtype_api_table[1]))
218
+
219
+ #define PyArrayDTypeMeta_Type \
220
+ (*(PyTypeObject *)__experimental_dtype_api_table[2])
221
+ typedef int __dtypemeta_fromspec(
222
+ PyArray_DTypeMeta *DType, PyArrayDTypeMeta_Spec *dtype_spec);
223
+ /*
224
+ * Finalize creation of a DTypeMeta. You must ensure that the DTypeMeta is
225
+ * a proper subclass. The DTypeMeta object has additional fields compared to
226
+ * a normal PyTypeObject!
227
+ * The only (easy) creation of a new DType is to create a static Type which
228
+ * inherits `PyArray_DescrType`, sets its type to `PyArrayDTypeMeta_Type` and
229
+ * uses `PyArray_DTypeMeta` defined above as the C-structure.
230
+ */
231
+ #define PyArrayInitDTypeMeta_FromSpec \
232
+ ((__dtypemeta_fromspec *)(__experimental_dtype_api_table[3]))
233
+
234
+
235
+ /*
236
+ * *************************************
237
+ * WORKING WITH DTYPES
238
+ * *************************************
239
+ */
240
+
241
+ typedef PyArray_DTypeMeta *__common_dtype(
242
+ PyArray_DTypeMeta *DType1, PyArray_DTypeMeta *DType2);
243
+ #define PyArray_CommonDType \
244
+ ((__common_dtype *)(__experimental_dtype_api_table[4]))
245
+
246
+
247
+ typedef PyArray_DTypeMeta *__promote_dtype_sequence(
248
+ npy_intp num, PyArray_DTypeMeta *DTypes[]);
249
+ #define PyArray_PromoteDTypeSequence \
250
+ ((__promote_dtype_sequence *)(__experimental_dtype_api_table[5]))
251
+
252
+
253
+ typedef PyArray_Descr *__get_default_descr(
254
+ PyArray_DTypeMeta *DType);
255
+ #define _PyArray_GetDefaultDescr \
256
+ ((__get_default_descr *)(__experimental_dtype_api_table[6]))
257
+
258
+ static inline PyArray_Descr *
259
+ PyArray_GetDefaultDescr(PyArray_DTypeMeta *DType)
260
+ {
261
+ if (DType->singleton != NULL) {
262
+ Py_INCREF(DType->singleton);
263
+ return DType->singleton;
264
+ }
265
+ return _PyArray_GetDefaultDescr(DType);
266
+ }
267
+
268
+
269
+ /*
270
+ * NumPy's builtin DTypes:
271
+ */
272
+ #define PyArray_BoolDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[10])
273
+ /* Integers */
274
+ #define PyArray_ByteDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[11])
275
+ #define PyArray_UByteDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[12])
276
+ #define PyArray_ShortDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[13])
277
+ #define PyArray_UShortDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[14])
278
+ #define PyArray_IntDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[15])
279
+ #define PyArray_UIntDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[16])
280
+ #define PyArray_LongDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[17])
281
+ #define PyArray_ULongDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[18])
282
+ #define PyArray_LongLongDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[19])
283
+ #define PyArray_ULongLongDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[20])
284
+ /* Integer aliases */
285
+ #define PyArray_Int8Type (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[21])
286
+ #define PyArray_UInt8DType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[22])
287
+ #define PyArray_Int16DType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[23])
288
+ #define PyArray_UInt16DType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[24])
289
+ #define PyArray_Int32DType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[25])
290
+ #define PyArray_UInt32DType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[26])
291
+ #define PyArray_Int64DType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[27])
292
+ #define PyArray_UInt64DType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[28])
293
+ #define PyArray_IntpDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[29])
294
+ #define PyArray_UIntpDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[30])
295
+ /* Floats */
296
+ #define PyArray_HalfType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[31])
297
+ #define PyArray_FloatDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[32])
298
+ #define PyArray_DoubleDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[33])
299
+ #define PyArray_LongDoubleDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[34])
300
+ /* Complex */
301
+ #define PyArray_CFloatDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[35])
302
+ #define PyArray_CDoubleDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[36])
303
+ #define PyArray_CLongDoubleDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[37])
304
+ /* String/Bytes */
305
+ #define PyArray_StringDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[38])
306
+ #define PyArray_UnicodeDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[39])
307
+ /* Datetime/Timedelta */
308
+ #define PyArray_DatetimeDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[40])
309
+ #define PyArray_TimedeltaDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[41])
310
+ /* Object/Void */
311
+ #define PyArray_ObjectDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[42])
312
+ #define PyArray_VoidDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[43])
313
+
314
+ /*
315
+ * ********************************
316
+ * Initialization
317
+ * ********************************
318
+ *
319
+ * Import the experimental API, the version must match the one defined in
320
+ * the header to ensure changes are taken into account. NumPy will further
321
+ * runtime-check this.
322
+ * You must call this function to use the symbols defined in this file.
323
+ */
324
+ #if !defined(NO_IMPORT) && !defined(NO_IMPORT_ARRAY)
325
+
326
+ static int
327
+ import_experimental_dtype_api(int version)
328
+ {
329
+ if (version != __EXPERIMENTAL_DTYPE_API_VERSION) {
330
+ PyErr_Format(PyExc_RuntimeError,
331
+ "DType API version %d did not match header version %d. Please "
332
+ "update the import statement and check for API changes.",
333
+ version, __EXPERIMENTAL_DTYPE_API_VERSION);
334
+ return -1;
335
+ }
336
+ if (__experimental_dtype_api_table != __uninitialized_table) {
337
+ /* already imported. */
338
+ return 0;
339
+ }
340
+
341
+ PyObject *multiarray = PyImport_ImportModule("numpy.core._multiarray_umath");
342
+ if (multiarray == NULL) {
343
+ return -1;
344
+ }
345
+
346
+ PyObject *api = PyObject_CallMethod(multiarray,
347
+ "_get_experimental_dtype_api", "i", version);
348
+ Py_DECREF(multiarray);
349
+ if (api == NULL) {
350
+ return -1;
351
+ }
352
+ __experimental_dtype_api_table = (void **)PyCapsule_GetPointer(api,
353
+ "experimental_dtype_api_table");
354
+ Py_DECREF(api);
355
+
356
+ if (__experimental_dtype_api_table == NULL) {
357
+ __experimental_dtype_api_table = __uninitialized_table;
358
+ return -1;
359
+ }
360
+ return 0;
361
+ }
362
+
363
+ #endif /* !defined(NO_IMPORT) && !defined(NO_IMPORT_ARRAY) */
364
+
365
+ #endif /* NUMPY_CORE_INCLUDE_NUMPY_EXPERIMENTAL_DTYPE_API_H_ */
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/halffloat.h ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef NUMPY_CORE_INCLUDE_NUMPY_HALFFLOAT_H_
2
+ #define NUMPY_CORE_INCLUDE_NUMPY_HALFFLOAT_H_
3
+
4
+ #include <Python.h>
5
+ #include <numpy/npy_math.h>
6
+
7
+ #ifdef __cplusplus
8
+ extern "C" {
9
+ #endif
10
+
11
+ /*
12
+ * Half-precision routines
13
+ */
14
+
15
+ /* Conversions */
16
+ float npy_half_to_float(npy_half h);
17
+ double npy_half_to_double(npy_half h);
18
+ npy_half npy_float_to_half(float f);
19
+ npy_half npy_double_to_half(double d);
20
+ /* Comparisons */
21
+ int npy_half_eq(npy_half h1, npy_half h2);
22
+ int npy_half_ne(npy_half h1, npy_half h2);
23
+ int npy_half_le(npy_half h1, npy_half h2);
24
+ int npy_half_lt(npy_half h1, npy_half h2);
25
+ int npy_half_ge(npy_half h1, npy_half h2);
26
+ int npy_half_gt(npy_half h1, npy_half h2);
27
+ /* faster *_nonan variants for when you know h1 and h2 are not NaN */
28
+ int npy_half_eq_nonan(npy_half h1, npy_half h2);
29
+ int npy_half_lt_nonan(npy_half h1, npy_half h2);
30
+ int npy_half_le_nonan(npy_half h1, npy_half h2);
31
+ /* Miscellaneous functions */
32
+ int npy_half_iszero(npy_half h);
33
+ int npy_half_isnan(npy_half h);
34
+ int npy_half_isinf(npy_half h);
35
+ int npy_half_isfinite(npy_half h);
36
+ int npy_half_signbit(npy_half h);
37
+ npy_half npy_half_copysign(npy_half x, npy_half y);
38
+ npy_half npy_half_spacing(npy_half h);
39
+ npy_half npy_half_nextafter(npy_half x, npy_half y);
40
+ npy_half npy_half_divmod(npy_half x, npy_half y, npy_half *modulus);
41
+
42
+ /*
43
+ * Half-precision constants
44
+ */
45
+
46
+ #define NPY_HALF_ZERO (0x0000u)
47
+ #define NPY_HALF_PZERO (0x0000u)
48
+ #define NPY_HALF_NZERO (0x8000u)
49
+ #define NPY_HALF_ONE (0x3c00u)
50
+ #define NPY_HALF_NEGONE (0xbc00u)
51
+ #define NPY_HALF_PINF (0x7c00u)
52
+ #define NPY_HALF_NINF (0xfc00u)
53
+ #define NPY_HALF_NAN (0x7e00u)
54
+
55
+ #define NPY_MAX_HALF (0x7bffu)
56
+
57
+ /*
58
+ * Bit-level conversions
59
+ */
60
+
61
+ npy_uint16 npy_floatbits_to_halfbits(npy_uint32 f);
62
+ npy_uint16 npy_doublebits_to_halfbits(npy_uint64 d);
63
+ npy_uint32 npy_halfbits_to_floatbits(npy_uint16 h);
64
+ npy_uint64 npy_halfbits_to_doublebits(npy_uint16 h);
65
+
66
+ #ifdef __cplusplus
67
+ }
68
+ #endif
69
+
70
+ #endif /* NUMPY_CORE_INCLUDE_NUMPY_HALFFLOAT_H_ */
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/npy_cpu.h ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * This set (target) cpu specific macros:
3
+ * - Possible values:
4
+ * NPY_CPU_X86
5
+ * NPY_CPU_AMD64
6
+ * NPY_CPU_PPC
7
+ * NPY_CPU_PPC64
8
+ * NPY_CPU_PPC64LE
9
+ * NPY_CPU_SPARC
10
+ * NPY_CPU_S390
11
+ * NPY_CPU_IA64
12
+ * NPY_CPU_HPPA
13
+ * NPY_CPU_ALPHA
14
+ * NPY_CPU_ARMEL
15
+ * NPY_CPU_ARMEB
16
+ * NPY_CPU_SH_LE
17
+ * NPY_CPU_SH_BE
18
+ * NPY_CPU_ARCEL
19
+ * NPY_CPU_ARCEB
20
+ * NPY_CPU_RISCV64
21
+ * NPY_CPU_LOONGARCH
22
+ * NPY_CPU_WASM
23
+ */
24
+ #ifndef NUMPY_CORE_INCLUDE_NUMPY_NPY_CPU_H_
25
+ #define NUMPY_CORE_INCLUDE_NUMPY_NPY_CPU_H_
26
+
27
+ #include "numpyconfig.h"
28
+
29
+ #if defined( __i386__ ) || defined(i386) || defined(_M_IX86)
30
+ /*
31
+ * __i386__ is defined by gcc and Intel compiler on Linux,
32
+ * _M_IX86 by VS compiler,
33
+ * i386 by Sun compilers on opensolaris at least
34
+ */
35
+ #define NPY_CPU_X86
36
+ #elif defined(__x86_64__) || defined(__amd64__) || defined(__x86_64) || defined(_M_AMD64)
37
+ /*
38
+ * both __x86_64__ and __amd64__ are defined by gcc
39
+ * __x86_64 defined by sun compiler on opensolaris at least
40
+ * _M_AMD64 defined by MS compiler
41
+ */
42
+ #define NPY_CPU_AMD64
43
+ #elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__)
44
+ #define NPY_CPU_PPC64LE
45
+ #elif defined(__powerpc64__) && defined(__BIG_ENDIAN__)
46
+ #define NPY_CPU_PPC64
47
+ #elif defined(__ppc__) || defined(__powerpc__) || defined(_ARCH_PPC)
48
+ /*
49
+ * __ppc__ is defined by gcc, I remember having seen __powerpc__ once,
50
+ * but can't find it ATM
51
+ * _ARCH_PPC is used by at least gcc on AIX
52
+ * As __powerpc__ and _ARCH_PPC are also defined by PPC64 check
53
+ * for those specifically first before defaulting to ppc
54
+ */
55
+ #define NPY_CPU_PPC
56
+ #elif defined(__sparc__) || defined(__sparc)
57
+ /* __sparc__ is defined by gcc and Forte (e.g. Sun) compilers */
58
+ #define NPY_CPU_SPARC
59
+ #elif defined(__s390__)
60
+ #define NPY_CPU_S390
61
+ #elif defined(__ia64)
62
+ #define NPY_CPU_IA64
63
+ #elif defined(__hppa)
64
+ #define NPY_CPU_HPPA
65
+ #elif defined(__alpha__)
66
+ #define NPY_CPU_ALPHA
67
+ #elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM64)
68
+ /* _M_ARM64 is defined in MSVC for ARM64 compilation on Windows */
69
+ #if defined(__ARMEB__) || defined(__AARCH64EB__)
70
+ #if defined(__ARM_32BIT_STATE)
71
+ #define NPY_CPU_ARMEB_AARCH32
72
+ #elif defined(__ARM_64BIT_STATE)
73
+ #define NPY_CPU_ARMEB_AARCH64
74
+ #else
75
+ #define NPY_CPU_ARMEB
76
+ #endif
77
+ #elif defined(__ARMEL__) || defined(__AARCH64EL__) || defined(_M_ARM64)
78
+ #if defined(__ARM_32BIT_STATE)
79
+ #define NPY_CPU_ARMEL_AARCH32
80
+ #elif defined(__ARM_64BIT_STATE) || defined(_M_ARM64) || defined(__AARCH64EL__)
81
+ #define NPY_CPU_ARMEL_AARCH64
82
+ #else
83
+ #define NPY_CPU_ARMEL
84
+ #endif
85
+ #else
86
+ # error Unknown ARM CPU, please report this to numpy maintainers with \
87
+ information about your platform (OS, CPU and compiler)
88
+ #endif
89
+ #elif defined(__sh__) && defined(__LITTLE_ENDIAN__)
90
+ #define NPY_CPU_SH_LE
91
+ #elif defined(__sh__) && defined(__BIG_ENDIAN__)
92
+ #define NPY_CPU_SH_BE
93
+ #elif defined(__MIPSEL__)
94
+ #define NPY_CPU_MIPSEL
95
+ #elif defined(__MIPSEB__)
96
+ #define NPY_CPU_MIPSEB
97
+ #elif defined(__or1k__)
98
+ #define NPY_CPU_OR1K
99
+ #elif defined(__mc68000__)
100
+ #define NPY_CPU_M68K
101
+ #elif defined(__arc__) && defined(__LITTLE_ENDIAN__)
102
+ #define NPY_CPU_ARCEL
103
+ #elif defined(__arc__) && defined(__BIG_ENDIAN__)
104
+ #define NPY_CPU_ARCEB
105
+ #elif defined(__riscv) && defined(__riscv_xlen) && __riscv_xlen == 64
106
+ #define NPY_CPU_RISCV64
107
+ #elif defined(__loongarch__)
108
+ #define NPY_CPU_LOONGARCH
109
+ #elif defined(__EMSCRIPTEN__)
110
+ /* __EMSCRIPTEN__ is defined by emscripten: an LLVM-to-Web compiler */
111
+ #define NPY_CPU_WASM
112
+ #else
113
+ #error Unknown CPU, please report this to numpy maintainers with \
114
+ information about your platform (OS, CPU and compiler)
115
+ #endif
116
+
117
+ /*
118
+ * Except for the following architectures, memory access is limited to the natural
119
+ * alignment of data types otherwise it may lead to bus error or performance regression.
120
+ * For more details about unaligned access, see https://www.kernel.org/doc/Documentation/unaligned-memory-access.txt.
121
+ */
122
+ #if defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64) || defined(__aarch64__) || defined(__powerpc64__)
123
+ #define NPY_ALIGNMENT_REQUIRED 0
124
+ #endif
125
+ #ifndef NPY_ALIGNMENT_REQUIRED
126
+ #define NPY_ALIGNMENT_REQUIRED 1
127
+ #endif
128
+
129
+ #endif /* NUMPY_CORE_INCLUDE_NUMPY_NPY_CPU_H_ */
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/npy_interrupt.h ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * This API is only provided because it is part of publicly exported
3
+ * headers. Its use is considered DEPRECATED, and it will be removed
4
+ * eventually.
5
+ * (This includes the _PyArray_SigintHandler and _PyArray_GetSigintBuf
6
+ * functions which are however, public API, and not headers.)
7
+ *
8
+ * Instead of using these non-threadsafe macros consider periodically
9
+ * querying `PyErr_CheckSignals()` or `PyOS_InterruptOccurred()` will work.
10
+ * Both of these require holding the GIL, although cpython could add a
11
+ * version of `PyOS_InterruptOccurred()` which does not. Such a version
12
+ * actually exists as private API in Python 3.10, and backported to 3.9 and 3.8,
13
+ * see also https://bugs.python.org/issue41037 and
14
+ * https://github.com/python/cpython/pull/20599).
15
+ */
16
+
17
+ #ifndef NUMPY_CORE_INCLUDE_NUMPY_NPY_INTERRUPT_H_
18
+ #define NUMPY_CORE_INCLUDE_NUMPY_NPY_INTERRUPT_H_
19
+
20
+ #ifndef NPY_NO_SIGNAL
21
+
22
+ #include <setjmp.h>
23
+ #include <signal.h>
24
+
25
+ #ifndef sigsetjmp
26
+
27
+ #define NPY_SIGSETJMP(arg1, arg2) setjmp(arg1)
28
+ #define NPY_SIGLONGJMP(arg1, arg2) longjmp(arg1, arg2)
29
+ #define NPY_SIGJMP_BUF jmp_buf
30
+
31
+ #else
32
+
33
+ #define NPY_SIGSETJMP(arg1, arg2) sigsetjmp(arg1, arg2)
34
+ #define NPY_SIGLONGJMP(arg1, arg2) siglongjmp(arg1, arg2)
35
+ #define NPY_SIGJMP_BUF sigjmp_buf
36
+
37
+ #endif
38
+
39
+ # define NPY_SIGINT_ON { \
40
+ PyOS_sighandler_t _npy_sig_save; \
41
+ _npy_sig_save = PyOS_setsig(SIGINT, _PyArray_SigintHandler); \
42
+ if (NPY_SIGSETJMP(*((NPY_SIGJMP_BUF *)_PyArray_GetSigintBuf()), \
43
+ 1) == 0) { \
44
+
45
+ # define NPY_SIGINT_OFF } \
46
+ PyOS_setsig(SIGINT, _npy_sig_save); \
47
+ }
48
+
49
+ #else /* NPY_NO_SIGNAL */
50
+
51
+ #define NPY_SIGINT_ON
52
+ #define NPY_SIGINT_OFF
53
+
54
+ #endif /* HAVE_SIGSETJMP */
55
+
56
+ #endif /* NUMPY_CORE_INCLUDE_NUMPY_NPY_INTERRUPT_H_ */
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/npy_no_deprecated_api.h ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * This include file is provided for inclusion in Cython *.pyd files where
3
+ * one would like to define the NPY_NO_DEPRECATED_API macro. It can be
4
+ * included by
5
+ *
6
+ * cdef extern from "npy_no_deprecated_api.h": pass
7
+ *
8
+ */
9
+ #ifndef NPY_NO_DEPRECATED_API
10
+
11
+ /* put this check here since there may be multiple includes in C extensions. */
12
+ #if defined(NUMPY_CORE_INCLUDE_NUMPY_NDARRAYTYPES_H_) || \
13
+ defined(NUMPY_CORE_INCLUDE_NUMPY_NPY_DEPRECATED_API_H) || \
14
+ defined(NUMPY_CORE_INCLUDE_NUMPY_OLD_DEFINES_H_)
15
+ #error "npy_no_deprecated_api.h" must be first among numpy includes.
16
+ #else
17
+ #define NPY_NO_DEPRECATED_API NPY_API_VERSION
18
+ #endif
19
+
20
+ #endif /* NPY_NO_DEPRECATED_API */
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/utils.h ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef NUMPY_CORE_INCLUDE_NUMPY_UTILS_H_
2
+ #define NUMPY_CORE_INCLUDE_NUMPY_UTILS_H_
3
+
4
+ #ifndef __COMP_NPY_UNUSED
5
+ #if defined(__GNUC__)
6
+ #define __COMP_NPY_UNUSED __attribute__ ((__unused__))
7
+ #elif defined(__ICC)
8
+ #define __COMP_NPY_UNUSED __attribute__ ((__unused__))
9
+ #elif defined(__clang__)
10
+ #define __COMP_NPY_UNUSED __attribute__ ((unused))
11
+ #else
12
+ #define __COMP_NPY_UNUSED
13
+ #endif
14
+ #endif
15
+
16
+ #if defined(__GNUC__) || defined(__ICC) || defined(__clang__)
17
+ #define NPY_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
18
+ #elif defined(_MSC_VER)
19
+ #define NPY_DECL_ALIGNED(x) __declspec(align(x))
20
+ #else
21
+ #define NPY_DECL_ALIGNED(x)
22
+ #endif
23
+
24
+ /* Use this to tag a variable as not used. It will remove unused variable
25
+ * warning on support platforms (see __COM_NPY_UNUSED) and mangle the variable
26
+ * to avoid accidental use */
27
+ #define NPY_UNUSED(x) __NPY_UNUSED_TAGGED ## x __COMP_NPY_UNUSED
28
+ #define NPY_EXPAND(x) x
29
+
30
+ #define NPY_STRINGIFY(x) #x
31
+ #define NPY_TOSTRING(x) NPY_STRINGIFY(x)
32
+
33
+ #define NPY_CAT__(a, b) a ## b
34
+ #define NPY_CAT_(a, b) NPY_CAT__(a, b)
35
+ #define NPY_CAT(a, b) NPY_CAT_(a, b)
36
+
37
+ #endif /* NUMPY_CORE_INCLUDE_NUMPY_UTILS_H_ */
LTA_openwebtext_dualt/scripts/flowtext_score_decode_lab.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Score decode-lab samples with an external causal LM in one model load."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import math
9
+ from collections import defaultdict
10
+ from pathlib import Path
11
+
12
+ import torch
13
+ import torch.nn.functional as F
14
+
15
+
16
+ @torch.no_grad()
17
+ def score_texts(texts, model_name_or_path: str, batch_size: int, max_length: int, device: torch.device):
18
+ from transformers import AutoModelForCausalLM, AutoTokenizer
19
+
20
+ tok = AutoTokenizer.from_pretrained(model_name_or_path)
21
+ if tok.pad_token_id is None:
22
+ tok.pad_token = tok.eos_token
23
+ tok.pad_token_id = tok.eos_token_id
24
+ model = AutoModelForCausalLM.from_pretrained(model_name_or_path).to(device)
25
+ if getattr(model.config, "pad_token_id", None) is None and tok.pad_token_id is not None:
26
+ model.config.pad_token_id = tok.pad_token_id
27
+ model.eval()
28
+
29
+ out = []
30
+ for start in range(0, len(texts), batch_size):
31
+ batch = texts[start : start + batch_size]
32
+ enc = tok(
33
+ batch,
34
+ return_tensors="pt",
35
+ return_attention_mask=True,
36
+ return_token_type_ids=False,
37
+ padding=True,
38
+ truncation=True,
39
+ max_length=max_length,
40
+ ).to(device)
41
+ input_ids = enc["input_ids"]
42
+ attention_mask = enc["attention_mask"].bool()
43
+ if input_ids.size(1) < 2:
44
+ out.extend([(None, None, 0) for _ in batch])
45
+ continue
46
+ logits = model(input_ids=input_ids, attention_mask=attention_mask).logits.transpose(-1, -2)
47
+ token_nll = F.cross_entropy(logits[..., :-1].float(), input_ids[..., 1:], reduction="none")
48
+ if tok.eos_token_id is not None:
49
+ first_eos = (input_ids == tok.eos_token_id).cumsum(-1) == 1
50
+ token_mask = input_ids != tok.eos_token_id
51
+ shift_mask = (first_eos[..., 1:] | token_mask[..., 1:]) & attention_mask[..., 1:]
52
+ else:
53
+ shift_mask = attention_mask[..., 1:]
54
+ for row in range(input_ids.size(0)):
55
+ mask = shift_mask[row]
56
+ count = int(mask.sum().detach().cpu())
57
+ if count <= 0:
58
+ out.append((None, None, 0))
59
+ continue
60
+ nll = float(token_nll[row][mask].sum().detach().cpu()) / count
61
+ out.append((float(math.exp(min(nll, 50.0))), nll, count))
62
+ return out
63
+
64
+
65
+ def main():
66
+ parser = argparse.ArgumentParser()
67
+ parser.add_argument("--input", required=True)
68
+ parser.add_argument("--scorer", required=True)
69
+ parser.add_argument("--batch_size", type=int, default=4)
70
+ parser.add_argument("--max_length", type=int, default=512)
71
+ parser.add_argument("--topk", type=int, default=12)
72
+ parser.add_argument("--output", default="")
73
+ args = parser.parse_args()
74
+
75
+ rows = [json.loads(x) for x in Path(args.input).open() if x.strip()]
76
+ samples = [r for r in rows if r.get("type") == "sample"]
77
+ texts = [r["text"] for r in samples]
78
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
79
+ print(f"[info] scoring {len(texts)} samples on {device} with {args.scorer}", flush=True)
80
+ scored = score_texts(texts, args.scorer, args.batch_size, args.max_length, device)
81
+
82
+ enriched = []
83
+ for row, (ppl, nll, count) in zip(samples, scored):
84
+ rec = dict(row)
85
+ rec["external_ppl"] = ppl
86
+ rec["external_nll"] = nll
87
+ rec["external_tokens"] = count
88
+ # Conservative selection: prioritize samples that are both non-collapsed
89
+ # and plausible under an external LM.
90
+ rec["combined_score"] = float(rec["quality"]) - (0.08 * float(nll) if nll is not None else 10.0)
91
+ enriched.append(rec)
92
+
93
+ by_label = defaultdict(list)
94
+ for rec in enriched:
95
+ by_label[rec["config"]["label"]].append(rec)
96
+ print("\n== Config Summary ==")
97
+ for label, group in sorted(by_label.items()):
98
+ valid = [g for g in group if g["external_ppl"] is not None]
99
+ mean_ppl = sum(g["external_ppl"] for g in valid) / max(len(valid), 1)
100
+ mean_nll = sum(g["external_nll"] for g in valid) / max(len(valid), 1)
101
+ mean_quality = sum(float(g["quality"]) for g in group) / max(len(group), 1)
102
+ best = max(group, key=lambda g: g["combined_score"])
103
+ print(
104
+ f"{label:24s} mean_quality={mean_quality:7.4f} "
105
+ f"mean_ppl={mean_ppl:8.2f} mean_nll={mean_nll:6.3f} "
106
+ f"best_prompt={best['prompt']!r} best_combined={best['combined_score']:.4f}"
107
+ )
108
+
109
+ print("\n== Top Samples ==")
110
+ enriched.sort(key=lambda r: r["combined_score"], reverse=True)
111
+ for rec in enriched[: args.topk]:
112
+ print(
113
+ "\n"
114
+ + "=" * 96
115
+ + f"\nlabel={rec['config']['label']} prompt={rec['prompt']!r} "
116
+ + f"quality={rec['quality']:.4f} ppl={rec['external_ppl']:.2f} "
117
+ + f"rep3={rec['rep3']:.3f} d2={rec['distinct2']:.3f} combined={rec['combined_score']:.4f}\n"
118
+ + rec["text"]
119
+ )
120
+
121
+ if args.output:
122
+ with Path(args.output).open("w") as f:
123
+ for rec in enriched:
124
+ f.write(json.dumps(rec, ensure_ascii=False) + "\n")
125
+ print(f"\n[done] wrote {args.output}")
126
+
127
+
128
+ if __name__ == "__main__":
129
+ main()
LTA_openwebtext_dualt/scripts/launch_lta_owt_from_lm1b_c1024_4gpu.sh ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
5
+
6
+ export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3}
7
+ export OMP_NUM_THREADS=${OMP_NUM_THREADS:-1}
8
+
9
+ RUN_NAME=${RUN_NAME:-lta_owt_distilbert_len1024_init_lm1b1m_posemb_repeat_fully_c1024_adamw_gbs512_4gpu_20k_$(date +%Y%m%d_%H%M%S)}
10
+ SAVE_DIR=${SAVE_DIR:-runs_transfer/${RUN_NAME}}
11
+ LOG_DIR=${LOG_DIR:-logs/owt_from_lm1b_c1024_4gpu}
12
+ mkdir -p "${LOG_DIR}"
13
+
14
+ LM1B_CKPT=${LM1B_CKPT:-runs/lta_lm1b_dirichlet_categorical_fullvocab_c1024_fullycoupled_flmpack_onehot_hardce_ddit_small_len128_gbs512_8gpu_1m_nw0/step_1000000.pt}
15
+ TOTAL_STEPS=${TOTAL_STEPS:-20000}
16
+ PER_GPU_BATCH_SIZE=${PER_GPU_BATCH_SIZE:-16}
17
+ GLOBAL_BATCH_SIZE=${GLOBAL_BATCH_SIZE:-512}
18
+ MASTER_PORT=${MASTER_PORT:-32043}
19
+
20
+ python -m torch.distributed.run --nproc_per_node=4 --master_port="${MASTER_PORT}" train.py \
21
+ --data_path /e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext \
22
+ --openwebtext_split train_minus_100k \
23
+ --text_column text \
24
+ --detokenizer auto \
25
+ --tokenizer_path /e2e-data/evad-tech-vla/wanghan58/workspace/imagenet_handoff_20260327/nlp_dts_light/assets/distilbert-base-uncased/tokenizer.json \
26
+ --save_dir "${SAVE_DIR}" \
27
+ --wrap \
28
+ --wrap_mode stream \
29
+ --max_len 1024 \
30
+ --batch_size "${PER_GPU_BATCH_SIZE}" \
31
+ --global_batch_size "${GLOBAL_BATCH_SIZE}" \
32
+ --num_workers 4 \
33
+ --dataloader_prefetch_factor 4 \
34
+ --total_steps "${TOTAL_STEPS}" \
35
+ --warmup_steps 2500 \
36
+ --log_every 50 \
37
+ --eval_every 0 \
38
+ --save_every 1000 \
39
+ --latest_every 500 \
40
+ --init_model_path "${LM1B_CKPT}" \
41
+ --init_pos_embed_mode repeat \
42
+ --lr 0.0003 \
43
+ --lr_schedule constant_warmup \
44
+ --weight_decay 0.0 \
45
+ --adam_beta1 0.9 \
46
+ --adam_beta2 0.999 \
47
+ --adam_eps 1e-8 \
48
+ --grad_clip 1.0 \
49
+ --d_model 768 \
50
+ --cond_dim 128 \
51
+ --n_layers 12 \
52
+ --n_heads 12 \
53
+ --dim_ff 3072 \
54
+ --dropout 0.1 \
55
+ --model_type ddit \
56
+ --state_format prob \
57
+ --bridge dirichlet \
58
+ --target_loss hard_ce \
59
+ --target_prob 1.0 \
60
+ --min_t 0.0 \
61
+ --max_t 1.0 \
62
+ --dual_t \
63
+ --corrupt_t_mode same \
64
+ --min_mask_ratio 0.1 \
65
+ --max_mask_ratio 1.0 \
66
+ --wrong_token_replace_prob 1.0 \
67
+ --wrong_token_schedule linear_t \
68
+ --wrong_token_exp_k 1.0 \
69
+ --dirichlet_concentration_min 1.0 \
70
+ --dirichlet_concentration_max 1024 \
71
+ --dirichlet_endpoint_mode categorical_dual_t \
72
+ --dirichlet_semantic_t_mode same \
73
+ --categorical_wrong_from_full_vocab \
74
+ --simplex_bridge_sampler dirichlet \
75
+ --infer_steps 128 \
76
+ --decode_damping 1.0 \
77
+ --max_gamma 1.0 \
78
+ --decode_solver flowmap \
79
+ --noise_init logistic_normal \
80
+ --bridge_noise_init logistic_normal \
81
+ --noise_sigma -1 \
82
+ --allow_tf32 \
83
+ --ddp_gradient_as_bucket_view \
84
+ --bf16 \
85
+ 2>&1 | tee "${LOG_DIR}/${RUN_NAME}.log"
LTA_openwebtext_dualt/scripts/launch_lta_wmt14_deen_fullycoupled_4gpu_smoke.sh ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
5
+ export PYTHONPATH="$(pwd)${PYTHONPATH:+:$PYTHONPATH}"
6
+ export TOKENIZERS_PARALLELISM=false
7
+ export PYTHONUNBUFFERED=1
8
+ export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
9
+ export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3}"
10
+
11
+ RUN_NAME="${RUN_NAME:-smoke_lta_wmt14_deen_t5_len128_fullycoupled_elfparams_4gpu_$(date +%Y%m%d_%H%M%S)}"
12
+ SAVE_DIR="${SAVE_DIR:-runs/${RUN_NAME}}"
13
+ LOG_DIR="${LOG_DIR:-logs/wmt14_deen_fullycoupled_smoke}"
14
+ LOG_FILE="${LOG_FILE:-${LOG_DIR}/${RUN_NAME}.log}"
15
+ mkdir -p "${LOG_DIR}" "${SAVE_DIR}"
16
+
17
+ DATA_ROOT="${DATA_ROOT:-/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/elf}"
18
+ DATA_PATH="${DATA_PATH:-${DATA_ROOT}/wmt14_de-en_train_t5}"
19
+ EVAL_DATA_PATH="${EVAL_DATA_PATH:-${DATA_ROOT}/wmt14_de-en_validation_t5}"
20
+ DATASET_CACHE_DIR="${DATASET_CACHE_DIR:-/e2e-data/evad-tech-vla/wanghan58/data/hf_cache}"
21
+ TOKENIZER_PATH="${TOKENIZER_PATH:-/e2e-data/evad-tech-vla/wanghan58/models/hf/t5-small/tokenizer.json}"
22
+
23
+ NPROC_PER_NODE="${NPROC_PER_NODE:-4}"
24
+ MASTER_PORT="${MASTER_PORT:-32072}"
25
+ GLOBAL_BATCH_SIZE="${GLOBAL_BATCH_SIZE:-512}"
26
+ PER_GPU_BATCH_SIZE="${PER_GPU_BATCH_SIZE:-32}"
27
+ TOTAL_STEPS="${TOTAL_STEPS:-50}"
28
+ WARMUP_STEPS="${WARMUP_STEPS:-20}"
29
+ MAX_RECORDS="${MAX_RECORDS:-4096}"
30
+
31
+ if [[ ! -e "${DATA_PATH}" ]]; then
32
+ echo "Missing WMT14 De-En train dataset at ${DATA_PATH}" >&2
33
+ echo "Stage embedded-language-flows/wmt14_de-en_train_t5 locally, then rerun." >&2
34
+ exit 2
35
+ fi
36
+
37
+ python -m torch.distributed.run --nproc_per_node="${NPROC_PER_NODE}" --master_port="${MASTER_PORT}" train.py \
38
+ --elf_conditional_hf \
39
+ --data_path "${DATA_PATH}" \
40
+ --eval_data_path "${EVAL_DATA_PATH}" \
41
+ --dataset_cache_dir "${DATASET_CACHE_DIR}" \
42
+ --tokenizer_path "${TOKENIZER_PATH}" \
43
+ --save_dir "${SAVE_DIR}" \
44
+ --max_len 128 \
45
+ --max_input_len 64 \
46
+ --conditional_pad_token eos \
47
+ --label_drop_prob 0.1 \
48
+ --max_records "${MAX_RECORDS}" \
49
+ --batch_size "${PER_GPU_BATCH_SIZE}" \
50
+ --global_batch_size "${GLOBAL_BATCH_SIZE}" \
51
+ --num_workers 4 \
52
+ --dataloader_prefetch_factor 4 \
53
+ --total_steps "${TOTAL_STEPS}" \
54
+ --warmup_steps "${WARMUP_STEPS}" \
55
+ --log_every 10 \
56
+ --eval_every 0 \
57
+ --save_every 0 \
58
+ --latest_every 25 \
59
+ --optimizer muon \
60
+ --lr 0.001 \
61
+ --lr_schedule constant_warmup \
62
+ --min_lr 0 \
63
+ --weight_decay 0.0 \
64
+ --adam_beta1 0.9 \
65
+ --adam_beta2 0.95 \
66
+ --adam_eps 1e-8 \
67
+ --muon_momentum 0.95 \
68
+ --muon_ns_steps 5 \
69
+ --muon_update_scale 1.0 \
70
+ --ema_decay 0.9999 \
71
+ --ema_start_step 0 \
72
+ --grad_clip 1.0 \
73
+ --seed 42 \
74
+ --d_model 768 \
75
+ --cond_dim 128 \
76
+ --n_layers 12 \
77
+ --n_heads 12 \
78
+ --dim_ff 3072 \
79
+ --dropout 0.1 \
80
+ --model_type ddit \
81
+ --state_format prob \
82
+ --bridge dirichlet \
83
+ --target_loss hard_ce \
84
+ --target_prob 1.0 \
85
+ --min_t 0.05 \
86
+ --max_t 1.0 \
87
+ --dual_t \
88
+ --corrupt_t_mode same \
89
+ --corrupt_min_t 0.05 \
90
+ --corrupt_max_t 1.0 \
91
+ --min_mask_ratio 0.1 \
92
+ --max_mask_ratio 1.0 \
93
+ --wrong_token_replace_prob 1.0 \
94
+ --wrong_token_schedule linear_t \
95
+ --wrong_token_exp_k 1.0 \
96
+ --dirichlet_concentration_min 1.0 \
97
+ --dirichlet_concentration_max 1024.0 \
98
+ --dirichlet_endpoint_mode categorical_dual_t \
99
+ --dirichlet_semantic_t_mode same \
100
+ --dirichlet_semantic_t_value 0.0 \
101
+ --categorical_wrong_from_full_vocab \
102
+ --simplex_bridge_sampler dirichlet \
103
+ --infer_steps 128 \
104
+ --decode_damping 1.0 \
105
+ --max_gamma 1.0 \
106
+ --decode_solver flowmap \
107
+ --noise_init logistic_normal \
108
+ --bridge_noise_init logistic_normal \
109
+ --noise_sigma -1 \
110
+ --allow_tf32 \
111
+ --ddp_gradient_as_bucket_view \
112
+ --bf16 \
113
+ 2>&1 | tee -a "${LOG_FILE}"
LTA_openwebtext_dualt/scripts/run_lta_owt_bert_absrope_time4_dirichlet_len1024_C1_to_1024_8gpu_1m_mask1_sameT_save10k.sh ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
5
+
6
+ # OWT raw text + BERT tokenizer, FLM wrapped stream:
7
+ # [CLS] + 1022 payload tokens + [SEP]
8
+ #
9
+ # Backbone:
10
+ # ddit_elf = RMSNorm/SwiGLU/QK-norm + RoPE + 4 prefix time tokens.
11
+ # We also add learned absolute position embeddings before RoPE.
12
+ #
13
+ # Bridge:
14
+ # Dirichlet C=1->1024, mask_ratio=1.0, model t and corruption t are shared.
15
+
16
+ export DATA_PATH="${DATA_PATH:-/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext}"
17
+ export TEXT_COLUMN="${TEXT_COLUMN:-text}"
18
+ export OPENWEBTEXT_SPLIT="${OPENWEBTEXT_SPLIT:-train_minus_100k}"
19
+ export TOKENIZER_PATH="${TOKENIZER_PATH:-/e2e-data/evad-tech-vla/wanghan58/workspace/imagenet_handoff_20260327/nlp_dts_light/assets/distilbert-base-uncased/tokenizer.json}"
20
+ export TOKENIZED_HF=0
21
+ export WRAP_MODE="${WRAP_MODE:-stream}"
22
+
23
+ export VOCAB_SIZE="${VOCAB_SIZE:-30522}"
24
+ export CMIN="${CMIN:-1}"
25
+ export CMAX="${CMAX:-1024}"
26
+
27
+ export MODEL_TYPE=ddit_elf
28
+ export ELF_NUM_TIME_TOKENS="${ELF_NUM_TIME_TOKENS:-4}"
29
+ export ELF_NUM_MODEL_MODE_TOKENS="${ELF_NUM_MODEL_MODE_TOKENS:-0}"
30
+ export QK_NORM="${QK_NORM:-1}"
31
+ export ABS_POS_EMBED=1
32
+ export CORRUPT_T_MODE=same
33
+ export MIN_MASK_RATIO=1.0
34
+ export MAX_MASK_RATIO=1.0
35
+ export CATEGORICAL_WRONG_PROB_FLOOR="${CATEGORICAL_WRONG_PROB_FLOOR:-0.0}"
36
+
37
+ _ngpus_avail=$(nvidia-smi --query-gpu=index --format=csv,noheader 2>/dev/null | wc -l || echo 1)
38
+ if [[ "${_ngpus_avail}" -le 0 ]]; then _ngpus_avail=1; fi
39
+ _default_cvd=$(seq -s, 0 $((_ngpus_avail - 1)))
40
+ export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-${_default_cvd}}"
41
+ IFS=',' read -ra _cvd_arr <<< "${CUDA_VISIBLE_DEVICES}"
42
+ export NPROC_PER_NODE="${NPROC_PER_NODE:-${#_cvd_arr[@]}}"
43
+ unset _ngpus_avail _default_cvd _cvd_arr
44
+ export NNODES="${NNODES:-${MLP_WORKER_NUM:-1}}"
45
+ export NODE_RANK="${NODE_RANK:-${MLP_ROLE_INDEX:-0}}"
46
+ export MASTER_ADDR="${MASTER_ADDR:-${MLP_WORKER_0_HOST:-127.0.0.1}}"
47
+ export MASTER_PORT="${MASTER_PORT:-${MLP_WORKER_0_PORT:-29500}}"
48
+ export GLOBAL_BATCH_SIZE="${GLOBAL_BATCH_SIZE:-512}"
49
+ export PER_GPU_BATCH_SIZE="${PER_GPU_BATCH_SIZE:-32}"
50
+ export TOTAL_STEPS="${TOTAL_STEPS:-1000000}"
51
+ export WARMUP_STEPS="${WARMUP_STEPS:-2500}"
52
+ export SAVE_EVERY="${SAVE_EVERY:-10000}"
53
+ export LATEST_EVERY="${LATEST_EVERY:-1000}"
54
+ export LOG_EVERY="${LOG_EVERY:-100}"
55
+
56
+ export DATE_TAG="${DATE_TAG:-$(date +%Y%m%d)}"
57
+ export RUN_NAME="${RUN_NAME:-lta_owt_bert_absrope_time4_dirichlet_len1024_C1_to_1024_mask1_sameT_gbs512_b32_8gpu_1m_save10k_${DATE_TAG}}"
58
+
59
+ export WATCH_ENABLED="${WATCH_ENABLED:-1}"
60
+ export WATCH_STEP_INTERVAL="${WATCH_STEP_INTERVAL:-10000}"
61
+ export WATCH_N_SAMPLES="${WATCH_N_SAMPLES:-128}"
62
+ export WATCH_CUDA_VISIBLE_DEVICES="${WATCH_CUDA_VISIBLE_DEVICES:-7}"
63
+ export WATCH_DECODE_MODE="${WATCH_DECODE_MODE:-dual_line_probe}"
64
+ export WATCH_DECODE_BATCH="${WATCH_DECODE_BATCH:-1}"
65
+ export WATCH_DUAL_SEMANTIC_POWER="${WATCH_DUAL_SEMANTIC_POWER:-1.5}"
66
+ export WATCH_DUAL_EARLY_TEMP="${WATCH_DUAL_EARLY_TEMP:-2.8}"
67
+ export WATCH_DUAL_LATE_TEMP="${WATCH_DUAL_LATE_TEMP:-1.45}"
68
+ export WATCH_DUAL_TEMP_END="${WATCH_DUAL_TEMP_END:-0.55}"
69
+ export WATCH_DUAL_TEMP_POWER="${WATCH_DUAL_TEMP_POWER:-1.5}"
70
+ export WATCH_ENDPOINT_TEMP="${WATCH_ENDPOINT_TEMP:-1.45}"
71
+ export WATCH_ENDPOINT_TOP_P="${WATCH_ENDPOINT_TOP_P:-0.95}"
72
+ export WATCH_GUMBEL_TAU_START="${WATCH_GUMBEL_TAU_START:-1.0}"
73
+ export WATCH_GUMBEL_TAU_END="${WATCH_GUMBEL_TAU_END:-0.2}"
74
+ export WATCH_OUT_BASE="${WATCH_OUT_BASE:-docs/lta_samples/metrics_${DATE_TAG}/owt_bert_absrope_time4_C1_to_1024_mask1_sameT_dualline_dirres_c${CMIN}_${CMAX}_n${WATCH_N_SAMPLES}/${RUN_NAME}}"
75
+ export WATCH_LOG_DIR="${WATCH_LOG_DIR:-logs/owt_bert_absrope_time4_C1_to_1024_mask1_sameT_dualline_watch}"
76
+
77
+ bash scripts/run_lta_owt_dirichlet_len1024_Cv_to_2v_8gpu_save1k_with_gumbel_watch.sh
LTA_openwebtext_dualt/scripts/tmp_run_three_quick_infer_20260525.sh ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
5
+ export PYTHONPATH="$(pwd)${PYTHONPATH:+:$PYTHONPATH}"
6
+ export TOKENIZERS_PARALLELISM=false
7
+ export PYTHONUNBUFFERED=1
8
+
9
+ run_sde_quick() {
10
+ local ckpt="$1"
11
+ local tok="$2"
12
+ local out_dir="$3"
13
+ local cmin="$4"
14
+ local cmax="$5"
15
+ local name="$6"
16
+ CUDA_VISIBLE_DEVICES=0 python - "$ckpt" "$tok" "$out_dir" "$cmin" "$cmax" "$name" <<'PY'
17
+ import json
18
+ import re
19
+ import sys
20
+ from pathlib import Path
21
+
22
+ import torch
23
+
24
+ from flowtext_lab.genppl import summarize_token_diversity
25
+ from flowtext_lab.tokenization import BpeTextTokenizer
26
+ from scripts.eval_lm1b_c1024_fullycoupled_sde_genppl import build_model, decode_sde
27
+
28
+ ckpt_path, tok_path, out_dir_s, cmin_s, cmax_s, name = sys.argv[1:7]
29
+ out_dir = Path(out_dir_s)
30
+ out_dir.mkdir(parents=True, exist_ok=True)
31
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
32
+
33
+ print(f"[{name}] load {ckpt_path}", flush=True)
34
+ ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)
35
+ print(f"[{name}] step={ckpt.get('step')}", flush=True)
36
+ tok = BpeTextTokenizer.from_file(tok_path)
37
+ model = build_model(ckpt, tok, device)
38
+
39
+ ids, texts, cfg = decode_sde(
40
+ model,
41
+ tok,
42
+ n_samples=8,
43
+ batch_size=1,
44
+ max_len=1024,
45
+ steps=128,
46
+ seed=20260525,
47
+ device=device,
48
+ concentration_min=float(cmin_s),
49
+ concentration_max=float(cmax_s),
50
+ endpoint_temp=1.45,
51
+ endpoint_projection="gumbel_softmax",
52
+ endpoint_top_p=0.95,
53
+ gumbel_tau_start=1.0,
54
+ gumbel_tau_end=0.2,
55
+ model_t_mode="support_t",
56
+ mean_mode="endpoint_only",
57
+ semantic_power=1.0,
58
+ noise_init="dirichlet",
59
+ noise_dirichlet_concentration=float(cmin_s),
60
+ sde_resample="dirichlet",
61
+ final_from="blend_0.5",
62
+ )
63
+
64
+ def strip_special(text: str) -> str:
65
+ for special in ("[CLS]", "[SEP]", "[PAD]", "<pad>", "</s>", "<s>", "<unk>", "<|endoftext|>"):
66
+ text = text.replace(special, " ")
67
+ return re.sub(r"\s+", " ", text).strip()
68
+
69
+ stripped = [strip_special(text) for text in texts]
70
+ summary = {
71
+ "type": "summary",
72
+ "checkpoint": ckpt_path,
73
+ "step": int(ckpt.get("step", -1)),
74
+ "decode": cfg,
75
+ "diversity": summarize_token_diversity(ids).__dict__,
76
+ }
77
+ out_jsonl = out_dir / "sde_steps128_samples8_unscored.jsonl"
78
+ with out_jsonl.open("w", encoding="utf-8") as f:
79
+ f.write(json.dumps(summary, ensure_ascii=False) + "\n")
80
+ for i, (raw, clean) in enumerate(zip(texts, stripped)):
81
+ f.write(json.dumps({"type": "sample", "index": i, "raw_text": raw, "stripped_text": clean}, ensure_ascii=False) + "\n")
82
+ (out_dir / "first8.txt").write_text("\n\n--- SAMPLE ---\n\n".join(texts), encoding="utf-8")
83
+ (out_dir / "first8_stripped.txt").write_text("\n\n--- SAMPLE ---\n\n".join(stripped), encoding="utf-8")
84
+ print(f"[{name}] done {out_jsonl}", flush=True)
85
+ print(json.dumps(summary, ensure_ascii=False, indent=2), flush=True)
86
+ PY
87
+ }
88
+
89
+ T5_TOK=/e2e-data/evad-tech-vla/wanghan58/models/hf/t5-small/tokenizer.json
90
+ BERT_TOK=/e2e-data/evad-tech-vla/wanghan58/workspace/imagenet_handoff_20260327/nlp_dts_light/assets/distilbert-base-uncased/tokenizer.json
91
+
92
+ run_sde_quick \
93
+ runs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/latest.pt \
94
+ "${T5_TOK}" \
95
+ docs/lta_samples/metrics_20260525/lta_owt_t5_absrope_adaln_Cv_to_2v_step26000_quick_n8 \
96
+ 32100 64200 t5_Cv2V
97
+
98
+ CUDA_VISIBLE_DEVICES=0 python scripts/infer_softkl_decode_probe.py \
99
+ --checkpoint runs/lta_owt_t5_absrope_adaln_dirichlet_len1024_C1_to_1024_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/latest.pt \
100
+ --tokenizer_path "${T5_TOK}" \
101
+ --out_dir docs/lta_samples/metrics_20260525/lta_owt_t5_absrope_adaln_C1_to_1024_step26000_dualline_quick_n8 \
102
+ --max_lens 1024 \
103
+ --n_samples 8 \
104
+ --batch_size 1 \
105
+ --steps 128 \
106
+ --decode_rule dual_line_resample \
107
+ --c_min 1 \
108
+ --c_max 1024 \
109
+ --input_noise_dirichlet_concentration 1 \
110
+ --anchor_mode state \
111
+ --model_t_mode flow \
112
+ --time_schedule uniform \
113
+ --support_power 1.0 \
114
+ --semantic_power 1.5 \
115
+ --early_temp 2.8 \
116
+ --late_temp 1.45 \
117
+ --temp_end 0.55 \
118
+ --temp_power 1.5 \
119
+ --final_from blend \
120
+ --final_decode argmax \
121
+ --seed 20260525
122
+ cp \
123
+ docs/lta_samples/metrics_20260525/lta_owt_t5_absrope_adaln_C1_to_1024_step26000_dualline_quick_n8/context1024_samples.txt \
124
+ docs/lta_samples/metrics_20260525/lta_owt_t5_absrope_adaln_C1_to_1024_step26000_dualline_quick_n8/first8.txt
125
+
126
+ run_sde_quick \
127
+ runs/lta_owt_bert_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask1_sameT_gbs512_b4x4_1m_save1k_watch_20260525/latest.pt \
128
+ "${BERT_TOK}" \
129
+ docs/lta_samples/metrics_20260525/lta_owt_bert_absrope_adaln_Cv_to_2v_mask1_sameT_step4000_quick_n8 \
130
+ 30522 61044 bert_Cv2V_step4000
LTA_openwebtext_dualt/scripts/trace_lta_decode_steps.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ import torch
10
+ import torch.nn.functional as F
11
+
12
+ REPO_ROOT = Path(__file__).resolve().parents[1]
13
+ if str(REPO_ROOT) not in sys.path:
14
+ sys.path.insert(0, str(REPO_ROOT))
15
+
16
+ from eval import build_model_from_ckpt
17
+ from flowtext_lab.decode import model_time_for_step, sample_noise_simplex, state_for_model
18
+ from flowtext_lab.tokenization import BpeTextTokenizer
19
+ from scripts.flowtext_decode_lab import decode_text, flowmap_gamma
20
+
21
+
22
+ def parse_args() -> argparse.Namespace:
23
+ p = argparse.ArgumentParser()
24
+ p.add_argument("--checkpoint", required=True)
25
+ p.add_argument("--tokenizer_path", required=True)
26
+ p.add_argument("--output", required=True)
27
+ p.add_argument("--max_len", type=int, default=128)
28
+ p.add_argument("--steps", type=int, default=128)
29
+ p.add_argument("--seed", type=int, default=20260502)
30
+ p.add_argument("--sample_index", type=int, default=13)
31
+ p.add_argument("--endpoint_temp", type=float, default=1.8)
32
+ p.add_argument("--damping", type=float, default=1.0)
33
+ p.add_argument("--max_gamma", type=float, default=1.0)
34
+ p.add_argument("--eps", type=float, default=1e-8)
35
+ p.add_argument("--trace_steps", default="0,1,2,4,8,16,32,64,96,127")
36
+ p.add_argument("--token_positions", default="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31")
37
+ return p.parse_args()
38
+
39
+
40
+ def parse_ints(text: str) -> list[int]:
41
+ return [int(x) for x in text.split(",") if x.strip()]
42
+
43
+
44
+ @torch.no_grad()
45
+ def main() -> None:
46
+ args = parse_args()
47
+ torch.manual_seed(args.seed)
48
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
49
+ tokenizer = BpeTextTokenizer.from_file(args.tokenizer_path)
50
+ ckpt = torch.load(args.checkpoint, map_location="cpu")
51
+ model = build_model_from_ckpt(ckpt, tokenizer.vocab_size, args.max_len, device)
52
+ model.eval()
53
+
54
+ # Reproduce sample_index by drawing that many + 1 initial noise samples with the same seed.
55
+ init = sample_noise_simplex(
56
+ (args.sample_index + 1, args.max_len),
57
+ tokenizer.vocab_size,
58
+ device,
59
+ args.eps,
60
+ noise_mode="dirichlet",
61
+ target_prob=1.0,
62
+ noise_sigma=-1.0,
63
+ dirichlet_concentration=1.0,
64
+ )[-1:].float()
65
+ attn = torch.ones((1, args.max_len), dtype=torch.bool, device=device)
66
+ trace_steps = set(parse_ints(args.trace_steps))
67
+ positions = parse_ints(args.token_positions)
68
+ records = []
69
+
70
+ probs = init.clone()
71
+ last_endpoint = probs
72
+ for step in range(args.steps):
73
+ t = model_time_for_step("flow", step, args.steps, 1, device, dtype=torch.float32)
74
+ logits = model(state_for_model(model, probs, args.eps), t, attn).float()
75
+ logits = logits / args.endpoint_temp
76
+ endpoint = F.softmax(logits, dim=-1)
77
+ last_endpoint = endpoint
78
+ gamma = flowmap_gamma(step, args.steps, args.damping, args.max_gamma, args.eps)
79
+ new_probs = probs + gamma * (endpoint - probs)
80
+ new_probs = new_probs.clamp_min(args.eps)
81
+ new_probs = new_probs / new_probs.sum(dim=-1, keepdim=True).clamp_min(args.eps)
82
+ probs = new_probs
83
+
84
+ if step in trace_steps:
85
+ top_prob, ids = probs[0].max(dim=-1)
86
+ ent = -(probs[0].clamp_min(args.eps) * probs[0].clamp_min(args.eps).log()).sum(dim=-1)
87
+ endpoint_top_prob, endpoint_ids = endpoint[0].max(dim=-1)
88
+ records.append({
89
+ "step": step,
90
+ "gamma": gamma,
91
+ "model_t": float(t.item()),
92
+ "text_prefix": decode_text(tokenizer, ids[:64].detach().cpu().tolist()),
93
+ "positions": [
94
+ {
95
+ "pos": pos,
96
+ "state_token": tokenizer.decode([int(ids[pos].item())], stop_at_eos=False, skip_special_tokens=False),
97
+ "state_id": int(ids[pos].item()),
98
+ "state_top_p": float(top_prob[pos].item()),
99
+ "state_entropy": float(ent[pos].item()),
100
+ "endpoint_token": tokenizer.decode([int(endpoint_ids[pos].item())], stop_at_eos=False, skip_special_tokens=False),
101
+ "endpoint_id": int(endpoint_ids[pos].item()),
102
+ "endpoint_top_p": float(endpoint_top_prob[pos].item()),
103
+ }
104
+ for pos in positions
105
+ if 0 <= pos < args.max_len
106
+ ],
107
+ })
108
+
109
+ final_ids = probs[0].argmax(dim=-1).detach().cpu().tolist()
110
+ final_text = decode_text(tokenizer, final_ids)
111
+ output = Path(args.output)
112
+ if not output.is_absolute():
113
+ output = Path(args.checkpoint).resolve().parent / output
114
+ output.parent.mkdir(parents=True, exist_ok=True)
115
+ payload = {
116
+ "checkpoint": args.checkpoint,
117
+ "seed": args.seed,
118
+ "sample_index": args.sample_index,
119
+ "steps": args.steps,
120
+ "endpoint_temp": args.endpoint_temp,
121
+ "final_text": final_text,
122
+ "records": records,
123
+ }
124
+ output.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
125
+ print(json.dumps(payload, ensure_ascii=False, indent=2))
126
+
127
+
128
+ if __name__ == "__main__":
129
+ main()