JinghuiLuAstronaut commited on 6 days ago

Commit

bd3a803

verified ·

1 Parent(s): 9566c86

Add files using upload-large-folder tool

Browse files

Files changed (20) hide show

LTA_openwebtext_dualt/logs/ar_lm1b_flmpack_bert_small_len128_gbs512_4gpu_1m_rowshard_b64_resume4000_20260504_203021.nohup.out +0 -0
LTA_openwebtext_dualt/logs/lta_owt_c1024_gpt2_cached_chunks_len1024_fast10k_4gpu_b16_100step.log +149 -0
LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/scalinglaw_ctx1024_exact10_vocab50257_small384x6_4gpu_2000step.log +212 -0
LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/scalinglaw_samples2_192x3_c512_vocab50257_4gpu_3000step.log +66 -0
LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/trace_ctx256_small384x6_step500.log +1 -0
LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/trace_ctx512_small384x6_step500.log +3 -0
LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/trace_params512x8_c512_vocab50257_step750.log +3 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/__multiarray_api.c +314 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/experimental_dtype_api.h +365 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/halffloat.h +70 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/npy_cpu.h +129 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/npy_interrupt.h +56 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/npy_no_deprecated_api.h +20 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/utils.h +37 -0
LTA_openwebtext_dualt/scripts/flowtext_score_decode_lab.py +129 -0
LTA_openwebtext_dualt/scripts/launch_lta_owt_from_lm1b_c1024_4gpu.sh +85 -0
LTA_openwebtext_dualt/scripts/launch_lta_wmt14_deen_fullycoupled_4gpu_smoke.sh +113 -0
LTA_openwebtext_dualt/scripts/run_lta_owt_bert_absrope_time4_dirichlet_len1024_C1_to_1024_8gpu_1m_mask1_sameT_save10k.sh +77 -0
LTA_openwebtext_dualt/scripts/tmp_run_three_quick_infer_20260525.sh +130 -0
LTA_openwebtext_dualt/scripts/trace_lta_decode_steps.py +129 -0

LTA_openwebtext_dualt/logs/ar_lm1b_flmpack_bert_small_len128_gbs512_4gpu_1m_rowshard_b64_resume4000_20260504_203021.nohup.out ADDED Viewed

The diff for this file is too large to render. See raw diff

LTA_openwebtext_dualt/logs/lta_owt_c1024_gpt2_cached_chunks_len1024_fast10k_4gpu_b16_100step.log ADDED Viewed

	@@ -0,0 +1,149 @@

+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[rank0]:[W512 16:41:13.244392390 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+NCCL version 2.25.1+cuda12.8
+[rank1]:[W512 16:41:14.936907560 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank3]:[W512 16:41:14.612056126 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank2]:[W512 16:41:15.111442637 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+{
+  "device": "cuda:0",
+  "rank": 0,
+  "world_size": 4,
+  "samples": "owt_cached_chunks:10904",
+  "vocab_size": 50257,
+  "save_dir": "runs/lta_owt_c1024_gpt2_cached_chunks_len1024_fast10k_4gpu_b16_100step",
+  "batch_size": 16,
+  "grad_accum": 8,
+  "effective_batch_size": 512,
+  "global_batch_size": 512,
+  "lr_schedule": "constant_warmup",
+  "warmup_steps": 20,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.999,
+  "adam_eps": 1e-08,
+  "model_type": "ddit",
+  "dual_t": true,
+  "corrupt_t_mode": "same",
+  "corrupt_min_t": 0.0,
+  "corrupt_max_t": 1.0,
+  "dirichlet_endpoint_mode": "categorical_dual_t",
+  "dirichlet_semantic_t_mode": "same",
+  "dirichlet_semantic_t_value": 0.0,
+  "categorical_wrong_from_full_vocab": true,
+  "simplex_bridge_sampler": "dirichlet",
+  "logistic_normal_sigma_min": 0.18,
+  "logistic_normal_sigma_max": 2.2,
+  "logistic_normal_tau_min": 0.65,
+  "logistic_normal_tau_max": 1.15,
+  "torch_compile": false,
+  "compile_mode": "max-autotune",
+  "state_format": "prob",
+  "target_loss": "hard_ce",
+  "meanflow_weight": 0.0,
+  "bridge_noise_init": "logistic_normal",
+  "noise_sigma": -1.0,
+  "wrap": true,
+  "wrap_mode": "stream",
+  "wrap_record_buffer_size": 200,
+  "owt_cached_chunks": true,
+  "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train_minus_100k_fast10k",
+  "owt_chunk_cache_rebuild": false,
+  "owt_chunk_cache_write_batch": 4096,
+  "online_chunk_shuffle": false,
+  "online_chunk_shuffle_buffer": 10000,
+  "openwebtext_split": "train_minus_100k",
+  "detokenizer": "auto",
+  "resolved_detokenizer": null,
+  "num_workers": 0,
+  "latest_every": 25,
+  "resume_path": ""
+}
+step=5 micro_steps=40 elapsed=50.1s lr=9.000000e-05 loss_all=10.7950 acc_all=0.5424 loss_corrupt=10.8013 acc_corrupt=0.3710 corrupt_frac=0.5505 loss=10.8013 loss_recon=10.8013 loss_meanflow=0.0000 mean_model_t=0.5037 mean_corrupt_t=0.5037 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4924 init_acc_corrupt=0.4756 init_gold_top10=0.5024 init_gold_top100=0.5300
+step=10 micro_steps=80 elapsed=48.8s lr=1.650000e-04 loss_all=10.5876 acc_all=0.5773 loss_corrupt=10.6249 acc_corrupt=0.3641 corrupt_frac=0.5710 loss=10.6249 loss_recon=10.6249 loss_meanflow=0.0000 mean_model_t=0.4988 mean_corrupt_t=0.4988 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5066 init_acc_corrupt=0.4603 init_gold_top10=0.4881 init_gold_top100=0.5178
+step=15 micro_steps=120 elapsed=48.5s lr=2.400000e-04 loss_all=10.0529 acc_all=0.1321 loss_corrupt=10.0839 acc_corrupt=0.0846 corrupt_frac=0.5482 loss=10.0839 loss_recon=10.0839 loss_meanflow=0.0000 mean_model_t=0.4826 mean_corrupt_t=0.4826 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5143 init_acc_corrupt=0.4517 init_gold_top10=0.4797 init_gold_top100=0.5107
+step=20 micro_steps=160 elapsed=46.3s lr=3.000000e-04 loss_all=9.1611 acc_all=0.2368 loss_corrupt=9.2121 acc_corrupt=0.1448 corrupt_frac=0.5527 loss=9.2121 loss_recon=9.2121 loss_meanflow=0.0000 mean_model_t=0.4898 mean_corrupt_t=0.4898 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5102 init_acc_corrupt=0.4514 init_gold_top10=0.4836 init_gold_top100=0.5166
+step=25 micro_steps=200 elapsed=43.5s lr=3.000000e-04 loss_all=8.2892 acc_all=0.1852 loss_corrupt=8.3501 acc_corrupt=0.1275 corrupt_frac=0.5519 loss=8.3501 loss_recon=8.3501 loss_meanflow=0.0000 mean_model_t=0.5035 mean_corrupt_t=0.5035 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5017 init_acc_corrupt=0.4630 init_gold_top10=0.4920 init_gold_top100=0.5225
+step=30 micro_steps=240 elapsed=58.5s lr=3.000000e-04 loss_all=7.5538 acc_all=0.2846 loss_corrupt=7.7125 acc_corrupt=0.1894 corrupt_frac=0.5483 loss=7.7125 loss_recon=7.7125 loss_meanflow=0.0000 mean_model_t=0.4809 mean_corrupt_t=0.4809 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5216 init_acc_corrupt=0.4416 init_gold_top10=0.4725 init_gold_top100=0.5025
+step=35 micro_steps=280 elapsed=44.9s lr=3.000000e-04 loss_all=6.9044 acc_all=0.3164 loss_corrupt=7.2563 acc_corrupt=0.2266 corrupt_frac=0.5421 loss=7.2563 loss_recon=7.2563 loss_meanflow=0.0000 mean_model_t=0.5272 mean_corrupt_t=0.5272 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4720 init_acc_corrupt=0.4966 init_gold_top10=0.5229 init_gold_top100=0.5512
+step=40 micro_steps=320 elapsed=44.0s lr=3.000000e-04 loss_all=6.4465 acc_all=0.2756 loss_corrupt=6.9584 acc_corrupt=0.1858 corrupt_frac=0.5546 loss=6.9584 loss_recon=6.9584 loss_meanflow=0.0000 mean_model_t=0.4862 mean_corrupt_t=0.4862 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5271 init_acc_corrupt=0.4348 init_gold_top10=0.4664 init_gold_top100=0.5004
+step=45 micro_steps=360 elapsed=42.7s lr=3.000000e-04 loss_all=5.9251 acc_all=0.2724 loss_corrupt=6.5543 acc_corrupt=0.1895 corrupt_frac=0.5743 loss=6.5543 loss_recon=6.5543 loss_meanflow=0.0000 mean_model_t=0.4939 mean_corrupt_t=0.4939 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5111 init_acc_corrupt=0.4545 init_gold_top10=0.4826 init_gold_top100=0.5159
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[rank0]:[W512 16:53:29.911882582 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+NCCL version 2.25.1+cuda12.8
+[rank3]:[W512 16:53:30.911647198 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank1]:[W512 16:53:30.554297191 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank2]:[W512 16:53:30.591215668 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+resumed_from=runs/lta_owt_c1024_gpt2_cached_chunks_len1024_fast10k_4gpu_b16_100step/latest.pt start_step=26
+{
+  "device": "cuda:0",
+  "rank": 0,
+  "world_size": 4,
+  "samples": "owt_cached_chunks:10904",
+  "vocab_size": 50257,
+  "save_dir": "runs/lta_owt_c1024_gpt2_cached_chunks_len1024_fast10k_4gpu_b16_100step",
+  "batch_size": 16,
+  "grad_accum": 8,
+  "effective_batch_size": 512,
+  "global_batch_size": 512,
+  "lr_schedule": "constant_warmup",
+  "warmup_steps": 20,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.999,
+  "adam_eps": 1e-08,
+  "model_type": "ddit",
+  "dual_t": true,
+  "corrupt_t_mode": "same",
+  "corrupt_min_t": 0.0,
+  "corrupt_max_t": 1.0,
+  "dirichlet_endpoint_mode": "categorical_dual_t",
+  "dirichlet_semantic_t_mode": "same",
+  "dirichlet_semantic_t_value": 0.0,
+  "categorical_wrong_from_full_vocab": true,
+  "simplex_bridge_sampler": "dirichlet",
+  "logistic_normal_sigma_min": 0.18,
+  "logistic_normal_sigma_max": 2.2,
+  "logistic_normal_tau_min": 0.65,
+  "logistic_normal_tau_max": 1.15,
+  "torch_compile": false,
+  "compile_mode": "max-autotune",
+  "state_format": "prob",
+  "target_loss": "hard_ce",
+  "meanflow_weight": 0.0,
+  "bridge_noise_init": "logistic_normal",
+  "noise_sigma": -1.0,
+  "wrap": true,
+  "wrap_mode": "stream",
+  "wrap_record_buffer_size": 200,
+  "owt_cached_chunks": true,
+  "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train_minus_100k_fast10k",
+  "owt_chunk_cache_rebuild": false,
+  "owt_chunk_cache_write_batch": 4096,
+  "online_chunk_shuffle": false,
+  "online_chunk_shuffle_buffer": 10000,
+  "openwebtext_split": "train_minus_100k",
+  "detokenizer": "auto",
+  "resolved_detokenizer": null,
+  "num_workers": 0,
+  "latest_every": 25,
+  "resume_path": "runs/lta_owt_c1024_gpt2_cached_chunks_len1024_fast10k_4gpu_b16_100step/latest.pt"
+}
+step=30 micro_steps=240 elapsed=49.2s lr=3.000000e-04 loss_all=7.5368 acc_all=0.2895 loss_corrupt=7.6733 acc_corrupt=0.2021 corrupt_frac=0.5505 loss=7.6733 loss_recon=7.6733 loss_meanflow=0.0000 mean_model_t=0.5037 mean_corrupt_t=0.5037 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4924 init_acc_corrupt=0.4756 init_gold_top10=0.5024 init_gold_top100=0.5300
+step=35 micro_steps=280 elapsed=48.2s lr=3.000000e-04 loss_all=7.0131 acc_all=0.2880 loss_corrupt=7.3238 acc_corrupt=0.1995 corrupt_frac=0.5710 loss=7.3238 loss_recon=7.3238 loss_meanflow=0.0000 mean_model_t=0.4988 mean_corrupt_t=0.4988 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5066 init_acc_corrupt=0.4603 init_gold_top10=0.4881 init_gold_top100=0.5178
+step=40 micro_steps=320 elapsed=43.2s lr=3.000000e-04 loss_all=6.4932 acc_all=0.2757 loss_corrupt=6.9707 acc_corrupt=0.1886 corrupt_frac=0.5482 loss=6.9707 loss_recon=6.9707 loss_meanflow=0.0000 mean_model_t=0.4826 mean_corrupt_t=0.4826 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5143 init_acc_corrupt=0.4517 init_gold_top10=0.4797 init_gold_top100=0.5107
+step=45 micro_steps=360 elapsed=42.4s lr=3.000000e-04 loss_all=6.0021 acc_all=0.2643 loss_corrupt=6.6188 acc_corrupt=0.1833 corrupt_frac=0.5527 loss=6.6188 loss_recon=6.6188 loss_meanflow=0.0000 mean_model_t=0.4898 mean_corrupt_t=0.4898 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5102 init_acc_corrupt=0.4514 init_gold_top10=0.4836 init_gold_top100=0.5166
+step=50 micro_steps=400 elapsed=43.3s lr=3.000000e-04 loss_all=5.3270 acc_all=0.3213 loss_corrupt=6.1520 acc_corrupt=0.2234 corrupt_frac=0.5519 loss=6.1520 loss_recon=6.1520 loss_meanflow=0.0000 mean_model_t=0.5035 mean_corrupt_t=0.5035 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5017 init_acc_corrupt=0.4630 init_gold_top10=0.4920 init_gold_top100=0.5225
+step=55 micro_steps=440 elapsed=73.6s lr=3.000000e-04 loss_all=4.5452 acc_all=0.4781 loss_corrupt=5.7044 acc_corrupt=0.3135 corrupt_frac=0.5483 loss=5.7044 loss_recon=5.7044 loss_meanflow=0.0000 mean_model_t=0.4809 mean_corrupt_t=0.4809 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5216 init_acc_corrupt=0.4416 init_gold_top10=0.4725 init_gold_top100=0.5025
+step=60 micro_steps=480 elapsed=48.1s lr=3.000000e-04 loss_all=3.5387 acc_all=0.6034 loss_corrupt=4.8525 acc_corrupt=0.4246 corrupt_frac=0.5421 loss=4.8525 loss_recon=4.8525 loss_meanflow=0.0000 mean_model_t=0.5272 mean_corrupt_t=0.5272 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4720 init_acc_corrupt=0.4966 init_gold_top10=0.5229 init_gold_top100=0.5512
+step=65 micro_steps=520 elapsed=45.9s lr=3.000000e-04 loss_all=3.3057 acc_all=0.6112 loss_corrupt=4.9579 acc_corrupt=0.4032 corrupt_frac=0.5546 loss=4.9579 loss_recon=4.9579 loss_meanflow=0.0000 mean_model_t=0.4862 mean_corrupt_t=0.4862 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5271 init_acc_corrupt=0.4348 init_gold_top10=0.4664 init_gold_top100=0.5004
+step=70 micro_steps=560 elapsed=42.2s lr=3.000000e-04 loss_all=3.1512 acc_all=0.6235 loss_corrupt=4.7593 acc_corrupt=0.4248 corrupt_frac=0.5743 loss=4.7593 loss_recon=4.7593 loss_meanflow=0.0000 mean_model_t=0.4939 mean_corrupt_t=0.4939 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5111 init_acc_corrupt=0.4545 init_gold_top10=0.4826 init_gold_top100=0.5159
+step=75 micro_steps=600 elapsed=42.2s lr=3.000000e-04 loss_all=2.9748 acc_all=0.6482 loss_corrupt=4.6855 acc_corrupt=0.4381 corrupt_frac=0.5472 loss=4.6855 loss_recon=4.6855 loss_meanflow=0.0000 mean_model_t=0.4975 mean_corrupt_t=0.4975 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4982 init_acc_corrupt=0.4679 init_gold_top10=0.4956 init_gold_top100=0.5266
+step=80 micro_steps=640 elapsed=68.9s lr=3.000000e-04 loss_all=2.8523 acc_all=0.6580 loss_corrupt=4.6641 acc_corrupt=0.4379 corrupt_frac=0.5418 loss=4.6641 loss_recon=4.6641 loss_meanflow=0.0000 mean_model_t=0.4888 mean_corrupt_t=0.4888 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5130 init_acc_corrupt=0.4518 init_gold_top10=0.4813 init_gold_top100=0.5123
+step=85 micro_steps=680 elapsed=43.6s lr=3.000000e-04 loss_all=2.8406 acc_all=0.6570 loss_corrupt=4.4957 acc_corrupt=0.4543 corrupt_frac=0.5720 loss=4.4957 loss_recon=4.4957 loss_meanflow=0.0000 mean_model_t=0.4943 mean_corrupt_t=0.4943 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4969 init_acc_corrupt=0.4674 init_gold_top10=0.4971 init_gold_top100=0.5276
+step=90 micro_steps=720 elapsed=41.1s lr=3.000000e-04 loss_all=2.7984 acc_all=0.6605 loss_corrupt=4.5917 acc_corrupt=0.4384 corrupt_frac=0.5533 loss=4.5917 loss_recon=4.5917 loss_meanflow=0.0000 mean_model_t=0.4842 mean_corrupt_t=0.4842 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5198 init_acc_corrupt=0.4432 init_gold_top10=0.4741 init_gold_top100=0.5064
+step=95 micro_steps=760 elapsed=45.6s lr=3.000000e-04 loss_all=2.5898 acc_all=0.6871 loss_corrupt=4.3891 acc_corrupt=0.4635 corrupt_frac=0.5377 loss=4.3891 loss_recon=4.3891 loss_meanflow=0.0000 mean_model_t=0.4992 mean_corrupt_t=0.4992 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4987 init_acc_corrupt=0.4669 init_gold_top10=0.4958 init_gold_top100=0.5251
+step=100 micro_steps=800 elapsed=46.1s lr=3.000000e-04 loss_all=2.5553 acc_all=0.6918 loss_corrupt=4.3386 acc_corrupt=0.4699 corrupt_frac=0.5449 loss=4.3386 loss_recon=4.3386 loss_meanflow=0.0000 mean_model_t=0.5043 mean_corrupt_t=0.5043 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4947 init_acc_corrupt=0.4710 init_gold_top10=0.5004 init_gold_top100=0.5283

LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/scalinglaw_ctx1024_exact10_vocab50257_small384x6_4gpu_2000step.log ADDED Viewed

	@@ -0,0 +1,212 @@

+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1000, in <module>
+[rank0]:     main()
+[rank0]:   File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 615, in main
+[rank0]:     raise ValueError("--owt_cached_chunks requires --wrap --wrap_mode stream")
+[rank0]: ValueError: --owt_cached_chunks requires --wrap --wrap_mode stream
+[rank0]:[W513 01:44:17.923645362 ProcessGroupNCCL.cpp:1487] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank2]: Traceback (most recent call last):
+[rank2]:   File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1000, in <module>
+[rank2]:     main()
+[rank2]:   File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 615, in main
+[rank2]:     raise ValueError("--owt_cached_chunks requires --wrap --wrap_mode stream")
+[rank2]: ValueError: --owt_cached_chunks requires --wrap --wrap_mode stream
+[rank1]: Traceback (most recent call last):
+[rank1]:   File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1000, in <module>
+[rank1]:     main()
+[rank1]:   File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 615, in main
+[rank1]:     raise ValueError("--owt_cached_chunks requires --wrap --wrap_mode stream")
+[rank1]: ValueError: --owt_cached_chunks requires --wrap --wrap_mode stream
+[rank3]: Traceback (most recent call last):
+[rank3]:   File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1000, in <module>
+[rank3]:     main()
+[rank3]:   File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 615, in main
+[rank3]:     raise ValueError("--owt_cached_chunks requires --wrap --wrap_mode stream")
+[rank3]: ValueError: --owt_cached_chunks requires --wrap --wrap_mode stream
+W0513 01:44:17.315000 312465 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 312533 closing signal SIGTERM
+W0513 01:44:17.316000 312465 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 312534 closing signal SIGTERM
+W0513 01:44:17.317000 312465 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 312535 closing signal SIGTERM
+E0513 01:44:17.445000 312465 torch/distributed/elastic/multiprocessing/api.py:870] failed (exitcode: 1) local_rank: 0 (pid: 312532) of binary: /usr/bin/python
+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
+    main()
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+    return f(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
+    run(args)
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
+    elastic_launch(
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+============================================================
+train.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2026-05-13_01:44:17
+  host      : localhost
+  rank      : 0 (local_rank: 0)
+  exitcode  : 1 (pid: 312532)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[rank0]:[W513 01:46:18.106846526 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+NCCL version 2.25.1+cuda12.8
+[rank1]:[W513 01:46:18.152602439 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank3]:[W513 01:46:18.156248186 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank2]:[W513 01:46:18.173783313 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+/usr/local/lib/python3.12/dist-packages/torch/nn/modules/transformer.py:375: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.norm_first was True
+  warnings.warn(
+/usr/local/lib/python3.12/dist-packages/torch/nn/modules/transformer.py:375: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.norm_first was True
+  warnings.warn(
+/usr/local/lib/python3.12/dist-packages/torch/nn/modules/transformer.py:375: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.norm_first was True
+  warnings.warn(
+/usr/local/lib/python3.12/dist-packages/torch/nn/modules/transformer.py:375: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.norm_first was True
+  warnings.warn(
+{
+  "device": "cuda:0",
+  "rank": 0,
+  "world_size": 4,
+  "samples": "owt_cached_chunks:10",
+  "vocab_size": 50257,
+  "tokenizer_vocab_size": 50257,
+  "save_dir": "runs/scalinglaw_ctx1024_exact10_vocab50257_small384x6_4gpu_2000step",
+  "batch_size": 16,
+  "grad_accum": 8,
+  "effective_batch_size": 512,
+  "global_batch_size": 512,
+  "lr_schedule": "constant_warmup",
+  "warmup_steps": 20,
+  "min_lr": 0.0,
+  "adamw_param_groups": "all_decay",
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.999,
+  "adam_eps": 1e-08,
+  "model_type": "transformer",
+  "dual_t": true,
+  "corrupt_t_mode": "independent",
+  "corrupt_min_t": null,
+  "corrupt_max_t": null,
+  "prefix_block_prob": 0.0,
+  "prefix_block_len": 128,
+  "dirichlet_endpoint_mode": "categorical_dual_t",
+  "dirichlet_semantic_t_mode": "same",
+  "dirichlet_semantic_t_value": 0.0,
+  "categorical_wrong_from_full_vocab": true,
+  "categorical_wrong_from_batch_valid_tokens": false,
+  "mask_mixture_original_prob": 0.0,
+  "mask_mixture_lowk_prob": 0.0,
+  "mask_mixture_lowcorrupt_prob": 0.0,
+  "mask_mixture_block_prob": 0.0,
+  "mask_mixture_all_prob": 0.0,
+  "mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
+  "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
+  "mask_mixture_block_tokens": "64,128",
+  "simplex_bridge_sampler": "dirichlet",
+  "logistic_normal_sigma_min": 0.18,
+  "logistic_normal_sigma_max": 2.2,
+  "logistic_normal_tau_min": 0.65,
+  "logistic_normal_tau_max": 1.15,
+  "torch_compile": false,
+  "compile_mode": "max-autotune",
+  "state_format": "prob",
+  "target_loss": "hard_ce",
+  "meanflow_weight": 0.0,
+  "bridge_noise_init": "logistic_normal",
+  "noise_sigma": -1.0,
+  "wrap": true,
+  "wrap_mode": "stream",
+  "wrap_record_buffer_size": 200,
+  "owt_cached_chunks": true,
+  "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train_minus_100k_exact10_minvocab",
+  "owt_chunk_cache_rebuild": false,
+  "owt_chunk_cache_write_batch": 4096,
+  "owt_exact_repeat_per_chunk": 10000,
+  "online_chunk_shuffle": false,
+  "online_chunk_shuffle_buffer": 10000,
+  "openwebtext_split": "all",
+  "detokenizer": "auto",
+  "resolved_detokenizer": null,
+  "num_workers": 0,
+  "latest_every": 500,
+  "resume_path": ""
+}
+step=25 micro_steps=200 elapsed=51.0s lr=3.000000e-04 loss_all=9.5698 acc_all=0.0351 loss_corrupt=9.5720 acc_corrupt=0.0348 corrupt_frac=0.5538 loss=9.5720 loss_recon=9.5720 loss_meanflow=0.0000 mean_model_t=0.5004 mean_corrupt_t=0.5067 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4889 init_acc_corrupt=0.4773 init_gold_top10=0.5053 init_gold_top100=0.5565
+step=50 micro_steps=400 elapsed=59.4s lr=3.000000e-04 loss_all=6.9808 acc_all=0.0435 loss_corrupt=6.9836 acc_corrupt=0.0426 corrupt_frac=0.5502 loss=6.9836 loss_recon=6.9836 loss_meanflow=0.0000 mean_model_t=0.4927 mean_corrupt_t=0.4946 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5015 init_acc_corrupt=0.4633 init_gold_top10=0.4927 init_gold_top100=0.5466
+step=75 micro_steps=600 elapsed=62.6s lr=3.000000e-04 loss_all=6.5358 acc_all=0.0417 loss_corrupt=6.5414 acc_corrupt=0.0417 corrupt_frac=0.5479 loss=6.5414 loss_recon=6.5414 loss_meanflow=0.0000 mean_model_t=0.4907 mean_corrupt_t=0.5026 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4971 init_acc_corrupt=0.4696 init_gold_top10=0.4972 init_gold_top100=0.5490
+step=100 micro_steps=800 elapsed=63.8s lr=3.000000e-04 loss_all=6.4942 acc_all=0.0471 loss_corrupt=6.5060 acc_corrupt=0.0454 corrupt_frac=0.5491 loss=6.5060 loss_recon=6.5060 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.5094 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4881 init_acc_corrupt=0.4785 init_gold_top10=0.5065 init_gold_top100=0.5552
+step=125 micro_steps=1000 elapsed=64.4s lr=3.000000e-04 loss_all=6.2915 acc_all=0.0970 loss_corrupt=6.3833 acc_corrupt=0.0779 corrupt_frac=0.5601 loss=6.3833 loss_recon=6.3833 loss_meanflow=0.0000 mean_model_t=0.5049 mean_corrupt_t=0.4978 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5000 init_acc_corrupt=0.4654 init_gold_top10=0.4943 init_gold_top100=0.5459
+step=150 micro_steps=1200 elapsed=64.7s lr=3.000000e-04 loss_all=5.5438 acc_all=0.1897 loss_corrupt=5.8703 acc_corrupt=0.1546 corrupt_frac=0.5550 loss=5.8703 loss_recon=5.8703 loss_meanflow=0.0000 mean_model_t=0.5132 mean_corrupt_t=0.4969 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5024 init_acc_corrupt=0.4618 init_gold_top10=0.4917 init_gold_top100=0.5428
+step=175 micro_steps=1400 elapsed=64.9s lr=3.000000e-04 loss_all=4.6997 acc_all=0.2680 loss_corrupt=5.2135 acc_corrupt=0.2113 corrupt_frac=0.5524 loss=5.2135 loss_recon=5.2135 loss_meanflow=0.0000 mean_model_t=0.5089 mean_corrupt_t=0.5104 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4923 init_acc_corrupt=0.4735 init_gold_top10=0.5019 init_gold_top100=0.5528
+step=200 micro_steps=1600 elapsed=65.0s lr=3.000000e-04 loss_all=3.9019 acc_all=0.3769 loss_corrupt=4.5546 acc_corrupt=0.2747 corrupt_frac=0.5513 loss=4.5546 loss_recon=4.5546 loss_meanflow=0.0000 mean_model_t=0.5030 mean_corrupt_t=0.5062 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4907 init_acc_corrupt=0.4750 init_gold_top10=0.5040 init_gold_top100=0.5515
+step=225 micro_steps=1800 elapsed=65.2s lr=3.000000e-04 loss_all=3.3872 acc_all=0.5167 loss_corrupt=4.1294 acc_corrupt=0.3459 corrupt_frac=0.5499 loss=4.1294 loss_recon=4.1294 loss_meanflow=0.0000 mean_model_t=0.4978 mean_corrupt_t=0.5020 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5021 init_acc_corrupt=0.4618 init_gold_top10=0.4920 init_gold_top100=0.5455
+step=250 micro_steps=2000 elapsed=65.2s lr=3.000000e-04 loss_all=3.0234 acc_all=0.6385 loss_corrupt=3.8056 acc_corrupt=0.4251 corrupt_frac=0.5563 loss=3.8056 loss_recon=3.8056 loss_meanflow=0.0000 mean_model_t=0.4949 mean_corrupt_t=0.5037 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4973 init_acc_corrupt=0.4692 init_gold_top10=0.4971 init_gold_top100=0.5482
+step=275 micro_steps=2200 elapsed=65.1s lr=3.000000e-04 loss_all=2.6022 acc_all=0.7098 loss_corrupt=3.4180 acc_corrupt=0.4904 corrupt_frac=0.5559 loss=3.4180 loss_recon=3.4180 loss_meanflow=0.0000 mean_model_t=0.4927 mean_corrupt_t=0.5018 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4963 init_acc_corrupt=0.4691 init_gold_top10=0.4982 init_gold_top100=0.5482
+step=300 micro_steps=2400 elapsed=64.9s lr=3.000000e-04 loss_all=2.2382 acc_all=0.7375 loss_corrupt=3.0869 acc_corrupt=0.5279 corrupt_frac=0.5560 loss=3.0869 loss_recon=3.0869 loss_meanflow=0.0000 mean_model_t=0.5101 mean_corrupt_t=0.5021 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4983 init_acc_corrupt=0.4672 init_gold_top10=0.4959 init_gold_top100=0.5493
+step=325 micro_steps=2600 elapsed=64.5s lr=3.000000e-04 loss_all=1.9767 acc_all=0.7410 loss_corrupt=2.8649 acc_corrupt=0.5331 corrupt_frac=0.5541 loss=2.8649 loss_recon=2.8649 loss_meanflow=0.0000 mean_model_t=0.5050 mean_corrupt_t=0.4960 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5083 init_acc_corrupt=0.4563 init_gold_top10=0.4857 init_gold_top100=0.5395
+step=350 micro_steps=2800 elapsed=65.1s lr=3.000000e-04 loss_all=1.6568 acc_all=0.7574 loss_corrupt=2.5097 acc_corrupt=0.5616 corrupt_frac=0.5528 loss=2.5097 loss_recon=2.5097 loss_meanflow=0.0000 mean_model_t=0.4963 mean_corrupt_t=0.5107 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4878 init_acc_corrupt=0.4793 init_gold_top10=0.5066 init_gold_top100=0.5567
+step=375 micro_steps=3000 elapsed=64.8s lr=3.000000e-04 loss_all=1.4678 acc_all=0.7555 loss_corrupt=2.3501 acc_corrupt=0.5523 corrupt_frac=0.5468 loss=2.3501 loss_recon=2.3501 loss_meanflow=0.0000 mean_model_t=0.5051 mean_corrupt_t=0.4982 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5020 init_acc_corrupt=0.4619 init_gold_top10=0.4920 init_gold_top100=0.5448
+step=400 micro_steps=3200 elapsed=64.6s lr=3.000000e-04 loss_all=1.2952 acc_all=0.7603 loss_corrupt=2.1428 acc_corrupt=0.5624 corrupt_frac=0.5475 loss=2.1428 loss_recon=2.1428 loss_meanflow=0.0000 mean_model_t=0.4886 mean_corrupt_t=0.5051 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4978 init_acc_corrupt=0.4664 init_gold_top10=0.4966 init_gold_top100=0.5475
+step=425 micro_steps=3400 elapsed=64.8s lr=3.000000e-04 loss_all=1.1578 acc_all=0.7646 loss_corrupt=1.9592 acc_corrupt=0.5729 corrupt_frac=0.5509 loss=1.9592 loss_recon=1.9592 loss_meanflow=0.0000 mean_model_t=0.5035 mean_corrupt_t=0.4999 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4939 init_acc_corrupt=0.4715 init_gold_top10=0.5003 init_gold_top100=0.5506
+step=450 micro_steps=3600 elapsed=64.7s lr=3.000000e-04 loss_all=1.0533 acc_all=0.7693 loss_corrupt=1.8220 acc_corrupt=0.5805 corrupt_frac=0.5503 loss=1.8220 loss_recon=1.8220 loss_meanflow=0.0000 mean_model_t=0.4990 mean_corrupt_t=0.5016 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4946 init_acc_corrupt=0.4706 init_gold_top10=0.4998 init_gold_top100=0.5502
+step=475 micro_steps=3800 elapsed=64.7s lr=3.000000e-04 loss_all=0.9739 acc_all=0.7743 loss_corrupt=1.6798 acc_corrupt=0.5961 corrupt_frac=0.5586 loss=1.6798 loss_recon=1.6798 loss_meanflow=0.0000 mean_model_t=0.5064 mean_corrupt_t=0.5119 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4869 init_acc_corrupt=0.4807 init_gold_top10=0.5078 init_gold_top100=0.5566
+step=500 micro_steps=4000 elapsed=64.8s lr=3.000000e-04 loss_all=0.8900 acc_all=0.7838 loss_corrupt=1.5706 acc_corrupt=0.6084 corrupt_frac=0.5516 loss=1.5706 loss_recon=1.5706 loss_meanflow=0.0000 mean_model_t=0.4952 mean_corrupt_t=0.5048 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4970 init_acc_corrupt=0.4698 init_gold_top10=0.4974 init_gold_top100=0.5487
+step=525 micro_steps=4200 elapsed=65.9s lr=3.000000e-04 loss_all=0.7953 acc_all=0.8066 loss_corrupt=1.4415 acc_corrupt=0.6420 corrupt_frac=0.5395 loss=1.4415 loss_recon=1.4415 loss_meanflow=0.0000 mean_model_t=0.5024 mean_corrupt_t=0.5028 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4925 init_acc_corrupt=0.4743 init_gold_top10=0.5020 init_gold_top100=0.5513
+step=550 micro_steps=4400 elapsed=64.7s lr=3.000000e-04 loss_all=0.7557 acc_all=0.8208 loss_corrupt=1.3456 acc_corrupt=0.6753 corrupt_frac=0.5522 loss=1.3456 loss_recon=1.3456 loss_meanflow=0.0000 mean_model_t=0.4996 mean_corrupt_t=0.4982 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5028 init_acc_corrupt=0.4624 init_gold_top10=0.4915 init_gold_top100=0.5432
+step=575 micro_steps=4600 elapsed=64.5s lr=3.000000e-04 loss_all=0.6402 acc_all=0.8644 loss_corrupt=1.1534 acc_corrupt=0.7515 corrupt_frac=0.5452 loss=1.1534 loss_recon=1.1534 loss_meanflow=0.0000 mean_model_t=0.5019 mean_corrupt_t=0.4997 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5020 init_acc_corrupt=0.4630 init_gold_top10=0.4920 init_gold_top100=0.5448
+step=600 micro_steps=4800 elapsed=64.6s lr=3.000000e-04 loss_all=0.5154 acc_all=0.9112 loss_corrupt=0.9217 acc_corrupt=0.8384 corrupt_frac=0.5482 loss=0.9217 loss_recon=0.9217 loss_meanflow=0.0000 mean_model_t=0.4926 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5034 init_acc_corrupt=0.4628 init_gold_top10=0.4908 init_gold_top100=0.5449
+step=625 micro_steps=5000 elapsed=64.8s lr=3.000000e-04 loss_all=0.3651 acc_all=0.9599 loss_corrupt=0.6465 acc_corrupt=0.9279 corrupt_frac=0.5518 loss=0.6465 loss_recon=0.6465 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5011 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5007 init_acc_corrupt=0.4636 init_gold_top10=0.4933 init_gold_top100=0.5457
+step=650 micro_steps=5200 elapsed=64.4s lr=3.000000e-04 loss_all=0.2195 acc_all=0.9897 loss_corrupt=0.3918 acc_corrupt=0.9812 corrupt_frac=0.5405 loss=0.3918 loss_recon=0.3918 loss_meanflow=0.0000 mean_model_t=0.4929 mean_corrupt_t=0.4927 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5096 init_acc_corrupt=0.4549 init_gold_top10=0.4843 init_gold_top100=0.5381
+step=675 micro_steps=5400 elapsed=64.7s lr=3.000000e-04 loss_all=0.1207 acc_all=0.9953 loss_corrupt=0.2134 acc_corrupt=0.9913 corrupt_frac=0.5407 loss=0.2134 loss_recon=0.2134 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5034 init_acc_corrupt=0.4609 init_gold_top10=0.4908 init_gold_top100=0.5421
+W0513 02:15:32.291000 316519 torch/distributed/elastic/agent/server/api.py:719] Received 15 death signal, shutting down workers
+W0513 02:15:32.293000 316519 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 316613 closing signal SIGTERM
+W0513 02:15:32.294000 316519 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 316614 closing signal SIGTERM
+W0513 02:15:32.294000 316519 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 316615 closing signal SIGTERM
+W0513 02:15:32.295000 316519 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 316616 closing signal SIGTERM
+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
+    main()
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+    return f(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
+    run(args)
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
+    elastic_launch(
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
+    result = agent.run()
+             ^^^^^^^^^^^
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
+    result = f(*args, **kwargs)
+             ^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run
+    result = self._invoke_run(role)
+             ^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 870, in _invoke_run
+    time.sleep(monitor_interval)
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 316519 got signal: 15

LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/scalinglaw_samples2_192x3_c512_vocab50257_4gpu_3000step.log ADDED Viewed

	@@ -0,0 +1,66 @@

+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1000, in <module>
+[rank0]:     main()
+[rank0]:   File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 615, in main
+[rank0]:     raise ValueError("--owt_cached_chunks requires --wrap --wrap_mode stream")
+[rank0]: ValueError: --owt_cached_chunks requires --wrap --wrap_mode stream
+[rank0]:[W513 01:44:40.999052026 ProcessGroupNCCL.cpp:1487] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank3]: Traceback (most recent call last):
+[rank3]:   File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1000, in <module>
+[rank3]:     main()
+[rank3]:   File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 615, in main
+[rank3]:     raise ValueError("--owt_cached_chunks requires --wrap --wrap_mode stream")
+[rank3]: ValueError: --owt_cached_chunks requires --wrap --wrap_mode stream
+[rank2]: Traceback (most recent call last):
+[rank2]:   File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1000, in <module>
+[rank2]:     main()
+[rank2]:   File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 615, in main
+[rank2]:     raise ValueError("--owt_cached_chunks requires --wrap --wrap_mode stream")
+[rank2]: ValueError: --owt_cached_chunks requires --wrap --wrap_mode stream
+[rank1]: Traceback (most recent call last):
+[rank1]:   File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1000, in <module>
+[rank1]:     main()
+[rank1]:   File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 615, in main
+[rank1]:     raise ValueError("--owt_cached_chunks requires --wrap --wrap_mode stream")
+[rank1]: ValueError: --owt_cached_chunks requires --wrap --wrap_mode stream
+W0513 01:44:40.334000 313116 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 313184 closing signal SIGTERM
+W0513 01:44:40.335000 313116 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 313185 closing signal SIGTERM
+W0513 01:44:40.336000 313116 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 313186 closing signal SIGTERM
+E0513 01:44:40.463000 313116 torch/distributed/elastic/multiprocessing/api.py:870] failed (exitcode: 1) local_rank: 0 (pid: 313183) of binary: /usr/bin/python
+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
+    main()
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+    return f(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
+    run(args)
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
+    elastic_launch(
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+============================================================
+train.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2026-05-13_01:44:40
+  host      : localhost
+  rank      : 0 (local_rank: 0)
+  exitcode  : 1 (pid: 313183)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================

LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/trace_ctx256_small384x6_step500.log ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"out_json": "docs/lta_samples/metrics_20260513/scalinglaw_4gpu_20260513/ctx256_small384x6_step500/trace_steps64_c48_t1p45.json", "records": 10, "step": 500}

LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/trace_ctx512_small384x6_step500.log ADDED Viewed

	@@ -0,0 +1,3 @@

+/usr/local/lib/python3.12/dist-packages/torch/nn/modules/transformer.py:375: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.norm_first was True
+  warnings.warn(
+{"out_json": "docs/lta_samples/metrics_20260513/scalinglaw_4gpu_20260513/ctx512_small384x6_step500/trace_steps64_c48_t1p45.json", "records": 10, "step": 500}

LTA_openwebtext_dualt/logs/scalinglaw_4gpu_20260513/trace_params512x8_c512_vocab50257_step750.log ADDED Viewed

	@@ -0,0 +1,3 @@

+/usr/local/lib/python3.12/dist-packages/torch/nn/modules/transformer.py:375: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.norm_first was True
+  warnings.warn(
+{"out_json": "docs/lta_samples/metrics_20260513/scalinglaw_4gpu_20260513/params512x8_c512_vocab50257_step750/trace_steps64_c48_t1p45.json", "records": 10, "step": 750}

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/__multiarray_api.c ADDED Viewed

	@@ -0,0 +1,314 @@

+/* These pointers will be stored in the C-object for use in other
+    extension modules
+*/
+void *PyArray_API[] = {
+        (void *) PyArray_GetNDArrayCVersion,
+        (void *) &PyBigArray_Type,
+        (void *) &PyArray_Type,
+        (void *) &PyArrayDescr_Type,
+        (void *) &PyArrayFlags_Type,
+        (void *) &PyArrayIter_Type,
+        (void *) &PyArrayMultiIter_Type,
+        (int *) &NPY_NUMUSERTYPES,
+        (void *) &PyBoolArrType_Type,
+        (void *) &_PyArrayScalar_BoolValues,
+        (void *) &PyGenericArrType_Type,
+        (void *) &PyNumberArrType_Type,
+        (void *) &PyIntegerArrType_Type,
+        (void *) &PySignedIntegerArrType_Type,
+        (void *) &PyUnsignedIntegerArrType_Type,
+        (void *) &PyInexactArrType_Type,
+        (void *) &PyFloatingArrType_Type,
+        (void *) &PyComplexFloatingArrType_Type,
+        (void *) &PyFlexibleArrType_Type,
+        (void *) &PyCharacterArrType_Type,
+        (void *) &PyByteArrType_Type,
+        (void *) &PyShortArrType_Type,
+        (void *) &PyIntArrType_Type,
+        (void *) &PyLongArrType_Type,
+        (void *) &PyLongLongArrType_Type,
+        (void *) &PyUByteArrType_Type,
+        (void *) &PyUShortArrType_Type,
+        (void *) &PyUIntArrType_Type,
+        (void *) &PyULongArrType_Type,
+        (void *) &PyULongLongArrType_Type,
+        (void *) &PyFloatArrType_Type,
+        (void *) &PyDoubleArrType_Type,
+        (void *) &PyLongDoubleArrType_Type,
+        (void *) &PyCFloatArrType_Type,
+        (void *) &PyCDoubleArrType_Type,
+        (void *) &PyCLongDoubleArrType_Type,
+        (void *) &PyObjectArrType_Type,
+        (void *) &PyStringArrType_Type,
+        (void *) &PyUnicodeArrType_Type,
+        (void *) &PyVoidArrType_Type,
+        (void *) PyArray_SetNumericOps,
+        (void *) PyArray_GetNumericOps,
+        (void *) PyArray_INCREF,
+        (void *) PyArray_XDECREF,
+        (void *) PyArray_SetStringFunction,
+        (void *) PyArray_DescrFromType,
+        (void *) PyArray_TypeObjectFromType,
+        (void *) PyArray_Zero,
+        (void *) PyArray_One,
+        (void *) PyArray_CastToType,
+        (void *) PyArray_CastTo,
+        (void *) PyArray_CastAnyTo,
+        (void *) PyArray_CanCastSafely,
+        (void *) PyArray_CanCastTo,
+        (void *) PyArray_ObjectType,
+        (void *) PyArray_DescrFromObject,
+        (void *) PyArray_ConvertToCommonType,
+        (void *) PyArray_DescrFromScalar,
+        (void *) PyArray_DescrFromTypeObject,
+        (void *) PyArray_Size,
+        (void *) PyArray_Scalar,
+        (void *) PyArray_FromScalar,
+        (void *) PyArray_ScalarAsCtype,
+        (void *) PyArray_CastScalarToCtype,
+        (void *) PyArray_CastScalarDirect,
+        (void *) PyArray_ScalarFromObject,
+        (void *) PyArray_GetCastFunc,
+        (void *) PyArray_FromDims,
+        (void *) PyArray_FromDimsAndDataAndDescr,
+        (void *) PyArray_FromAny,
+        (void *) PyArray_EnsureArray,
+        (void *) PyArray_EnsureAnyArray,
+        (void *) PyArray_FromFile,
+        (void *) PyArray_FromString,
+        (void *) PyArray_FromBuffer,
+        (void *) PyArray_FromIter,
+        (void *) PyArray_Return,
+        (void *) PyArray_GetField,
+        (void *) PyArray_SetField,
+        (void *) PyArray_Byteswap,
+        (void *) PyArray_Resize,
+        (void *) PyArray_MoveInto,
+        (void *) PyArray_CopyInto,
+        (void *) PyArray_CopyAnyInto,
+        (void *) PyArray_CopyObject,
+        (void *) PyArray_NewCopy,
+        (void *) PyArray_ToList,
+        (void *) PyArray_ToString,
+        (void *) PyArray_ToFile,
+        (void *) PyArray_Dump,
+        (void *) PyArray_Dumps,
+        (void *) PyArray_ValidType,
+        (void *) PyArray_UpdateFlags,
+        (void *) PyArray_New,
+        (void *) PyArray_NewFromDescr,
+        (void *) PyArray_DescrNew,
+        (void *) PyArray_DescrNewFromType,
+        (void *) PyArray_GetPriority,
+        (void *) PyArray_IterNew,
+        (void *) PyArray_MultiIterNew,
+        (void *) PyArray_PyIntAsInt,
+        (void *) PyArray_PyIntAsIntp,
+        (void *) PyArray_Broadcast,
+        (void *) PyArray_FillObjectArray,
+        (void *) PyArray_FillWithScalar,
+        (void *) PyArray_CheckStrides,
+        (void *) PyArray_DescrNewByteorder,
+        (void *) PyArray_IterAllButAxis,
+        (void *) PyArray_CheckFromAny,
+        (void *) PyArray_FromArray,
+        (void *) PyArray_FromInterface,
+        (void *) PyArray_FromStructInterface,
+        (void *) PyArray_FromArrayAttr,
+        (void *) PyArray_ScalarKind,
+        (void *) PyArray_CanCoerceScalar,
+        (void *) PyArray_NewFlagsObject,
+        (void *) PyArray_CanCastScalar,
+        (void *) PyArray_CompareUCS4,
+        (void *) PyArray_RemoveSmallest,
+        (void *) PyArray_ElementStrides,
+        (void *) PyArray_Item_INCREF,
+        (void *) PyArray_Item_XDECREF,
+        (void *) PyArray_FieldNames,
+        (void *) PyArray_Transpose,
+        (void *) PyArray_TakeFrom,
+        (void *) PyArray_PutTo,
+        (void *) PyArray_PutMask,
+        (void *) PyArray_Repeat,
+        (void *) PyArray_Choose,
+        (void *) PyArray_Sort,
+        (void *) PyArray_ArgSort,
+        (void *) PyArray_SearchSorted,
+        (void *) PyArray_ArgMax,
+        (void *) PyArray_ArgMin,
+        (void *) PyArray_Reshape,
+        (void *) PyArray_Newshape,
+        (void *) PyArray_Squeeze,
+        (void *) PyArray_View,
+        (void *) PyArray_SwapAxes,
+        (void *) PyArray_Max,
+        (void *) PyArray_Min,
+        (void *) PyArray_Ptp,
+        (void *) PyArray_Mean,
+        (void *) PyArray_Trace,
+        (void *) PyArray_Diagonal,
+        (void *) PyArray_Clip,
+        (void *) PyArray_Conjugate,
+        (void *) PyArray_Nonzero,
+        (void *) PyArray_Std,
+        (void *) PyArray_Sum,
+        (void *) PyArray_CumSum,
+        (void *) PyArray_Prod,
+        (void *) PyArray_CumProd,
+        (void *) PyArray_All,
+        (void *) PyArray_Any,
+        (void *) PyArray_Compress,
+        (void *) PyArray_Flatten,
+        (void *) PyArray_Ravel,
+        (void *) PyArray_MultiplyList,
+        (void *) PyArray_MultiplyIntList,
+        (void *) PyArray_GetPtr,
+        (void *) PyArray_CompareLists,
+        (void *) PyArray_AsCArray,
+        (void *) PyArray_As1D,
+        (void *) PyArray_As2D,
+        (void *) PyArray_Free,
+        (void *) PyArray_Converter,
+        (void *) PyArray_IntpFromSequence,
+        (void *) PyArray_Concatenate,
+        (void *) PyArray_InnerProduct,
+        (void *) PyArray_MatrixProduct,
+        (void *) PyArray_CopyAndTranspose,
+        (void *) PyArray_Correlate,
+        (void *) PyArray_TypestrConvert,
+        (void *) PyArray_DescrConverter,
+        (void *) PyArray_DescrConverter2,
+        (void *) PyArray_IntpConverter,
+        (void *) PyArray_BufferConverter,
+        (void *) PyArray_AxisConverter,
+        (void *) PyArray_BoolConverter,
+        (void *) PyArray_ByteorderConverter,
+        (void *) PyArray_OrderConverter,
+        (void *) PyArray_EquivTypes,
+        (void *) PyArray_Zeros,
+        (void *) PyArray_Empty,
+        (void *) PyArray_Where,
+        (void *) PyArray_Arange,
+        (void *) PyArray_ArangeObj,
+        (void *) PyArray_SortkindConverter,
+        (void *) PyArray_LexSort,
+        (void *) PyArray_Round,
+        (void *) PyArray_EquivTypenums,
+        (void *) PyArray_RegisterDataType,
+        (void *) PyArray_RegisterCastFunc,
+        (void *) PyArray_RegisterCanCast,
+        (void *) PyArray_InitArrFuncs,
+        (void *) PyArray_IntTupleFromIntp,
+        (void *) PyArray_TypeNumFromName,
+        (void *) PyArray_ClipmodeConverter,
+        (void *) PyArray_OutputConverter,
+        (void *) PyArray_BroadcastToShape,
+        (void *) _PyArray_SigintHandler,
+        (void *) _PyArray_GetSigintBuf,
+        (void *) PyArray_DescrAlignConverter,
+        (void *) PyArray_DescrAlignConverter2,
+        (void *) PyArray_SearchsideConverter,
+        (void *) PyArray_CheckAxis,
+        (void *) PyArray_OverflowMultiplyList,
+        (void *) PyArray_CompareString,
+        (void *) PyArray_MultiIterFromObjects,
+        (void *) PyArray_GetEndianness,
+        (void *) PyArray_GetNDArrayCFeatureVersion,
+        (void *) PyArray_Correlate2,
+        (void *) PyArray_NeighborhoodIterNew,
+        (void *) &PyTimeIntegerArrType_Type,
+        (void *) &PyDatetimeArrType_Type,
+        (void *) &PyTimedeltaArrType_Type,
+        (void *) &PyHalfArrType_Type,
+        (void *) &NpyIter_Type,
+        (void *) PyArray_SetDatetimeParseFunction,
+        (void *) PyArray_DatetimeToDatetimeStruct,
+        (void *) PyArray_TimedeltaToTimedeltaStruct,
+        (void *) PyArray_DatetimeStructToDatetime,
+        (void *) PyArray_TimedeltaStructToTimedelta,
+        (void *) NpyIter_New,
+        (void *) NpyIter_MultiNew,
+        (void *) NpyIter_AdvancedNew,
+        (void *) NpyIter_Copy,
+        (void *) NpyIter_Deallocate,
+        (void *) NpyIter_HasDelayedBufAlloc,
+        (void *) NpyIter_HasExternalLoop,
+        (void *) NpyIter_EnableExternalLoop,
+        (void *) NpyIter_GetInnerStrideArray,
+        (void *) NpyIter_GetInnerLoopSizePtr,
+        (void *) NpyIter_Reset,
+        (void *) NpyIter_ResetBasePointers,
+        (void *) NpyIter_ResetToIterIndexRange,
+        (void *) NpyIter_GetNDim,
+        (void *) NpyIter_GetNOp,
+        (void *) NpyIter_GetIterNext,
+        (void *) NpyIter_GetIterSize,
+        (void *) NpyIter_GetIterIndexRange,
+        (void *) NpyIter_GetIterIndex,
+        (void *) NpyIter_GotoIterIndex,
+        (void *) NpyIter_HasMultiIndex,
+        (void *) NpyIter_GetShape,
+        (void *) NpyIter_GetGetMultiIndex,
+        (void *) NpyIter_GotoMultiIndex,
+        (void *) NpyIter_RemoveMultiIndex,
+        (void *) NpyIter_HasIndex,
+        (void *) NpyIter_IsBuffered,
+        (void *) NpyIter_IsGrowInner,
+        (void *) NpyIter_GetBufferSize,
+        (void *) NpyIter_GetIndexPtr,
+        (void *) NpyIter_GotoIndex,
+        (void *) NpyIter_GetDataPtrArray,
+        (void *) NpyIter_GetDescrArray,
+        (void *) NpyIter_GetOperandArray,
+        (void *) NpyIter_GetIterView,
+        (void *) NpyIter_GetReadFlags,
+        (void *) NpyIter_GetWriteFlags,
+        (void *) NpyIter_DebugPrint,
+        (void *) NpyIter_IterationNeedsAPI,
+        (void *) NpyIter_GetInnerFixedStrideArray,
+        (void *) NpyIter_RemoveAxis,
+        (void *) NpyIter_GetAxisStrideArray,
+        (void *) NpyIter_RequiresBuffering,
+        (void *) NpyIter_GetInitialDataPtrArray,
+        (void *) NpyIter_CreateCompatibleStrides,
+        (void *) PyArray_CastingConverter,
+        (void *) PyArray_CountNonzero,
+        (void *) PyArray_PromoteTypes,
+        (void *) PyArray_MinScalarType,
+        (void *) PyArray_ResultType,
+        (void *) PyArray_CanCastArrayTo,
+        (void *) PyArray_CanCastTypeTo,
+        (void *) PyArray_EinsteinSum,
+        (void *) PyArray_NewLikeArray,
+        (void *) PyArray_GetArrayParamsFromObject,
+        (void *) PyArray_ConvertClipmodeSequence,
+        (void *) PyArray_MatrixProduct2,
+        (void *) NpyIter_IsFirstVisit,
+        (void *) PyArray_SetBaseObject,
+        (void *) PyArray_CreateSortedStridePerm,
+        (void *) PyArray_RemoveAxesInPlace,
+        (void *) PyArray_DebugPrint,
+        (void *) PyArray_FailUnlessWriteable,
+        (void *) PyArray_SetUpdateIfCopyBase,
+        (void *) PyDataMem_NEW,
+        (void *) PyDataMem_FREE,
+        (void *) PyDataMem_RENEW,
+        (void *) PyDataMem_SetEventHook,
+        (NPY_CASTING *) &NPY_DEFAULT_ASSIGN_CASTING,
+        (void *) PyArray_MapIterSwapAxes,
+        (void *) PyArray_MapIterArray,
+        (void *) PyArray_MapIterNext,
+        (void *) PyArray_Partition,
+        (void *) PyArray_ArgPartition,
+        (void *) PyArray_SelectkindConverter,
+        (void *) PyDataMem_NEW_ZEROED,
+        (void *) PyArray_CheckAnyScalarExact,
+        (void *) PyArray_MapIterArrayCopyIfOverlap,
+        (void *) PyArray_ResolveWritebackIfCopy,
+        (void *) PyArray_SetWritebackIfCopyBase,
+        (void *) PyDataMem_SetHandler,
+        (void *) PyDataMem_GetHandler,
+        (PyObject* *) &PyDataMem_DefaultHandler
+};

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/experimental_dtype_api.h ADDED Viewed

	@@ -0,0 +1,365 @@

+/*
+ * This header exports the new experimental DType API as proposed in
+ * NEPs 41 to 43.  For background, please check these NEPs.  Otherwise,
+ * this header also serves as documentation for the time being.
+ *
+ * The header includes `_dtype_api.h` which holds most definition while this
+ * header mainly wraps functions for public consumption.
+ *
+ * Please do not hesitate to contact @seberg with questions.  This is
+ * developed together with https://github.com/seberg/experimental_user_dtypes
+ * and those interested in experimenting are encouraged to contribute there.
+ *
+ * To use the functions defined in the header, call::
+ *
+ *     if (import_experimental_dtype_api(version) < 0) {
+ *         return NULL;
+ *     }
+ *
+ * in your module init.  (A version mismatch will be reported, just update
+ * to the correct one, this will alert you of possible changes.)
+ *
+ * The following lists the main symbols currently exported.  Please do not
+ * hesitate to ask for help or clarification:
+ *
+ * - PyUFunc_AddLoopFromSpec:
+ *
+ *     Register a new loop for a ufunc.  This uses the `PyArrayMethod_Spec`
+ *     which must be filled in (see in-line comments).
+ *
+ * - PyUFunc_AddWrappingLoop:
+ *
+ *     Register a new loop which reuses an existing one, but modifies the
+ *     result dtypes.  Please search the internal NumPy docs for more info
+ *     at this point.  (Used for physical units dtype.)
+ *
+ * - PyUFunc_AddPromoter:
+ *
+ *     Register a new promoter for a ufunc.  A promoter is a function stored
+ *     in a PyCapsule (see in-line comments).  It is passed the operation and
+ *     requested DType signatures and can mutate it to attempt a new search
+ *     for a matching loop/promoter.
+ *     I.e. for Numba a promoter could even add the desired loop.
+ *
+ * - PyArrayInitDTypeMeta_FromSpec:
+ *
+ *     Initialize a new DType.  It must currently be a static Python C type
+ *     that is declared as `PyArray_DTypeMeta` and not `PyTypeObject`.
+ *     Further, it must subclass `np.dtype` and set its type to
+ *     `PyArrayDTypeMeta_Type` (before calling `PyType_Read()`).
+ *
+ * - PyArray_CommonDType:
+ *
+ *     Find the common-dtype ("promotion") for two DType classes.  Similar
+ *     to `np.result_type`, but works on the classes and not instances.
+ *
+ * - PyArray_PromoteDTypeSequence:
+ *
+ *     Same as CommonDType, but works with an arbitrary number of DTypes.
+ *     This function is smarter and can often return successful and unambiguous
+ *     results when `common_dtype(common_dtype(dt1, dt2), dt3)` would
+ *     depend on the operation order or fail.  Nevertheless, DTypes should
+ *     aim to ensure that their common-dtype implementation is associative
+ *     and commutative!  (Mainly, unsigned and signed integers are not.)
+ *
+ *     For guaranteed consistent results DTypes must implement common-Dtype
+ *     "transitively".  If A promotes B and B promotes C, than A must generally
+ *     also promote C; where "promotes" means implements the promotion.
+ *     (There are some exceptions for abstract DTypes)
+ *
+ * - PyArray_GetDefaultDescr:
+ *
+ *     Given a DType class, returns the default instance (descriptor).
+ *     This is an inline function checking for `singleton` first and only
+ *     calls the `default_descr` function if necessary.
+ *
+ * - PyArray_DoubleDType, etc.:
+ *
+ *     Aliases to the DType classes for the builtin NumPy DTypes.
+ *
+ * WARNING
+ * =======
+ *
+ * By using this header, you understand that this is a fully experimental
+ * exposure.  Details are expected to change, and some options may have no
+ * effect.  (Please contact @seberg if you have questions!)
+ * If the exposure stops working, please file a bug report with NumPy.
+ * Further, a DType created using this API/header should still be expected
+ * to be incompatible with some functionality inside and outside of NumPy.
+ * In this case crashes must be expected.  Please report any such problems
+ * so that they can be fixed before final exposure.
+ * Furthermore, expect missing checks for programming errors which the final
+ * API is expected to have.
+ *
+ * Symbols with a leading underscore are likely to not be included in the
+ * first public version, if these are central to your use-case, please let
+ * us know, so that we can reconsider.
+ *
+ * "Array-like" consumer API not yet under considerations
+ * ======================================================
+ *
+ * The new DType API is designed in a way to make it potentially useful for
+ * alternative "array-like" implementations.  This will require careful
+ * exposure of details and functions and is not part of this experimental API.
+ *
+ * Brief (incompatibility) changelog
+ * =================================
+ *
+ * 2. None (only additions).
+ * 3. New `npy_intp *view_offset` argument for `resolve_descriptors`.
+ *    This replaces the `NPY_CAST_IS_VIEW` flag.  It can be set to 0 if the
+ *    operation is a view, and is pre-initialized to `NPY_MIN_INTP` indicating
+ *    that the operation is not a view.
+ */
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_EXPERIMENTAL_DTYPE_API_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_EXPERIMENTAL_DTYPE_API_H_
+#include <Python.h>
+#include "ndarraytypes.h"
+#include "_dtype_api.h"
+/*
+ * The contents of PyArrayMethodObject are currently opaque (is there a way
+ * good way to make them be `PyObject *`?)
+ */
+typedef struct PyArrayMethodObject_tag PyArrayMethodObject;
+/*
+ * There must be a better way?! -- Oh well, this is experimental
+ * (my issue with it, is that I cannot undef those helpers).
+ */
+#if defined(PY_ARRAY_UNIQUE_SYMBOL)
+    #define NPY_EXP_DTYPE_API_CONCAT_HELPER2(x, y) x ## y
+    #define NPY_EXP_DTYPE_API_CONCAT_HELPER(arg) NPY_EXP_DTYPE_API_CONCAT_HELPER2(arg, __experimental_dtype_api_table)
+    #define __experimental_dtype_api_table NPY_EXP_DTYPE_API_CONCAT_HELPER(PY_ARRAY_UNIQUE_SYMBOL)
+#else
+    #define __experimental_dtype_api_table __experimental_dtype_api_table
+#endif
+/* Support for correct multi-file projects: */
+#if defined(NO_IMPORT) || defined(NO_IMPORT_ARRAY)
+    extern void **__experimental_dtype_api_table;
+#else
+    /*
+     * Just a hack so I don't forget importing as much myself, I spend way too
+     * much time noticing it the first time around :).
+     */
+    static void
+    __not_imported(void)
+    {
+        printf("*****\nCritical error, dtype API not imported\n*****\n");
+    }
+    static void *__uninitialized_table[] = {
+            &__not_imported, &__not_imported, &__not_imported, &__not_imported,
+            &__not_imported, &__not_imported, &__not_imported, &__not_imported};
+    #if defined(PY_ARRAY_UNIQUE_SYMBOL)
+        void **__experimental_dtype_api_table = __uninitialized_table;
+    #else
+        static void **__experimental_dtype_api_table = __uninitialized_table;
+    #endif
+#endif
+typedef int _ufunc_addloop_fromspec_func(
+        PyObject *ufunc, PyArrayMethod_Spec *spec);
+/*
+ * The main ufunc registration function.  This adds a new implementation/loop
+ * to a ufunc.  It replaces `PyUFunc_RegisterLoopForType`.
+ */
+#define PyUFunc_AddLoopFromSpec \
+    (*(_ufunc_addloop_fromspec_func *)(__experimental_dtype_api_table[0]))
+/* Please see the NumPy definitions in `array_method.h` for details on these */
+typedef int translate_given_descrs_func(int nin, int nout,
+        PyArray_DTypeMeta *wrapped_dtypes[],
+        PyArray_Descr *given_descrs[], PyArray_Descr *new_descrs[]);
+typedef int translate_loop_descrs_func(int nin, int nout,
+        PyArray_DTypeMeta *new_dtypes[], PyArray_Descr *given_descrs[],
+        PyArray_Descr *original_descrs[], PyArray_Descr *loop_descrs[]);
+typedef int _ufunc_wrapping_loop_func(PyObject *ufunc_obj,
+        PyArray_DTypeMeta *new_dtypes[], PyArray_DTypeMeta *wrapped_dtypes[],
+        translate_given_descrs_func *translate_given_descrs,
+        translate_loop_descrs_func *translate_loop_descrs);
+#define PyUFunc_AddWrappingLoop \
+    (*(_ufunc_wrapping_loop_func *)(__experimental_dtype_api_table[7]))
+/*
+ * Type of the C promoter function, which must be wrapped into a
+ * PyCapsule with name "numpy._ufunc_promoter".
+ *
+ * Note that currently the output dtypes are always NULL unless they are
+ * also part of the signature.  This is an implementation detail and could
+ * change in the future.  However, in general promoters should not have a
+ * need for output dtypes.
+ * (There are potential use-cases, these are currently unsupported.)
+ */
+typedef int promoter_function(PyObject *ufunc,
+        PyArray_DTypeMeta *op_dtypes[], PyArray_DTypeMeta *signature[],
+        PyArray_DTypeMeta *new_op_dtypes[]);
+/*
+ * Function to register a promoter.
+ *
+ * @param ufunc The ufunc object to register the promoter with.
+ * @param DType_tuple A Python tuple containing DTypes or None matching the
+ *        number of inputs and outputs of the ufunc.
+ * @param promoter A PyCapsule with name "numpy._ufunc_promoter" containing
+ *        a pointer to a `promoter_function`.
+ */
+typedef int _ufunc_addpromoter_func(
+        PyObject *ufunc, PyObject *DType_tuple, PyObject *promoter);
+#define PyUFunc_AddPromoter \
+    (*(_ufunc_addpromoter_func *)(__experimental_dtype_api_table[1]))
+#define PyArrayDTypeMeta_Type \
+    (*(PyTypeObject *)__experimental_dtype_api_table[2])
+typedef int __dtypemeta_fromspec(
+        PyArray_DTypeMeta *DType, PyArrayDTypeMeta_Spec *dtype_spec);
+/*
+ * Finalize creation of a DTypeMeta.  You must ensure that the DTypeMeta is
+ * a proper subclass.  The DTypeMeta object has additional fields compared to
+ * a normal PyTypeObject!
+ * The only (easy) creation of a new DType is to create a static Type which
+ * inherits `PyArray_DescrType`, sets its type to `PyArrayDTypeMeta_Type` and
+ * uses `PyArray_DTypeMeta` defined above as the C-structure.
+ */
+#define PyArrayInitDTypeMeta_FromSpec \
+    ((__dtypemeta_fromspec *)(__experimental_dtype_api_table[3]))
+/*
+ * *************************************
+ *          WORKING WITH DTYPES
+ * *************************************
+ */
+typedef PyArray_DTypeMeta *__common_dtype(
+        PyArray_DTypeMeta *DType1, PyArray_DTypeMeta *DType2);
+#define PyArray_CommonDType \
+    ((__common_dtype *)(__experimental_dtype_api_table[4]))
+typedef PyArray_DTypeMeta *__promote_dtype_sequence(
+        npy_intp num, PyArray_DTypeMeta *DTypes[]);
+#define PyArray_PromoteDTypeSequence \
+    ((__promote_dtype_sequence *)(__experimental_dtype_api_table[5]))
+typedef PyArray_Descr *__get_default_descr(
+        PyArray_DTypeMeta *DType);
+#define _PyArray_GetDefaultDescr \
+    ((__get_default_descr *)(__experimental_dtype_api_table[6]))
+static inline PyArray_Descr *
+PyArray_GetDefaultDescr(PyArray_DTypeMeta *DType)
+{
+    if (DType->singleton != NULL) {
+        Py_INCREF(DType->singleton);
+        return DType->singleton;
+    }
+    return _PyArray_GetDefaultDescr(DType);
+}
+/*
+ * NumPy's builtin DTypes:
+ */
+#define PyArray_BoolDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[10])
+/* Integers */
+#define PyArray_ByteDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[11])
+#define PyArray_UByteDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[12])
+#define PyArray_ShortDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[13])
+#define PyArray_UShortDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[14])
+#define PyArray_IntDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[15])
+#define PyArray_UIntDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[16])
+#define PyArray_LongDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[17])
+#define PyArray_ULongDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[18])
+#define PyArray_LongLongDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[19])
+#define PyArray_ULongLongDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[20])
+/* Integer aliases */
+#define PyArray_Int8Type (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[21])
+#define PyArray_UInt8DType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[22])
+#define PyArray_Int16DType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[23])
+#define PyArray_UInt16DType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[24])
+#define PyArray_Int32DType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[25])
+#define PyArray_UInt32DType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[26])
+#define PyArray_Int64DType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[27])
+#define PyArray_UInt64DType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[28])
+#define PyArray_IntpDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[29])
+#define PyArray_UIntpDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[30])
+/* Floats */
+#define PyArray_HalfType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[31])
+#define PyArray_FloatDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[32])
+#define PyArray_DoubleDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[33])
+#define PyArray_LongDoubleDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[34])
+/* Complex */
+#define PyArray_CFloatDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[35])
+#define PyArray_CDoubleDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[36])
+#define PyArray_CLongDoubleDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[37])
+/* String/Bytes */
+#define PyArray_StringDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[38])
+#define PyArray_UnicodeDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[39])
+/* Datetime/Timedelta */
+#define PyArray_DatetimeDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[40])
+#define PyArray_TimedeltaDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[41])
+/* Object/Void */
+#define PyArray_ObjectDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[42])
+#define PyArray_VoidDType (*(PyArray_DTypeMeta *)__experimental_dtype_api_table[43])
+/*
+ * ********************************
+ *         Initialization
+ * ********************************
+ *
+ * Import the experimental API, the version must match the one defined in
+ * the header to ensure changes are taken into account. NumPy will further
+ * runtime-check this.
+ * You must call this function to use the symbols defined in this file.
+ */
+#if !defined(NO_IMPORT) && !defined(NO_IMPORT_ARRAY)
+static int
+import_experimental_dtype_api(int version)
+{
+    if (version != __EXPERIMENTAL_DTYPE_API_VERSION) {
+        PyErr_Format(PyExc_RuntimeError,
+                "DType API version %d did not match header version %d. Please "
+                "update the import statement and check for API changes.",
+                version, __EXPERIMENTAL_DTYPE_API_VERSION);
+        return -1;
+    }
+    if (__experimental_dtype_api_table != __uninitialized_table) {
+        /* already imported. */
+        return 0;
+    }
+    PyObject *multiarray = PyImport_ImportModule("numpy.core._multiarray_umath");
+    if (multiarray == NULL) {
+        return -1;
+    }
+    PyObject *api = PyObject_CallMethod(multiarray,
+        "_get_experimental_dtype_api", "i", version);
+    Py_DECREF(multiarray);
+    if (api == NULL) {
+        return -1;
+    }
+    __experimental_dtype_api_table = (void **)PyCapsule_GetPointer(api,
+            "experimental_dtype_api_table");
+    Py_DECREF(api);
+    if (__experimental_dtype_api_table == NULL) {
+        __experimental_dtype_api_table = __uninitialized_table;
+        return -1;
+    }
+    return 0;
+}
+#endif  /* !defined(NO_IMPORT) && !defined(NO_IMPORT_ARRAY) */
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_EXPERIMENTAL_DTYPE_API_H_ */

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/halffloat.h ADDED Viewed

	@@ -0,0 +1,70 @@

+#ifndef NUMPY_CORE_INCLUDE_NUMPY_HALFFLOAT_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_HALFFLOAT_H_
+#include <Python.h>
+#include <numpy/npy_math.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Half-precision routines
+ */
+/* Conversions */
+float npy_half_to_float(npy_half h);
+double npy_half_to_double(npy_half h);
+npy_half npy_float_to_half(float f);
+npy_half npy_double_to_half(double d);
+/* Comparisons */
+int npy_half_eq(npy_half h1, npy_half h2);
+int npy_half_ne(npy_half h1, npy_half h2);
+int npy_half_le(npy_half h1, npy_half h2);
+int npy_half_lt(npy_half h1, npy_half h2);
+int npy_half_ge(npy_half h1, npy_half h2);
+int npy_half_gt(npy_half h1, npy_half h2);
+/* faster *_nonan variants for when you know h1 and h2 are not NaN */
+int npy_half_eq_nonan(npy_half h1, npy_half h2);
+int npy_half_lt_nonan(npy_half h1, npy_half h2);
+int npy_half_le_nonan(npy_half h1, npy_half h2);
+/* Miscellaneous functions */
+int npy_half_iszero(npy_half h);
+int npy_half_isnan(npy_half h);
+int npy_half_isinf(npy_half h);
+int npy_half_isfinite(npy_half h);
+int npy_half_signbit(npy_half h);
+npy_half npy_half_copysign(npy_half x, npy_half y);
+npy_half npy_half_spacing(npy_half h);
+npy_half npy_half_nextafter(npy_half x, npy_half y);
+npy_half npy_half_divmod(npy_half x, npy_half y, npy_half *modulus);
+/*
+ * Half-precision constants
+ */
+#define NPY_HALF_ZERO   (0x0000u)
+#define NPY_HALF_PZERO  (0x0000u)
+#define NPY_HALF_NZERO  (0x8000u)
+#define NPY_HALF_ONE    (0x3c00u)
+#define NPY_HALF_NEGONE (0xbc00u)
+#define NPY_HALF_PINF   (0x7c00u)
+#define NPY_HALF_NINF   (0xfc00u)
+#define NPY_HALF_NAN    (0x7e00u)
+#define NPY_MAX_HALF    (0x7bffu)
+/*
+ * Bit-level conversions
+ */
+npy_uint16 npy_floatbits_to_halfbits(npy_uint32 f);
+npy_uint16 npy_doublebits_to_halfbits(npy_uint64 d);
+npy_uint32 npy_halfbits_to_floatbits(npy_uint16 h);
+npy_uint64 npy_halfbits_to_doublebits(npy_uint16 h);
+#ifdef __cplusplus
+}
+#endif
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_HALFFLOAT_H_ */

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/npy_cpu.h ADDED Viewed

	@@ -0,0 +1,129 @@

+/*
+ * This set (target) cpu specific macros:
+ *      - Possible values:
+ *              NPY_CPU_X86
+ *              NPY_CPU_AMD64
+ *              NPY_CPU_PPC
+ *              NPY_CPU_PPC64
+ *              NPY_CPU_PPC64LE
+ *              NPY_CPU_SPARC
+ *              NPY_CPU_S390
+ *              NPY_CPU_IA64
+ *              NPY_CPU_HPPA
+ *              NPY_CPU_ALPHA
+ *              NPY_CPU_ARMEL
+ *              NPY_CPU_ARMEB
+ *              NPY_CPU_SH_LE
+ *              NPY_CPU_SH_BE
+ *              NPY_CPU_ARCEL
+ *              NPY_CPU_ARCEB
+ *              NPY_CPU_RISCV64
+ *              NPY_CPU_LOONGARCH
+ *              NPY_CPU_WASM
+ */
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_NPY_CPU_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_NPY_CPU_H_
+#include "numpyconfig.h"
+#if defined( __i386__ ) || defined(i386) || defined(_M_IX86)
+    /*
+     * __i386__ is defined by gcc and Intel compiler on Linux,
+     * _M_IX86 by VS compiler,
+     * i386 by Sun compilers on opensolaris at least
+     */
+    #define NPY_CPU_X86
+#elif defined(__x86_64__) || defined(__amd64__) || defined(__x86_64) || defined(_M_AMD64)
+    /*
+     * both __x86_64__ and __amd64__ are defined by gcc
+     * __x86_64 defined by sun compiler on opensolaris at least
+     * _M_AMD64 defined by MS compiler
+     */
+    #define NPY_CPU_AMD64
+#elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__)
+    #define NPY_CPU_PPC64LE
+#elif defined(__powerpc64__) && defined(__BIG_ENDIAN__)
+    #define NPY_CPU_PPC64
+#elif defined(__ppc__) || defined(__powerpc__) || defined(_ARCH_PPC)
+    /*
+     * __ppc__ is defined by gcc, I remember having seen __powerpc__ once,
+     * but can't find it ATM
+     * _ARCH_PPC is used by at least gcc on AIX
+     * As __powerpc__ and _ARCH_PPC are also defined by PPC64 check
+     * for those specifically first before defaulting to ppc
+     */
+    #define NPY_CPU_PPC
+#elif defined(__sparc__) || defined(__sparc)
+    /* __sparc__ is defined by gcc and Forte (e.g. Sun) compilers */
+    #define NPY_CPU_SPARC
+#elif defined(__s390__)
+    #define NPY_CPU_S390
+#elif defined(__ia64)
+    #define NPY_CPU_IA64
+#elif defined(__hppa)
+    #define NPY_CPU_HPPA
+#elif defined(__alpha__)
+    #define NPY_CPU_ALPHA
+#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM64)
+    /* _M_ARM64 is defined in MSVC for ARM64 compilation on Windows */
+    #if defined(__ARMEB__) || defined(__AARCH64EB__)
+        #if defined(__ARM_32BIT_STATE)
+            #define NPY_CPU_ARMEB_AARCH32
+        #elif defined(__ARM_64BIT_STATE)
+            #define NPY_CPU_ARMEB_AARCH64
+        #else
+            #define NPY_CPU_ARMEB
+        #endif
+    #elif defined(__ARMEL__) || defined(__AARCH64EL__) || defined(_M_ARM64)
+        #if defined(__ARM_32BIT_STATE)
+            #define NPY_CPU_ARMEL_AARCH32
+        #elif defined(__ARM_64BIT_STATE) || defined(_M_ARM64) || defined(__AARCH64EL__)
+            #define NPY_CPU_ARMEL_AARCH64
+        #else
+            #define NPY_CPU_ARMEL
+        #endif
+    #else
+        # error Unknown ARM CPU, please report this to numpy maintainers with \
+	information about your platform (OS, CPU and compiler)
+    #endif
+#elif defined(__sh__) && defined(__LITTLE_ENDIAN__)
+    #define NPY_CPU_SH_LE
+#elif defined(__sh__) && defined(__BIG_ENDIAN__)
+    #define NPY_CPU_SH_BE
+#elif defined(__MIPSEL__)
+    #define NPY_CPU_MIPSEL
+#elif defined(__MIPSEB__)
+    #define NPY_CPU_MIPSEB
+#elif defined(__or1k__)
+    #define NPY_CPU_OR1K
+#elif defined(__mc68000__)
+    #define NPY_CPU_M68K
+#elif defined(__arc__) && defined(__LITTLE_ENDIAN__)
+    #define NPY_CPU_ARCEL
+#elif defined(__arc__) && defined(__BIG_ENDIAN__)
+    #define NPY_CPU_ARCEB
+#elif defined(__riscv) && defined(__riscv_xlen) && __riscv_xlen == 64
+    #define NPY_CPU_RISCV64
+#elif defined(__loongarch__)
+    #define NPY_CPU_LOONGARCH
+#elif defined(__EMSCRIPTEN__)
+    /* __EMSCRIPTEN__ is defined by emscripten: an LLVM-to-Web compiler */
+    #define NPY_CPU_WASM
+#else
+    #error Unknown CPU, please report this to numpy maintainers with \
+    information about your platform (OS, CPU and compiler)
+#endif
+/*
+ * Except for the following architectures, memory access is limited to the natural
+ * alignment of data types otherwise it may lead to bus error or performance regression.
+ * For more details about unaligned access, see https://www.kernel.org/doc/Documentation/unaligned-memory-access.txt.
+*/
+#if defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64) || defined(__aarch64__) || defined(__powerpc64__)
+    #define NPY_ALIGNMENT_REQUIRED 0
+#endif
+#ifndef NPY_ALIGNMENT_REQUIRED
+    #define NPY_ALIGNMENT_REQUIRED 1
+#endif
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_NPY_CPU_H_ */

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/npy_interrupt.h ADDED Viewed

	@@ -0,0 +1,56 @@

+/*
+ * This API is only provided because it is part of publicly exported
+ * headers. Its use is considered DEPRECATED, and it will be removed
+ * eventually.
+ * (This includes the _PyArray_SigintHandler and _PyArray_GetSigintBuf
+ * functions which are however, public API, and not headers.)
+ *
+ * Instead of using these non-threadsafe macros consider periodically
+ * querying `PyErr_CheckSignals()` or `PyOS_InterruptOccurred()` will work.
+ * Both of these require holding the GIL, although cpython could add a
+ * version of `PyOS_InterruptOccurred()` which does not. Such a version
+ * actually exists as private API in Python 3.10, and backported to 3.9 and 3.8,
+ * see also https://bugs.python.org/issue41037 and
+ * https://github.com/python/cpython/pull/20599).
+ */
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_NPY_INTERRUPT_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_NPY_INTERRUPT_H_
+#ifndef NPY_NO_SIGNAL
+#include <setjmp.h>
+#include <signal.h>
+#ifndef sigsetjmp
+#define NPY_SIGSETJMP(arg1, arg2) setjmp(arg1)
+#define NPY_SIGLONGJMP(arg1, arg2) longjmp(arg1, arg2)
+#define NPY_SIGJMP_BUF jmp_buf
+#else
+#define NPY_SIGSETJMP(arg1, arg2) sigsetjmp(arg1, arg2)
+#define NPY_SIGLONGJMP(arg1, arg2) siglongjmp(arg1, arg2)
+#define NPY_SIGJMP_BUF sigjmp_buf
+#endif
+#    define NPY_SIGINT_ON {                                             \
+                   PyOS_sighandler_t _npy_sig_save;                     \
+                   _npy_sig_save = PyOS_setsig(SIGINT, _PyArray_SigintHandler); \
+                   if (NPY_SIGSETJMP(*((NPY_SIGJMP_BUF *)_PyArray_GetSigintBuf()), \
+                                 1) == 0) {                             \
+#    define NPY_SIGINT_OFF }                                      \
+        PyOS_setsig(SIGINT, _npy_sig_save);                       \
+        }
+#else  /* NPY_NO_SIGNAL  */
+#define NPY_SIGINT_ON
+#define NPY_SIGINT_OFF
+#endif  /* HAVE_SIGSETJMP */
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_NPY_INTERRUPT_H_ */

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/npy_no_deprecated_api.h ADDED Viewed

	@@ -0,0 +1,20 @@

+/*
+ * This include file is provided for inclusion in Cython *.pyd files where
+ * one would like to define the NPY_NO_DEPRECATED_API macro. It can be
+ * included by
+ *
+ * cdef extern from "npy_no_deprecated_api.h": pass
+ *
+ */
+#ifndef NPY_NO_DEPRECATED_API
+/* put this check here since there may be multiple includes in C extensions. */
+#if defined(NUMPY_CORE_INCLUDE_NUMPY_NDARRAYTYPES_H_) || \
+    defined(NUMPY_CORE_INCLUDE_NUMPY_NPY_DEPRECATED_API_H) || \
+    defined(NUMPY_CORE_INCLUDE_NUMPY_OLD_DEFINES_H_)
+#error "npy_no_deprecated_api.h" must be first among numpy includes.
+#else
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#endif
+#endif  /* NPY_NO_DEPRECATED_API */

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/numpy/core/include/numpy/utils.h ADDED Viewed

	@@ -0,0 +1,37 @@

+#ifndef NUMPY_CORE_INCLUDE_NUMPY_UTILS_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_UTILS_H_
+#ifndef __COMP_NPY_UNUSED
+    #if defined(__GNUC__)
+        #define __COMP_NPY_UNUSED __attribute__ ((__unused__))
+    #elif defined(__ICC)
+        #define __COMP_NPY_UNUSED __attribute__ ((__unused__))
+    #elif defined(__clang__)
+        #define __COMP_NPY_UNUSED __attribute__ ((unused))
+    #else
+        #define __COMP_NPY_UNUSED
+    #endif
+#endif
+#if defined(__GNUC__) || defined(__ICC) || defined(__clang__)
+    #define NPY_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
+#elif defined(_MSC_VER)
+    #define NPY_DECL_ALIGNED(x) __declspec(align(x))
+#else
+    #define NPY_DECL_ALIGNED(x)
+#endif
+/* Use this to tag a variable as not used. It will remove unused variable
+ * warning on support platforms (see __COM_NPY_UNUSED) and mangle the variable
+ * to avoid accidental use */
+#define NPY_UNUSED(x) __NPY_UNUSED_TAGGED ## x __COMP_NPY_UNUSED
+#define NPY_EXPAND(x) x
+#define NPY_STRINGIFY(x) #x
+#define NPY_TOSTRING(x) NPY_STRINGIFY(x)
+#define NPY_CAT__(a, b) a ## b
+#define NPY_CAT_(a, b) NPY_CAT__(a, b)
+#define NPY_CAT(a, b) NPY_CAT_(a, b)
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_UTILS_H_ */

LTA_openwebtext_dualt/scripts/flowtext_score_decode_lab.py ADDED Viewed

	@@ -0,0 +1,129 @@

+#!/usr/bin/env python3
+"""Score decode-lab samples with an external causal LM in one model load."""
+from __future__ import annotations
+import argparse
+import json
+import math
+from collections import defaultdict
+from pathlib import Path
+import torch
+import torch.nn.functional as F
+@torch.no_grad()
+def score_texts(texts, model_name_or_path: str, batch_size: int, max_length: int, device: torch.device):
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    tok = AutoTokenizer.from_pretrained(model_name_or_path)
+    if tok.pad_token_id is None:
+        tok.pad_token = tok.eos_token
+        tok.pad_token_id = tok.eos_token_id
+    model = AutoModelForCausalLM.from_pretrained(model_name_or_path).to(device)
+    if getattr(model.config, "pad_token_id", None) is None and tok.pad_token_id is not None:
+        model.config.pad_token_id = tok.pad_token_id
+    model.eval()
+    out = []
+    for start in range(0, len(texts), batch_size):
+        batch = texts[start : start + batch_size]
+        enc = tok(
+            batch,
+            return_tensors="pt",
+            return_attention_mask=True,
+            return_token_type_ids=False,
+            padding=True,
+            truncation=True,
+            max_length=max_length,
+        ).to(device)
+        input_ids = enc["input_ids"]
+        attention_mask = enc["attention_mask"].bool()
+        if input_ids.size(1) < 2:
+            out.extend([(None, None, 0) for _ in batch])
+            continue
+        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits.transpose(-1, -2)
+        token_nll = F.cross_entropy(logits[..., :-1].float(), input_ids[..., 1:], reduction="none")
+        if tok.eos_token_id is not None:
+            first_eos = (input_ids == tok.eos_token_id).cumsum(-1) == 1
+            token_mask = input_ids != tok.eos_token_id
+            shift_mask = (first_eos[..., 1:] | token_mask[..., 1:]) & attention_mask[..., 1:]
+        else:
+            shift_mask = attention_mask[..., 1:]
+        for row in range(input_ids.size(0)):
+            mask = shift_mask[row]
+            count = int(mask.sum().detach().cpu())
+            if count <= 0:
+                out.append((None, None, 0))
+                continue
+            nll = float(token_nll[row][mask].sum().detach().cpu()) / count
+            out.append((float(math.exp(min(nll, 50.0))), nll, count))
+    return out
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True)
+    parser.add_argument("--scorer", required=True)
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--max_length", type=int, default=512)
+    parser.add_argument("--topk", type=int, default=12)
+    parser.add_argument("--output", default="")
+    args = parser.parse_args()
+    rows = [json.loads(x) for x in Path(args.input).open() if x.strip()]
+    samples = [r for r in rows if r.get("type") == "sample"]
+    texts = [r["text"] for r in samples]
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"[info] scoring {len(texts)} samples on {device} with {args.scorer}", flush=True)
+    scored = score_texts(texts, args.scorer, args.batch_size, args.max_length, device)
+    enriched = []
+    for row, (ppl, nll, count) in zip(samples, scored):
+        rec = dict(row)
+        rec["external_ppl"] = ppl
+        rec["external_nll"] = nll
+        rec["external_tokens"] = count
+        # Conservative selection: prioritize samples that are both non-collapsed
+        # and plausible under an external LM.
+        rec["combined_score"] = float(rec["quality"]) - (0.08 * float(nll) if nll is not None else 10.0)
+        enriched.append(rec)
+    by_label = defaultdict(list)
+    for rec in enriched:
+        by_label[rec["config"]["label"]].append(rec)
+    print("\n== Config Summary ==")
+    for label, group in sorted(by_label.items()):
+        valid = [g for g in group if g["external_ppl"] is not None]
+        mean_ppl = sum(g["external_ppl"] for g in valid) / max(len(valid), 1)
+        mean_nll = sum(g["external_nll"] for g in valid) / max(len(valid), 1)
+        mean_quality = sum(float(g["quality"]) for g in group) / max(len(group), 1)
+        best = max(group, key=lambda g: g["combined_score"])
+        print(
+            f"{label:24s} mean_quality={mean_quality:7.4f} "
+            f"mean_ppl={mean_ppl:8.2f} mean_nll={mean_nll:6.3f} "
+            f"best_prompt={best['prompt']!r} best_combined={best['combined_score']:.4f}"
+        )
+    print("\n== Top Samples ==")
+    enriched.sort(key=lambda r: r["combined_score"], reverse=True)
+    for rec in enriched[: args.topk]:
+        print(
+            "\n"
+            + "=" * 96
+            + f"\nlabel={rec['config']['label']} prompt={rec['prompt']!r} "
+            + f"quality={rec['quality']:.4f} ppl={rec['external_ppl']:.2f} "
+            + f"rep3={rec['rep3']:.3f} d2={rec['distinct2']:.3f} combined={rec['combined_score']:.4f}\n"
+            + rec["text"]
+        )
+    if args.output:
+        with Path(args.output).open("w") as f:
+            for rec in enriched:
+                f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+        print(f"\n[done] wrote {args.output}")
+if __name__ == "__main__":
+    main()

LTA_openwebtext_dualt/scripts/launch_lta_owt_from_lm1b_c1024_4gpu.sh ADDED Viewed

	@@ -0,0 +1,85 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
+export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3}
+export OMP_NUM_THREADS=${OMP_NUM_THREADS:-1}
+RUN_NAME=${RUN_NAME:-lta_owt_distilbert_len1024_init_lm1b1m_posemb_repeat_fully_c1024_adamw_gbs512_4gpu_20k_$(date +%Y%m%d_%H%M%S)}
+SAVE_DIR=${SAVE_DIR:-runs_transfer/${RUN_NAME}}
+LOG_DIR=${LOG_DIR:-logs/owt_from_lm1b_c1024_4gpu}
+mkdir -p "${LOG_DIR}"
+LM1B_CKPT=${LM1B_CKPT:-runs/lta_lm1b_dirichlet_categorical_fullvocab_c1024_fullycoupled_flmpack_onehot_hardce_ddit_small_len128_gbs512_8gpu_1m_nw0/step_1000000.pt}
+TOTAL_STEPS=${TOTAL_STEPS:-20000}
+PER_GPU_BATCH_SIZE=${PER_GPU_BATCH_SIZE:-16}
+GLOBAL_BATCH_SIZE=${GLOBAL_BATCH_SIZE:-512}
+MASTER_PORT=${MASTER_PORT:-32043}
+python -m torch.distributed.run --nproc_per_node=4 --master_port="${MASTER_PORT}" train.py \
+  --data_path /e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext \
+  --openwebtext_split train_minus_100k \
+  --text_column text \
+  --detokenizer auto \
+  --tokenizer_path /e2e-data/evad-tech-vla/wanghan58/workspace/imagenet_handoff_20260327/nlp_dts_light/assets/distilbert-base-uncased/tokenizer.json \
+  --save_dir "${SAVE_DIR}" \
+  --wrap \
+  --wrap_mode stream \
+  --max_len 1024 \
+  --batch_size "${PER_GPU_BATCH_SIZE}" \
+  --global_batch_size "${GLOBAL_BATCH_SIZE}" \
+  --num_workers 4 \
+  --dataloader_prefetch_factor 4 \
+  --total_steps "${TOTAL_STEPS}" \
+  --warmup_steps 2500 \
+  --log_every 50 \
+  --eval_every 0 \
+  --save_every 1000 \
+  --latest_every 500 \
+  --init_model_path "${LM1B_CKPT}" \
+  --init_pos_embed_mode repeat \
+  --lr 0.0003 \
+  --lr_schedule constant_warmup \
+  --weight_decay 0.0 \
+  --adam_beta1 0.9 \
+  --adam_beta2 0.999 \
+  --adam_eps 1e-8 \
+  --grad_clip 1.0 \
+  --d_model 768 \
+  --cond_dim 128 \
+  --n_layers 12 \
+  --n_heads 12 \
+  --dim_ff 3072 \
+  --dropout 0.1 \
+  --model_type ddit \
+  --state_format prob \
+  --bridge dirichlet \
+  --target_loss hard_ce \
+  --target_prob 1.0 \
+  --min_t 0.0 \
+  --max_t 1.0 \
+  --dual_t \
+  --corrupt_t_mode same \
+  --min_mask_ratio 0.1 \
+  --max_mask_ratio 1.0 \
+  --wrong_token_replace_prob 1.0 \
+  --wrong_token_schedule linear_t \
+  --wrong_token_exp_k 1.0 \
+  --dirichlet_concentration_min 1.0 \
+  --dirichlet_concentration_max 1024 \
+  --dirichlet_endpoint_mode categorical_dual_t \
+  --dirichlet_semantic_t_mode same \
+  --categorical_wrong_from_full_vocab \
+  --simplex_bridge_sampler dirichlet \
+  --infer_steps 128 \
+  --decode_damping 1.0 \
+  --max_gamma 1.0 \
+  --decode_solver flowmap \
+  --noise_init logistic_normal \
+  --bridge_noise_init logistic_normal \
+  --noise_sigma -1 \
+  --allow_tf32 \
+  --ddp_gradient_as_bucket_view \
+  --bf16 \
+  2>&1 | tee "${LOG_DIR}/${RUN_NAME}.log"

LTA_openwebtext_dualt/scripts/launch_lta_wmt14_deen_fullycoupled_4gpu_smoke.sh ADDED Viewed

	@@ -0,0 +1,113 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
+export PYTHONPATH="$(pwd)${PYTHONPATH:+:$PYTHONPATH}"
+export TOKENIZERS_PARALLELISM=false
+export PYTHONUNBUFFERED=1
+export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
+export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3}"
+RUN_NAME="${RUN_NAME:-smoke_lta_wmt14_deen_t5_len128_fullycoupled_elfparams_4gpu_$(date +%Y%m%d_%H%M%S)}"
+SAVE_DIR="${SAVE_DIR:-runs/${RUN_NAME}}"
+LOG_DIR="${LOG_DIR:-logs/wmt14_deen_fullycoupled_smoke}"
+LOG_FILE="${LOG_FILE:-${LOG_DIR}/${RUN_NAME}.log}"
+mkdir -p "${LOG_DIR}" "${SAVE_DIR}"
+DATA_ROOT="${DATA_ROOT:-/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/elf}"
+DATA_PATH="${DATA_PATH:-${DATA_ROOT}/wmt14_de-en_train_t5}"
+EVAL_DATA_PATH="${EVAL_DATA_PATH:-${DATA_ROOT}/wmt14_de-en_validation_t5}"
+DATASET_CACHE_DIR="${DATASET_CACHE_DIR:-/e2e-data/evad-tech-vla/wanghan58/data/hf_cache}"
+TOKENIZER_PATH="${TOKENIZER_PATH:-/e2e-data/evad-tech-vla/wanghan58/models/hf/t5-small/tokenizer.json}"
+NPROC_PER_NODE="${NPROC_PER_NODE:-4}"
+MASTER_PORT="${MASTER_PORT:-32072}"
+GLOBAL_BATCH_SIZE="${GLOBAL_BATCH_SIZE:-512}"
+PER_GPU_BATCH_SIZE="${PER_GPU_BATCH_SIZE:-32}"
+TOTAL_STEPS="${TOTAL_STEPS:-50}"
+WARMUP_STEPS="${WARMUP_STEPS:-20}"
+MAX_RECORDS="${MAX_RECORDS:-4096}"
+if [[ ! -e "${DATA_PATH}" ]]; then
+  echo "Missing WMT14 De-En train dataset at ${DATA_PATH}" >&2
+  echo "Stage embedded-language-flows/wmt14_de-en_train_t5 locally, then rerun." >&2
+  exit 2
+fi
+python -m torch.distributed.run --nproc_per_node="${NPROC_PER_NODE}" --master_port="${MASTER_PORT}" train.py \
+  --elf_conditional_hf \
+  --data_path "${DATA_PATH}" \
+  --eval_data_path "${EVAL_DATA_PATH}" \
+  --dataset_cache_dir "${DATASET_CACHE_DIR}" \
+  --tokenizer_path "${TOKENIZER_PATH}" \
+  --save_dir "${SAVE_DIR}" \
+  --max_len 128 \
+  --max_input_len 64 \
+  --conditional_pad_token eos \
+  --label_drop_prob 0.1 \
+  --max_records "${MAX_RECORDS}" \
+  --batch_size "${PER_GPU_BATCH_SIZE}" \
+  --global_batch_size "${GLOBAL_BATCH_SIZE}" \
+  --num_workers 4 \
+  --dataloader_prefetch_factor 4 \
+  --total_steps "${TOTAL_STEPS}" \
+  --warmup_steps "${WARMUP_STEPS}" \
+  --log_every 10 \
+  --eval_every 0 \
+  --save_every 0 \
+  --latest_every 25 \
+  --optimizer muon \
+  --lr 0.001 \
+  --lr_schedule constant_warmup \
+  --min_lr 0 \
+  --weight_decay 0.0 \
+  --adam_beta1 0.9 \
+  --adam_beta2 0.95 \
+  --adam_eps 1e-8 \
+  --muon_momentum 0.95 \
+  --muon_ns_steps 5 \
+  --muon_update_scale 1.0 \
+  --ema_decay 0.9999 \
+  --ema_start_step 0 \
+  --grad_clip 1.0 \
+  --seed 42 \
+  --d_model 768 \
+  --cond_dim 128 \
+  --n_layers 12 \
+  --n_heads 12 \
+  --dim_ff 3072 \
+  --dropout 0.1 \
+  --model_type ddit \
+  --state_format prob \
+  --bridge dirichlet \
+  --target_loss hard_ce \
+  --target_prob 1.0 \
+  --min_t 0.05 \
+  --max_t 1.0 \
+  --dual_t \
+  --corrupt_t_mode same \
+  --corrupt_min_t 0.05 \
+  --corrupt_max_t 1.0 \
+  --min_mask_ratio 0.1 \
+  --max_mask_ratio 1.0 \
+  --wrong_token_replace_prob 1.0 \
+  --wrong_token_schedule linear_t \
+  --wrong_token_exp_k 1.0 \
+  --dirichlet_concentration_min 1.0 \
+  --dirichlet_concentration_max 1024.0 \
+  --dirichlet_endpoint_mode categorical_dual_t \
+  --dirichlet_semantic_t_mode same \
+  --dirichlet_semantic_t_value 0.0 \
+  --categorical_wrong_from_full_vocab \
+  --simplex_bridge_sampler dirichlet \
+  --infer_steps 128 \
+  --decode_damping 1.0 \
+  --max_gamma 1.0 \
+  --decode_solver flowmap \
+  --noise_init logistic_normal \
+  --bridge_noise_init logistic_normal \
+  --noise_sigma -1 \
+  --allow_tf32 \
+  --ddp_gradient_as_bucket_view \
+  --bf16 \
+  2>&1 | tee -a "${LOG_FILE}"

LTA_openwebtext_dualt/scripts/run_lta_owt_bert_absrope_time4_dirichlet_len1024_C1_to_1024_8gpu_1m_mask1_sameT_save10k.sh ADDED Viewed

	@@ -0,0 +1,77 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
+# OWT raw text + BERT tokenizer, FLM wrapped stream:
+#   [CLS] + 1022 payload tokens + [SEP]
+#
+# Backbone:
+#   ddit_elf = RMSNorm/SwiGLU/QK-norm + RoPE + 4 prefix time tokens.
+#   We also add learned absolute position embeddings before RoPE.
+#
+# Bridge:
+#   Dirichlet C=1->1024, mask_ratio=1.0, model t and corruption t are shared.
+export DATA_PATH="${DATA_PATH:-/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext}"
+export TEXT_COLUMN="${TEXT_COLUMN:-text}"
+export OPENWEBTEXT_SPLIT="${OPENWEBTEXT_SPLIT:-train_minus_100k}"
+export TOKENIZER_PATH="${TOKENIZER_PATH:-/e2e-data/evad-tech-vla/wanghan58/workspace/imagenet_handoff_20260327/nlp_dts_light/assets/distilbert-base-uncased/tokenizer.json}"
+export TOKENIZED_HF=0
+export WRAP_MODE="${WRAP_MODE:-stream}"
+export VOCAB_SIZE="${VOCAB_SIZE:-30522}"
+export CMIN="${CMIN:-1}"
+export CMAX="${CMAX:-1024}"
+export MODEL_TYPE=ddit_elf
+export ELF_NUM_TIME_TOKENS="${ELF_NUM_TIME_TOKENS:-4}"
+export ELF_NUM_MODEL_MODE_TOKENS="${ELF_NUM_MODEL_MODE_TOKENS:-0}"
+export QK_NORM="${QK_NORM:-1}"
+export ABS_POS_EMBED=1
+export CORRUPT_T_MODE=same
+export MIN_MASK_RATIO=1.0
+export MAX_MASK_RATIO=1.0
+export CATEGORICAL_WRONG_PROB_FLOOR="${CATEGORICAL_WRONG_PROB_FLOOR:-0.0}"
+_ngpus_avail=$(nvidia-smi --query-gpu=index --format=csv,noheader 2>/dev/null | wc -l || echo 1)
+if [[ "${_ngpus_avail}" -le 0 ]]; then _ngpus_avail=1; fi
+_default_cvd=$(seq -s, 0 $((_ngpus_avail - 1)))
+export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-${_default_cvd}}"
+IFS=',' read -ra _cvd_arr <<< "${CUDA_VISIBLE_DEVICES}"
+export NPROC_PER_NODE="${NPROC_PER_NODE:-${#_cvd_arr[@]}}"
+unset _ngpus_avail _default_cvd _cvd_arr
+export NNODES="${NNODES:-${MLP_WORKER_NUM:-1}}"
+export NODE_RANK="${NODE_RANK:-${MLP_ROLE_INDEX:-0}}"
+export MASTER_ADDR="${MASTER_ADDR:-${MLP_WORKER_0_HOST:-127.0.0.1}}"
+export MASTER_PORT="${MASTER_PORT:-${MLP_WORKER_0_PORT:-29500}}"
+export GLOBAL_BATCH_SIZE="${GLOBAL_BATCH_SIZE:-512}"
+export PER_GPU_BATCH_SIZE="${PER_GPU_BATCH_SIZE:-32}"
+export TOTAL_STEPS="${TOTAL_STEPS:-1000000}"
+export WARMUP_STEPS="${WARMUP_STEPS:-2500}"
+export SAVE_EVERY="${SAVE_EVERY:-10000}"
+export LATEST_EVERY="${LATEST_EVERY:-1000}"
+export LOG_EVERY="${LOG_EVERY:-100}"
+export DATE_TAG="${DATE_TAG:-$(date +%Y%m%d)}"
+export RUN_NAME="${RUN_NAME:-lta_owt_bert_absrope_time4_dirichlet_len1024_C1_to_1024_mask1_sameT_gbs512_b32_8gpu_1m_save10k_${DATE_TAG}}"
+export WATCH_ENABLED="${WATCH_ENABLED:-1}"
+export WATCH_STEP_INTERVAL="${WATCH_STEP_INTERVAL:-10000}"
+export WATCH_N_SAMPLES="${WATCH_N_SAMPLES:-128}"
+export WATCH_CUDA_VISIBLE_DEVICES="${WATCH_CUDA_VISIBLE_DEVICES:-7}"
+export WATCH_DECODE_MODE="${WATCH_DECODE_MODE:-dual_line_probe}"
+export WATCH_DECODE_BATCH="${WATCH_DECODE_BATCH:-1}"
+export WATCH_DUAL_SEMANTIC_POWER="${WATCH_DUAL_SEMANTIC_POWER:-1.5}"
+export WATCH_DUAL_EARLY_TEMP="${WATCH_DUAL_EARLY_TEMP:-2.8}"
+export WATCH_DUAL_LATE_TEMP="${WATCH_DUAL_LATE_TEMP:-1.45}"
+export WATCH_DUAL_TEMP_END="${WATCH_DUAL_TEMP_END:-0.55}"
+export WATCH_DUAL_TEMP_POWER="${WATCH_DUAL_TEMP_POWER:-1.5}"
+export WATCH_ENDPOINT_TEMP="${WATCH_ENDPOINT_TEMP:-1.45}"
+export WATCH_ENDPOINT_TOP_P="${WATCH_ENDPOINT_TOP_P:-0.95}"
+export WATCH_GUMBEL_TAU_START="${WATCH_GUMBEL_TAU_START:-1.0}"
+export WATCH_GUMBEL_TAU_END="${WATCH_GUMBEL_TAU_END:-0.2}"
+export WATCH_OUT_BASE="${WATCH_OUT_BASE:-docs/lta_samples/metrics_${DATE_TAG}/owt_bert_absrope_time4_C1_to_1024_mask1_sameT_dualline_dirres_c${CMIN}_${CMAX}_n${WATCH_N_SAMPLES}/${RUN_NAME}}"
+export WATCH_LOG_DIR="${WATCH_LOG_DIR:-logs/owt_bert_absrope_time4_C1_to_1024_mask1_sameT_dualline_watch}"
+bash scripts/run_lta_owt_dirichlet_len1024_Cv_to_2v_8gpu_save1k_with_gumbel_watch.sh

LTA_openwebtext_dualt/scripts/tmp_run_three_quick_infer_20260525.sh ADDED Viewed

	@@ -0,0 +1,130 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
+export PYTHONPATH="$(pwd)${PYTHONPATH:+:$PYTHONPATH}"
+export TOKENIZERS_PARALLELISM=false
+export PYTHONUNBUFFERED=1
+run_sde_quick() {
+  local ckpt="$1"
+  local tok="$2"
+  local out_dir="$3"
+  local cmin="$4"
+  local cmax="$5"
+  local name="$6"
+  CUDA_VISIBLE_DEVICES=0 python - "$ckpt" "$tok" "$out_dir" "$cmin" "$cmax" "$name" <<'PY'
+import json
+import re
+import sys
+from pathlib import Path
+import torch
+from flowtext_lab.genppl import summarize_token_diversity
+from flowtext_lab.tokenization import BpeTextTokenizer
+from scripts.eval_lm1b_c1024_fullycoupled_sde_genppl import build_model, decode_sde
+ckpt_path, tok_path, out_dir_s, cmin_s, cmax_s, name = sys.argv[1:7]
+out_dir = Path(out_dir_s)
+out_dir.mkdir(parents=True, exist_ok=True)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"[{name}] load {ckpt_path}", flush=True)
+ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)
+print(f"[{name}] step={ckpt.get('step')}", flush=True)
+tok = BpeTextTokenizer.from_file(tok_path)
+model = build_model(ckpt, tok, device)
+ids, texts, cfg = decode_sde(
+    model,
+    tok,
+    n_samples=8,
+    batch_size=1,
+    max_len=1024,
+    steps=128,
+    seed=20260525,
+    device=device,
+    concentration_min=float(cmin_s),
+    concentration_max=float(cmax_s),
+    endpoint_temp=1.45,
+    endpoint_projection="gumbel_softmax",
+    endpoint_top_p=0.95,
+    gumbel_tau_start=1.0,
+    gumbel_tau_end=0.2,
+    model_t_mode="support_t",
+    mean_mode="endpoint_only",
+    semantic_power=1.0,
+    noise_init="dirichlet",
+    noise_dirichlet_concentration=float(cmin_s),
+    sde_resample="dirichlet",
+    final_from="blend_0.5",
+)
+def strip_special(text: str) -> str:
+    for special in ("[CLS]", "[SEP]", "[PAD]", "<pad>", "</s>", "<s>", "<unk>", "<|endoftext|>"):
+        text = text.replace(special, " ")
+    return re.sub(r"\s+", " ", text).strip()
+stripped = [strip_special(text) for text in texts]
+summary = {
+    "type": "summary",
+    "checkpoint": ckpt_path,
+    "step": int(ckpt.get("step", -1)),
+    "decode": cfg,
+    "diversity": summarize_token_diversity(ids).__dict__,
+}
+out_jsonl = out_dir / "sde_steps128_samples8_unscored.jsonl"
+with out_jsonl.open("w", encoding="utf-8") as f:
+    f.write(json.dumps(summary, ensure_ascii=False) + "\n")
+    for i, (raw, clean) in enumerate(zip(texts, stripped)):
+        f.write(json.dumps({"type": "sample", "index": i, "raw_text": raw, "stripped_text": clean}, ensure_ascii=False) + "\n")
+(out_dir / "first8.txt").write_text("\n\n--- SAMPLE ---\n\n".join(texts), encoding="utf-8")
+(out_dir / "first8_stripped.txt").write_text("\n\n--- SAMPLE ---\n\n".join(stripped), encoding="utf-8")
+print(f"[{name}] done {out_jsonl}", flush=True)
+print(json.dumps(summary, ensure_ascii=False, indent=2), flush=True)
+PY
+}
+T5_TOK=/e2e-data/evad-tech-vla/wanghan58/models/hf/t5-small/tokenizer.json
+BERT_TOK=/e2e-data/evad-tech-vla/wanghan58/workspace/imagenet_handoff_20260327/nlp_dts_light/assets/distilbert-base-uncased/tokenizer.json
+run_sde_quick \
+  runs/lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/latest.pt \
+  "${T5_TOK}" \
+  docs/lta_samples/metrics_20260525/lta_owt_t5_absrope_adaln_Cv_to_2v_step26000_quick_n8 \
+  32100 64200 t5_Cv2V
+CUDA_VISIBLE_DEVICES=0 python scripts/infer_softkl_decode_probe.py \
+  --checkpoint runs/lta_owt_t5_absrope_adaln_dirichlet_len1024_C1_to_1024_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_20260525/latest.pt \
+  --tokenizer_path "${T5_TOK}" \
+  --out_dir docs/lta_samples/metrics_20260525/lta_owt_t5_absrope_adaln_C1_to_1024_step26000_dualline_quick_n8 \
+  --max_lens 1024 \
+  --n_samples 8 \
+  --batch_size 1 \
+  --steps 128 \
+  --decode_rule dual_line_resample \
+  --c_min 1 \
+  --c_max 1024 \
+  --input_noise_dirichlet_concentration 1 \
+  --anchor_mode state \
+  --model_t_mode flow \
+  --time_schedule uniform \
+  --support_power 1.0 \
+  --semantic_power 1.5 \
+  --early_temp 2.8 \
+  --late_temp 1.45 \
+  --temp_end 0.55 \
+  --temp_power 1.5 \
+  --final_from blend \
+  --final_decode argmax \
+  --seed 20260525
+cp \
+  docs/lta_samples/metrics_20260525/lta_owt_t5_absrope_adaln_C1_to_1024_step26000_dualline_quick_n8/context1024_samples.txt \
+  docs/lta_samples/metrics_20260525/lta_owt_t5_absrope_adaln_C1_to_1024_step26000_dualline_quick_n8/first8.txt
+run_sde_quick \
+  runs/lta_owt_bert_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask1_sameT_gbs512_b4x4_1m_save1k_watch_20260525/latest.pt \
+  "${BERT_TOK}" \
+  docs/lta_samples/metrics_20260525/lta_owt_bert_absrope_adaln_Cv_to_2v_mask1_sameT_step4000_quick_n8 \
+  30522 61044 bert_Cv2V_step4000

LTA_openwebtext_dualt/scripts/trace_lta_decode_steps.py ADDED Viewed

	@@ -0,0 +1,129 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+import torch
+import torch.nn.functional as F
+REPO_ROOT = Path(__file__).resolve().parents[1]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+from eval import build_model_from_ckpt
+from flowtext_lab.decode import model_time_for_step, sample_noise_simplex, state_for_model
+from flowtext_lab.tokenization import BpeTextTokenizer
+from scripts.flowtext_decode_lab import decode_text, flowmap_gamma
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser()
+    p.add_argument("--checkpoint", required=True)
+    p.add_argument("--tokenizer_path", required=True)
+    p.add_argument("--output", required=True)
+    p.add_argument("--max_len", type=int, default=128)
+    p.add_argument("--steps", type=int, default=128)
+    p.add_argument("--seed", type=int, default=20260502)
+    p.add_argument("--sample_index", type=int, default=13)
+    p.add_argument("--endpoint_temp", type=float, default=1.8)
+    p.add_argument("--damping", type=float, default=1.0)
+    p.add_argument("--max_gamma", type=float, default=1.0)
+    p.add_argument("--eps", type=float, default=1e-8)
+    p.add_argument("--trace_steps", default="0,1,2,4,8,16,32,64,96,127")
+    p.add_argument("--token_positions", default="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31")
+    return p.parse_args()
+def parse_ints(text: str) -> list[int]:
+    return [int(x) for x in text.split(",") if x.strip()]
+@torch.no_grad()
+def main() -> None:
+    args = parse_args()
+    torch.manual_seed(args.seed)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    tokenizer = BpeTextTokenizer.from_file(args.tokenizer_path)
+    ckpt = torch.load(args.checkpoint, map_location="cpu")
+    model = build_model_from_ckpt(ckpt, tokenizer.vocab_size, args.max_len, device)
+    model.eval()
+    # Reproduce sample_index by drawing that many + 1 initial noise samples with the same seed.
+    init = sample_noise_simplex(
+        (args.sample_index + 1, args.max_len),
+        tokenizer.vocab_size,
+        device,
+        args.eps,
+        noise_mode="dirichlet",
+        target_prob=1.0,
+        noise_sigma=-1.0,
+        dirichlet_concentration=1.0,
+    )[-1:].float()
+    attn = torch.ones((1, args.max_len), dtype=torch.bool, device=device)
+    trace_steps = set(parse_ints(args.trace_steps))
+    positions = parse_ints(args.token_positions)
+    records = []
+    probs = init.clone()
+    last_endpoint = probs
+    for step in range(args.steps):
+        t = model_time_for_step("flow", step, args.steps, 1, device, dtype=torch.float32)
+        logits = model(state_for_model(model, probs, args.eps), t, attn).float()
+        logits = logits / args.endpoint_temp
+        endpoint = F.softmax(logits, dim=-1)
+        last_endpoint = endpoint
+        gamma = flowmap_gamma(step, args.steps, args.damping, args.max_gamma, args.eps)
+        new_probs = probs + gamma * (endpoint - probs)
+        new_probs = new_probs.clamp_min(args.eps)
+        new_probs = new_probs / new_probs.sum(dim=-1, keepdim=True).clamp_min(args.eps)
+        probs = new_probs
+        if step in trace_steps:
+            top_prob, ids = probs[0].max(dim=-1)
+            ent = -(probs[0].clamp_min(args.eps) * probs[0].clamp_min(args.eps).log()).sum(dim=-1)
+            endpoint_top_prob, endpoint_ids = endpoint[0].max(dim=-1)
+            records.append({
+                "step": step,
+                "gamma": gamma,
+                "model_t": float(t.item()),
+                "text_prefix": decode_text(tokenizer, ids[:64].detach().cpu().tolist()),
+                "positions": [
+                    {
+                        "pos": pos,
+                        "state_token": tokenizer.decode([int(ids[pos].item())], stop_at_eos=False, skip_special_tokens=False),
+                        "state_id": int(ids[pos].item()),
+                        "state_top_p": float(top_prob[pos].item()),
+                        "state_entropy": float(ent[pos].item()),
+                        "endpoint_token": tokenizer.decode([int(endpoint_ids[pos].item())], stop_at_eos=False, skip_special_tokens=False),
+                        "endpoint_id": int(endpoint_ids[pos].item()),
+                        "endpoint_top_p": float(endpoint_top_prob[pos].item()),
+                    }
+                    for pos in positions
+                    if 0 <= pos < args.max_len
+                ],
+            })
+    final_ids = probs[0].argmax(dim=-1).detach().cpu().tolist()
+    final_text = decode_text(tokenizer, final_ids)
+    output = Path(args.output)
+    if not output.is_absolute():
+        output = Path(args.checkpoint).resolve().parent / output
+    output.parent.mkdir(parents=True, exist_ok=True)
+    payload = {
+        "checkpoint": args.checkpoint,
+        "seed": args.seed,
+        "sample_index": args.sample_index,
+        "steps": args.steps,
+        "endpoint_temp": args.endpoint_temp,
+        "final_text": final_text,
+        "records": records,
+    }
+    output.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
+    print(json.dumps(payload, ensure_ascii=False, indent=2))
+if __name__ == "__main__":
+    main()