JinghuiLuAstronaut commited on 9 days ago

Commit

8ed0c93

verified ·

1 Parent(s): 9805aea

Add files using upload-large-folder tool

Browse files

Files changed (20) hide show

LTA_openwebtext_dualt/logs/lta_lm1b_classic_dirichlet_len512_gbs512_4gpu_10k_save1k_20260523.train.pid +1 -0
LTA_openwebtext_dualt/logs/noise_geometry_combo_4gpu/20260517_170456.log +994 -0
LTA_openwebtext_dualt/logs/train8_len_sweep_compact_bs512_until_exact_4gpu/driver.log +0 -0
LTA_openwebtext_dualt/scripts/apple_to_apple_lta_checks.py +631 -0
LTA_openwebtext_dualt/scripts/build_lta_owt_compact_gpt2bpe_stream1024_train_minus_100k_np8.sh +13 -0
LTA_openwebtext_dualt/scripts/build_owt_t5_elf_dataset.py +587 -0
LTA_openwebtext_dualt/scripts/eval_dirichlet_latest_key3_state_20260508.py +51 -0
LTA_openwebtext_dualt/scripts/infer_lta_owt_t5_len128_uniform10k_then_lognsr_latest.sh +113 -0
LTA_openwebtext_dualt/scripts/launch_lta_lm1b_categorical_fullvocab_c1024_fullycoupled_8gpu_small_1m.sh +150 -0
LTA_openwebtext_dualt/scripts/launch_lta_lm1b_categorical_fullvocab_c16_dualt_4gpu_small_1m.sh +155 -0
LTA_openwebtext_dualt/scripts/launch_lta_owt_c1024_fullycoupled_8gpu_len1024_gpt2_cached_chunks_1m.sh +60 -0
LTA_openwebtext_dualt/scripts/launch_lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_mask1_wd0p1_fp32_8gpu.sh +39 -0
LTA_openwebtext_dualt/scripts/launch_lta_owt_elfaligned_t5_logitnormal_8gpu.sh +209 -0
LTA_openwebtext_dualt/scripts/launch_lta_owt_fullycoupled_outwd0p5_8gpu.sh +11 -0
LTA_openwebtext_dualt/scripts/launch_lta_owt_t5_rollin_grad_k1_rho025_subset10k_4gpu_100k.sh +148 -0
LTA_openwebtext_dualt/scripts/run_lta_lm1b_dirichlet_len1024_Cv_to_2v_8gpu_1m_save10k.sh +34 -0
LTA_openwebtext_dualt/scripts/run_lta_owt_dirichlet_len1024_Cv_to_2v_8gpu_1m_save10k.sh +34 -0
LTA_openwebtext_dualt/scripts/run_lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_8gpu_mask0p1_1p0_sameT_1m_save10k.sh +36 -0
LTA_openwebtext_dualt/scripts/run_train8_wrong_floor_pilots_4gpu.sh +194 -0
LTA_openwebtext_dualt/scripts/watch_infer_owt_classic_fullvocab_len1024_lr2e4_gbs2048_latest_every1k_t1p45.sh +158 -0

LTA_openwebtext_dualt/logs/lta_lm1b_classic_dirichlet_len512_gbs512_4gpu_10k_save1k_20260523.train.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 993819

LTA_openwebtext_dualt/logs/noise_geometry_combo_4gpu/20260517_170456.log ADDED Viewed

	@@ -0,0 +1,994 @@

+[combo-pilot] start stamp=20260517_170456 len=256 vocab=969 out=docs/lta_samples/metrics_20260517/noise_geometry_combo_len256_bs512_ode128_20260517_170456
+[combo-pilot] round=1 Sun May 17 17:04:56 UTC 2026
+[combo-pilot] train config=logistic_unigram_shared_highC from=0 to=1000 sampler=logistic_normal_linear_mean C=64->4096 unigram_shared=0.5 seq=0.0
+[combo-pilot] eval config=logistic_unigram_shared_highC step=1000
+[eval-decode-acc] train8_combo_len256_logistic_unigram_shared_highC_20260517_170456 step=1000 soft=none
+[decode] max_len=256 generated=64/64
+{
+  "num_rows": 1,
+  "best_by_run": {
+    "train8_combo_len256_logistic_unigram_shared_highC_20260517_170456::none": {
+      "run": "train8_combo_len256_logistic_unigram_shared_highC_20260517_170456",
+      "checkpoint": "runs/train8_combo_len256_logistic_unigram_shared_highC_20260517_170456/step_0001000.pt",
+      "ckpt_step": 1000,
+      "endpoint_softening": "none",
+      "decode_rule": "flowmap",
+      "steps": 128,
+      "time_schedule": "logit_normal",
+      "model_t_mode": "post",
+      "final_from": "state",
+      "n_gen": 64,
+      "n_refs": 8,
+      "token_acc_mean": 0.0487060546875,
+      "token_acc_min": 0.03515625,
+      "token_acc_max": 0.07421875,
+      "exact_acc": 0.0,
+      "exact_count": 0,
+      "exact_ref_coverage": 0.0,
+      "exact_ref_count": 0,
+      "exact_ref_hits": [],
+      "best_ref_idx": [
+        5,
+        0,
+        0,
+        0,
+        5,
+        5,
+        5,
+        0,
+        5,
+        2,
+        1,
+        0,
+        7,
+        2,
+        7,
+        0,
+        3,
+        3,
+        2,
+        0,
+        2,
+        2,
+        5,
+        7,
+        5,
+        7,
+        7,
+        2,
+        5,
+        7,
+        5,
+        2,
+        1,
+        5,
+        0,
+        0,
+        5,
+        2,
+        0,
+        0,
+        2,
+        0,
+        0,
+        5,
+        5,
+        3,
+        5,
+        5,
+        5,
+        3,
+        3,
+        0,
+        3,
+        2,
+        5,
+        0,
+        7,
+        0,
+        1,
+        5,
+        2,
+        7,
+        3,
+        2
+      ],
+      "best_token_acc": [
+        0.04296875,
+        0.04296875,
+        0.04296875,
+        0.046875,
+        0.05859375,
+        0.04296875,
+        0.04296875,
+        0.05859375,
+        0.046875,
+        0.05859375,
+        0.04296875,
+        0.05859375,
+        0.0390625,
+        0.046875,
+        0.0625,
+        0.0390625,
+        0.04296875,
+        0.046875,
+        0.046875,
+        0.046875,
+        0.05078125,
+        0.05078125,
+        0.04296875,
+        0.0546875,
+        0.046875,
+        0.046875,
+        0.046875,
+        0.046875,
+        0.0625,
+        0.0625,
+        0.05078125,
+        0.0390625,
+        0.0546875,
+        0.046875,
+        0.04296875,
+        0.0390625,
+        0.05078125,
+        0.0390625,
+        0.046875,
+        0.04296875,
+        0.03515625,
+        0.046875,
+        0.046875,
+        0.0546875,
+        0.0546875,
+        0.04296875,
+        0.04296875,
+        0.0546875,
+        0.04296875,
+        0.046875,
+        0.05078125,
+        0.07421875,
+        0.04296875,
+        0.05078125,
+        0.046875,
+        0.0546875,
+        0.0546875,
+        0.04296875,
+        0.0546875,
+        0.0546875,
+        0.0546875,
+        0.05078125,
+        0.04296875,
+        0.05078125
+      ]
+    }
+  },
+  "first_exact_by_run": {}
+}
+RESULT config=logistic_unigram_shared_highC ckpt_step=1000 views=512000 token_acc=0.0487 exact=0/64 exact_refs=0 hits=[]
+[combo-pilot] continue config=logistic_unigram_shared_highC step=1000
+[combo-pilot] train config=logistic_unigram_shared_highC_seqrand from=0 to=1000 sampler=logistic_normal_linear_mean C=64->4096 unigram_shared=0.5 seq=0.5
+[combo-pilot] eval config=logistic_unigram_shared_highC_seqrand step=1000
+[eval-decode-acc] train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456 step=1000 soft=none
+[decode] max_len=256 generated=64/64
+{
+  "num_rows": 1,
+  "best_by_run": {
+    "train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456::none": {
+      "run": "train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456",
+      "checkpoint": "runs/train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456/step_0001000.pt",
+      "ckpt_step": 1000,
+      "endpoint_softening": "none",
+      "decode_rule": "flowmap",
+      "steps": 128,
+      "time_schedule": "logit_normal",
+      "model_t_mode": "post",
+      "final_from": "state",
+      "n_gen": 64,
+      "n_refs": 8,
+      "token_acc_mean": 0.04034423828125,
+      "token_acc_min": 0.0234375,
+      "token_acc_max": 0.0625,
+      "exact_acc": 0.0,
+      "exact_count": 0,
+      "exact_ref_coverage": 0.0,
+      "exact_ref_count": 0,
+      "exact_ref_hits": [],
+      "best_ref_idx": [
+        0,
+        0,
+        0,
+        0,
+        0,
+        3,
+        0,
+        7,
+        0,
+        4,
+        0,
+        5,
+        4,
+        0,
+        0,
+        0,
+        3,
+        0,
+        0,
+        0,
+        3,
+        0,
+        0,
+        0,
+        4,
+        0,
+        0,
+        5,
+        0,
+        4,
+        0,
+        0,
+        0,
+        0,
+        5,
+        0,
+        0,
+        0,
+        0,
+        4,
+        0,
+        0,
+        0,
+        5,
+        3,
+        0,
+        0,
+        0,
+        0,
+        4,
+        0,
+        4,
+        0,
+        0,
+        0,
+        0,
+        5,
+        0,
+        0,
+        0,
+        4,
+        0,
+        3,
+        0
+      ],
+      "best_token_acc": [
+        0.03515625,
+        0.03515625,
+        0.03125,
+        0.05859375,
+        0.03515625,
+        0.0234375,
+        0.03515625,
+        0.02734375,
+        0.0625,
+        0.03515625,
+        0.02734375,
+        0.03125,
+        0.0234375,
+        0.03515625,
+        0.046875,
+        0.04296875,
+        0.05078125,
+        0.03125,
+        0.03515625,
+        0.0625,
+        0.03125,
+        0.04296875,
+        0.02734375,
+        0.04296875,
+        0.03125,
+        0.0390625,
+        0.05078125,
+        0.0390625,
+        0.02734375,
+        0.03125,
+        0.03125,
+        0.0234375,
+        0.046875,
+        0.05078125,
+        0.04296875,
+        0.03515625,
+        0.05078125,
+        0.04296875,
+        0.0390625,
+        0.05078125,
+        0.0390625,
+        0.046875,
+        0.0390625,
+        0.0390625,
+        0.02734375,
+        0.05078125,
+        0.05078125,
+        0.046875,
+        0.04296875,
+        0.046875,
+        0.05859375,
+        0.05859375,
+        0.04296875,
+        0.05078125,
+        0.05078125,
+        0.046875,
+        0.03125,
+        0.04296875,
+        0.0390625,
+        0.05078125,
+        0.03125,
+        0.03125,
+        0.03515625,
+        0.0390625
+      ]
+    }
+  },
+  "first_exact_by_run": {}
+}
+RESULT config=logistic_unigram_shared_highC_seqrand ckpt_step=1000 views=512000 token_acc=0.0403 exact=0/64 exact_refs=0 hits=[]
+[combo-pilot] continue config=logistic_unigram_shared_highC_seqrand step=1000
+[combo-pilot] train config=logistic_unigram_shared_C1024 from=0 to=1000 sampler=logistic_normal_linear_mean C=1.0->1024 unigram_shared=0.5 seq=0.0
+[combo-pilot] eval config=logistic_unigram_shared_C1024 step=1000
+[eval-decode-acc] train8_combo_len256_logistic_unigram_shared_C1024_20260517_170456 step=1000 soft=none
+[decode] max_len=256 generated=64/64
+{
+  "num_rows": 1,
+  "best_by_run": {
+    "train8_combo_len256_logistic_unigram_shared_C1024_20260517_170456::none": {
+      "run": "train8_combo_len256_logistic_unigram_shared_C1024_20260517_170456",
+      "checkpoint": "runs/train8_combo_len256_logistic_unigram_shared_C1024_20260517_170456/step_0001000.pt",
+      "ckpt_step": 1000,
+      "endpoint_softening": "none",
+      "decode_rule": "flowmap",
+      "steps": 128,
+      "time_schedule": "logit_normal",
+      "model_t_mode": "post",
+      "final_from": "state",
+      "n_gen": 64,
+      "n_refs": 8,
+      "token_acc_mean": 0.0487060546875,
+      "token_acc_min": 0.03515625,
+      "token_acc_max": 0.07421875,
+      "exact_acc": 0.0,
+      "exact_count": 0,
+      "exact_ref_coverage": 0.0,
+      "exact_ref_count": 0,
+      "exact_ref_hits": [],
+      "best_ref_idx": [
+        5,
+        0,
+        0,
+        0,
+        5,
+        5,
+        5,
+        0,
+        5,
+        2,
+        1,
+        0,
+        7,
+        2,
+        7,
+        0,
+        3,
+        3,
+        2,
+        0,
+        2,
+        2,
+        5,
+        7,
+        5,
+        7,
+        7,
+        2,
+        5,
+        7,
+        5,
+        2,
+        1,
+        5,
+        0,
+        0,
+        5,
+        2,
+        0,
+        0,
+        2,
+        0,
+        0,
+        5,
+        5,
+        3,
+        5,
+        5,
+        5,
+        3,
+        3,
+        0,
+        3,
+        2,
+        5,
+        0,
+        7,
+        0,
+        1,
+        5,
+        2,
+        7,
+        3,
+        2
+      ],
+      "best_token_acc": [
+        0.04296875,
+        0.04296875,
+        0.04296875,
+        0.046875,
+        0.05859375,
+        0.04296875,
+        0.04296875,
+        0.05859375,
+        0.046875,
+        0.05859375,
+        0.04296875,
+        0.05859375,
+        0.0390625,
+        0.046875,
+        0.0625,
+        0.0390625,
+        0.04296875,
+        0.046875,
+        0.046875,
+        0.046875,
+        0.05078125,
+        0.05078125,
+        0.04296875,
+        0.0546875,
+        0.046875,
+        0.046875,
+        0.046875,
+        0.046875,
+        0.0625,
+        0.0625,
+        0.05078125,
+        0.0390625,
+        0.0546875,
+        0.046875,
+        0.04296875,
+        0.0390625,
+        0.05078125,
+        0.0390625,
+        0.046875,
+        0.04296875,
+        0.03515625,
+        0.046875,
+        0.046875,
+        0.0546875,
+        0.0546875,
+        0.04296875,
+        0.04296875,
+        0.0546875,
+        0.04296875,
+        0.046875,
+        0.05078125,
+        0.07421875,
+        0.04296875,
+        0.05078125,
+        0.046875,
+        0.0546875,
+        0.0546875,
+        0.04296875,
+        0.0546875,
+        0.0546875,
+        0.0546875,
+        0.05078125,
+        0.04296875,
+        0.05078125
+      ]
+    }
+  },
+  "first_exact_by_run": {}
+}
+RESULT config=logistic_unigram_shared_C1024 ckpt_step=1000 views=512000 token_acc=0.0487 exact=0/64 exact_refs=0 hits=[]
+[combo-pilot] continue config=logistic_unigram_shared_C1024 step=1000
+[combo-pilot] train config=dirichlet_unigram_shared_highC from=0 to=1000 sampler=dirichlet C=64->4096 unigram_shared=0.5 seq=0.0
+[combo-pilot] eval config=dirichlet_unigram_shared_highC step=1000
+[eval-decode-acc] train8_combo_len256_dirichlet_unigram_shared_highC_20260517_170456 step=1000 soft=none
+[decode] max_len=256 generated=64/64
+{
+  "num_rows": 1,
+  "best_by_run": {
+    "train8_combo_len256_dirichlet_unigram_shared_highC_20260517_170456::none": {
+      "run": "train8_combo_len256_dirichlet_unigram_shared_highC_20260517_170456",
+      "checkpoint": "runs/train8_combo_len256_dirichlet_unigram_shared_highC_20260517_170456/step_0001000.pt",
+      "ckpt_step": 1000,
+      "endpoint_softening": "none",
+      "decode_rule": "flowmap",
+      "steps": 128,
+      "time_schedule": "logit_normal",
+      "model_t_mode": "post",
+      "final_from": "state",
+      "n_gen": 64,
+      "n_refs": 8,
+      "token_acc_mean": 0.03857421875,
+      "token_acc_min": 0.02734375,
+      "token_acc_max": 0.05078125,
+      "exact_acc": 0.0,
+      "exact_count": 0,
+      "exact_ref_coverage": 0.0,
+      "exact_ref_count": 0,
+      "exact_ref_hits": [],
+      "best_ref_idx": [
+        1,
+        1,
+        1,
+        2,
+        1,
+        1,
+        0,
+        1,
+        0,
+        1,
+        0,
+        0,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        2,
+        0,
+        1,
+        1,
+        2,
+        1,
+        1,
+        0,
+        0,
+        1,
+        0,
+        2,
+        1,
+        1,
+        0,
+        0,
+        1,
+        0,
+        2,
+        0,
+        1,
+        1,
+        1,
+        1,
+        1,
+        0,
+        5,
+        2,
+        1,
+        0,
+        2,
+        1,
+        1,
+        1,
+        2,
+        1,
+        0,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        0,
+        1
+      ],
+      "best_token_acc": [
+        0.03125,
+        0.04296875,
+        0.046875,
+        0.0390625,
+        0.0390625,
+        0.0390625,
+        0.04296875,
+        0.03515625,
+        0.0390625,
+        0.03515625,
+        0.03125,
+        0.02734375,
+        0.03515625,
+        0.03125,
+        0.03515625,
+        0.03515625,
+        0.03515625,
+        0.04296875,
+        0.04296875,
+        0.03125,
+        0.02734375,
+        0.03125,
+        0.04296875,
+        0.0390625,
+        0.0390625,
+        0.03515625,
+        0.03515625,
+        0.0390625,
+        0.046875,
+        0.03515625,
+        0.05078125,
+        0.0390625,
+        0.046875,
+        0.04296875,
+        0.0390625,
+        0.0390625,
+        0.0390625,
+        0.04296875,
+        0.03125,
+        0.046875,
+        0.03515625,
+        0.046875,
+        0.046875,
+        0.04296875,
+        0.03125,
+        0.03515625,
+        0.03515625,
+        0.0390625,
+        0.03125,
+        0.046875,
+        0.0390625,
+        0.05078125,
+        0.0390625,
+        0.02734375,
+        0.02734375,
+        0.0390625,
+        0.05078125,
+        0.03125,
+        0.03515625,
+        0.04296875,
+        0.0390625,
+        0.04296875,
+        0.0390625,
+        0.046875
+      ]
+    }
+  },
+  "first_exact_by_run": {}
+}
+RESULT config=dirichlet_unigram_shared_highC ckpt_step=1000 views=512000 token_acc=0.0386 exact=0/64 exact_refs=0 hits=[]
+[combo-pilot] continue config=dirichlet_unigram_shared_highC step=1000
+[combo-pilot] round=2 Sun May 17 17:08:26 UTC 2026
+[combo-pilot] train config=logistic_unigram_shared_highC from=1000 to=2000 sampler=logistic_normal_linear_mean C=64->4096 unigram_shared=0.5 seq=0.0
+[combo-pilot] eval config=logistic_unigram_shared_highC step=2000
+[eval-decode-acc] train8_combo_len256_logistic_unigram_shared_highC_20260517_170456 step=2000 soft=none
+[decode] max_len=256 generated=64/64
+{
+  "num_rows": 1,
+  "best_by_run": {
+    "train8_combo_len256_logistic_unigram_shared_highC_20260517_170456::none": {
+      "run": "train8_combo_len256_logistic_unigram_shared_highC_20260517_170456",
+      "checkpoint": "runs/train8_combo_len256_logistic_unigram_shared_highC_20260517_170456/step_0002000.pt",
+      "ckpt_step": 2000,
+      "endpoint_softening": "none",
+      "decode_rule": "flowmap",
+      "steps": 128,
+      "time_schedule": "logit_normal",
+      "model_t_mode": "post",
+      "final_from": "state",
+      "n_gen": 64,
+      "n_refs": 8,
+      "token_acc_mean": 0.03033447265625,
+      "token_acc_min": 0.015625,
+      "token_acc_max": 0.046875,
+      "exact_acc": 0.0,
+      "exact_count": 0,
+      "exact_ref_coverage": 0.0,
+      "exact_ref_count": 0,
+      "exact_ref_hits": [],
+      "best_ref_idx": [
+        1,
+        1,
+        1,
+        1,
+        7,
+        3,
+        1,
+        7,
+        0,
+        0,
+        0,
+        3,
+        1,
+        0,
+        1,
+        5,
+        0,
+        0,
+        3,
+        0,
+        0,
+        1,
+        0,
+        7,
+        7,
+        1,
+        7,
+        0,
+        1,
+        0,
+        7,
+        1,
+        0,
+        0,
+        0,
+        0,
+        0,
+        3,
+        1,
+        0,
+        0,
+        1,
+        7,
+        5,
+        1,
+        0,
+        1,
+        1,
+        1,
+        0,
+        0,
+        0,
+        0,
+        0,
+        3,
+        0,
+        1,
+        7,
+        7,
+        0,
+        7,
+        0,
+        7,
+        5
+      ],
+      "best_token_acc": [
+        0.03125,
+        0.02734375,
+        0.02734375,
+        0.03515625,
+        0.046875,
+        0.02734375,
+        0.03125,
+        0.04296875,
+        0.04296875,
+        0.02734375,
+        0.046875,
+        0.03515625,
+        0.02734375,
+        0.0234375,
+        0.01953125,
+        0.02734375,
+        0.02734375,
+        0.0390625,
+        0.02734375,
+        0.01953125,
+        0.03125,
+        0.03125,
+        0.01953125,
+        0.0390625,
+        0.0234375,
+        0.03125,
+        0.02734375,
+        0.02734375,
+        0.03125,
+        0.03125,
+        0.03125,
+        0.02734375,
+        0.03125,
+        0.03515625,
+        0.03125,
+        0.02734375,
+        0.03515625,
+        0.02734375,
+        0.0234375,
+        0.02734375,
+        0.03125,
+        0.03125,
+        0.03515625,
+        0.03515625,
+        0.02734375,
+        0.01953125,
+        0.0234375,
+        0.0234375,
+        0.015625,
+        0.046875,
+        0.03125,
+        0.02734375,
+        0.03515625,
+        0.0234375,
+        0.03125,
+        0.02734375,
+        0.0234375,
+        0.02734375,
+        0.03125,
+        0.03515625,
+        0.03515625,
+        0.03125,
+        0.03125,
+        0.0390625
+      ]
+    }
+  },
+  "first_exact_by_run": {}
+}
+RESULT config=logistic_unigram_shared_highC ckpt_step=2000 views=1024000 token_acc=0.0303 exact=0/64 exact_refs=0 hits=[]
+[combo-pilot] continue config=logistic_unigram_shared_highC step=2000
+[combo-pilot] train config=logistic_unigram_shared_highC_seqrand from=1000 to=2000 sampler=logistic_normal_linear_mean C=64->4096 unigram_shared=0.5 seq=0.5
+[combo-pilot] eval config=logistic_unigram_shared_highC_seqrand step=2000
+[eval-decode-acc] train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456 step=2000 soft=none
+[decode] max_len=256 generated=64/64
+{
+  "num_rows": 1,
+  "best_by_run": {
+    "train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456::none": {
+      "run": "train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456",
+      "checkpoint": "runs/train8_combo_len256_logistic_unigram_shared_highC_seqrand_20260517_170456/step_0002000.pt",
+      "ckpt_step": 2000,
+      "endpoint_softening": "none",
+      "decode_rule": "flowmap",
+      "steps": 128,
+      "time_schedule": "logit_normal",
+      "model_t_mode": "post",
+      "final_from": "state",
+      "n_gen": 64,
+      "n_refs": 8,
+      "token_acc_mean": 0.04046630859375,
+      "token_acc_min": 0.01953125,
+      "token_acc_max": 0.06640625,
+      "exact_acc": 0.0,
+      "exact_count": 0,
+      "exact_ref_coverage": 0.0,
+      "exact_ref_count": 0,
+      "exact_ref_hits": [],
+      "best_ref_idx": [
+        0,
+        7,
+        0,
+        0,
+        0,
+        0,
+        7,
+        0,
+        7,
+        7,
+        7,
+        0,
+        7,
+        7,
+        7,
+        7,
+        7,
+        7,
+        7,
+        0,
+        0,
+        7,
+        0,
+        0,
+        0,
+        7,
+        0,
+        7,
+        0,
+        0,
+        0,
+        0,
+        7,
+        1,
+        0,
+        7,
+        0,
+        0,
+        5,
+        0,
+        0,
+        7,
+        0,
+        0,
+        0,
+        7,
+        5,
+        0,
+        5,
+        2,
+        0,
+        0,
+        0,
+        7,
+        0,
+        7,
+        1,
+        0,
+        0,
+        0,
+        7,
+        2,
+        0,
+        0
+      ],
+      "best_token_acc": [
+        0.01953125,
+        0.03125,
+        0.03515625,
+        0.0546875,
+        0.0390625,
+        0.0546875,
+        0.0234375,
+        0.03125,
+        0.046875,
+        0.05078125,
+        0.0390625,
+        0.0234375,
+        0.0390625,
+        0.05859375,
+        0.02734375,
+        0.02734375,
+        0.0546875,
+        0.05078125,
+        0.03515625,
+        0.046875,
+        0.05859375,
+        0.02734375,
+        0.046875,
+        0.04296875,
+        0.0546875,
+        0.01953125,
+        0.046875,
+        0.03125,
+        0.05078125,
+        0.05859375,
+        0.04296875,
+        0.01953125,
+        0.05078125,
+        0.02734375,
+        0.046875,
+        0.03515625,
+        0.03515625,
+        0.05859375,
+        0.03125,
+        0.04296875,
+        0.046875,
+        0.05078125,
+        0.04296875,
+        0.0546875,
+        0.02734375,
+        0.02734375,
+        0.02734375,
+        0.046875,
+        0.01953125,
+        0.03515625,
+        0.06640625,
+        0.03515625,
+        0.046875,
+        0.046875,
+        0.05078125,
+        0.03125,
+        0.03125,
+        0.03125,
+        0.03125,
+        0.0546875,
+        0.0546875,
+        0.02734375,
+        0.0546875,
+        0.03125
+      ]
+    }
+  },
+  "first_exact_by_run": {}
+}
+RESULT config=logistic_unigram_shared_highC_seqrand ckpt_step=2000 views=1024000 token_acc=0.0405 exact=0/64 exact_refs=0 hits=[]
+[combo-pilot] continue config=logistic_unigram_shared_highC_seqrand step=2000
+[combo-pilot] train config=logistic_unigram_shared_C1024 from=1000 to=2000 sampler=logistic_normal_linear_mean C=1.0->1024 unigram_shared=0.5 seq=0.0

LTA_openwebtext_dualt/logs/train8_len_sweep_compact_bs512_until_exact_4gpu/driver.log ADDED Viewed

The diff for this file is too large to render. See raw diff

LTA_openwebtext_dualt/scripts/apple_to_apple_lta_checks.py ADDED Viewed

	@@ -0,0 +1,631 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import csv
+import json
+import math
+import sys
+from collections import Counter
+from pathlib import Path
+from typing import Any, Iterable
+import torch
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+REPO_ROOT = Path(__file__).resolve().parents[1]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+from eval import build_model_from_ckpt
+from flowtext_lab.bridges import make_dirichlet_bridge_batch
+from flowtext_lab.data import EosPadCollator, WrappedStreamingTextSequenceDataset, iter_text_records
+from flowtext_lab.decode import sample_noise_simplex, state_for_model
+from flowtext_lab.tokenization import BpeTextTokenizer
+from train import TokenizedTextCollator, load_tokenized_hf_dataset
+def token_piece(tok: BpeTextTokenizer, idx: int) -> str:
+    raw = getattr(tok, "tokenizer", None)
+    id_to_token = getattr(raw, "id_to_token", None)
+    if callable(id_to_token):
+        piece = id_to_token(int(idx))
+        if piece is not None:
+            return str(piece)
+    return tok.decode([int(idx)], stop_at_eos=False, skip_special_tokens=False)
+def token_text(tok: BpeTextTokenizer, idx: int) -> str:
+    return tok.decode([int(idx)], stop_at_eos=False, skip_special_tokens=False)
+def compact_piece(s: str) -> str:
+    return s.replace("\n", "\\n").replace("\t", "\\t")
+def load_batch(
+    *,
+    data_path: str,
+    tokenizer: BpeTextTokenizer,
+    max_len: int,
+    batch_size: int,
+    mode: str,
+    text_column: str | None,
+    openwebtext_split: str,
+    wrap_mode: str,
+    max_records: int,
+    tokenized_pad_token: str,
+) -> dict[str, torch.Tensor]:
+    if mode == "tokenized_hf":
+        ds = load_tokenized_hf_dataset(data_path, max_records=max_records)
+        pad_id = tokenizer.pad_id if tokenized_pad_token == "pad" and tokenizer.pad_id is not None else tokenizer.eos_id
+        collate = TokenizedTextCollator(pad_id, max_len=max_len)
+        examples = [ds[i] for i in range(min(batch_size, len(ds)))]
+        return collate(examples)
+    if mode != "wrap":
+        raise ValueError(f"unknown data mode: {mode}")
+    ds = WrappedStreamingTextSequenceDataset(
+        data_path,
+        tokenizer,
+        max_len=max_len,
+        text_column=text_column,
+        openwebtext_split=openwebtext_split,
+        max_records_per_epoch=max_records,
+        wrap_mode=wrap_mode,
+    )
+    loader = DataLoader(ds, batch_size=batch_size, collate_fn=EosPadCollator(tokenizer.eos_id, max_len=max_len))
+    return next(iter(loader))
+def iter_record_lengths(
+    *,
+    data_path: str,
+    tokenizer: BpeTextTokenizer,
+    mode: str,
+    text_column: str | None,
+    openwebtext_split: str,
+    max_records: int,
+) -> Iterable[int]:
+    if mode == "tokenized_hf":
+        ds = load_tokenized_hf_dataset(data_path, max_records=max_records)
+        for ex in ds:
+            raw = ex["input_ids"]
+            if hasattr(raw, "tolist"):
+                raw = raw.tolist()
+            yield len(raw)
+        return
+    for i, text in enumerate(
+        iter_text_records(
+            data_path,
+            text_column=text_column,
+            openwebtext_split=openwebtext_split,
+            detokenizer="auto",
+        )
+    ):
+        if i >= max_records:
+            break
+        ids = tokenizer.encode(text, add_eos=False, add_special_tokens=False)
+        yield len(ids)
+def rate_summary(values: list[float]) -> dict[str, float]:
+    if not values:
+        return {"mean": 0.0, "min": 0.0, "p50": 0.0, "p90": 0.0, "p99": 0.0, "max": 0.0}
+    vals = sorted(float(x) for x in values)
+    n = len(vals)
+    def q(p: float) -> float:
+        return vals[min(n - 1, max(0, int(round((n - 1) * p))))]
+    return {
+        "mean": float(sum(vals) / n),
+        "min": float(vals[0]),
+        "p50": float(q(0.5)),
+        "p90": float(q(0.9)),
+        "p99": float(q(0.99)),
+        "max": float(vals[-1]),
+    }
+def distribution_entropy_from_counts(counts: Counter[int]) -> float:
+    total = sum(counts.values())
+    if total <= 0:
+        return 0.0
+    out = 0.0
+    for c in counts.values():
+        p = c / total
+        out -= p * math.log(max(p, 1e-12))
+    return float(out)
+def token_feature_rates(ids: torch.Tensor, tok: BpeTextTokenizer) -> dict[str, float]:
+    flat = [int(x) for x in ids.reshape(-1).tolist()]
+    if not flat:
+        return {}
+    pieces = [token_piece(tok, x) for x in flat]
+    texts = [token_text(tok, x) for x in flat]
+    specials = {tok.eos_id, tok.bos_id, tok.unk_id}
+    if tok.pad_id is not None:
+        specials.add(tok.pad_id)
+    denom = len(flat)
+    normal = [i for i, x in enumerate(flat) if x not in specials]
+    normal_denom = max(len(normal), 1)
+    return {
+        "bert_hash_rate": sum(pieces[i].startswith("##") for i in normal) / normal_denom,
+        "spm_cont_rate": sum((not pieces[i].startswith("▁")) and (not pieces[i].startswith("<")) for i in normal) / normal_denom,
+        "single_char_rate": sum(len(texts[i].strip()) == 1 for i in normal) / normal_denom,
+        "digit_piece_rate": sum(any(ch.isdigit() for ch in pieces[i]) for i in normal) / normal_denom,
+        "url_piece_rate": sum(("http" in pieces[i].lower() or "www" in pieces[i].lower() or ".com" in pieces[i].lower()) for i in normal) / normal_denom,
+        "special_rate": sum(x in specials for x in flat) / denom,
+    }
+def command_data(args: argparse.Namespace) -> None:
+    tok = BpeTextTokenizer.from_file(args.tokenizer_path)
+    batch = load_batch(
+        data_path=args.data_path,
+        tokenizer=tok,
+        max_len=args.max_len,
+        batch_size=args.n_sequences,
+        mode=args.data_mode,
+        text_column=args.text_column,
+        openwebtext_split=args.openwebtext_split,
+        wrap_mode=args.wrap_mode,
+        max_records=args.max_records,
+        tokenized_pad_token=args.tokenized_pad_token,
+    )
+    ids = batch["ids"]
+    attn = batch.get("attn_mask", torch.ones_like(ids, dtype=torch.bool))
+    valid_ids = ids[attn]
+    counts = Counter(int(x) for x in valid_ids.tolist())
+    top = [
+        {
+            "id": int(i),
+            "piece": compact_piece(token_piece(tok, int(i))),
+            "text": compact_piece(token_text(tok, int(i))),
+            "count": int(c),
+            "rate": float(c / max(valid_ids.numel(), 1)),
+        }
+        for i, c in counts.most_common(args.top_k)
+    ]
+    seq_lens = attn.long().sum(dim=1).tolist()
+    internal = ids[:, 1:-1] if ids.size(1) > 2 else ids[:, :0]
+    internal_attn = attn[:, 1:-1] if attn.size(1) > 2 else attn[:, :0]
+    eos_internal = ((internal == tok.eos_id) & internal_attn).long().sum(dim=1).tolist()
+    pad_internal = []
+    if tok.pad_id is not None:
+        pad_internal = ((internal == tok.pad_id) & internal_attn).long().sum(dim=1).tolist()
+    pos0 = Counter(int(x) for x in ids[:, 0].tolist())
+    last_valid = []
+    for row, mask in zip(ids, attn):
+        idx = int(mask.long().sum().item()) - 1
+        if idx >= 0:
+            last_valid.append(int(row[idx].item()))
+    last_counts = Counter(last_valid)
+    record_lengths = list(
+        iter_record_lengths(
+            data_path=args.data_path,
+            tokenizer=tok,
+            mode=args.data_mode,
+            text_column=args.text_column,
+            openwebtext_split=args.openwebtext_split,
+            max_records=args.max_records,
+        )
+    )
+    out = {
+        "name": args.name,
+        "data_path": args.data_path,
+        "data_mode": args.data_mode,
+        "tokenizer_path": args.tokenizer_path,
+        "vocab_size": tok.vocab_size,
+        "bos_id": tok.bos_id,
+        "bos_piece": token_piece(tok, tok.bos_id),
+        "eos_id": tok.eos_id,
+        "eos_piece": token_piece(tok, tok.eos_id),
+        "pad_id": tok.pad_id,
+        "n_sequences": int(ids.size(0)),
+        "max_len": args.max_len,
+        "sequence_len": rate_summary([float(x) for x in seq_lens]),
+        "record_token_len_no_special_no_eos": rate_summary([float(x) for x in record_lengths]),
+        "internal_eos_per_seq": rate_summary([float(x) for x in eos_internal]),
+        "internal_pad_per_seq": rate_summary([float(x) for x in pad_internal]) if pad_internal else None,
+        "pos0_top": [
+            {"id": i, "piece": compact_piece(token_piece(tok, i)), "count": c, "rate": c / max(ids.size(0), 1)}
+            for i, c in pos0.most_common(args.top_k)
+        ],
+        "last_valid_top": [
+            {"id": i, "piece": compact_piece(token_piece(tok, i)), "count": c, "rate": c / max(len(last_valid), 1)}
+            for i, c in last_counts.most_common(args.top_k)
+        ],
+        "unigram_entropy": distribution_entropy_from_counts(counts),
+        "token_feature_rates": token_feature_rates(valid_ids, tok),
+        "top_unigram": top,
+    }
+    Path(args.out_json).parent.mkdir(parents=True, exist_ok=True)
+    Path(args.out_json).write_text(json.dumps(out, indent=2, ensure_ascii=False), encoding="utf-8")
+    print(json.dumps(out, indent=2, ensure_ascii=False), flush=True)
+def ckpt_arg(ckpt_args: dict[str, Any], key: str, default: Any) -> Any:
+    return ckpt_args.get(key, default)
+def make_bridge_for_eval(
+    *,
+    ids: torch.Tensor,
+    attn: torch.Tensor,
+    ckpt_args: dict[str, Any],
+    vocab_size: int,
+    t_value: float,
+    force_mask_ratio: float | None,
+    eps: float,
+) -> Any:
+    return make_dirichlet_bridge_batch(
+        ids=ids,
+        attn_mask=attn,
+        vocab_size=vocab_size,
+        target_prob=float(ckpt_arg(ckpt_args, "target_prob", 1.0)),
+        min_t=float(ckpt_arg(ckpt_args, "min_t", 0.0)),
+        max_t=float(ckpt_arg(ckpt_args, "max_t", 1.0)),
+        min_mask_ratio=float(ckpt_arg(ckpt_args, "min_mask_ratio", 0.1)),
+        max_mask_ratio=float(ckpt_arg(ckpt_args, "max_mask_ratio", 1.0)),
+        wrong_token_replace_prob=ckpt_arg(ckpt_args, "wrong_token_replace_prob", "0.0"),
+        wrong_token_schedule=str(ckpt_arg(ckpt_args, "wrong_token_schedule", "constant")),
+        wrong_token_exp_k=float(ckpt_arg(ckpt_args, "wrong_token_exp_k", 1.0)),
+        dirichlet_concentration_min=float(ckpt_arg(ckpt_args, "dirichlet_concentration_min", 1.0)),
+        dirichlet_concentration_max=float(ckpt_arg(ckpt_args, "dirichlet_concentration_max", 1024.0)),
+        eps=eps,
+        state_format=str(ckpt_arg(ckpt_args, "state_format", ckpt_arg(ckpt_args, "input_format", "prob"))),
+        dirichlet_endpoint_mode=str(ckpt_arg(ckpt_args, "dirichlet_endpoint_mode", "bernoulli_wrong")),
+        dirichlet_semantic_t_mode=str(ckpt_arg(ckpt_args, "dirichlet_semantic_t_mode", "same")),
+        dirichlet_semantic_t_value=float(ckpt_arg(ckpt_args, "dirichlet_semantic_t_value", 0.0)),
+        dirichlet_semantic_t_curve=str(ckpt_arg(ckpt_args, "dirichlet_semantic_t_curve", "linear")),
+        dirichlet_semantic_t_power=float(ckpt_arg(ckpt_args, "dirichlet_semantic_t_power", 1.0)),
+        dirichlet_support_t_curve=str(ckpt_arg(ckpt_args, "dirichlet_support_t_curve", "linear")),
+        dirichlet_support_t_power=float(ckpt_arg(ckpt_args, "dirichlet_support_t_power", 1.0)),
+        endpoint_sequence_random_prob_alpha=float(ckpt_arg(ckpt_args, "endpoint_sequence_random_prob_alpha", 0.0)),
+        categorical_wrong_from_full_vocab=bool(ckpt_arg(ckpt_args, "categorical_wrong_from_full_vocab", False)),
+        categorical_wrong_from_batch_valid_tokens=bool(ckpt_arg(ckpt_args, "categorical_wrong_from_batch_valid_tokens", False)),
+        categorical_wrong_basin_token_ids=ckpt_arg(ckpt_args, "categorical_wrong_basin_token_ids", ""),
+        categorical_wrong_basin_prob=float(ckpt_arg(ckpt_args, "categorical_wrong_basin_prob", 0.0)),
+        categorical_wrong_unigram_prob=float(ckpt_arg(ckpt_args, "categorical_wrong_unigram_prob", 0.0)),
+        categorical_wrong_uniform_prob=float(ckpt_arg(ckpt_args, "categorical_wrong_uniform_prob", 0.0)),
+        categorical_wrong_prob_floor=float(ckpt_arg(ckpt_args, "categorical_wrong_prob_floor", 0.0)),
+        categorical_gold_prob_floor=float(ckpt_arg(ckpt_args, "categorical_gold_prob_floor", 0.0)),
+        categorical_gold_prob_ceil=float(ckpt_arg(ckpt_args, "categorical_gold_prob_ceil", 1.0)),
+        simplex_bridge_sampler=str(ckpt_arg(ckpt_args, "simplex_bridge_sampler", "dirichlet")),
+        logistic_normal_sigma_min=float(ckpt_arg(ckpt_args, "logistic_normal_sigma_min", 0.18)),
+        logistic_normal_sigma_max=float(ckpt_arg(ckpt_args, "logistic_normal_sigma_max", 2.2)),
+        logistic_normal_tau_min=float(ckpt_arg(ckpt_args, "logistic_normal_tau_min", 0.65)),
+        logistic_normal_tau_max=float(ckpt_arg(ckpt_args, "logistic_normal_tau_max", 1.15)),
+        force_t=t_value,
+        force_mask_ratio=force_mask_ratio,
+        mask_ratio_floor_schedule=str(ckpt_arg(ckpt_args, "mask_ratio_floor_schedule", "none")),
+        mask_mixture_original_prob=float(ckpt_arg(ckpt_args, "mask_mixture_original_prob", 0.0)),
+        mask_mixture_lowk_prob=float(ckpt_arg(ckpt_args, "mask_mixture_lowk_prob", 0.0)),
+        mask_mixture_lowcorrupt_prob=float(ckpt_arg(ckpt_args, "mask_mixture_lowcorrupt_prob", 0.0)),
+        mask_mixture_block_prob=float(ckpt_arg(ckpt_args, "mask_mixture_block_prob", 0.0)),
+        mask_mixture_all_prob=float(ckpt_arg(ckpt_args, "mask_mixture_all_prob", 0.0)),
+        mask_mixture_lowk_clean_tokens=ckpt_arg(ckpt_args, "mask_mixture_lowk_clean_tokens", "1,2,4,8,16,32,64"),
+        mask_mixture_lowcorrupt_tokens=ckpt_arg(ckpt_args, "mask_mixture_lowcorrupt_tokens", "1,2,4,8,16,32,64"),
+        mask_mixture_block_tokens=ckpt_arg(ckpt_args, "mask_mixture_block_tokens", "64,128"),
+        clean_state_mode=str(ckpt_arg(ckpt_args, "clean_state_mode", "onehot")),
+        return_dense_targets=False,
+    )
+def masked_loss_acc(logits: torch.Tensor, target: torch.Tensor, mask: torch.Tensor) -> dict[str, float]:
+    flat_mask = mask.reshape(-1)
+    if not bool(flat_mask.any().item()):
+        return {"nll": 0.0, "ppl": 1.0, "acc": 0.0, "tokens": 0}
+    flat_logits = logits.reshape(-1, logits.size(-1))[flat_mask]
+    flat_target = target.reshape(-1)[flat_mask]
+    loss = F.cross_entropy(flat_logits, flat_target, reduction="mean")
+    pred = flat_logits.argmax(dim=-1)
+    acc = (pred == flat_target).float().mean()
+    return {
+        "nll": float(loss.detach().cpu()),
+        "ppl": float(torch.exp(loss.clamp(max=50)).detach().cpu()),
+        "acc": float(acc.detach().cpu()),
+        "tokens": int(flat_mask.sum().detach().cpu()),
+    }
+@torch.inference_mode()
+def command_teacher(args: argparse.Namespace) -> None:
+    tok = BpeTextTokenizer.from_file(args.tokenizer_path)
+    device = torch.device("cuda" if torch.cuda.is_available() and not args.cpu else "cpu")
+    ckpt = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
+    ckpt_args = dict(ckpt.get("args", {}))
+    model = build_model_from_ckpt(ckpt, tok.vocab_size, args.max_len, device).eval()
+    batch = load_batch(
+        data_path=args.data_path,
+        tokenizer=tok,
+        max_len=args.max_len,
+        batch_size=args.batch_size,
+        mode=args.data_mode,
+        text_column=args.text_column,
+        openwebtext_split=args.openwebtext_split,
+        wrap_mode=args.wrap_mode,
+        max_records=args.max_records,
+        tokenized_pad_token=args.tokenized_pad_token,
+    )
+    ids = batch["ids"].to(device)
+    attn = batch.get("attn_mask", torch.ones_like(ids, dtype=torch.bool)).to(device)
+    rows = []
+    for t_value in [float(x) for x in args.t_values.split(",") if x.strip()]:
+        torch.manual_seed(args.seed + int(round(t_value * 1000000)))
+        bridge = make_bridge_for_eval(
+            ids=ids,
+            attn=attn,
+            ckpt_args=ckpt_args,
+            vocab_size=tok.vocab_size,
+            t_value=t_value,
+            force_mask_ratio=args.force_mask_ratio,
+            eps=args.eps,
+        )
+        model_t = bridge.t
+        logits = model(state_for_model(model, bridge.state, args.eps), model_t, attn).float()
+        valid = attn
+        corrupt = bridge.corrupt_mask & attn
+        pos0_pred = logits[:, 0].argmax(dim=-1)
+        last_pred = []
+        for b in range(ids.size(0)):
+            last = int(attn[b].long().sum().item()) - 1
+            last_pred.append(int(logits[b, last].argmax().detach().cpu()) if last >= 0 else -1)
+        pos0_counts = Counter(int(x) for x in pos0_pred.detach().cpu().tolist())
+        last_counts = Counter(last_pred)
+        probs = F.softmax(logits, dim=-1)
+        rows.append(
+            {
+                "name": args.name,
+                "checkpoint": args.checkpoint,
+                "ckpt_step": int(ckpt.get("step", -1)),
+                "t": t_value,
+                "force_mask_ratio": args.force_mask_ratio,
+                "corrupt_frac": float(corrupt.float().mean().detach().cpu()),
+                "wrong_frac": float((bridge.wrong_mask & attn).float().sum().detach().cpu() / attn.float().sum().clamp_min(1).detach().cpu()),
+                "valid": masked_loss_acc(logits, ids, valid),
+                "corrupt": masked_loss_acc(logits, ids, corrupt),
+                "dist_entropy": float((-(probs.clamp_min(args.eps) * probs.clamp_min(args.eps).log()).sum(dim=-1)[valid]).mean().detach().cpu()),
+                "mean_maxp": float(probs.max(dim=-1).values[valid].mean().detach().cpu()),
+                "pos0_gold_id": int(ids[0, 0].detach().cpu()),
+                "pos0_gold_piece": token_piece(tok, int(ids[0, 0].detach().cpu())),
+                "pos0_top": [
+                    {"id": i, "piece": compact_piece(token_piece(tok, i)), "count": c, "rate": c / max(ids.size(0), 1)}
+                    for i, c in pos0_counts.most_common(5)
+                ],
+                "last_top": [
+                    {"id": i, "piece": compact_piece(token_piece(tok, i)), "count": c, "rate": c / max(ids.size(0), 1)}
+                    for i, c in last_counts.most_common(5)
+                ],
+            }
+        )
+    out = Path(args.out_json)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(rows, indent=2, ensure_ascii=False), encoding="utf-8")
+    with out.with_suffix(".tsv").open("w", newline="", encoding="utf-8") as f:
+        fields = [
+            "name",
+            "ckpt_step",
+            "t",
+            "force_mask_ratio",
+            "corrupt_frac",
+            "wrong_frac",
+            "valid_nll",
+            "valid_acc",
+            "corrupt_nll",
+            "corrupt_acc",
+            "dist_entropy",
+            "mean_maxp",
+            "pos0_gold_piece",
+            "pos0_top",
+            "last_top",
+        ]
+        writer = csv.DictWriter(f, fieldnames=fields, delimiter="\t")
+        writer.writeheader()
+        for row in rows:
+            writer.writerow(
+                {
+                    "name": row["name"],
+                    "ckpt_step": row["ckpt_step"],
+                    "t": row["t"],
+                    "force_mask_ratio": row["force_mask_ratio"],
+                    "corrupt_frac": row["corrupt_frac"],
+                    "wrong_frac": row["wrong_frac"],
+                    "valid_nll": row["valid"]["nll"],
+                    "valid_acc": row["valid"]["acc"],
+                    "corrupt_nll": row["corrupt"]["nll"],
+                    "corrupt_acc": row["corrupt"]["acc"],
+                    "dist_entropy": row["dist_entropy"],
+                    "mean_maxp": row["mean_maxp"],
+                    "pos0_gold_piece": row["pos0_gold_piece"],
+                    "pos0_top": " | ".join(f"{x['piece']}:{x['rate']:.2f}" for x in row["pos0_top"]),
+                    "last_top": " | ".join(f"{x['piece']}:{x['rate']:.2f}" for x in row["last_top"]),
+                }
+            )
+    for row in rows:
+        print(
+            f"{row['name']} step={row['ckpt_step']} t={row['t']:.4f} "
+            f"valid_nll={row['valid']['nll']:.3f} valid_acc={row['valid']['acc']:.3f} "
+            f"corrupt_nll={row['corrupt']['nll']:.3f} corrupt_acc={row['corrupt']['acc']:.3f} "
+            f"pos0={row['pos0_top'][0]['piece']}:{row['pos0_top'][0]['rate']:.2f}",
+            flush=True,
+        )
+def filter_top_p(probs: torch.Tensor, top_p: float, eps: float) -> torch.Tensor:
+    if top_p >= 1.0:
+        return probs
+    sorted_vals, sorted_idx = torch.sort(probs, dim=-1, descending=True)
+    total = sorted_vals.sum(dim=-1, keepdim=True).clamp_min(eps)
+    remove = sorted_vals.cumsum(dim=-1) > top_p * total
+    remove[..., 0] = False
+    sorted_vals = sorted_vals.masked_fill(remove, 0.0)
+    out = torch.zeros_like(probs).scatter(-1, sorted_idx, sorted_vals)
+    return out / out.sum(dim=-1, keepdim=True).clamp_min(eps)
+def distribution_metrics(probs: torch.Tensor, ids: torch.Tensor, tok: BpeTextTokenizer, prefix: str) -> dict[str, Any]:
+    p = probs.clamp_min(1e-12)
+    ent = float((-(p * p.log()).sum(dim=-1)).mean().detach().cpu())
+    maxp, arg = probs.max(dim=-1)
+    counts = Counter(int(x) for x in arg.reshape(-1).detach().cpu().tolist())
+    return {
+        f"{prefix}_entropy": ent,
+        f"{prefix}_mean_top_mass": float(maxp.mean().detach().cpu()),
+        f"{prefix}_argmax_token_entropy": distribution_entropy_from_counts(counts),
+        f"{prefix}_argmax_top": [
+            {"id": i, "piece": compact_piece(token_piece(tok, i)), "count": c, "rate": c / max(arg.numel(), 1)}
+            for i, c in counts.most_common(8)
+        ],
+        **{f"{prefix}_{k}": v for k, v in token_feature_rates(arg.detach().cpu(), tok).items()},
+    }
+@torch.inference_mode()
+def command_trace(args: argparse.Namespace) -> None:
+    tok = BpeTextTokenizer.from_file(args.tokenizer_path)
+    device = torch.device("cuda" if torch.cuda.is_available() and not args.cpu else "cpu")
+    torch.manual_seed(args.seed)
+    ckpt = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
+    model = build_model_from_ckpt(ckpt, tok.vocab_size, args.max_len, device).eval()
+    eps = args.eps
+    bs = args.batch_size
+    probs = sample_noise_simplex(
+        (bs, args.max_len),
+        tok.vocab_size,
+        device,
+        eps,
+        noise_mode="dirichlet",
+        target_prob=1.0,
+        noise_sigma=-1.0,
+        dirichlet_concentration=args.concentration_min,
+    )
+    attn = torch.ones((bs, args.max_len), dtype=torch.bool, device=device)
+    log_cmin = math.log(args.concentration_min)
+    log_cmax = math.log(args.concentration_max)
+    out = Path(args.out_jsonl)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    snapshot = set(int(x) for x in args.trace_steps.split(",") if x.strip())
+    last_endpoint = probs
+    with out.open("w", encoding="utf-8") as f:
+        for step in range(args.steps):
+            support_t = (step + 1) / max(args.steps, 1)
+            t = torch.full((bs,), support_t, dtype=torch.float32, device=device)
+            logits = model(state_for_model(model, probs, eps), t, attn).float()
+            endpoint = F.softmax(logits / args.endpoint_temp, dim=-1)
+            endpoint = filter_top_p(endpoint, args.endpoint_top_p, eps)
+            tau = args.gumbel_tau_start + support_t * (args.gumbel_tau_end - args.gumbel_tau_start)
+            uniform = torch.rand_like(endpoint).clamp_(eps, 1.0 - eps)
+            gumbel = -torch.log(-torch.log(uniform))
+            projected = F.softmax((endpoint.clamp_min(eps).log() + gumbel) / max(tau, eps), dim=-1)
+            last_endpoint = projected
+            mean = (1.0 - support_t) / tok.vocab_size + support_t * projected
+            mean = mean / mean.sum(dim=-1, keepdim=True).clamp_min(eps)
+            conc = math.exp(log_cmin + support_t * (log_cmax - log_cmin))
+            alpha = (mean * conc).clamp_min(eps)
+            probs = torch._standard_gamma(alpha).clamp_min(eps)
+            probs = probs / probs.sum(dim=-1, keepdim=True).clamp_min(eps)
+            step_num = step + 1
+            if step_num in snapshot or step_num == args.steps:
+                row = {
+                    "name": args.name,
+                    "ckpt_step": int(ckpt.get("step", -1)),
+                    "step": step_num,
+                    "support_t": support_t,
+                    "tau": tau,
+                    "concentration": conc,
+                }
+                row.update(distribution_metrics(endpoint, endpoint.argmax(dim=-1), tok, "a"))
+                row.update(distribution_metrics(projected, projected.argmax(dim=-1), tok, "e"))
+                row.update(distribution_metrics(probs, probs.argmax(dim=-1), tok, "p"))
+                for pos in [0, 1, args.max_len - 2, args.max_len - 1]:
+                    a_id = int(endpoint[0, pos].argmax().detach().cpu())
+                    e_id = int(projected[0, pos].argmax().detach().cpu())
+                    p_id = int(probs[0, pos].argmax().detach().cpu())
+                    row[f"pos{pos}_a"] = {"id": a_id, "piece": compact_piece(token_piece(tok, a_id)), "prob": float(endpoint[0, pos, a_id].detach().cpu())}
+                    row[f"pos{pos}_e"] = {"id": e_id, "piece": compact_piece(token_piece(tok, e_id)), "prob": float(projected[0, pos, e_id].detach().cpu())}
+                    row[f"pos{pos}_p"] = {"id": p_id, "piece": compact_piece(token_piece(tok, p_id)), "prob": float(probs[0, pos, p_id].detach().cpu())}
+                f.write(json.dumps(row, ensure_ascii=False) + "\n")
+                print(
+                    f"{args.name} step={step_num} aH={row['a_entropy']:.2f} eH={row['e_entropy']:.2f} pH={row['p_entropy']:.2f} "
+                    f"a_top={row['a_argmax_top'][0]['piece']}:{row['a_argmax_top'][0]['rate']:.2f} "
+                    f"p_top={row['p_argmax_top'][0]['piece']}:{row['p_argmax_top'][0]['rate']:.2f}",
+                    flush=True,
+                )
+    if args.final_out:
+        final_probs = 0.5 * probs + 0.5 * last_endpoint
+        ids = final_probs.argmax(dim=-1).detach().cpu().tolist()
+        Path(args.final_out).write_text("\n\n".join(tok.decode(row, stop_at_eos=False, skip_special_tokens=False) for row in ids), encoding="utf-8")
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    sub = ap.add_subparsers(dest="cmd", required=True)
+    data = sub.add_parser("data")
+    data.add_argument("--name", required=True)
+    data.add_argument("--data_path", required=True)
+    data.add_argument("--tokenizer_path", required=True)
+    data.add_argument("--out_json", required=True)
+    data.add_argument("--data_mode", choices=["wrap", "tokenized_hf"], default="wrap")
+    data.add_argument("--text_column", default=None)
+    data.add_argument("--openwebtext_split", default="all")
+    data.add_argument("--wrap_mode", default="stream")
+    data.add_argument("--tokenized_pad_token", default="pad")
+    data.add_argument("--max_len", type=int, default=1024)
+    data.add_argument("--n_sequences", type=int, default=2048)
+    data.add_argument("--max_records", type=int, default=20000)
+    data.add_argument("--top_k", type=int, default=24)
+    data.set_defaults(func=command_data)
+    teacher = sub.add_parser("teacher")
+    teacher.add_argument("--name", required=True)
+    teacher.add_argument("--checkpoint", required=True)
+    teacher.add_argument("--data_path", required=True)
+    teacher.add_argument("--tokenizer_path", required=True)
+    teacher.add_argument("--out_json", required=True)
+    teacher.add_argument("--data_mode", choices=["wrap", "tokenized_hf"], default="wrap")
+    teacher.add_argument("--text_column", default=None)
+    teacher.add_argument("--openwebtext_split", default="all")
+    teacher.add_argument("--wrap_mode", default="stream")
+    teacher.add_argument("--tokenized_pad_token", default="pad")
+    teacher.add_argument("--max_len", type=int, default=1024)
+    teacher.add_argument("--batch_size", type=int, default=8)
+    teacher.add_argument("--max_records", type=int, default=20000)
+    teacher.add_argument("--t_values", default="0.0,0.0078125,0.03125,0.125,0.5,1.0")
+    teacher.add_argument("--force_mask_ratio", type=float, default=None)
+    teacher.add_argument("--seed", type=int, default=20260525)
+    teacher.add_argument("--eps", type=float, default=1e-8)
+    teacher.add_argument("--cpu", action="store_true")
+    teacher.set_defaults(func=command_teacher)
+    trace = sub.add_parser("trace")
+    trace.add_argument("--name", required=True)
+    trace.add_argument("--checkpoint", required=True)
+    trace.add_argument("--tokenizer_path", required=True)
+    trace.add_argument("--out_jsonl", required=True)
+    trace.add_argument("--final_out", default="")
+    trace.add_argument("--max_len", type=int, default=1024)
+    trace.add_argument("--batch_size", type=int, default=2)
+    trace.add_argument("--steps", type=int, default=128)
+    trace.add_argument("--trace_steps", default="1,2,4,8,16,32,64,96,128")
+    trace.add_argument("--concentration_min", type=float, default=30522)
+    trace.add_argument("--concentration_max", type=float, default=61044)
+    trace.add_argument("--endpoint_temp", type=float, default=1.45)
+    trace.add_argument("--endpoint_top_p", type=float, default=0.95)
+    trace.add_argument("--gumbel_tau_start", type=float, default=1.0)
+    trace.add_argument("--gumbel_tau_end", type=float, default=0.2)
+    trace.add_argument("--seed", type=int, default=20260525)
+    trace.add_argument("--eps", type=float, default=1e-8)
+    trace.add_argument("--cpu", action="store_true")
+    trace.set_defaults(func=command_trace)
+    args = ap.parse_args()
+    args.func(args)
+if __name__ == "__main__":
+    main()

LTA_openwebtext_dualt/scripts/build_lta_owt_compact_gpt2bpe_stream1024_train_minus_100k_np8.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
+export PACKING_MODE="${PACKING_MODE:-stream_chunks}"
+export OUTPUT_SUFFIX="${OUTPUT_SUFFIX:-stream1024}"
+export CACHE_SUFFIX="${CACHE_SUFFIX:-_stream1024}"
+export LOG_DIR="${LOG_DIR:-logs/data_build_compact_gpt2bpe_stream1024}"
+export VOCAB_SIZES="${VOCAB_SIZES:-2048,4096,8192}"
+export NUM_PROC="${NUM_PROC:-8}"
+exec bash scripts/build_lta_owt_compact_gpt2bpe_packed_train_minus_100k_np8.sh "$@"

LTA_openwebtext_dualt/scripts/build_owt_t5_elf_dataset.py ADDED Viewed

	@@ -0,0 +1,587 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import json
+import os
+import shutil
+from pathlib import Path
+from typing import Iterator
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(
+        description=(
+            "Build an ELF-style OpenWebText T5 token dataset. By default each raw "
+            "record is tokenized with add_special_tokens=False, overlength records "
+            "are split into max_len chunks, and short records stay short. The "
+            "packed_records mode instead concatenates EOS-terminated records up to "
+            "max_len while preserving record boundaries. stream_chunks concatenates "
+            "the token stream and slices exact max_len chunks, so chunk boundaries "
+            "are defined by the selected tokenizer."
+        )
+    )
+    p.add_argument("--data_path", required=True)
+    p.add_argument("--output_dir", required=True)
+    p.add_argument("--tokenizer_path", required=True)
+    p.add_argument("--text_column", default="text")
+    p.add_argument("--txt_record_mode", choices=["auto", "line", "eot"], default="auto")
+    p.add_argument("--openwebtext_split", choices=["all", "train_minus_100k", "valid_last_100k"], default="all")
+    p.add_argument("--openwebtext_valid_records", type=int, default=100_000)
+    p.add_argument("--detokenizer", default="auto")
+    p.add_argument("--max_len", type=int, default=1024)
+    p.add_argument(
+        "--packing_mode",
+        choices=["record_chunks", "packed_records", "stream_chunks"],
+        default="record_chunks",
+        help=(
+            "record_chunks preserves the old behavior. packed_records appends EOS "
+            "per record and packs multiple records into near-max_len examples. "
+            "stream_chunks appends EOS per record, concatenates records, and emits "
+            "exact max_len chunks across record boundaries."
+        ),
+    )
+    p.add_argument("--max_records", type=int, default=0)
+    p.add_argument("--min_len", type=int, default=1)
+    p.add_argument("--add_eos", action="store_true", help="Append tokenizer EOS to each raw record before chunking.")
+    p.add_argument("--add_special_tokens", action="store_true", help="Let the tokenizer add model special tokens.")
+    p.add_argument("--cache_dir", default="")
+    p.add_argument("--max_shard_size", default="500MB")
+    p.add_argument("--num_proc", type=int, default=max(1, min(32, (os.cpu_count() or 8) // 2)))
+    p.add_argument("--tokenize_batch_size", type=int, default=1024)
+    p.add_argument(
+        "--merge_parts",
+        action="store_true",
+        help="After parallel part build, merge into one save_to_disk dataset. Slower but portable.",
+    )
+    p.add_argument("--keep_parts", action="store_true")
+    p.add_argument("--resume_parts", action="store_true", help="Keep completed part-* directories and build only missing parts.")
+    p.add_argument("--stats_only", action="store_true")
+    p.add_argument("--overwrite", action="store_true")
+    return p.parse_args()
+def _iter_examples(
+    *,
+    data_path: str,
+    tokenizer_path: str,
+    text_column: str | None,
+    txt_record_mode: str,
+    openwebtext_split: str,
+    openwebtext_valid_records: int,
+    detokenizer: str | None,
+    max_len: int,
+    packing_mode: str,
+    max_records: int,
+    min_len: int,
+    add_eos: bool,
+    add_special_tokens: bool,
+) -> Iterator[dict]:
+    from flowtext_lab.data import iter_text_records
+    from flowtext_lab.tokenization import BpeTextTokenizer
+    tokenizer = BpeTextTokenizer.from_file(tokenizer_path)
+    seen_records = 0
+    pack: list[int] = []
+    def emit_ids(ids: list[int]) -> dict:
+        return {
+            "input_ids": [int(x) for x in ids],
+            "sequence_length": int(len(ids)),
+        }
+    def iter_record_chunks(ids: list[int]) -> Iterator[dict]:
+        for start in range(0, len(ids), max_len):
+            chunk = ids[start : start + max_len]
+            if len(chunk) >= min_len:
+                yield emit_ids(chunk)
+            if start + max_len >= len(ids):
+                break
+    def flush_pack() -> Iterator[dict]:
+        nonlocal pack
+        if len(pack) >= min_len:
+            yield emit_ids(pack)
+        pack = []
+    def append_stream(ids: list[int]) -> Iterator[dict]:
+        nonlocal pack
+        pack.extend(int(x) for x in ids)
+        while len(pack) >= max_len:
+            yield emit_ids(pack[:max_len])
+            pack = pack[max_len:]
+    for text in iter_text_records(
+        data_path,
+        text_column=text_column,
+        txt_record_mode=txt_record_mode,
+        openwebtext_split=openwebtext_split,
+        openwebtext_valid_records=openwebtext_valid_records,
+        detokenizer=detokenizer,
+    ):
+        if not text:
+            continue
+        ids = tokenizer.encode(text, add_eos=add_eos, add_special_tokens=add_special_tokens)
+        if not ids:
+            continue
+        if packing_mode == "record_chunks":
+            yield from iter_record_chunks(ids)
+        elif packing_mode == "packed_records":
+            if len(ids) > max_len:
+                yield from flush_pack()
+                yield from iter_record_chunks(ids)
+            else:
+                if pack and len(pack) + len(ids) > max_len:
+                    yield from flush_pack()
+                pack.extend(int(x) for x in ids)
+                if len(pack) >= max_len:
+                    yield from flush_pack()
+        else:
+            yield from append_stream(ids)
+        seen_records += 1
+        if max_records > 0 and seen_records >= max_records:
+            break
+    if packing_mode in ("packed_records", "stream_chunks"):
+        yield from flush_pack()
+def _stats(args: argparse.Namespace) -> dict:
+    num_examples = 0
+    total_tokens = 0
+    min_len = None
+    max_len = 0
+    hist = {"lt128": 0, "128_255": 0, "256_511": 0, "512_1023": 0, "eq1024": 0}
+    for ex in _iter_examples(**_gen_kwargs(args)):
+        length = int(ex["sequence_length"])
+        num_examples += 1
+        total_tokens += length
+        min_len = length if min_len is None else min(min_len, length)
+        max_len = max(max_len, length)
+        if length < 128:
+            hist["lt128"] += 1
+        elif length < 256:
+            hist["128_255"] += 1
+        elif length < 512:
+            hist["256_511"] += 1
+        elif length < args.max_len:
+            hist["512_1023"] += 1
+        else:
+            hist["eq1024"] += 1
+    return {
+        "num_examples": int(num_examples),
+        "total_tokens": int(total_tokens),
+        "mean_length": float(total_tokens / num_examples) if num_examples else 0.0,
+        "min_length": int(min_len or 0),
+        "max_length": int(max_len),
+        "length_hist": hist,
+    }
+def _gen_kwargs(args: argparse.Namespace) -> dict:
+    return {
+        "data_path": args.data_path,
+        "tokenizer_path": args.tokenizer_path,
+        "text_column": args.text_column,
+        "txt_record_mode": args.txt_record_mode,
+        "openwebtext_split": args.openwebtext_split,
+        "openwebtext_valid_records": args.openwebtext_valid_records,
+        "detokenizer": args.detokenizer,
+        "max_len": int(args.max_len),
+        "packing_mode": args.packing_mode,
+        "max_records": int(args.max_records),
+        "min_len": int(args.min_len),
+        "add_eos": bool(args.add_eos),
+        "add_special_tokens": bool(args.add_special_tokens),
+    }
+def _make_limited_specs(args: argparse.Namespace) -> list[tuple[str, int, int | None]]:
+    from flowtext_lab.data import _make_file_specs
+    root = Path(args.data_path)
+    if root.is_dir():
+        files = sorted(
+            p for p in root.rglob("*")
+            if p.suffix.lower() in {".txt", ".jsonl", ".json", ".parquet"}
+        )
+    else:
+        files = [root]
+    specs = _make_file_specs(files, args.openwebtext_split, int(args.openwebtext_valid_records))
+    if args.max_records <= 0:
+        return [(str(p), int(a), None if b is None else int(b)) for p, a, b in specs]
+    limited = []
+    remaining = int(args.max_records)
+    for path, start, stop in specs:
+        if remaining <= 0:
+            break
+        if stop is None:
+            limited.append((str(path), int(start), None))
+            break
+        count = max(0, int(stop) - int(start))
+        take = min(count, remaining)
+        if take > 0:
+            limited.append((str(path), int(start), int(start) + take))
+            remaining -= take
+    return limited
+def _iter_parquet_text_batches(
+    path: Path,
+    *,
+    text_column: str | None,
+    row_start: int,
+    row_stop: int | None,
+    batch_size: int,
+) -> Iterator[list[str]]:
+    import pyarrow.parquet as pq
+    pf = pq.ParquetFile(path)
+    col = text_column
+    if col is None:
+        names = set(pf.schema_arrow.names)
+        col = next((c for c in ("text", "content", "document", "article", "sentence") if c in names), None)
+    if col is None:
+        raise ValueError(f"Could not infer text column for {path}")
+    offset = 0
+    stop = pf.metadata.num_rows if row_stop is None else min(row_stop, pf.metadata.num_rows)
+    for batch in pf.iter_batches(columns=[col], batch_size=batch_size):
+        batch_start = offset
+        batch_stop = offset + batch.num_rows
+        offset = batch_stop
+        if batch_stop <= row_start:
+            continue
+        if batch_start >= stop:
+            break
+        local_start = max(0, row_start - batch_start)
+        local_stop = min(batch.num_rows, stop - batch_start)
+        values = batch.column(0).slice(local_start, local_stop - local_start).to_pylist()
+        texts = [str(value) for value in values if value is not None and str(value)]
+        if texts:
+            yield texts
+def _iter_part_examples(
+    *,
+    spec: tuple[str, int, int | None],
+    tokenizer_path: str,
+    text_column: str | None,
+    detokenizer: str | None,
+    max_len: int,
+    packing_mode: str,
+    min_len: int,
+    add_eos: bool,
+    add_special_tokens: bool,
+    tokenize_batch_size: int,
+) -> Iterator[dict]:
+    from flowtext_lab.text_detokenization import detokenize_text, infer_detokenizer_name
+    from flowtext_lab.tokenization import BpeTextTokenizer
+    path = Path(spec[0])
+    row_start = int(spec[1])
+    row_stop = None if spec[2] is None else int(spec[2])
+    tokenizer = BpeTextTokenizer.from_file(tokenizer_path)
+    resolved_detok = infer_detokenizer_name(raw_path=str(path), explicit=detokenizer)
+    pack: list[int] = []
+    def emit_ids(ids: list[int]) -> dict:
+        return {
+            "input_ids": [int(x) for x in ids],
+            "sequence_length": int(len(ids)),
+        }
+    def iter_record_chunks(ids: list[int]) -> Iterator[dict]:
+        for start in range(0, len(ids), max_len):
+            chunk = ids[start : start + max_len]
+            if len(chunk) >= min_len:
+                yield emit_ids(chunk)
+            if start + max_len >= len(ids):
+                break
+    def flush_pack() -> Iterator[dict]:
+        nonlocal pack
+        if len(pack) >= min_len:
+            yield emit_ids(pack)
+        pack = []
+    def append_stream(ids: list[int]) -> Iterator[dict]:
+        nonlocal pack
+        pack.extend(int(x) for x in ids)
+        while len(pack) >= max_len:
+            yield emit_ids(pack[:max_len])
+            pack = pack[max_len:]
+    for texts in _iter_parquet_text_batches(
+        path,
+        text_column=text_column,
+        row_start=row_start,
+        row_stop=row_stop,
+        batch_size=max(1, int(tokenize_batch_size)),
+    ):
+        if resolved_detok:
+            texts = [detokenize_text(text, resolved_detok) for text in texts]
+        encoded = tokenizer.tokenizer.encode_batch(texts, add_special_tokens=add_special_tokens)
+        for enc in encoded:
+            ids = list(enc.ids)
+            if add_eos:
+                ids.append(tokenizer.eos_id)
+            if not ids:
+                continue
+            if packing_mode == "record_chunks":
+                yield from iter_record_chunks(ids)
+            elif packing_mode == "packed_records":
+                if len(ids) > max_len:
+                    yield from flush_pack()
+                    yield from iter_record_chunks(ids)
+                else:
+                    if pack and len(pack) + len(ids) > max_len:
+                        yield from flush_pack()
+                    pack.extend(int(x) for x in ids)
+                    if len(pack) >= max_len:
+                        yield from flush_pack()
+            else:
+                yield from append_stream(ids)
+    if packing_mode in ("packed_records", "stream_chunks"):
+        yield from flush_pack()
+def _build_part(task: dict) -> dict:
+    from datasets import Dataset, Features, Sequence, Value, disable_progress_bars
+    disable_progress_bars()
+    part_dir = Path(task["part_dir"])
+    if part_dir.exists():
+        shutil.rmtree(part_dir)
+    features = Features(
+        {
+            "input_ids": Sequence(Value("int32")),
+            "sequence_length": Value("int64"),
+        }
+    )
+    ds = Dataset.from_generator(
+        _iter_part_examples,
+        gen_kwargs={
+            "spec": task["spec"],
+            "tokenizer_path": task["tokenizer_path"],
+            "text_column": task["text_column"],
+            "detokenizer": task["detokenizer"],
+            "max_len": task["max_len"],
+            "packing_mode": task["packing_mode"],
+            "min_len": task["min_len"],
+            "add_eos": task["add_eos"],
+            "add_special_tokens": task["add_special_tokens"],
+            "tokenize_batch_size": task["tokenize_batch_size"],
+        },
+        features=features,
+        cache_dir=task["cache_dir"] or None,
+    )
+    ds.save_to_disk(str(part_dir), max_shard_size=task["max_shard_size"])
+    lengths = ds["sequence_length"] if len(ds) else []
+    total_tokens = int(sum(int(x) for x in lengths))
+    if task["cache_dir"]:
+        shutil.rmtree(task["cache_dir"], ignore_errors=True)
+    return {
+        "part_dir": str(part_dir),
+        "num_examples": int(len(ds)),
+        "total_tokens": total_tokens,
+        "spec": task["spec"],
+    }
+def _part_is_complete(part_dir: Path) -> bool:
+    return (part_dir / "state.json").exists() and any(part_dir.glob("data-*.arrow"))
+def _summarize_part(part_dir: Path, spec: tuple[str, int, int | None]) -> dict:
+    from datasets import load_from_disk
+    ds = load_from_disk(str(part_dir))
+    lengths = ds["sequence_length"] if len(ds) else []
+    total_tokens = int(sum(int(x) for x in lengths))
+    return {
+        "part_dir": str(part_dir),
+        "num_examples": int(len(ds)),
+        "total_tokens": total_tokens,
+        "spec": spec,
+    }
+def _preload_datasets_for_fork() -> None:
+    # Importing datasets pulls in fsspec, which scans Python entry points.
+    # On this machine that scan can intermittently hit a corrupt/fragile zipped
+    # egg when many workers import at once. Preloading in the parent lets forked
+    # workers reuse sys.modules instead of racing through the entry point scan.
+    from datasets import Dataset, Features, Sequence, Value, disable_progress_bars, load_from_disk  # noqa: F401
+    disable_progress_bars()
+def _parallel_build(args: argparse.Namespace) -> dict:
+    from concurrent.futures import ProcessPoolExecutor, as_completed
+    specs = _make_limited_specs(args)
+    if not specs:
+        raise RuntimeError("No input file specs found")
+    output_dir = Path(args.output_dir)
+    parts_root = output_dir / "parts"
+    parts_root.mkdir(parents=True, exist_ok=True)
+    tasks = []
+    part_results = []
+    for idx, spec in enumerate(specs):
+        part_dir = parts_root / f"part-{idx:05d}"
+        if args.resume_parts and _part_is_complete(part_dir):
+            part_results.append(_summarize_part(part_dir, spec))
+            continue
+        tasks.append(
+            {
+                "part_dir": str(part_dir),
+                "spec": spec,
+                "tokenizer_path": args.tokenizer_path,
+                "text_column": args.text_column,
+                "detokenizer": args.detokenizer,
+                "max_len": int(args.max_len),
+                "packing_mode": args.packing_mode,
+                "min_len": int(args.min_len),
+                "add_eos": bool(args.add_eos),
+                "add_special_tokens": bool(args.add_special_tokens),
+                "tokenize_batch_size": int(args.tokenize_batch_size),
+                "cache_dir": str(Path(args.cache_dir) / f"part-{idx:05d}") if args.cache_dir else "",
+                "max_shard_size": args.max_shard_size,
+            }
+        )
+    print(
+        f"[build] specs={len(specs)} existing={len(part_results)} "
+        f"todo={len(tasks)} num_proc={args.num_proc} output={output_dir}",
+        flush=True,
+    )
+    if tasks:
+        _preload_datasets_for_fork()
+        with ProcessPoolExecutor(max_workers=max(1, int(args.num_proc))) as pool:
+            futures = [pool.submit(_build_part, task) for task in tasks]
+            for done, fut in enumerate(as_completed(futures), start=1):
+                result = fut.result()
+                part_results.append(result)
+                print(
+                    "[build] "
+                    f"{done}/{len(futures)} {Path(result['part_dir']).name} "
+                    f"examples={result['num_examples']} tokens={result['total_tokens']}",
+                    flush=True,
+                )
+    part_results.sort(key=lambda x: x["part_dir"])
+    total_examples = sum(int(x["num_examples"]) for x in part_results)
+    total_tokens = sum(int(x["total_tokens"]) for x in part_results)
+    meta = {
+        "builder": "build_owt_t5_elf_dataset.py",
+        "format": f"elf_unconditional_tokenized_{args.packing_mode}_multipart",
+        "data_path": args.data_path,
+        "tokenizer_path": args.tokenizer_path,
+        "text_column": args.text_column,
+        "openwebtext_split": args.openwebtext_split,
+        "openwebtext_valid_records": args.openwebtext_valid_records,
+        "max_len": args.max_len,
+        "packing_mode": args.packing_mode,
+        "max_records": args.max_records,
+        "min_len": args.min_len,
+        "add_eos": args.add_eos,
+        "add_special_tokens": args.add_special_tokens,
+        "num_parts": len(part_results),
+        "num_examples": int(total_examples),
+        "total_tokens": int(total_tokens),
+        "mean_length": float(total_tokens / total_examples) if total_examples else 0.0,
+        "parts": part_results,
+    }
+    (output_dir / "elf_multi_part_meta.json").write_text(json.dumps(meta, indent=2, sort_keys=True), encoding="utf-8")
+    if args.merge_parts:
+        from datasets import concatenate_datasets, load_from_disk
+        merged_tmp = output_dir / "_merged_tmp"
+        if merged_tmp.exists():
+            shutil.rmtree(merged_tmp)
+        datasets = [load_from_disk(result["part_dir"]) for result in part_results if result["num_examples"] > 0]
+        merged = datasets[0] if len(datasets) == 1 else concatenate_datasets(datasets)
+        merged.save_to_disk(str(merged_tmp), max_shard_size=args.max_shard_size)
+        for child in list(output_dir.iterdir()):
+            if child.name in {"_merged_tmp", "parts"}:
+                continue
+            if child.is_dir():
+                shutil.rmtree(child)
+            else:
+                child.unlink()
+        for child in list(merged_tmp.iterdir()):
+            child.rename(output_dir / child.name)
+        merged_tmp.rmdir()
+        if not args.keep_parts:
+            shutil.rmtree(parts_root)
+        meta["format"] = f"elf_unconditional_tokenized_{args.packing_mode}"
+        (output_dir / "elf_build_meta.json").write_text(json.dumps(meta, indent=2, sort_keys=True), encoding="utf-8")
+    return meta
+def main() -> None:
+    args = parse_args()
+    output_dir = Path(args.output_dir)
+    if args.stats_only:
+        print(json.dumps(_stats(args), indent=2, sort_keys=True))
+        return
+    if output_dir.exists():
+        if not args.overwrite:
+            if not args.resume_parts:
+                raise SystemExit(f"output_dir exists: {output_dir}; pass --overwrite to replace it")
+        elif not args.resume_parts:
+            shutil.rmtree(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if args.num_proc > 1:
+        meta = _parallel_build(args)
+        print(json.dumps({k: v for k, v in meta.items() if k != "parts"}, indent=2, sort_keys=True))
+        return
+    from datasets import Dataset, Features, Sequence, Value
+    features = Features(
+        {
+            "input_ids": Sequence(Value("int32")),
+            "sequence_length": Value("int64"),
+        }
+    )
+    ds = Dataset.from_generator(
+        _iter_examples,
+        gen_kwargs=_gen_kwargs(args),
+        features=features,
+        cache_dir=args.cache_dir or None,
+    )
+    ds.save_to_disk(str(output_dir), max_shard_size=args.max_shard_size)
+    meta = {
+        "builder": "build_owt_t5_elf_dataset.py",
+        "format": f"elf_unconditional_tokenized_{args.packing_mode}",
+        "data_path": args.data_path,
+        "tokenizer_path": args.tokenizer_path,
+        "text_column": args.text_column,
+        "openwebtext_split": args.openwebtext_split,
+        "openwebtext_valid_records": args.openwebtext_valid_records,
+        "max_len": args.max_len,
+        "packing_mode": args.packing_mode,
+        "max_records": args.max_records,
+        "min_len": args.min_len,
+        "add_eos": args.add_eos,
+        "add_special_tokens": args.add_special_tokens,
+        "num_examples": int(len(ds)),
+        "columns": list(ds.column_names),
+    }
+    (output_dir / "elf_build_meta.json").write_text(json.dumps(meta, indent=2, sort_keys=True), encoding="utf-8")
+    print(json.dumps(meta, indent=2, sort_keys=True))
+if __name__ == "__main__":
+    main()

LTA_openwebtext_dualt/scripts/eval_dirichlet_latest_key3_state_20260508.py ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import importlib.util
+import sys
+from pathlib import Path
+BASE = Path(__file__).with_name("eval_c1024_decode_sweep_20260507.py")
+spec = importlib.util.spec_from_file_location("eval_c1024_decode_sweep_20260507", BASE)
+if spec is None or spec.loader is None:
+    raise RuntimeError(f"cannot import {BASE}")
+base = importlib.util.module_from_spec(spec)
+sys.modules[spec.name] = base
+spec.loader.exec_module(base)
+def key_configs() -> list[base.DecodeConfig]:
+    return [
+        base.DecodeConfig(
+            "match_post_sem1_state_c16_t1p3",
+            "post",
+            1.0,
+            1.0,
+            "state",
+            endpoint_temp=1.3,
+            concentration_max=16.0,
+        ),
+        base.DecodeConfig(
+            "match_post_sem1_state_c64_t1p3",
+            "post",
+            1.0,
+            1.0,
+            "state",
+            endpoint_temp=1.3,
+            concentration_max=64.0,
+        ),
+        base.DecodeConfig(
+            "match_post_sem1_state_c1024_t1p3",
+            "post",
+            1.0,
+            1.0,
+            "state",
+            endpoint_temp=1.3,
+            concentration_max=1024.0,
+        ),
+    ]
+base.default_configs = key_configs
+base.main()

LTA_openwebtext_dualt/scripts/infer_lta_owt_t5_len128_uniform10k_then_lognsr_latest.sh ADDED Viewed

	@@ -0,0 +1,113 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
+export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
+export PYTHONUNBUFFERED=1
+export TOKENIZERS_PARALLELISM=false
+RUN_PREFIX="${RUN_PREFIX:-lta_owt_t5_len128_uniform10k_then_lognsr}"
+TOKENIZER_PATH="${TOKENIZER_PATH:-/e2e-data/evad-tech-vla/wanghan58/models/hf/t5-small/tokenizer.json}"
+SCORER="${SCORER:-/e2e-data/evad-tech-vla/wanghan58/models/flowtext_scorers/gpt2-large-standard}"
+N_SAMPLES="${N_SAMPLES:-8}"
+DECODE_BATCH="${DECODE_BATCH:-4}"
+SCORE_BATCH="${SCORE_BATCH:-4}"
+MAX_LEN="${MAX_LEN:-128}"
+STEPS="${STEPS:-1024}"
+ENDPOINT_TEMPS="${ENDPOINT_TEMPS:-1.0,1.15,1.30,1.45}"
+DECODE_RULE="${DECODE_RULE:-dirichlet_resample}"
+MODEL_T_MODE="${MODEL_T_MODE:-post}"
+TIME_SCHEDULE="${TIME_SCHEDULE:-lognsr_gumbel}"
+TIME_GUMBEL_LOC="${TIME_GUMBEL_LOC:-2.2}"
+TIME_GUMBEL_SCALE="${TIME_GUMBEL_SCALE:-0.8}"
+CONCENTRATION_MIN="${CONCENTRATION_MIN:-1}"
+CONCENTRATION_MAX="${CONCENTRATION_MAX:-64}"
+NOISE_INIT="${NOISE_INIT:-dirichlet}"
+FINAL_FROM="${FINAL_FROM:-state}"
+FINAL_SAMPLE_MODE="${FINAL_SAMPLE_MODE:-argmax}"
+pick_run() {
+  local suffix="$1"
+  find runs -maxdepth 1 -type d -name "${RUN_PREFIX}*${suffix}" -printf "%T@ %p\n" 2>/dev/null \
+    | sort -nr \
+    | head -n 1 \
+    | cut -d' ' -f2-
+}
+RUN_DIR="${RUN_DIR:-}"
+if [[ -z "${RUN_DIR}" ]]; then
+  RUN_DIR="$(pick_run "_resume_lognsr_sde_rollin")"
+fi
+if [[ -z "${RUN_DIR}" ]]; then
+  RUN_DIR="$(pick_run "_warmup_uniform_norollin")"
+fi
+if [[ -z "${RUN_DIR}" || ! -d "${RUN_DIR}" ]]; then
+  echo "[infer] could not find run dir for prefix=${RUN_PREFIX}" >&2
+  exit 1
+fi
+CKPT="${CKPT:-}"
+if [[ -z "${CKPT}" ]]; then
+  CKPT="$(ls -1 "${RUN_DIR}"/step_*.pt 2>/dev/null | sort | tail -n 1 || true)"
+fi
+if [[ -z "${CKPT}" || ! -f "${CKPT}" ]]; then
+  echo "[infer] could not find checkpoint under ${RUN_DIR}" >&2
+  exit 1
+fi
+RUN_BASENAME="$(basename "${RUN_DIR}")"
+CKPT_BASENAME="$(basename "${CKPT}" .pt)"
+OUT_DIR="${OUT_DIR:-docs/lta_samples/metrics_20260519/${RUN_BASENAME}_${CKPT_BASENAME}_len128_lm1bgood_sdeish_n${N_SAMPLES}}"
+OUT_JSONL="${OUT_DIR}/summary.jsonl"
+mkdir -p "${OUT_DIR}"
+echo "[infer] run=${RUN_DIR}"
+echo "[infer] ckpt=${CKPT}"
+echo "[infer] out=${OUT_JSONL}"
+echo "[infer] decode_rule=${DECODE_RULE} steps=${STEPS} cmax=${CONCENTRATION_MAX} model_t=${MODEL_T_MODE} temps=${ENDPOINT_TEMPS}"
+python scripts/standard_genppl_entropy_latest_decode.py \
+  --checkpoint "${CKPT}" \
+  --tokenizer_path "${TOKENIZER_PATH}" \
+  --scorer "${SCORER}" \
+  --output "${OUT_JSONL}" \
+  --max_len "${MAX_LEN}" \
+  --n_samples "${N_SAMPLES}" \
+  --decode_batch "${DECODE_BATCH}" \
+  --score_batch "${SCORE_BATCH}" \
+  --score_max_length "${MAX_LEN}" \
+  --steps "${STEPS}" \
+  --model_t_mode "${MODEL_T_MODE}" \
+  --decode_time_schedule "${TIME_SCHEDULE}" \
+  --decode_time_gumbel_loc "${TIME_GUMBEL_LOC}" \
+  --decode_time_gumbel_scale "${TIME_GUMBEL_SCALE}" \
+  --decode_rule "${DECODE_RULE}" \
+  --concentration_min "${CONCENTRATION_MIN}" \
+  --concentration_max "${CONCENTRATION_MAX}" \
+  --noise_init "${NOISE_INIT}" \
+  --endpoint_temps "${ENDPOINT_TEMPS}" \
+  --final_from "${FINAL_FROM}" \
+  --final_sample_mode "${FINAL_SAMPLE_MODE}" \
+  --save_samples "${N_SAMPLES}"
+echo "[infer] summaries:"
+python - "${OUT_JSONL}" <<'PY'
+import json, sys
+path = sys.argv[1]
+with open(path, encoding="utf-8") as f:
+    for line in f:
+        row = json.loads(line)
+        if row.get("type") != "summary":
+            continue
+        d = row["decode"]
+        stripped = row.get("stripped_genppl", {})
+        div = row.get("diversity", {})
+        print(
+            f"temp={d['endpoint_temp']:.2f} final={d['final_from']} "
+            f"ppl={stripped.get('ppl')} entropy={div.get('sample_entropy')} "
+            f"top_mass={div.get('top_token_mass')}"
+        )
+PY

LTA_openwebtext_dualt/scripts/launch_lta_lm1b_categorical_fullvocab_c1024_fullycoupled_8gpu_small_1m.sh ADDED Viewed

	@@ -0,0 +1,150 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
+export PYTHONPATH="$(pwd)${PYTHONPATH:+:$PYTHONPATH}"
+export TOKENIZERS_PARALLELISM=false
+export PYTHONUNBUFFERED=1
+export NCCL_DEBUG="${NCCL_DEBUG:-WARN}"
+export TORCH_DISTRIBUTED_TIMEOUT="${TORCH_DISTRIBUTED_TIMEOUT:-3600}"
+# Fully-coupled t ablation:
+#   model_t == support/Dirichlet t == semantic endpoint t
+RUN_NAME="${RUN_NAME:-lta_lm1b_dirichlet_categorical_fullvocab_c1024_fullycoupled_flmpack_onehot_hardce_ddit_small_len128_gbs512_8gpu_1m_nw0}"
+DATA_PATH="${DATA_PATH:-data/lm1b_train_parquet}"
+TOKENIZER_PATH="${TOKENIZER_PATH:-/e2e-data/evad-tech-vla/wanghan58/workspace/imagenet_handoff_20260327/nlp_dts_light/assets/distilbert-base-uncased/tokenizer.json}"
+TEXT_COLUMN="${TEXT_COLUMN:-}"
+OPENWEBTEXT_SPLIT="${OPENWEBTEXT_SPLIT:-all}"
+SAVE_DIR="${SAVE_DIR:-runs/${RUN_NAME}}"
+LOG_FILE="${LOG_FILE:-logs/${RUN_NAME}.log}"
+NNODES="${NNODES:-1}"
+NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
+NODE_RANK="${NODE_RANK:-0}"
+MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}"
+MASTER_PORT="${MASTER_PORT:-29631}"
+GLOBAL_BATCH_SIZE="${GLOBAL_BATCH_SIZE:-512}"
+PER_GPU_BATCH_SIZE="${PER_GPU_BATCH_SIZE:-64}"
+TOTAL_STEPS="${TOTAL_STEPS:-1000000}"
+WARMUP_STEPS="${WARMUP_STEPS:-2500}"
+MAX_LEN="${MAX_LEN:-128}"
+WRAP_MODE="${WRAP_MODE:-stream}"
+WRAP_RECORD_BUFFER_SIZE="${WRAP_RECORD_BUFFER_SIZE:-200}"
+NUM_WORKERS="${NUM_WORKERS:-0}"
+LOG_EVERY="${LOG_EVERY:-100}"
+SAVE_EVERY="${SAVE_EVERY:-20000}"
+LATEST_EVERY="${LATEST_EVERY:-1000}"
+EVAL_EVERY="${EVAL_EVERY:-0}"
+RESUME_PATH="${RESUME_PATH:-}"
+ALLOW_EXISTING_SAVE_DIR="${ALLOW_EXISTING_SAVE_DIR:-0}"
+ENABLE_TORCH_COMPILE="${ENABLE_TORCH_COMPILE:-0}"
+FORCE_DISABLE_TORCH_COMPILE="${FORCE_DISABLE_TORCH_COMPILE:-1}"
+if [[ "${FORCE_DISABLE_TORCH_COMPILE}" == "1" ]]; then
+  ENABLE_TORCH_COMPILE=0
+fi
+if [[ "${DATA_PATH}" == *"lm1b_train_parquet"* && "${NUM_WORKERS}" != "0" ]]; then
+  echo "LM1B has only 9 parquet shards; forcing NUM_WORKERS=0 to avoid empty DDP dataloader shards." >&2
+  NUM_WORKERS=0
+fi
+COMPILE_ARGS=()
+if [[ "${ENABLE_TORCH_COMPILE}" == "1" ]]; then
+  COMPILE_ARGS+=(--torch_compile --compile_mode reduce-overhead)
+fi
+RESUME_ARGS=()
+if [[ -n "${RESUME_PATH}" ]]; then
+  RESUME_ARGS+=(--resume_path "${RESUME_PATH}")
+fi
+TEXT_COLUMN_ARGS=()
+if [[ -n "${TEXT_COLUMN}" ]]; then
+  TEXT_COLUMN_ARGS+=(--text_column "${TEXT_COLUMN}")
+fi
+if [[ -f "${SAVE_DIR}/args.json" && -z "${RESUME_PATH}" && "${ALLOW_EXISTING_SAVE_DIR}" != "1" ]]; then
+  echo "Refusing to start because SAVE_DIR already contains args.json: ${SAVE_DIR}" >&2
+  echo "Use a new RUN_NAME/SAVE_DIR, set RESUME_PATH to resume, or set ALLOW_EXISTING_SAVE_DIR=1 intentionally." >&2
+  exit 2
+fi
+mkdir -p logs runs "${SAVE_DIR}"
+echo "[launch] method=categorical_fullvocab_c1024_fullycoupled host=$(hostname) time=$(date -Iseconds)"
+echo "[launch] cwd=$(pwd)"
+echo "[launch] run_name=${RUN_NAME}"
+echo "[launch] save_dir=${SAVE_DIR}"
+echo "[launch] log_file=${LOG_FILE}"
+python -m torch.distributed.run \
+  --nnodes="${NNODES}" \
+  --nproc_per_node="${NPROC_PER_NODE}" \
+  --node_rank="${NODE_RANK}" \
+  --master_addr="${MASTER_ADDR}" \
+  --master_port="${MASTER_PORT}" \
+  train.py \
+  --data_path "${DATA_PATH}" \
+  "${TEXT_COLUMN_ARGS[@]}" \
+  --openwebtext_split "${OPENWEBTEXT_SPLIT}" \
+  --tokenizer_path "${TOKENIZER_PATH}" \
+  --save_dir "${SAVE_DIR}" \
+  --wrap \
+  --wrap_mode "${WRAP_MODE}" \
+  --wrap_record_buffer_size "${WRAP_RECORD_BUFFER_SIZE}" \
+  --max_len "${MAX_LEN}" \
+  --batch_size "${PER_GPU_BATCH_SIZE}" \
+  --num_workers "${NUM_WORKERS}" \
+  --global_batch_size "${GLOBAL_BATCH_SIZE}" \
+  --total_steps "${TOTAL_STEPS}" \
+  --log_every "${LOG_EVERY}" \
+  --eval_every "${EVAL_EVERY}" \
+  --save_every "${SAVE_EVERY}" \
+  --latest_every "${LATEST_EVERY}" \
+  --lr 3e-4 \
+  --weight_decay 0 \
+  --adam_beta1 0.9 \
+  --adam_beta2 0.999 \
+  --adam_eps 1e-8 \
+  --warmup_steps "${WARMUP_STEPS}" \
+  --lr_schedule constant_warmup \
+  --grad_clip 1.0 \
+  --seed 123 \
+  --d_model 768 \
+  --cond_dim 128 \
+  --n_layers 12 \
+  --n_heads 12 \
+  --dim_ff 3072 \
+  --dropout 0.1 \
+  --model_type ddit \
+  --state_format prob \
+  --bridge dirichlet \
+  --target_loss hard_ce \
+  --target_prob 1.0 \
+  --min_t 0.0 \
+  --max_t 1.0 \
+  --dual_t \
+  --corrupt_t_mode same \
+  --corrupt_min_t 0.0 \
+  --corrupt_max_t 1.0 \
+  --min_mask_ratio 0.1 \
+  --max_mask_ratio 1.0 \
+  --wrong_token_replace_prob 1.0 \
+  --wrong_token_schedule linear_t \
+  --wrong_token_exp_k 1.0 \
+  --dirichlet_concentration_min 1.0 \
+  --dirichlet_concentration_max 1024.0 \
+  --dirichlet_endpoint_mode categorical_dual_t \
+  --dirichlet_semantic_t_mode same \
+  --dirichlet_semantic_t_value 0.0 \
+  --categorical_wrong_from_full_vocab \
+  --simplex_bridge_sampler dirichlet \
+  --eps 1e-8 \
+  --infer_steps 128 \
+  --decode_damping 1.0 \
+  --max_gamma 1.0 \
+  --decode_solver flowmap \
+  --noise_init logistic_normal \
+  --bridge_noise_init logistic_normal \
+  --noise_sigma -1 \
+  "${RESUME_ARGS[@]}" \
+  "${COMPILE_ARGS[@]}" \
+  --bf16 2>&1 | tee -a "${LOG_FILE}"

LTA_openwebtext_dualt/scripts/launch_lta_lm1b_categorical_fullvocab_c16_dualt_4gpu_small_1m.sh ADDED Viewed

	@@ -0,0 +1,155 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
+export PYTHONPATH="$(pwd)${PYTHONPATH:+:$PYTHONPATH}"
+export TOKENIZERS_PARALLELISM=false
+export PYTHONUNBUFFERED=1
+export NCCL_DEBUG="${NCCL_DEBUG:-WARN}"
+export TORCH_DISTRIBUTED_TIMEOUT="${TORCH_DISTRIBUTED_TIMEOUT:-3600}"
+# C=16 categorical dual-t LM1B, full-vocab wrong-token endpoint.
+# This is the 4-GPU counterpart of the 8-GPU full-vocab run; global batch stays 512.
+C_MAX="${C_MAX:-16.0}"
+C_TAG="${C_TAG:-c${C_MAX//./p}}"
+RUN_NAME="${RUN_NAME:-lta_lm1b_dirichlet_categorical_fullvocab_${C_TAG}_dualt_flmpack_onehot_hardce_ddit_small_len128_gbs512_4gpu_1m_nw0}"
+DATA_PATH="${DATA_PATH:-data/lm1b_train_parquet}"
+TOKENIZER_PATH="${TOKENIZER_PATH:-/e2e-data/evad-tech-vla/wanghan58/workspace/imagenet_handoff_20260327/nlp_dts_light/assets/distilbert-base-uncased/tokenizer.json}"
+DETOKENIZER="${DETOKENIZER:-auto}"
+TEXT_COLUMN="${TEXT_COLUMN:-}"
+OPENWEBTEXT_SPLIT="${OPENWEBTEXT_SPLIT:-all}"
+SAVE_DIR="${SAVE_DIR:-runs/${RUN_NAME}}"
+LOG_FILE="${LOG_FILE:-logs/${RUN_NAME}.log}"
+NNODES="${NNODES:-1}"
+NPROC_PER_NODE="${NPROC_PER_NODE:-4}"
+NODE_RANK="${NODE_RANK:-0}"
+MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}"
+MASTER_PORT="${MASTER_PORT:-29641}"
+GLOBAL_BATCH_SIZE="${GLOBAL_BATCH_SIZE:-512}"
+PER_GPU_BATCH_SIZE="${PER_GPU_BATCH_SIZE:-64}"
+TOTAL_STEPS="${TOTAL_STEPS:-1000000}"
+WARMUP_STEPS="${WARMUP_STEPS:-2500}"
+MAX_LEN="${MAX_LEN:-128}"
+WRAP_MODE="${WRAP_MODE:-stream}"
+WRAP_RECORD_BUFFER_SIZE="${WRAP_RECORD_BUFFER_SIZE:-200}"
+NUM_WORKERS="${NUM_WORKERS:-0}"
+LOG_EVERY="${LOG_EVERY:-100}"
+SAVE_EVERY="${SAVE_EVERY:-20000}"
+LATEST_EVERY="${LATEST_EVERY:-1000}"
+EVAL_EVERY="${EVAL_EVERY:-0}"
+RESUME_PATH="${RESUME_PATH:-}"
+ALLOW_EXISTING_SAVE_DIR="${ALLOW_EXISTING_SAVE_DIR:-0}"
+ENABLE_TORCH_COMPILE="${ENABLE_TORCH_COMPILE:-0}"
+FORCE_DISABLE_TORCH_COMPILE="${FORCE_DISABLE_TORCH_COMPILE:-1}"
+if [[ "${FORCE_DISABLE_TORCH_COMPILE}" == "1" ]]; then
+  ENABLE_TORCH_COMPILE=0
+fi
+if [[ "${DATA_PATH}" == *"lm1b_train_parquet"* && "${NUM_WORKERS}" != "0" ]]; then
+  echo "LM1B has only 9 parquet shards; forcing NUM_WORKERS=0 to avoid empty DDP dataloader shards." >&2
+  NUM_WORKERS=0
+fi
+COMPILE_ARGS=()
+if [[ "${ENABLE_TORCH_COMPILE}" == "1" ]]; then
+  COMPILE_ARGS+=(--torch_compile --compile_mode reduce-overhead)
+fi
+RESUME_ARGS=()
+if [[ -n "${RESUME_PATH}" ]]; then
+  RESUME_ARGS+=(--resume_path "${RESUME_PATH}")
+fi
+TEXT_COLUMN_ARGS=()
+if [[ -n "${TEXT_COLUMN}" ]]; then
+  TEXT_COLUMN_ARGS+=(--text_column "${TEXT_COLUMN}")
+fi
+if [[ -f "${SAVE_DIR}/args.json" && -z "${RESUME_PATH}" && "${ALLOW_EXISTING_SAVE_DIR}" != "1" ]]; then
+  echo "Refusing to start because SAVE_DIR already contains args.json: ${SAVE_DIR}" >&2
+  echo "Use a new RUN_NAME/SAVE_DIR, set RESUME_PATH to resume, or set ALLOW_EXISTING_SAVE_DIR=1 intentionally." >&2
+  exit 2
+fi
+mkdir -p logs runs "${SAVE_DIR}"
+echo "[launch] method=categorical_fullvocab C_MAX=${C_MAX} host=$(hostname) time=$(date -Iseconds)"
+echo "[launch] cwd=$(pwd)"
+echo "[launch] run_name=${RUN_NAME}"
+echo "[launch] save_dir=${SAVE_DIR}"
+echo "[launch] log_file=${LOG_FILE}"
+echo "[launch] nproc_per_node=${NPROC_PER_NODE} global_batch_size=${GLOBAL_BATCH_SIZE} per_gpu_batch_size=${PER_GPU_BATCH_SIZE}"
+python -m torch.distributed.run \
+  --nnodes="${NNODES}" \
+  --nproc_per_node="${NPROC_PER_NODE}" \
+  --node_rank="${NODE_RANK}" \
+  --master_addr="${MASTER_ADDR}" \
+  --master_port="${MASTER_PORT}" \
+  train.py \
+  --data_path "${DATA_PATH}" \
+  "${TEXT_COLUMN_ARGS[@]}" \
+  --openwebtext_split "${OPENWEBTEXT_SPLIT}" \
+  --detokenizer "${DETOKENIZER}" \
+  --tokenizer_path "${TOKENIZER_PATH}" \
+  --save_dir "${SAVE_DIR}" \
+  --wrap \
+  --wrap_mode "${WRAP_MODE}" \
+  --wrap_record_buffer_size "${WRAP_RECORD_BUFFER_SIZE}" \
+  --max_len "${MAX_LEN}" \
+  --batch_size "${PER_GPU_BATCH_SIZE}" \
+  --num_workers "${NUM_WORKERS}" \
+  --global_batch_size "${GLOBAL_BATCH_SIZE}" \
+  --total_steps "${TOTAL_STEPS}" \
+  --log_every "${LOG_EVERY}" \
+  --eval_every "${EVAL_EVERY}" \
+  --save_every "${SAVE_EVERY}" \
+  --latest_every "${LATEST_EVERY}" \
+  --lr 3e-4 \
+  --weight_decay 0 \
+  --adam_beta1 0.9 \
+  --adam_beta2 0.999 \
+  --adam_eps 1e-8 \
+  --warmup_steps "${WARMUP_STEPS}" \
+  --lr_schedule constant_warmup \
+  --grad_clip 1.0 \
+  --seed 123 \
+  --d_model 768 \
+  --cond_dim 128 \
+  --n_layers 12 \
+  --n_heads 12 \
+  --dim_ff 3072 \
+  --dropout 0.1 \
+  --model_type ddit \
+  --state_format prob \
+  --bridge dirichlet \
+  --target_loss hard_ce \
+  --target_prob 1.0 \
+  --min_t 0.0 \
+  --max_t 1.0 \
+  --dual_t \
+  --corrupt_t_mode independent \
+  --corrupt_min_t 0.0 \
+  --corrupt_max_t 1.0 \
+  --min_mask_ratio 0.1 \
+  --max_mask_ratio 1.0 \
+  --wrong_token_replace_prob 1.0 \
+  --wrong_token_schedule linear_t \
+  --wrong_token_exp_k 1.0 \
+  --dirichlet_concentration_min 1.0 \
+  --dirichlet_concentration_max "${C_MAX}" \
+  --dirichlet_endpoint_mode categorical_dual_t \
+  --dirichlet_semantic_t_mode independent \
+  --dirichlet_semantic_t_value 0.0 \
+  --categorical_wrong_from_full_vocab \
+  --eps 1e-8 \
+  --infer_steps 128 \
+  --decode_damping 1.0 \
+  --max_gamma 1.0 \
+  --decode_solver flowmap \
+  --noise_init logistic_normal \
+  --bridge_noise_init logistic_normal \
+  --noise_sigma -1 \
+  "${RESUME_ARGS[@]}" \
+  "${COMPILE_ARGS[@]}" \
+  --bf16 2>&1 | tee -a "${LOG_FILE}"

LTA_openwebtext_dualt/scripts/launch_lta_owt_c1024_fullycoupled_8gpu_len1024_gpt2_cached_chunks_1m.sh ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
+# Explicit cached-chunk OWT/GPT-2 run.
+# Uses the already-built cache:
+#   openwebtext_lta_cached_chunks/gpt2_len1024_train_minus_100k
+#
+# Data processing:
+#   tokenize records with GPT-2 tokenizer
+#   append GPT-2 EOT after each record
+#   concatenate stream
+#   split into payload_len=1022
+#   wrap as [EOT] + payload + [EOT]
+#   train from fixed memmap chunks with DistributedSampler shuffle
+export OWT_CACHED_CHUNKS=1
+export OWT_CHUNK_CACHE_DIR="${OWT_CHUNK_CACHE_DIR:-/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train_minus_100k}"
+# Default to reusing the prebuilt cache; set OWT_CHUNK_CACHE_REBUILD=1 only when
+# intentionally refreshing or repairing the cached chunk pool.
+export OWT_CHUNK_CACHE_REBUILD="${OWT_CHUNK_CACHE_REBUILD:-0}"
+export NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
+export PER_GPU_BATCH_SIZE="${PER_GPU_BATCH_SIZE:-32}"
+export GLOBAL_BATCH_SIZE="${GLOBAL_BATCH_SIZE:-512}"
+export TOTAL_STEPS="${TOTAL_STEPS:-1000000}"
+export WARMUP_STEPS="${WARMUP_STEPS:-2000}"
+export MAX_LEN="${MAX_LEN:-1024}"
+export NUM_WORKERS="${NUM_WORKERS:-4}"
+export DATALOADER_PREFETCH_FACTOR="${DATALOADER_PREFETCH_FACTOR:-2}"
+export LOG_EVERY="${LOG_EVERY:-100}"
+export SAVE_EVERY="${SAVE_EVERY:-20000}"
+export LATEST_EVERY="${LATEST_EVERY:-1000}"
+export EVAL_EVERY="${EVAL_EVERY:-0}"
+export ENABLE_TORCH_COMPILE="${ENABLE_TORCH_COMPILE:-0}"
+export ALLOW_EXISTING_SAVE_DIR="${ALLOW_EXISTING_SAVE_DIR:-0}"
+export OPTIMIZER="${OPTIMIZER:-adamw}"
+export MUON_MOMENTUM="${MUON_MOMENTUM:-0.95}"
+export MUON_NS_STEPS="${MUON_NS_STEPS:-5}"
+export MUON_UPDATE_SCALE="${MUON_UPDATE_SCALE:-1.0}"
+export EMA_DECAY="${EMA_DECAY:-0.0}"
+export EMA_START_STEP="${EMA_START_STEP:-0}"
+export ALLOW_TF32="${ALLOW_TF32:-1}"
+export ACTIVATION_CHECKPOINTING="${ACTIVATION_CHECKPOINTING:-0}"
+export ACTIVATION_CHECKPOINT_INTERVAL="${ACTIVATION_CHECKPOINT_INTERVAL:-1}"
+export DDP_STATIC_GRAPH="${DDP_STATIC_GRAPH:-0}"
+export DDP_GRADIENT_AS_BUCKET_VIEW="${DDP_GRADIENT_AS_BUCKET_VIEW:-1}"
+export BLOCKING_DATA_TRANSFER="${BLOCKING_DATA_TRANSFER:-0}"
+export FULL_TRAIN_STATS="${FULL_TRAIN_STATS:-0}"
+export DATA_PATH="${DATA_PATH:-/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext}"
+export TOKENIZER_PATH="${TOKENIZER_PATH:-/e2e-data/evad-tech-vla/wanghan58/models/flowtext_scorers/gpt2-standard/tokenizer.json}"
+export TEXT_COLUMN="${TEXT_COLUMN:-text}"
+export OPENWEBTEXT_SPLIT="${OPENWEBTEXT_SPLIT:-train_minus_100k}"
+export DETOKENIZER="${DETOKENIZER:-auto}"
+export RUN_NAME="${RUN_NAME:-lta_owt_dirichlet_categorical_fullvocab_c1024_fullycoupled_gpt2_cached_chunks_len1024_gbs${GLOBAL_BATCH_SIZE}_${NPROC_PER_NODE}gpu_1m_nw${NUM_WORKERS}}"
+bash scripts/launch_lta_owt_categorical_fullvocab_c1024_fullycoupled_8gpu_small_1m.sh

LTA_openwebtext_dualt/scripts/launch_lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_mask1_wd0p1_fp32_8gpu.sh ADDED Viewed

	@@ -0,0 +1,39 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
+# 8k compact GPT2-BPE variant of the v2048 fully-coupled mask=1 baseline.
+# Keep the actual training recipe centralized in the v2048 script; this wrapper
+# only swaps tokenizer/data/run labels.
+export VOCAB_SIZE="${VOCAB_SIZE:-8192}"
+export DATA_PATH="${DATA_PATH:-/e2e-data/evad-tech-vla/wanghan58/data/embedded-language-flows/openwebtext-compact-gpt2bpe-v8192-stream1024-train-minus-100k}"
+export TOKENIZER_PATH="${TOKENIZER_PATH:-/e2e-data/evad-tech-vla/wanghan58/models/lta_tokenizers/owt_compact_gpt2bpe_v8192/tokenizer.json}"
+export COMPACT_VARIANT_LABEL="${COMPACT_VARIANT_LABEL:-compact_gpt2bpe_v8192_stream1024_fullycoupled_mask0p1-1p0_wd0p1_fp32}"
+export T_SAMPLING_MODE="${T_SAMPLING_MODE:-uniform}"
+export MIN_MASK_RATIO="${MIN_MASK_RATIO:-0.1}"
+export MAX_MASK_RATIO="${MAX_MASK_RATIO:-1.0}"
+sanitize_label() {
+  printf "%s" "$1" | sed -e 's/-/m/g' -e 's/\./p/g'
+}
+T_SAMPLING_LOGIT_MEAN_FOR_NAME="${T_SAMPLING_LOGIT_MEAN:--1.5}"
+T_SAMPLING_LOGIT_STD_FOR_NAME="${T_SAMPLING_LOGIT_STD:-0.8}"
+MIN_MASK_RATIO_FOR_NAME="${MIN_MASK_RATIO:-1.0}"
+MAX_MASK_RATIO_FOR_NAME="${MAX_MASK_RATIO:-1.0}"
+T_LOGIT_MEAN_LABEL="$(sanitize_label "${T_SAMPLING_LOGIT_MEAN_FOR_NAME}")"
+T_LOGIT_STD_LABEL="$(sanitize_label "${T_SAMPLING_LOGIT_STD_FOR_NAME}")"
+MIN_MASK_RATIO_LABEL="$(sanitize_label "${MIN_MASK_RATIO_FOR_NAME}")"
+MAX_MASK_RATIO_LABEL="$(sanitize_label "${MAX_MASK_RATIO_FOR_NAME}")"
+if [[ "${T_SAMPLING_MODE}" == "logit_normal" ]]; then
+  T_SAMPLING_LABEL="logitnormal_${T_LOGIT_MEAN_LABEL}_s${T_LOGIT_STD_LABEL}"
+else
+  T_SAMPLING_LABEL="$(sanitize_label "${T_SAMPLING_MODE}")t"
+fi
+export RUN_NAME="${RUN_NAME:-lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_${T_SAMPLING_LABEL}_hardce_mask${MIN_MASK_RATIO_LABEL}-${MAX_MASK_RATIO_LABEL}_fp32_ddit768x12_gbs512_8gpu_1m_$(date +%Y%m%d_%H%M%S)}"
+export LOG_DIR="${LOG_DIR:-logs/compact_gpt2bpe_v8192_stream1024_fullycoupled_mask1_wd0p1_fp32_8gpu}"
+bash scripts/launch_lta_owt_compact_gpt2bpe_v2048_stream1024_fullycoupled_mask1_wd0p1_fp32_8gpu.sh

LTA_openwebtext_dualt/scripts/launch_lta_owt_elfaligned_t5_logitnormal_8gpu.sh ADDED Viewed

	@@ -0,0 +1,209 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
+export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
+export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
+export PYTHONPATH="$(pwd)${PYTHONPATH:+:$PYTHONPATH}"
+export TOKENIZERS_PARALLELISM=false
+export PYTHONUNBUFFERED=1
+export NCCL_DEBUG="${NCCL_DEBUG:-WARN}"
+export TORCH_DISTRIBUTED_TIMEOUT="${TORCH_DISTRIBUTED_TIMEOUT:-3600}"
+# ELF-aligned simplex run:
+#   architecture: ddit_elf = no adaLN, prefix time tokens, qk norm, RoPE, RMSNorm, SwiGLU
+#   tokenizer/data: T5-small tokenizer, one OWT record per example, pad/truncate to 1024
+#   optimizer: Muon, lr 0.002, wd 0, constant LR after 0.5 epoch warmup
+#   time sampling: sigmoid(N(T_LOGIT_MEAN, T_LOGIT_STD^2)); defaults match ELF
+# The old ddit path and GPT2 cached scripts are untouched.
+DATA_PATH="${DATA_PATH:-/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext}"
+TOKENIZER_PATH="${TOKENIZER_PATH:-/e2e-data/evad-tech-vla/wanghan58/models/hf/t5-small/tokenizer.json}"
+NNODES="${NNODES:-1}"
+NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
+NODE_RANK="${NODE_RANK:-0}"
+MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}"
+MASTER_PORT="${MASTER_PORT:-32091}"
+PER_GPU_BATCH_SIZE="${PER_GPU_BATCH_SIZE:-32}"
+GLOBAL_BATCH_SIZE="${GLOBAL_BATCH_SIZE:-512}"
+EPOCHS="${EPOCHS:-5}"
+NUM_WORKERS="${NUM_WORKERS:-8}"
+DATALOADER_PREFETCH_FACTOR="${DATALOADER_PREFETCH_FACTOR:-4}"
+LOG_EVERY="${LOG_EVERY:-100}"
+LATEST_EVERY="${LATEST_EVERY:-1000}"
+EVAL_EVERY="${EVAL_EVERY:-0}"
+ALLOW_EXISTING_SAVE_DIR="${ALLOW_EXISTING_SAVE_DIR:-0}"
+ALLOW_TF32="${ALLOW_TF32:-1}"
+LR="${LR:-0.002}"
+WEIGHT_DECAY="${WEIGHT_DECAY:-0.0}"
+ADAM_BETA1="${ADAM_BETA1:-0.9}"
+ADAM_BETA2="${ADAM_BETA2:-0.999}"
+ADAM_EPS="${ADAM_EPS:-1e-8}"
+MUON_MOMENTUM="${MUON_MOMENTUM:-0.95}"
+MUON_NS_STEPS="${MUON_NS_STEPS:-5}"
+MUON_UPDATE_SCALE="${MUON_UPDATE_SCALE:-1.0}"
+GRAD_CLIP="${GRAD_CLIP:-1.0}"
+EMA_DECAY="${EMA_DECAY:-0.9999}"
+EMA_START_STEP="${EMA_START_STEP:-0}"
+T_LOGIT_MEAN="${T_LOGIT_MEAN:--1.5}"
+T_LOGIT_STD="${T_LOGIT_STD:-0.8}"
+LOSS_T_WEIGHT_MODE="${LOSS_T_WEIGHT_MODE:-none}"
+LOSS_T_MIN_WEIGHT="${LOSS_T_MIN_WEIGHT:-0.0}"
+OUTPUT_INIT_STD="${OUTPUT_INIT_STD:-0.0}"
+sanitize_label() {
+  printf "%s" "$1" | sed -e 's/-/m/g' -e 's/\./p/g'
+}
+T_LOGIT_MEAN_LABEL="$(sanitize_label "${T_LOGIT_MEAN}")"
+T_LOGIT_STD_LABEL="$(sanitize_label "${T_LOGIT_STD}")"
+LOSS_T_MIN_WEIGHT_LABEL="$(sanitize_label "${LOSS_T_MIN_WEIGHT}")"
+RUN_NAME="${RUN_NAME:-lta_owt_t5record_len1024_elfaligned_dditelf_muon_logitnormal_${T_LOGIT_MEAN_LABEL}_s${T_LOGIT_STD_LABEL}_${LOSS_T_WEIGHT_MODE}_floor${LOSS_T_MIN_WEIGHT_LABEL}_gbs512_8gpu_5epoch_$(date +%Y%m%d_%H%M%S)}"
+SAVE_DIR="${SAVE_DIR:-runs/${RUN_NAME}}"
+LOG_DIR="${LOG_DIR:-logs/elfaligned_t5record_8gpu}"
+LOG_FILE="${LOG_FILE:-${LOG_DIR}/${RUN_NAME}.log}"
+NUM_RECORDS=$(python - <<PY
+from pathlib import Path
+import pyarrow.parquet as pq
+root = Path("${DATA_PATH}")
+files = sorted(root.rglob("*.parquet")) if root.is_dir() else [root]
+rows = sum(pq.ParquetFile(str(p)).metadata.num_rows for p in files)
+print(max(0, rows - 100_000))
+PY
+)
+STEPS_PER_EPOCH=$(( (NUM_RECORDS + GLOBAL_BATCH_SIZE - 1) / GLOBAL_BATCH_SIZE ))
+SAVE_EVERY="${SAVE_EVERY:-${STEPS_PER_EPOCH}}"
+if [[ -f "${SAVE_DIR}/args.json" && "${ALLOW_EXISTING_SAVE_DIR}" != "1" ]]; then
+  echo "Refusing to start because SAVE_DIR already contains args.json: ${SAVE_DIR}" >&2
+  echo "Use a new RUN_NAME/SAVE_DIR or set ALLOW_EXISTING_SAVE_DIR=1 intentionally." >&2
+  exit 2
+fi
+mkdir -p "${LOG_DIR}" "${SAVE_DIR}"
+TF32_FLAG="--allow_tf32"
+TF32_LABEL="true"
+if [[ "${ALLOW_TF32}" == "0" || "${ALLOW_TF32}" == "false" || "${ALLOW_TF32}" == "False" ]]; then
+  TF32_FLAG="--no-allow_tf32"
+  TF32_LABEL="false"
+fi
+echo "[launch] method=owt_elfaligned_t5record_dditelf host=$(hostname) time=$(date -Iseconds)"
+echo "[launch] run_name=${RUN_NAME}"
+echo "[launch] save_dir=${SAVE_DIR}"
+echo "[launch] log_file=${LOG_FILE}"
+echo "[launch] data_path=${DATA_PATH}"
+echo "[launch] tokenizer=${TOKENIZER_PATH}"
+echo "[launch] records=${NUM_RECORDS} epochs=${EPOCHS} approx_steps_per_epoch=${STEPS_PER_EPOCH} save_every=${SAVE_EVERY}"
+echo "[launch] optimizer=muon_impl=optax grouping=hidden_2d lr=${LR} wd=${WEIGHT_DECAY} adam_fallback_wd=0 momentum=${MUON_MOMENTUM} ns=${MUON_NS_STEPS} nesterov=true width_scale=true adam_fallback_b2=${ADAM_BETA2} ema=${EMA_DECAY}"
+echo "[launch] model=ddit_elf rmsnorm qk_norm=true swiglu no_adaln output_bias=false output_init_std=${OUTPUT_INIT_STD} time_tokens=4 mode_tokens=0"
+echo "[launch] data=record_pad_truncate pad=pad add_special_tokens=false t5-small fp32=true bf16=false tf32=${TF32_LABEL}"
+echo "[launch] t_sampling=logit_normal mean=${T_LOGIT_MEAN} std=${T_LOGIT_STD} loss_t_weight=${LOSS_T_WEIGHT_MODE} loss_t_min_weight=${LOSS_T_MIN_WEIGHT} warmup_epochs=0.5"
+python -m torch.distributed.run \
+  --nnodes="${NNODES}" \
+  --nproc_per_node="${NPROC_PER_NODE}" \
+  --node_rank="${NODE_RANK}" \
+  --master_addr="${MASTER_ADDR}" \
+  --master_port="${MASTER_PORT}" \
+  train.py \
+  --data_path "${DATA_PATH}" \
+  --openwebtext_split train_minus_100k \
+  --text_column text \
+  --detokenizer auto \
+  --tokenizer_path "${TOKENIZER_PATH}" \
+  --save_dir "${SAVE_DIR}" \
+  --record_pad_truncate \
+  --record_pad_token pad \
+  --record_shuffle_buffer 10000 \
+  --max_len 1024 \
+  --batch_size "${PER_GPU_BATCH_SIZE}" \
+  --global_batch_size "${GLOBAL_BATCH_SIZE}" \
+  --num_workers "${NUM_WORKERS}" \
+  --dataloader_prefetch_factor "${DATALOADER_PREFETCH_FACTOR}" \
+  --epochs "${EPOCHS}" \
+  --total_steps 1 \
+  --warmup_epochs 0.5 \
+  --log_every "${LOG_EVERY}" \
+  --eval_every "${EVAL_EVERY}" \
+  --save_every "${SAVE_EVERY}" \
+  --latest_every "${LATEST_EVERY}" \
+  --optimizer muon \
+  --muon_impl optax \
+  --lr "${LR}" \
+  --lr_schedule constant_warmup \
+  --min_lr 0 \
+  --weight_decay "${WEIGHT_DECAY}" \
+  --adam_beta1 "${ADAM_BETA1}" \
+  --adam_beta2 "${ADAM_BETA2}" \
+  --adam_eps "${ADAM_EPS}" \
+  --muon_momentum "${MUON_MOMENTUM}" \
+  --muon_ns_steps "${MUON_NS_STEPS}" \
+  --muon_update_scale "${MUON_UPDATE_SCALE}" \
+  --muon_nesterov \
+  --muon_width_scale \
+  --ema_decay "${EMA_DECAY}" \
+  --ema_start_step "${EMA_START_STEP}" \
+  --grad_clip "${GRAD_CLIP}" \
+  --seed 42 \
+  --d_model 768 \
+  --cond_dim 128 \
+  --n_layers 12 \
+  --n_heads 12 \
+  --dim_ff 3072 \
+  --dropout 0.0 \
+  --no-output_bias \
+  --output_init_std "${OUTPUT_INIT_STD}" \
+  --norm_type rmsnorm \
+  --model_type ddit_elf \
+  --elf_num_time_tokens 4 \
+  --elf_num_model_mode_tokens 0 \
+  --qk_norm \
+  --state_format prob \
+  --bridge dirichlet \
+  --target_loss hard_ce \
+  --loss_t_weight_mode "${LOSS_T_WEIGHT_MODE}" \
+  --loss_t_min_weight "${LOSS_T_MIN_WEIGHT}" \
+  --target_prob 1.0 \
+  --min_t 0.0 \
+  --max_t 1.0 \
+  --t_sampling_mode logit_normal \
+  --t_sampling_logit_mean "${T_LOGIT_MEAN}" \
+  --t_sampling_logit_std "${T_LOGIT_STD}" \
+  --t_sampling_eps 1e-4 \
+  --dual_t \
+  --corrupt_t_mode same \
+  --corrupt_min_t 0.0 \
+  --corrupt_max_t 1.0 \
+  --min_mask_ratio 0.1 \
+  --max_mask_ratio 1.0 \
+  --wrong_token_replace_prob 1.0 \
+  --wrong_token_schedule linear_t \
+  --wrong_token_exp_k 1.0 \
+  --dirichlet_concentration_min 1.0 \
+  --dirichlet_concentration_max 1024 \
+  --dirichlet_endpoint_mode categorical_dual_t \
+  --dirichlet_semantic_t_mode same \
+  --dirichlet_semantic_t_value 0.0 \
+  --categorical_wrong_from_full_vocab \
+  --simplex_bridge_sampler dirichlet \
+  --eps 1e-8 \
+  --infer_steps 1024 \
+  --decode_damping 1.0 \
+  --max_gamma 1.0 \
+  --decode_solver flowmap \
+  --noise_init logistic_normal \
+  --bridge_noise_init logistic_normal \
+  --noise_sigma -1 \
+  "${TF32_FLAG}" \
+  --activation_checkpointing \
+  --activation_checkpoint_scope mlp \
+  --ddp_gradient_as_bucket_view \
+  2>&1 | tee -a "${LOG_FILE}"

LTA_openwebtext_dualt/scripts/launch_lta_owt_fullycoupled_outwd0p5_8gpu.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
+export OUTPUT_WEIGHT_DECAY="${OUTPUT_WEIGHT_DECAY:-0.5}"
+export WEIGHT_DECAY="${WEIGHT_DECAY:-0.1}"
+export RUN_NAME="${RUN_NAME:-lta_owt_gpt2cached_len1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_outwd0p5_nanogpt_tf32_ddit768x12_gbs512_8gpu_1m_$(date +%Y%m%d_%H%M%S)}"
+export LOG_DIR="${LOG_DIR:-logs/fullycoupled_outwd0p5_8gpu}"
+bash scripts/launch_lta_owt_fullycoupled_wd0p1_fp32_8gpu.sh

LTA_openwebtext_dualt/scripts/launch_lta_owt_t5_rollin_grad_k1_rho025_subset10k_4gpu_100k.sh ADDED Viewed

	@@ -0,0 +1,148 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
+export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3}"
+export NPROC_PER_NODE="${NPROC_PER_NODE:-4}"
+export PYTHONPATH="$(pwd)${PYTHONPATH:+:$PYTHONPATH}"
+export TOKENIZERS_PARALLELISM=false
+export PYTHONUNBUFFERED=1
+export NCCL_DEBUG="${NCCL_DEBUG:-WARN}"
+free_port() {
+  python3 - <<'PY'
+import socket
+with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+    s.bind(("127.0.0.1", 0))
+    print(s.getsockname()[1])
+PY
+}
+DATA_PATH="${DATA_PATH:-/e2e-data/evad-tech-vla/wanghan58/data/embedded-language-flows/openwebtext-t5}"
+TOKENIZER_PATH="${TOKENIZER_PATH:-/e2e-data/evad-tech-vla/wanghan58/models/hf/t5-small/tokenizer.json}"
+MAX_RECORDS="${MAX_RECORDS:-10000}"
+TOTAL_STEPS="${TOTAL_STEPS:-100000}"
+GLOBAL_BATCH_SIZE="${GLOBAL_BATCH_SIZE:-512}"
+PER_GPU_BATCH_SIZE="${PER_GPU_BATCH_SIZE:-16}"
+RUN_NAME="${RUN_NAME:-lta_owt_t5_3l_d256_rollin_grad_p50_k1_rho0_0p25_uniformt_maxrec10k_4gpu_100k_$(date +%Y%m%d_%H%M%S)}"
+MASTER_PORT="${MASTER_PORT:-$(free_port)}"
+LOG_DIR="${LOG_DIR:-logs/elfaligned_t5tokenized_4gpu}"
+mkdir -p "${LOG_DIR}" "runs/${RUN_NAME}"
+LOG_FILE="${LOG_DIR}/${RUN_NAME}.log"
+echo "[launch] run_name=${RUN_NAME}" | tee -a "${LOG_FILE}"
+echo "[launch] data=${DATA_PATH} max_records=${MAX_RECORDS} tokenizer=${TOKENIZER_PATH}" | tee -a "${LOG_FILE}"
+echo "[launch] cuda=${CUDA_VISIBLE_DEVICES} nproc=${NPROC_PER_NODE} gbs=${GLOBAL_BATCH_SIZE} per_gpu=${PER_GPU_BATCH_SIZE} total_steps=${TOTAL_STEPS}" | tee -a "${LOG_FILE}"
+torchrun \
+  --nproc_per_node="${NPROC_PER_NODE}" \
+  --master_port="${MASTER_PORT}" \
+  train.py \
+  --data_path "${DATA_PATH}" \
+  --max_records "${MAX_RECORDS}" \
+  --tokenized_hf \
+  --tokenized_pad_token pad \
+  --tokenizer_path "${TOKENIZER_PATH}" \
+  --save_dir "runs/${RUN_NAME}" \
+  --max_len 1024 \
+  --batch_size "${PER_GPU_BATCH_SIZE}" \
+  --global_batch_size "${GLOBAL_BATCH_SIZE}" \
+  --num_workers 0 \
+  --epochs 0 \
+  --total_steps "${TOTAL_STEPS}" \
+  --warmup_steps 1 \
+  --warmup_epochs 0.5 \
+  --log_every 100 \
+  --eval_every 0 \
+  --save_every 5000 \
+  --latest_every 1000 \
+  --optimizer muon \
+  --muon_impl optax \
+  --lr 0.002 \
+  --lr_schedule constant_warmup \
+  --min_lr 0.0 \
+  --weight_decay 0.1 \
+  --output_weight_decay -1 \
+  --adamw_param_groups nanogpt \
+  --adam_beta1 0.9 \
+  --adam_beta2 0.999 \
+  --adam_eps 1e-8 \
+  --ema_decay 0.9999 \
+  --ema_start_step 0 \
+  --grad_clip 1.0 \
+  --seed 42 \
+  --d_model 256 \
+  --cond_dim 128 \
+  --n_layers 3 \
+  --n_heads 4 \
+  --dim_ff 1024 \
+  --dropout 0.0 \
+  --no-output_bias \
+  --output_init_std 0 \
+  --norm_type rmsnorm \
+  --qk_norm \
+  --model_type ddit_elf \
+  --ddit_mlp_type gelu \
+  --state_format prob \
+  --bridge dirichlet \
+  --target_loss hard_ce \
+  --loss_t_weight_mode none \
+  --loss_t_min_weight 0.0 \
+  --rollout_train_prob 0.50 \
+  --rollout_train_time_mode sampled_path \
+  --rollout_train_steps 1 \
+  --rollout_train_steps_min -1 \
+  --rollout_train_infer_steps 1 \
+  --rollout_train_s_dist uniform \
+  --rollout_train_s_min_frac 0.0 \
+  --rollout_train_s_max_frac 0.25 \
+  --rollout_train_temp 1.0 \
+  --rollout_train_max_gamma 1.0 \
+  --rollout_train_corrupt_only \
+  --rollout_train_samplewise \
+  --rollout_train_selected_only \
+  --no-rollout_train_compute_always \
+  --rollout_train_keep_grad \
+  --rollout_train_sync_t \
+  --target_prob 1.0 \
+  --min_t 0.0 \
+  --max_t 1.0 \
+  --t_sampling_mode uniform \
+  --t_sampling_logit_mean -1.5 \
+  --t_sampling_logit_std 0.8 \
+  --t_sampling_eps 1e-4 \
+  --dual_t \
+  --corrupt_t_mode same \
+  --corrupt_min_t 0.0 \
+  --corrupt_max_t 1.0 \
+  --min_mask_ratio 1.0 \
+  --max_mask_ratio 1.0 \
+  --mask_mixture_original_prob 0.0 \
+  --mask_mixture_lowk_prob 0.0 \
+  --mask_mixture_lowcorrupt_prob 0.0 \
+  --mask_mixture_block_prob 0.0 \
+  --mask_mixture_all_prob 1.0 \
+  --wrong_token_replace_prob 1.0 \
+  --wrong_token_schedule linear_t \
+  --wrong_token_exp_k 1.0 \
+  --dirichlet_concentration_min 1.0 \
+  --dirichlet_concentration_max 1024 \
+  --dirichlet_endpoint_mode categorical_dual_t \
+  --dirichlet_semantic_t_mode same \
+  --dirichlet_semantic_t_value 0.0 \
+  --categorical_wrong_from_full_vocab \
+  --simplex_bridge_sampler dirichlet \
+  --eps 1e-8 \
+  --infer_steps 1024 \
+  --decode_damping 1.0 \
+  --max_gamma 1.0 \
+  --decode_solver flowmap \
+  --noise_init logistic_normal \
+  --bridge_noise_init logistic_normal \
+  --noise_sigma -1 \
+  --allow_tf32 \
+  --activation_checkpointing \
+  --activation_checkpoint_scope mlp \
+  --ddp_gradient_as_bucket_view \
+  2>&1 | tee -a "${LOG_FILE}"

LTA_openwebtext_dualt/scripts/run_lta_lm1b_dirichlet_len1024_Cv_to_2v_8gpu_1m_save10k.sh ADDED Viewed

	@@ -0,0 +1,34 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
+export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
+export NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
+export MASTER_PORT="${MASTER_PORT:-32682}"
+export GLOBAL_BATCH_SIZE="${GLOBAL_BATCH_SIZE:-512}"
+export PER_GPU_BATCH_SIZE="${PER_GPU_BATCH_SIZE:-32}"
+export TOTAL_STEPS="${TOTAL_STEPS:-1000000}"
+export WARMUP_STEPS="${WARMUP_STEPS:-2500}"
+export SAVE_EVERY="${SAVE_EVERY:-10000}"
+export LATEST_EVERY="${LATEST_EVERY:-1000}"
+export LOG_EVERY="${LOG_EVERY:-100}"
+export MAX_LEN="${MAX_LEN:-1024}"
+export VOCAB_SIZE="${VOCAB_SIZE:-30522}"
+export CMIN="${CMIN:-${VOCAB_SIZE}}"
+export CMAX="${CMAX:-61044}"
+export MIN_MASK_RATIO="${MIN_MASK_RATIO:-0.1}"
+export MAX_MASK_RATIO="${MAX_MASK_RATIO:-1.0}"
+export CATEGORICAL_WRONG_PROB_FLOOR="${CATEGORICAL_WRONG_PROB_FLOOR:-0.0}"
+# Keep watcher off by default for the 1M run; enable explicitly to avoid
+# competing with training GPUs on busy 8-card nodes.
+export WATCH_ENABLED="${WATCH_ENABLED:-0}"
+DATE_TAG="${DATE_TAG:-$(date +%Y%m%d)}"
+export RUN_NAME="${RUN_NAME:-lta_lm1b_dirichlet_len1024_Cv_to_2v_gbs512_b32_8gpu_1m_save10k_${DATE_TAG}}"
+bash scripts/run_lta_lm1b_dirichlet_len1024_Cv_to_2v_8gpu_save1k_with_gumbel_watch.sh

LTA_openwebtext_dualt/scripts/run_lta_owt_dirichlet_len1024_Cv_to_2v_8gpu_1m_save10k.sh ADDED Viewed

	@@ -0,0 +1,34 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
+export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
+export NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
+export MASTER_PORT="${MASTER_PORT:-32682}"
+export GLOBAL_BATCH_SIZE="${GLOBAL_BATCH_SIZE:-512}"
+export PER_GPU_BATCH_SIZE="${PER_GPU_BATCH_SIZE:-32}"
+export TOTAL_STEPS="${TOTAL_STEPS:-1000000}"
+export WARMUP_STEPS="${WARMUP_STEPS:-2500}"
+export SAVE_EVERY="${SAVE_EVERY:-10000}"
+export LATEST_EVERY="${LATEST_EVERY:-1000}"
+export LOG_EVERY="${LOG_EVERY:-100}"
+export MAX_LEN="${MAX_LEN:-1024}"
+export VOCAB_SIZE="${VOCAB_SIZE:-30522}"
+export CMIN="${CMIN:-${VOCAB_SIZE}}"
+export CMAX="${CMAX:-61044}"
+export MIN_MASK_RATIO="${MIN_MASK_RATIO:-0.1}"
+export MAX_MASK_RATIO="${MAX_MASK_RATIO:-1.0}"
+export CATEGORICAL_WRONG_PROB_FLOOR="${CATEGORICAL_WRONG_PROB_FLOOR:-0.0}"
+# Keep watcher off by default for the 1M run; enable explicitly to avoid
+# competing with training GPUs on busy 8-card nodes.
+export WATCH_ENABLED="${WATCH_ENABLED:-0}"
+DATE_TAG="${DATE_TAG:-$(date +%Y%m%d)}"
+export RUN_NAME="${RUN_NAME:-lta_owt_dirichlet_len1024_Cv_to_2v_gbs512_b32_8gpu_1m_save10k_${DATE_TAG}}"
+bash scripts/run_lta_owt_dirichlet_len1024_Cv_to_2v_8gpu_save1k_with_gumbel_watch.sh

LTA_openwebtext_dualt/scripts/run_lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_8gpu_mask0p1_1p0_sameT_1m_save10k.sh ADDED Viewed

	@@ -0,0 +1,36 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
+# T5-tokenized OWT, DDiT = RoPE + adaLN-zero, with learned absolute position
+# embeddings added before RoPE.  The bridge/model t is shared (sameT).
+export DATA_PATH="${DATA_PATH:-/e2e-data/evad-tech-vla/wanghan58/data/embedded-language-flows/openwebtext-t5}"
+export TOKENIZER_PATH="${TOKENIZER_PATH:-/e2e-data/evad-tech-vla/wanghan58/models/hf/t5-small/tokenizer.json}"
+export TOKENIZED_HF="${TOKENIZED_HF:-1}"
+export TOKENIZED_PAD_TOKEN="${TOKENIZED_PAD_TOKEN:-pad}"
+export VOCAB_SIZE="${VOCAB_SIZE:-32100}"
+export CMIN="${CMIN:-32100}"
+export CMAX="${CMAX:-64200}"
+export ABS_POS_EMBED="${ABS_POS_EMBED:-1}"
+export CORRUPT_T_MODE="${CORRUPT_T_MODE:-same}"
+export MIN_MASK_RATIO="${MIN_MASK_RATIO:-0.1}"
+export MAX_MASK_RATIO="${MAX_MASK_RATIO:-1.0}"
+export MASK_MIXTURE_ORIGINAL_PROB="${MASK_MIXTURE_ORIGINAL_PROB:-0.0}"
+export MASK_MIXTURE_ALL_PROB="${MASK_MIXTURE_ALL_PROB:-0.0}"
+export DATE_TAG="${DATE_TAG:-$(date +%Y%m%d)}"
+export TOTAL_STEPS="${TOTAL_STEPS:-1000000}"
+export SAVE_EVERY="${SAVE_EVERY:-10000}"
+export LATEST_EVERY="${LATEST_EVERY:-1000}"
+export WATCH_ENABLED="${WATCH_ENABLED:-1}"
+export WATCH_STEP_INTERVAL="${WATCH_STEP_INTERVAL:-10000}"
+export WATCH_N_SAMPLES="${WATCH_N_SAMPLES:-128}"
+export WATCH_CUDA_VISIBLE_DEVICES="${WATCH_CUDA_VISIBLE_DEVICES:-7}"
+export RUN_NAME="${RUN_NAME:-lta_owt_t5_absrope_adaln_dirichlet_len1024_Cv_to_2v_mask0p1_1p0_sameT_gbs512_b32_8gpu_1m_save10k_${DATE_TAG}}"
+export WATCH_OUT_BASE="${WATCH_OUT_BASE:-docs/lta_samples/metrics_${DATE_TAG}/owt_t5_absrope_adaln_Cv_to_2v_mask0p1_1p0_sameT_sde_gumbel_topp${WATCH_ENDPOINT_TOP_P:-0.95}_tau${WATCH_GUMBEL_TAU_START:-1.0}_to_${WATCH_GUMBEL_TAU_END:-0.2}_blend_c${CMIN}_${CMAX}_n${WATCH_N_SAMPLES}/${RUN_NAME}}"
+bash scripts/run_lta_owt_dirichlet_len1024_Cv_to_2v_8gpu_save1k_with_gumbel_watch.sh

LTA_openwebtext_dualt/scripts/run_train8_wrong_floor_pilots_4gpu.sh ADDED Viewed

	@@ -0,0 +1,194 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
+export PYTHONPATH="$(pwd)${PYTHONPATH:+:$PYTHONPATH}"
+export TOKENIZERS_PARALLELISM=false
+export PYTHONUNBUFFERED=1
+BASE_CACHE="${BASE_CACHE:-/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks}"
+TOKENIZER_PATH="${TOKENIZER_PATH:-/e2e-data/evad-tech-vla/wanghan58/models/flowtext_scorers/gpt2-standard/tokenizer.json}"
+MAX_LEN="${MAX_LEN:-256}"
+N_SAMPLES="${N_SAMPLES:-64}"
+INFER_STEPS="${INFER_STEPS:-128}"
+STEP_CHUNK="${STEP_CHUNK:-1000}"
+MAX_TOTAL_STEPS="${MAX_TOTAL_STEPS:-20000}"
+PER_GPU_BATCH_SIZE="${PER_GPU_BATCH_SIZE:-128}"
+GLOBAL_BATCH_SIZE="${GLOBAL_BATCH_SIZE:-512}"
+GROUP_STAMP="${GROUP_STAMP:-$(date +%Y%m%d_%H%M%S)}"
+OUT_ROOT="${OUT_ROOT:-docs/lta_samples/metrics_20260517/wrong_floor_pilots_len${MAX_LEN}_bs512_ode128_${GROUP_STAMP}}"
+DRIVER_LOG="${DRIVER_LOG:-logs/wrong_floor_pilots_4gpu/${GROUP_STAMP}.log}"
+CURVE_CSV="${CURVE_CSV:-${OUT_ROOT}/hit_ratio_curve.csv}"
+mkdir -p "$(dirname "${DRIVER_LOG}")" "${OUT_ROOT}"
+cache="${BASE_CACHE}/gpt2_len${MAX_LEN}_train8_compact_overfit"
+vocab_size="$(
+python - "$cache" <<'PY'
+import json
+import sys
+from pathlib import Path
+meta = json.loads((Path(sys.argv[1]) / "meta.json").read_text())
+print(int(meta.get("compact_vocab_size", meta.get("vocab_size"))))
+PY
+)"
+if [[ ! -f "${CURVE_CSV}" ]]; then
+  echo "config,ckpt_step,train_views_seen,train_tokens_seen,token_acc_mean,exact_count,exact_ref_count,exact_ref_hits" > "${CURVE_CSV}"
+fi
+latest_step() {
+  local run_name="$1"
+  python - "$run_name" <<'PY'
+import re
+import sys
+from pathlib import Path
+run = Path("runs") / sys.argv[1]
+steps = []
+for path in run.glob("step_*.pt"):
+    m = re.search(r"step_(\d+)\.pt$", path.name)
+    if m:
+        steps.append(int(m.group(1)))
+print(max(steps) if steps else 0)
+PY
+}
+free_port() {
+  python - <<'PY'
+import socket
+with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+    s.bind(("127.0.0.1", 0))
+    print(s.getsockname()[1])
+PY
+}
+eval_latest() {
+  local config="$1"
+  local run_name="$2"
+  local target_step="$3"
+  local out_dir="${OUT_ROOT}/${config}/step_${target_step}"
+  mkdir -p "${out_dir}"
+  CUDA_VISIBLE_DEVICES="${EVAL_CUDA_VISIBLE_DEVICES:-0}" python scripts/eval_train8_decode_acc.py \
+    --runs_glob "runs/${run_name}" \
+    --data_dir "${cache}" \
+    --tokenizer_path "${TOKENIZER_PATH}" \
+    --out_dir "${out_dir}" \
+    --max_len "${MAX_LEN}" \
+    --n_samples "${N_SAMPLES}" \
+    --batch_size "${N_SAMPLES}" \
+    --latest_only \
+    --endpoint_softenings none \
+    --steps "${INFER_STEPS}" \
+    --decode_rule flowmap \
+    --time_schedule logit_normal \
+    --time_logit_mean -1.5 \
+    --time_logit_std 0.8 \
+    --model_t_mode post \
+    --c_min 1 \
+    --c_max 512 \
+    --late_temp 1.0 \
+    --final_from state \
+    --final_decode argmax
+  python - "$config" "$out_dir" "$N_SAMPLES" "$GLOBAL_BATCH_SIZE" "$MAX_LEN" "$CURVE_CSV" <<'PY'
+import json
+import sys
+from pathlib import Path
+config = sys.argv[1]
+out = Path(sys.argv[2])
+n = int(sys.argv[3])
+global_batch = int(sys.argv[4])
+max_len = int(sys.argv[5])
+curve = Path(sys.argv[6])
+row = json.loads((out / "decode_token_acc.jsonl").read_text().splitlines()[-1])
+views = int(row["ckpt_step"]) * global_batch
+tokens = views * max_len
+print(
+    "RESULT "
+    f"config={config} ckpt_step={row['ckpt_step']} views={views} "
+    f"token_acc={row['token_acc_mean']:.4f} exact={row['exact_count']}/{n} "
+    f"exact_refs={row['exact_ref_count']} hits={row['exact_ref_hits']}",
+    flush=True,
+)
+with curve.open("a", encoding="utf-8") as f:
+    f.write(
+        f"{config},{row['ckpt_step']},{views},{tokens},{row['token_acc_mean']},"
+        f"{row['exact_count']},{row['exact_ref_count']},\"{row['exact_ref_hits']}\"\n"
+    )
+PY
+}
+configs=(
+  wrongfloor0p3
+  wrongfloor0p5
+  wrongfloor0p7
+)
+echo "[wrong-floor] start stamp=${GROUP_STAMP} len=${MAX_LEN} vocab=${vocab_size} out=${OUT_ROOT}" | tee -a "${DRIVER_LOG}"
+round_idx=0
+while :; do
+  round_idx=$((round_idx + 1))
+  active=0
+  echo "[wrong-floor] round=${round_idx} $(date)" | tee -a "${DRIVER_LOG}"
+  for config in "${configs[@]}"; do
+    floor="${config#wrongfloor}"
+    floor="${floor//p/.}"
+    run_name="train8_wrongfloor_len${MAX_LEN}_${config}_${GROUP_STAMP}"
+    step_now="$(latest_step "${run_name}")"
+    if [[ "${step_now}" -ge "${MAX_TOTAL_STEPS}" ]]; then
+      echo "[wrong-floor] capped config=${config} step=${step_now}" | tee -a "${DRIVER_LOG}"
+      continue
+    fi
+    active=1
+    target_step=$((step_now + STEP_CHUNK))
+    if [[ "${target_step}" -gt "${MAX_TOTAL_STEPS}" ]]; then
+      target_step="${MAX_TOTAL_STEPS}"
+    fi
+    resume_path=""
+    if [[ -f "runs/${run_name}/latest.pt" ]]; then
+      resume_path="runs/${run_name}/latest.pt"
+    fi
+    echo "[wrong-floor] train config=${config} floor=${floor} from=${step_now} to=${target_step}" | tee -a "${DRIVER_LOG}"
+    CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3}" \
+    NPROC_PER_NODE="${NPROC_PER_NODE:-4}" \
+    MASTER_PORT="$(free_port)" \
+    OWT_CHUNK_CACHE_DIR="${cache}" \
+    OWT_EXACT_REPEAT_PER_CHUNK="${OWT_EXACT_REPEAT_PER_CHUNK:-64}" \
+    MAX_LEN="${MAX_LEN}" \
+    VOCAB_SIZE_OVERRIDE="${vocab_size}" \
+    D_MODEL="${D_MODEL:-192}" \
+    COND_DIM="${COND_DIM:-64}" \
+    N_LAYERS="${N_LAYERS:-3}" \
+    N_HEADS="${N_HEADS:-3}" \
+    DIM_FF="${DIM_FF:-768}" \
+    TOTAL_STEPS="${target_step}" \
+    PER_GPU_BATCH_SIZE="${PER_GPU_BATCH_SIZE}" \
+    GLOBAL_BATCH_SIZE="${GLOBAL_BATCH_SIZE}" \
+    NUM_WORKERS="${NUM_WORKERS:-0}" \
+    LOG_EVERY="${LOG_EVERY:-100}" \
+    SAVE_EVERY="${STEP_CHUNK}" \
+    LATEST_EVERY="${STEP_CHUNK}" \
+    WARMUP_STEPS="${WARMUP_STEPS:-10}" \
+    LEARNING_RATE="${LEARNING_RATE:-0.002}" \
+    WEIGHT_DECAY="${WEIGHT_DECAY:-0.1}" \
+    MUON_IMPL="${MUON_IMPL:-legacy}" \
+    OUTPUT_WEIGHT_DECAY="${OUTPUT_WEIGHT_DECAY:--1}" \
+    TARGET_LOSS=hard_ce \
+    MIN_MASK_RATIO=1.0 \
+    MAX_MASK_RATIO=1.0 \
+    MASK_MIXTURE_LOWK_PROB=0.0 \
+    MASK_MIXTURE_ALL_PROB=1.0 \
+    LOWK_CLEAN_TOKENS=0 \
+    CLEAN_STATE_MODE=onehot \
+    ROLLOUT_TRAIN_PROB=0.0 \
+    CATEGORICAL_WRONG_PROB_FLOOR="${floor}" \
+    RUN_NAME="${run_name}" \
+    RESUME_PATH="${resume_path}" \
+    bash scripts/launch_lta_owt_gpt2_softendpoint_mn_pilot_4gpu.sh
+    echo "[wrong-floor] eval config=${config} step=${target_step}" | tee -a "${DRIVER_LOG}"
+    eval_latest "${config}" "${run_name}" "${target_step}" | tee -a "${DRIVER_LOG}"
+  done
+  if [[ "${active}" -eq 0 ]]; then
+    echo "[wrong-floor] all capped $(date)" | tee -a "${DRIVER_LOG}"
+    break
+  fi
+done

LTA_openwebtext_dualt/scripts/watch_infer_owt_classic_fullvocab_len1024_lr2e4_gbs2048_latest_every1k_t1p45.sh ADDED Viewed

	@@ -0,0 +1,158 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt
+export PYTHONPATH="$(pwd)${PYTHONPATH:+:$PYTHONPATH}"
+export TOKENIZERS_PARALLELISM=false
+export PYTHONUNBUFFERED=1
+# Watch the 16-GPU OWT classic full-vocab len1024/lr2e-4/GBS2048 run.
+# The training command saves step_*.pt every 10k but latest.pt every 1k, so this
+# watcher snapshots stable latest.pt at each new 1k step before running infer.
+RUN_GLOB="${RUN_GLOB:-runs/lta_owt_classic_fullvocab_bert_c1024_len1024_lr2e4_gbs2048_2node8gpu_1m_save10k_*}"
+RUN_DIR="${RUN_DIR:-}"
+TOKENIZER_PATH="${TOKENIZER_PATH:-/e2e-data/evad-tech-vla/wanghan58/workspace/imagenet_handoff_20260327/nlp_dts_light/assets/distilbert-base-uncased/tokenizer.json}"
+SCORER="${SCORER:-/e2e-data/evad-tech-vla/wanghan58/models/flowtext_scorers/gpt2-large-standard}"
+CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
+N_SAMPLES="${N_SAMPLES:-1024}"
+STEPS="${STEPS:-128}"
+CMAX="${CMAX:-1024}"
+TEMP="${TEMP:-1.45}"
+MAX_LEN="${MAX_LEN:-1024}"
+DECODE_BATCH="${DECODE_BATCH:-1}"
+SCORE_BATCH="${SCORE_BATCH:-1}"
+SCORE_MAX_LENGTH="${SCORE_MAX_LENGTH:-1024}"
+SLEEP_SECONDS="${SLEEP_SECONDS:-60}"
+STEP_INTERVAL="${STEP_INTERVAL:-1000}"
+DATE_TAG="${DATE_TAG:-$(date +%Y%m%d)}"
+TEMP_TAG="${TEMP//./p}"
+LOG_DIR="${LOG_DIR:-logs/owt_classic_fullvocab_len1024_lr2e4_gbs2048_infer_watch}"
+OUT_ROOT="${OUT_ROOT:-docs/lta_samples/metrics_${DATE_TAG}/owt_classic_fullvocab_len1024_lr2e4_gbs2048_latest_every1k_normal_steps_state_t${TEMP_TAG}_c${CMAX}_n${N_SAMPLES}}"
+mkdir -p "${LOG_DIR}" "${OUT_ROOT}"
+find_run_dir() {
+  if [[ -n "${RUN_DIR}" ]]; then
+    if [[ -d "${RUN_DIR}" ]]; then
+      printf '%s\n' "${RUN_DIR}"
+      return 0
+    fi
+    return 1
+  fi
+  shopt -s nullglob
+  local matches=( ${RUN_GLOB} )
+  shopt -u nullglob
+  if (( ${#matches[@]} == 0 )); then
+    return 1
+  fi
+  ls -td "${matches[@]}" 2>/dev/null | head -1
+}
+wait_for_stable_file() {
+  local path="$1"
+  local stat_a stat_b
+  stat_a="$(stat -c '%s:%Y' "${path}" 2>/dev/null || echo missing)"
+  sleep 20
+  stat_b="$(stat -c '%s:%Y' "${path}" 2>/dev/null || echo changed)"
+  [[ "${stat_a}" == "${stat_b}" && "${stat_a}" != "missing" ]]
+}
+read_ckpt_step() {
+  local ckpt="$1"
+  python - "$ckpt" <<'PY'
+import sys
+import torch
+ckpt = torch.load(sys.argv[1], map_location="cpu", weights_only=False)
+step = ckpt.get("step")
+if step is None:
+    raise SystemExit("checkpoint has no step")
+print(int(step))
+PY
+}
+echo "[watch-owt-len1024-lr2e4] run_glob=${RUN_GLOB}"
+echo "[watch-owt-len1024-lr2e4] explicit_run_dir=${RUN_DIR:-<auto>}"
+echo "[watch-owt-len1024-lr2e4] out_root=${OUT_ROOT}"
+echo "[watch-owt-len1024-lr2e4] decode=normal_steps_sweep steps=${STEPS} cmax=${CMAX} temp=${TEMP} final_from=state n=${N_SAMPLES} max_len=${MAX_LEN}"
+echo "[watch-owt-len1024-lr2e4] source=latest.pt snapshot_each=${STEP_INTERVAL} decode_batch=${DECODE_BATCH} score_batch=${SCORE_BATCH}"
+while true; do
+  current_run_dir="$(find_run_dir || true)"
+  if [[ -z "${current_run_dir}" ]]; then
+    echo "[watch-owt-len1024-lr2e4] $(date +%F_%T) waiting for matching run: ${RUN_GLOB}"
+    sleep "${SLEEP_SECONDS}"
+    continue
+  fi
+  run_stem="$(basename "${current_run_dir}")"
+  latest_ckpt="${current_run_dir}/latest.pt"
+  out_base="${OUT_ROOT}/${run_stem}"
+  processed_file="${LOG_DIR}/processed_${run_stem}_steps${STEPS}_c${CMAX}_t${TEMP_TAG}_n${N_SAMPLES}.txt"
+  snapshot_dir="${current_run_dir}/latest_snapshots_1k"
+  mkdir -p "${out_base}" "${LOG_DIR}" "${snapshot_dir}"
+  touch "${processed_file}"
+  if [[ ! -f "${latest_ckpt}" ]]; then
+    echo "[watch-owt-len1024-lr2e4] $(date +%F_%T) run=${run_stem} no latest.pt yet"
+    sleep "${SLEEP_SECONDS}"
+    continue
+  fi
+  if ! wait_for_stable_file "${latest_ckpt}"; then
+    echo "[watch-owt-len1024-lr2e4] $(date +%F_%T) latest.pt not stable yet"
+    sleep "${SLEEP_SECONDS}"
+    continue
+  fi
+  step_num="$(read_ckpt_step "${latest_ckpt}")"
+  if (( step_num <= 0 || step_num % STEP_INTERVAL != 0 )); then
+    echo "[watch-owt-len1024-lr2e4] $(date +%F_%T) latest step=${step_num}; waiting for multiple of ${STEP_INTERVAL}"
+    sleep "${SLEEP_SECONDS}"
+    continue
+  fi
+  step="$(printf '%07d' "${step_num}")"
+  snapshot="${snapshot_dir}/step_${step}.pt"
+  processed_key="${current_run_dir}:step_${step}"
+  if grep -Fxq "${processed_key}" "${processed_file}"; then
+    sleep "${SLEEP_SECONDS}"
+    continue
+  fi
+  if [[ ! -f "${snapshot}" ]]; then
+    tmp_snapshot="${snapshot}.tmp.$$"
+    echo "[watch-owt-len1024-lr2e4] $(date +%F_%T) snapshot latest step_${step} -> ${snapshot}"
+    cp --reflink=auto "${latest_ckpt}" "${tmp_snapshot}" 2>/dev/null || cp "${latest_ckpt}" "${tmp_snapshot}"
+    mv "${tmp_snapshot}" "${snapshot}"
+  fi
+  out_dir="${out_base}/step_${step}"
+  log_file="${LOG_DIR}/infer_${run_stem}_step_${step}_t${TEMP_TAG}.log"
+  mkdir -p "${out_dir}"
+  echo "[watch-owt-len1024-lr2e4] $(date +%F_%T) infer ${snapshot} -> ${out_dir}" | tee -a "${log_file}"
+  CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES}" python scripts/eval_owt_normal_steps_sweep_20260515.py \
+    --checkpoint "${snapshot}" \
+    --tokenizer_path "${TOKENIZER_PATH}" \
+    --scorer "${SCORER}" \
+    --out_dir "${out_dir}" \
+    --steps_list "${STEPS}" \
+    --cmax_list "${CMAX}" \
+    --endpoint_temps "${TEMP}" \
+    --n_samples "${N_SAMPLES}" \
+    --max_len "${MAX_LEN}" \
+    --decode_batch "${DECODE_BATCH}" \
+    --score_batch "${SCORE_BATCH}" \
+    --score_max_length "${SCORE_MAX_LENGTH}" \
+    --detokenizer none \
+    --seed 20260521 \
+    --save_samples 16 \
+    2>&1 | tee -a "${log_file}"
+  echo "${processed_key}" >> "${processed_file}"
+  echo "[watch-owt-len1024-lr2e4] $(date +%F_%T) done step_${step}" | tee -a "${log_file}"
+  sleep "${SLEEP_SECONDS}"
+done