JinghuiLuAstronaut commited on
Commit
0241b9f
·
verified ·
1 Parent(s): 36dad47

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. LTA_openwebtext_dualt/logs/basinbankmix_debug/lta_owt_basinbankmix_smoke4gpu_20260514_152205.log +111 -0
  2. LTA_openwebtext_dualt/logs/eval_lm1b_200k_methods_genppl_20260506_rerun.log +65 -0
  3. LTA_openwebtext_dualt/logs/infer_owt_t5_rollin450k_dualline_state_steps128_n16_20260522.log +51 -0
  4. LTA_openwebtext_dualt/logs/lta_lm1b_classic_dirichlet_len512_gbs512_4gpu_10k_save1k_20260523.watch.pid +1 -0
  5. LTA_openwebtext_dualt/logs/lta_lm1b_dirichlet_categorical_fullvocab_c4p0_dualt_flmpack_onehot_hardce_ddit_small_len128_gbs512_8gpu_1m_nw0.log +0 -0
  6. LTA_openwebtext_dualt/logs/lta_lm1b_duo_aligned_dirichlet_true_dualtline_flmpack_onehot_hardce_ddit_small_len128_gbs512_4xh20_1m.log +0 -0
  7. LTA_openwebtext_dualt/logs/lta_owt_c1024_len1024_gaussianflm_gbs128_4gpu_2k_resume.log +64 -0
  8. LTA_openwebtext_dualt/logs/lta_owt_gpt2cached_len1024_ddit768x12_elfopt_only_muon_ema_gbs512_8gpu_1m_20260513_025959.log +0 -0
  9. LTA_openwebtext_dualt/logs/lta_owt_gpt2cached_len1024_rollout1_p1_smoke4gpu_nofind_20260513_144753.log +225 -0
  10. LTA_openwebtext_dualt/logs/lta_owt_t5elf_absrope_time4_dirichlet_len1025_C1_to_1024_prebos_mask1_sameT_gbs512_b32_8gpu_1m_save10k_20260526_watcher.pid +1 -0
  11. LTA_openwebtext_dualt/logs/lta_owt_t5elf_dirichlet_len1024_Cv_to_2v_mask1_gbs512_b32_4gpu_20k_save1k_20260525.log +197 -0
  12. LTA_openwebtext_dualt/logs/owt_fully_categorical_gate_probe_step116k_n8.log +44 -0
  13. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/pilot_softendpoint_mn_n128_m8_16_32_onehot_20260516_softendpoint_mn_pilot.log +286 -0
  14. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_combo_len256_logistic_unigram_shared_C1024_20260517_170456.log +423 -0
  15. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_core_p50_unif0_0p25_outwdm1_ctx1024_core_tradeoff_dual_20260517_230929.log +0 -0
  16. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_p50_unif0_0p25_outwdm1_ctx1024_sampleds_wide_20260517_220321.log +0 -0
  17. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_path_p50_path3_unif0_0p25_outwdm1_ctx1024_path_tradeoff_sde_20260517_232950.log +617 -0
  18. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_randk_p50_path3_rand1_3_unif0_0p25_ctx1024_uniformt_temp1_randk_20260518_010217.log +827 -0
  19. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_t5tok_p35_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014800.log +1026 -0
  20. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_t5tok_p50_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728.log +620 -0
  21. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_t5tok_p50_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014436.log +201 -0
  22. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n1024_linear_soft_kl_onehot_20260517_train8_overfit.log +316 -0
  23. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n128_compactv584_3l_bs512_hard_ce_onehot.log +0 -0
  24. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n256_compactv969_3l_bs512_hard_ce_allcorrupt.log +0 -0
  25. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n512_compactv1635_3l_bs512_hard_ce_onehot.log +0 -0
  26. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n64_compactv335_3l_hard_ce_allcorrupt.log +0 -0
  27. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n8_allcorrupt_linear_soft_kl_20260517_train8ctx8_allcorrupt.log +326 -0
  28. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n8_compactv47_3l_linear_soft_kl_allcorrupt.log +0 -0
  29. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n8_hard_ce_bridge_20260517_train8ctx8_overfit.log +326 -0
  30. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_noisegeo_len256_allcorrupt_fullvocab_dirC1_1024_20260517_163805.log +0 -0
  31. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_noisegeo_len256_allcorrupt_seqrand1_dirC1_1024_20260517_163805.log +989 -0
  32. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_noisegeo_len256_allcorrupt_seqrand1_highC64_4096_20260517_163805.log +791 -0
  33. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_len256_rollin_p10_s4_i32_20260517_171654.log +396 -0
  34. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_len256_rollin_p25_s8_i64_20260517_171654.log +198 -0
  35. LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_synct_len256_synct_p50_s4_i32_20260517_1800synct.log +194 -0
  36. LTA_openwebtext_dualt/logs/wmt14_deen_elfofficial_4gpu_debug/debug_wmt14_deen_elfofficial_t5_len128_in64_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_zeroout_4gpu_20260516_020150.log +298 -0
  37. LTA_openwebtext_dualt/logs/wmt14_deen_elfofficial_4gpu_debug/debug_wmt14_deen_elfofficial_t5_len256_in128_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_zeroout_4gpu_20260516_014755.log +326 -0
  38. LTA_openwebtext_dualt/logs/wmt14_deen_elfofficial_4gpu_debug/debug_wmt14_deen_elfofficial_t5_len256_in128_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_zeroout_4gpu_20260516_015703.log +326 -0
  39. LTA_openwebtext_dualt/mini_owt_fit/model.py +121 -0
  40. LTA_openwebtext_dualt/mini_owt_fit/run_standard_owt_t5_8gpu.sh +60 -0
  41. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/hf_xet-1.5.0.dist-info/licenses/LICENSE +201 -0
  42. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/hf_xet-1.5.0.dist-info/sboms/hf_xet.cyclonedx.json +0 -0
  43. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/pygments-2.20.0.dist-info/WHEEL +4 -0
  44. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/typer/__init__.py +39 -0
  45. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/typer/__main__.py +3 -0
  46. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/typer/_completion_classes.py +199 -0
  47. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/typer/_completion_shared.py +252 -0
  48. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/typer/_types.py +27 -0
  49. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/typer/_typing.py +73 -0
  50. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/typer/cli.py +318 -0
LTA_openwebtext_dualt/logs/basinbankmix_debug/lta_owt_basinbankmix_smoke4gpu_20260514_152205.log ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "device": "cuda:0",
3
+ "rank": 0,
4
+ "world_size": 4,
5
+ "samples": "owt_cached_chunks:8734897",
6
+ "vocab_size": 50257,
7
+ "tokenizer_vocab_size": 50257,
8
+ "save_dir": "runs_debug/lta_owt_basinbankmix_smoke4gpu_20260514_152205",
9
+ "batch_size": 4,
10
+ "grad_accum": 1,
11
+ "effective_batch_size": 16,
12
+ "global_batch_size": 16,
13
+ "lr_schedule": "constant_warmup",
14
+ "optimizer": "adamw",
15
+ "warmup_steps": 1,
16
+ "min_lr": 6e-05,
17
+ "weight_decay": 0.0,
18
+ "adamw_param_groups": "nanogpt",
19
+ "adam_beta1": 0.9,
20
+ "adam_beta2": 0.95,
21
+ "adam_eps": 1e-08,
22
+ "muon_momentum": 0.95,
23
+ "muon_ns_steps": 5,
24
+ "muon_update_scale": 1.0,
25
+ "ema_decay": 0.0,
26
+ "ema_start_step": 0,
27
+ "model_type": "ddit",
28
+ "dual_t": true,
29
+ "corrupt_t_mode": "same",
30
+ "corrupt_min_t": null,
31
+ "corrupt_max_t": null,
32
+ "prefix_block_prob": 0.0,
33
+ "prefix_block_len": 128,
34
+ "mask_ratio_floor_schedule": "none",
35
+ "dirichlet_endpoint_mode": "categorical_dual_t",
36
+ "dirichlet_semantic_t_mode": "same",
37
+ "dirichlet_semantic_t_value": 0.0,
38
+ "dirichlet_semantic_t_curve": "linear",
39
+ "dirichlet_semantic_t_power": 1.0,
40
+ "endpoint_sequence_random_prob_alpha": 0.0,
41
+ "categorical_wrong_from_full_vocab": false,
42
+ "categorical_wrong_from_batch_valid_tokens": false,
43
+ "categorical_wrong_basin_token_ids": "284,326,340,307,11,314,1949,262,2808,1295,318,12,423,13,2608,290,1842,198,356,2681,642,34,1011,2310,287,286,1577,3853,352,345,467,25,1596,2579,257,678,2555,4747,484,513,16,787,1679,606,460,1110,1542,18,1498,362,357,767,2107,1026,779,17,373,9752,2077,35,1271,532,860,1394",
44
+ "categorical_wrong_basin_prob": 0.0,
45
+ "categorical_wrong_unigram_prob": 0.3,
46
+ "categorical_wrong_uniform_prob": 0.6,
47
+ "categorical_wrong_basin_shared_prob": 0.1,
48
+ "mask_mixture_original_prob": 0.0,
49
+ "mask_mixture_lowk_prob": 0.0,
50
+ "mask_mixture_lowcorrupt_prob": 0.0,
51
+ "mask_mixture_block_prob": 0.0,
52
+ "mask_mixture_all_prob": 0.0,
53
+ "mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
54
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
55
+ "mask_mixture_block_tokens": "64,128",
56
+ "simplex_bridge_sampler": "dirichlet",
57
+ "logistic_normal_sigma_min": 0.18,
58
+ "logistic_normal_sigma_max": 2.2,
59
+ "logistic_normal_tau_min": 0.65,
60
+ "logistic_normal_tau_max": 1.15,
61
+ "torch_compile": false,
62
+ "compile_mode": "max-autotune",
63
+ "state_format": "prob",
64
+ "target_loss": "hard_ce",
65
+ "meanflow_weight": 0.0,
66
+ "rollout_train_prob": 0.0,
67
+ "rollout_train_steps": 1,
68
+ "rollout_train_infer_steps": 64,
69
+ "rollout_train_temp": 1.45,
70
+ "rollout_train_max_gamma": 1.0,
71
+ "rollout_train_corrupt_only": true,
72
+ "rollout_train_samplewise": false,
73
+ "rollout_train_compute_always": false,
74
+ "bridge_noise_init": "logistic_normal",
75
+ "noise_sigma": -1.0,
76
+ "allow_tf32": true,
77
+ "activation_checkpointing": false,
78
+ "activation_checkpoint_interval": 1,
79
+ "activation_checkpoint_scope": "block",
80
+ "ddp_static_graph": false,
81
+ "ddp_gradient_as_bucket_view": true,
82
+ "blocking_data_transfer": false,
83
+ "dataloader_prefetch_factor": 2,
84
+ "full_train_stats": false,
85
+ "record_pad_truncate": false,
86
+ "record_add_eos": false,
87
+ "record_add_special_tokens": false,
88
+ "record_pad_token": "pad",
89
+ "record_shuffle_buffer": 10000,
90
+ "wrap": true,
91
+ "wrap_mode": "stream",
92
+ "wrap_record_buffer_size": 200,
93
+ "owt_cached_chunks": true,
94
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train_minus_100k",
95
+ "owt_chunk_cache_rebuild": false,
96
+ "owt_chunk_cache_write_batch": 4096,
97
+ "owt_exact_repeat_per_chunk": 0,
98
+ "online_chunk_shuffle": false,
99
+ "online_chunk_shuffle_buffer": 10000,
100
+ "openwebtext_split": "train_minus_100k",
101
+ "detokenizer": "auto",
102
+ "resolved_detokenizer": null,
103
+ "num_workers": 0,
104
+ "latest_every": 1000,
105
+ "resume_path": ""
106
+ }
107
+ step=1 micro_steps=1 elapsed=1.3s lr=3.000000e-04 loss=10.8125 loss_recon=10.8125 loss_meanflow=0.0000 mean_model_t=0.2654 mean_corrupt_t=0.2654 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 out_w_norm=0.0000 out_g_norm=0.8875 acc_all=0.0010 acc_corrupt=0.0008 corrupt_frac=0.5942 loss_all=10.8125 loss_corrupt=10.8125 acc_corrupt_t_0p0_0p2=0.0000 corrupt_frac_t_0p0_0p2=0.3365 acc_corrupt_t_0p2_0p4=0.0016 corrupt_frac_t_0p2_0p4=0.2494 acc_corrupt_t_0p6_0p8=0.0010 corrupt_frac_t_0p6_0p8=0.4141 wrong_frac=0.6315 init_acc_corrupt=0.3591 init_gold_top10=0.3632 init_gold_top100=0.3973
108
+ step=2 micro_steps=2 elapsed=0.1s lr=3.000000e-04 loss=10.7812 loss_recon=10.7812 loss_meanflow=0.0000 mean_model_t=0.3494 mean_corrupt_t=0.3494 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 out_w_norm=1.7929 out_g_norm=1.1412 acc_all=0.2507 acc_corrupt=0.1279 corrupt_frac=0.5098 loss_all=10.7415 loss_corrupt=10.7812 acc_corrupt_t_0p0_0p2=0.0110 corrupt_frac_t_0p0_0p2=0.3491 acc_corrupt_t_0p2_0p4=0.1098 corrupt_frac_t_0p2_0p4=0.2486 acc_corrupt_t_0p6_0p8=0.2405 corrupt_frac_t_0p6_0p8=0.4023 wrong_frac=0.6140 init_acc_corrupt=0.3348 init_gold_top10=0.3740 init_gold_top100=0.4148
109
+ step=3 micro_steps=3 elapsed=0.1s lr=3.000000e-04 loss=10.7211 loss_recon=10.7211 loss_meanflow=0.0000 mean_model_t=0.2648 mean_corrupt_t=0.2648 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 out_w_norm=3.2734 out_g_norm=1.3191 acc_all=0.4695 acc_corrupt=0.1768 corrupt_frac=0.3909 loss_all=10.6144 loss_corrupt=10.7211 acc_corrupt_t_0p0_0p2=0.0632 corrupt_frac_t_0p0_0p2=0.2274 acc_corrupt_t_0p2_0p4=0.1393 corrupt_frac_t_0p2_0p4=0.3810 acc_corrupt_t_0p4_0p6=0.2791 corrupt_frac_t_0p4_0p6=0.3916 wrong_frac=0.7046 init_acc_corrupt=0.2142 init_gold_top10=0.2911 init_gold_top100=0.3392
110
+ step=4 micro_steps=4 elapsed=0.1s lr=3.000000e-04 loss=10.4237 loss_recon=10.4237 loss_meanflow=0.0000 mean_model_t=0.5641 mean_corrupt_t=0.5641 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 out_w_norm=4.6708 out_g_norm=1.8258 acc_all=0.4395 acc_corrupt=0.3441 corrupt_frac=0.6514 loss_all=10.3969 loss_corrupt=10.4237 acc_corrupt_t_0p0_0p2=0.0599 corrupt_frac_t_0p0_0p2=0.2691 acc_corrupt_t_0p4_0p6=0.2872 corrupt_frac_t_0p4_0p6=0.3171 acc_corrupt_t_0p6_0p8=0.4082 corrupt_frac_t_0p6_0p8=0.1001 acc_corrupt_t_0p8_1p0=0.6249 corrupt_frac_t_0p8_1p0=0.3137 wrong_frac=0.4258 init_acc_corrupt=0.5315 init_gold_top10=0.5645 init_gold_top100=0.6008
111
+ step=5 micro_steps=5 elapsed=0.1s lr=3.000000e-04 loss=10.2527 loss_recon=10.2527 loss_meanflow=0.0000 mean_model_t=0.6118 mean_corrupt_t=0.6118 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 out_w_norm=5.9812 out_g_norm=2.4594 acc_all=0.2424 acc_corrupt=0.1392 corrupt_frac=0.5701 loss_all=10.2071 loss_corrupt=10.2527 acc_corrupt_t_0p2_0p4=0.0524 corrupt_frac_t_0p2_0p4=0.3842 acc_corrupt_t_0p6_0p8=0.1933 corrupt_frac_t_0p6_0p8=0.6158 wrong_frac=0.4313 init_acc_corrupt=0.5388 init_gold_top10=0.5687 init_gold_top100=0.5704
LTA_openwebtext_dualt/logs/eval_lm1b_200k_methods_genppl_20260506_rerun.log ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [load] mdlm_200k step=200000 ckpt=runs/lm1b_mdlm_unified_ddit_small_len128_gbs512_8gpu_1m_20260505_repro/step_0200000.pt
2
+ [mdlm] generated 16/256
3
+ [mdlm] generated 32/256
4
+ [mdlm] generated 48/256
5
+ [mdlm] generated 64/256
6
+ [mdlm] generated 80/256
7
+ [mdlm] generated 96/256
8
+ [mdlm] generated 112/256
9
+ [mdlm] generated 128/256
10
+ [mdlm] generated 144/256
11
+ [mdlm] generated 160/256
12
+ [mdlm] generated 176/256
13
+ [mdlm] generated 192/256
14
+ [mdlm] generated 208/256
15
+ [mdlm] generated 224/256
16
+ [mdlm] generated 240/256
17
+ [mdlm] generated 256/256
18
+ [summary] {"type": "summary", "name": "mdlm_200k", "kind": "mdlm", "checkpoint": "runs/lm1b_mdlm_unified_ddit_small_len128_gbs512_8gpu_1m_20260505_repro/step_0200000.pt", "step": 200000, "decode": {"kind": "mdlm", "steps": 256, "decode_rule": "confidence_unmask", "start": "all_mask", "n_samples": 256, "seed": 20260506}, "raw_genppl": {"ppl": 7.612718874390907, "nll_per_token": 2.0298203845952183, "tokens": 38144, "kept_samples": 256, "total_samples": 256, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 9.005877695089472, "nll_per_token": 2.19787744140625, "tokens": 32000, "kept_samples": 256, "total_samples": 256, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 3.3737493108092327, "unique_tokens": 50, "token_count": 32768, "distinct_1": 0.00152587890625, "distinct_2": 0.0022145669291338582, "top_token_mass": 0.1875}}
19
+ [load] duo_200k step=200000 ckpt=runs/lm1b_duo_unified_ddit_small_len128_gbs512_8gpu_1m_20260505_repro/step_0200000.pt
20
+ Traceback (most recent call last):
21
+ File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/scripts/eval_lm1b_200k_methods_genppl_20260506.py", line 587, in <module>
22
+ main()
23
+ File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/scripts/eval_lm1b_200k_methods_genppl_20260506.py", line 538, in main
24
+ ids, texts, decode = decode_dense_baseline(
25
+ ^^^^^^^^^^^^^^^^^^^^^^
26
+ File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/scripts/eval_lm1b_200k_methods_genppl_20260506.py", line 232, in decode_dense_baseline
27
+ logits = model(state_for_model(model, probs, eps), t, attn).float()
28
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
29
+ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
30
+ return self._call_impl(*args, **kwargs)
31
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
32
+ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
33
+ return forward_call(*args, **kwargs)
34
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
35
+ File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/model.py", line 302, in forward
36
+ x = block(x, rotary_cos_sin, c)
37
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
38
+ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
39
+ return self._call_impl(*args, **kwargs)
40
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
41
+ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
42
+ return forward_call(*args, **kwargs)
43
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
44
+ File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/model.py", line 187, in forward
45
+ x = x + gate_mlp * F.dropout(self.mlp(h), p=self.dropout, training=self.training)
46
+ ^^^^^^^^^^^
47
+ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
48
+ return self._call_impl(*args, **kwargs)
49
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
50
+ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
51
+ return forward_call(*args, **kwargs)
52
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
53
+ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py", line 250, in forward
54
+ input = module(input)
55
+ ^^^^^^^^^^^^^
56
+ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
57
+ return self._call_impl(*args, **kwargs)
58
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
59
+ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
60
+ return forward_call(*args, **kwargs)
61
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
62
+ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/linear.py", line 125, in forward
63
+ return F.linear(input, self.weight, self.bias)
64
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
65
+ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacity of 95.22 GiB of which 9.88 MiB is free. Process 746240 has 19.59 GiB memory in use. Process 1128627 has 75.61 GiB memory in use. Of the allocated memory 74.90 GiB is allocated by PyTorch, and 294.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
LTA_openwebtext_dualt/logs/infer_owt_t5_rollin450k_dualline_state_steps128_n16_20260522.log ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [ckpt] runs/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/step_0450000.pt step=450000
2
+ [decode-base] n=16 max_len=1024 steps=128 model_t=post
3
+ [decode-time] schedule=linear s=[0.0,0.25] gumbel=(2.2,0.8) force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.007812 dt_max=0.007812
4
+ Traceback (most recent call last):
5
+ File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/scripts/standard_genppl_entropy_latest_decode.py", line 826, in <module>
6
+ main()
7
+ File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/scripts/standard_genppl_entropy_latest_decode.py", line 673, in main
8
+ ids, raw_texts = decode_samples(
9
+ ^^^^^^^^^^^^^^^
10
+ File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
11
+ return func(*args, **kwargs)
12
+ ^^^^^^^^^^^^^^^^^^^^^
13
+ File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/scripts/standard_genppl_entropy_latest_decode.py", line 354, in decode_samples
14
+ probs = make_started_state(
15
+ ^^^^^^^^^^^^^^^^^^^
16
+ File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/scripts/standard_genppl_entropy_latest_decode.py", line 264, in make_started_state
17
+ return sample_noise_simplex(
18
+ ^^^^^^^^^^^^^^^^^^^^^
19
+ File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/decode.py", line 32, in sample_noise_simplex
20
+ sigma = resolve_noise_sigma(vocab_size, target_prob, noise_sigma)
21
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
22
+ File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/decode.py", line 16, in resolve_noise_sigma
23
+ return float(margin_for_target_prob(vocab_size, target_prob))
24
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
25
+ File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/bridges.py", line 161, in margin_for_target_prob
26
+ raise ValueError(f"target_prob must be in (0, 1), got {q}")
27
+ ValueError: target_prob must be in (0, 1), got 1.0
28
+ [ckpt] runs/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/step_0450000.pt step=450000
29
+ [decode-base] n=16 max_len=1024 steps=128 model_t=post
30
+ [decode-time] schedule=linear s=[0.0,0.25] gumbel=(2.2,0.8) force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.007812 dt_max=0.007812
31
+ [decode] temp=1.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/16
32
+ [decode] temp=1.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/16
33
+ [decode] temp=1.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 12/16
34
+ [decode] temp=1.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 16/16
35
+ [decode] temp=1.30 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/16
36
+ [decode] temp=1.30 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/16
37
+ [decode] temp=1.30 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 12/16
38
+ [decode] temp=1.30 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 16/16
39
+ [decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/16
40
+ [decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/16
41
+ [decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 12/16
42
+ [decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 16/16
43
+ [decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/16
44
+ [decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/16
45
+ [decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 12/16
46
+ [decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 16/16
47
+ [summary] {"type": "summary", "checkpoint": "runs/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/step_0450000.pt", "step": 450000, "decode": {"steps": 128, "model_t_mode": "post", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "state", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "logistic_normal", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 0.999, "endpoint_temp": 1.0, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 16, "seed": 20260503}, "raw_genppl": {"ppl": 2.2326077476556088, "nll_per_token": 0.8031702958314813, "tokens": 8999, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 2.2307195664959876, "nll_per_token": 0.8023242089822044, "tokens": 8998, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 1.0316432280294907, "unique_tokens": 41, "token_count": 16384, "distinct_1": 0.00250244140625, "distinct_2": 0.010202834799608993, "top_token_mass": 0.21685791015625}}
48
+ [summary] {"type": "summary", "checkpoint": "runs/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/step_0450000.pt", "step": 450000, "decode": {"steps": 128, "model_t_mode": "post", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "state", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "logistic_normal", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 0.999, "endpoint_temp": 1.3, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 16, "seed": 20260503}, "raw_genppl": {"ppl": 7.064495313258387, "nll_per_token": 1.955081578801945, "tokens": 13572, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 17.572733752302447, "nll_per_token": 2.866348482144905, "tokens": 5451, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 1.2441357657584844, "unique_tokens": 22, "token_count": 16384, "distinct_1": 0.0013427734375, "distinct_2": 0.007270283479960899, "top_token_mass": 0.6495361328125}}
49
+ [summary] {"type": "summary", "checkpoint": "runs/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/step_0450000.pt", "step": 450000, "decode": {"steps": 128, "model_t_mode": "post", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "state", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "logistic_normal", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 0.999, "endpoint_temp": 1.45, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 16, "seed": 20260503}, "raw_genppl": {"ppl": 30.717215425545668, "nll_per_token": 3.4248232604713977, "tokens": 13513, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 30.86467464857938, "nll_per_token": 3.429612314678342, "tokens": 11534, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 2.1715079167281877, "unique_tokens": 188, "token_count": 16384, "distinct_1": 0.011474609375, "distinct_2": 0.04233870967741935, "top_token_mass": 0.2110595703125}}
50
+ [summary] {"type": "summary", "checkpoint": "runs/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/step_0450000.pt", "step": 450000, "decode": {"steps": 128, "model_t_mode": "post", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "state", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "logistic_normal", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 0.999, "endpoint_temp": 1.6, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 16, "seed": 20260503}, "raw_genppl": {"ppl": 197.5653078639758, "nll_per_token": 5.2860692024099905, "tokens": 14554, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 232.80731535973388, "nll_per_token": 5.4502111387301335, "tokens": 12773, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 3.243602888805284, "unique_tokens": 2007, "token_count": 16384, "distinct_1": 0.12249755859375, "distinct_2": 0.31030058651026393, "top_token_mass": 0.17779541015625}}
51
+ [done] docs/lta_samples/metrics_20260522/owt_t5_rollin_p50_450k_dualline_state_steps128_n16/summary.jsonl
LTA_openwebtext_dualt/logs/lta_lm1b_classic_dirichlet_len512_gbs512_4gpu_10k_save1k_20260523.watch.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 993820
LTA_openwebtext_dualt/logs/lta_lm1b_dirichlet_categorical_fullvocab_c4p0_dualt_flmpack_onehot_hardce_ddit_small_len128_gbs512_8gpu_1m_nw0.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/lta_lm1b_duo_aligned_dirichlet_true_dualtline_flmpack_onehot_hardce_ddit_small_len128_gbs512_4xh20_1m.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/lta_owt_c1024_len1024_gaussianflm_gbs128_4gpu_2k_resume.log ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ *****************************************
3
+ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
4
+ *****************************************
5
+ NCCL version 2.25.1+cuda12.8
6
+ resumed_from=runs/lta_owt_c1024_len1024_gaussianflm_gbs128_4gpu_2k/step_0000500.pt start_step=501
7
+ {
8
+ "device": "cuda:0",
9
+ "rank": 0,
10
+ "world_size": 4,
11
+ "samples": "wrapped_stream_online_shuffle:1000",
12
+ "vocab_size": 50257,
13
+ "save_dir": "runs/lta_owt_c1024_len1024_gaussianflm_gbs128_4gpu_2k",
14
+ "batch_size": 16,
15
+ "grad_accum": 2,
16
+ "effective_batch_size": 128,
17
+ "global_batch_size": 128,
18
+ "lr_schedule": "constant_warmup",
19
+ "warmup_steps": 100,
20
+ "adam_beta1": 0.9,
21
+ "adam_beta2": 0.999,
22
+ "adam_eps": 1e-08,
23
+ "model_type": "ddit",
24
+ "dual_t": true,
25
+ "corrupt_t_mode": "same",
26
+ "corrupt_min_t": 0.0,
27
+ "corrupt_max_t": 1.0,
28
+ "dirichlet_endpoint_mode": "categorical_dual_t",
29
+ "dirichlet_semantic_t_mode": "same",
30
+ "dirichlet_semantic_t_value": 0.0,
31
+ "categorical_wrong_from_full_vocab": true,
32
+ "simplex_bridge_sampler": "dirichlet",
33
+ "logistic_normal_sigma_min": 0.18,
34
+ "logistic_normal_sigma_max": 2.2,
35
+ "logistic_normal_tau_min": 0.65,
36
+ "logistic_normal_tau_max": 1.15,
37
+ "torch_compile": false,
38
+ "compile_mode": "max-autotune",
39
+ "state_format": "prob",
40
+ "target_loss": "hard_ce",
41
+ "meanflow_weight": 0.0,
42
+ "bridge_noise_init": "logistic_normal",
43
+ "noise_sigma": -1.0,
44
+ "wrap": true,
45
+ "wrap_mode": "stream",
46
+ "wrap_record_buffer_size": 200,
47
+ "owt_cached_chunks": false,
48
+ "owt_chunk_cache_dir": "",
49
+ "owt_chunk_cache_rebuild": false,
50
+ "owt_chunk_cache_write_batch": 4096,
51
+ "online_chunk_shuffle": true,
52
+ "online_chunk_shuffle_buffer": 1000,
53
+ "openwebtext_split": "train_minus_100k",
54
+ "detokenizer": "auto",
55
+ "resolved_detokenizer": null,
56
+ "num_workers": 0,
57
+ "latest_every": 500,
58
+ "resume_path": "runs/lta_owt_c1024_len1024_gaussianflm_gbs128_4gpu_2k/step_0000500.pt"
59
+ }
60
+ step=600 micro_steps=1200 elapsed=81.0s lr=3.000000e-04 loss_all=7.5269 acc_all=0.0389 loss_corrupt=7.5269 acc_corrupt=0.0389 corrupt_frac=1.0000 loss=7.5269 loss_recon=7.5269 loss_meanflow=0.0000 mean_model_t=0.4969 mean_corrupt_t=0.4969 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.0000 init_acc_corrupt=0.2011 init_gold_top10=0.2329 init_gold_top100=0.2787
61
+ step=700 micro_steps=1400 elapsed=77.3s lr=3.000000e-04 loss_all=7.4719 acc_all=0.0440 loss_corrupt=7.4719 acc_corrupt=0.0440 corrupt_frac=1.0000 loss=7.4719 loss_recon=7.4719 loss_meanflow=0.0000 mean_model_t=0.4972 mean_corrupt_t=0.4972 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.0000 init_acc_corrupt=0.1970 init_gold_top10=0.2301 init_gold_top100=0.2770
62
+ step=800 micro_steps=1600 elapsed=77.9s lr=3.000000e-04 loss_all=7.4278 acc_all=0.0551 loss_corrupt=7.4278 acc_corrupt=0.0551 corrupt_frac=1.0000 loss=7.4278 loss_recon=7.4278 loss_meanflow=0.0000 mean_model_t=0.4958 mean_corrupt_t=0.4958 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.0000 init_acc_corrupt=0.1973 init_gold_top10=0.2295 init_gold_top100=0.2756
63
+ step=900 micro_steps=1800 elapsed=77.6s lr=3.000000e-04 loss_all=7.3457 acc_all=0.0738 loss_corrupt=7.3457 acc_corrupt=0.0738 corrupt_frac=1.0000 loss=7.3457 loss_recon=7.3457 loss_meanflow=0.0000 mean_model_t=0.5016 mean_corrupt_t=0.5016 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.0000 init_acc_corrupt=0.2068 init_gold_top10=0.2383 init_gold_top100=0.2841
64
+ step=1000 micro_steps=2000 elapsed=77.5s lr=3.000000e-04 loss_all=7.3143 acc_all=0.0879 loss_corrupt=7.3143 acc_corrupt=0.0879 corrupt_frac=1.0000 loss=7.3143 loss_recon=7.3143 loss_meanflow=0.0000 mean_model_t=0.5041 mean_corrupt_t=0.5041 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.0000 init_acc_corrupt=0.1991 init_gold_top10=0.2344 init_gold_top100=0.2851
LTA_openwebtext_dualt/logs/lta_owt_gpt2cached_len1024_ddit768x12_elfopt_only_muon_ema_gbs512_8gpu_1m_20260513_025959.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/lta_owt_gpt2cached_len1024_rollout1_p1_smoke4gpu_nofind_20260513_144753.log ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8734897",
7
+ "vocab_size": 50257,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/lta_owt_gpt2cached_len1024_rollout1_p1_smoke4gpu_nofind_20260513_144753",
10
+ "batch_size": 2,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 8,
13
+ "global_batch_size": 8,
14
+ "lr_schedule": "cosine",
15
+ "optimizer": "adamw",
16
+ "warmup_steps": 1,
17
+ "min_lr": 6e-05,
18
+ "weight_decay": 0.1,
19
+ "adamw_param_groups": "nanogpt",
20
+ "adam_beta1": 0.9,
21
+ "adam_beta2": 0.95,
22
+ "adam_eps": 1e-08,
23
+ "muon_momentum": 0.95,
24
+ "muon_ns_steps": 5,
25
+ "muon_update_scale": 1.0,
26
+ "ema_decay": 0.0,
27
+ "ema_start_step": 0,
28
+ "model_type": "ddit",
29
+ "dual_t": true,
30
+ "corrupt_t_mode": "same",
31
+ "corrupt_min_t": 0.0,
32
+ "corrupt_max_t": 1.0,
33
+ "prefix_block_prob": 0.0,
34
+ "prefix_block_len": 128,
35
+ "dirichlet_endpoint_mode": "categorical_dual_t",
36
+ "dirichlet_semantic_t_mode": "same",
37
+ "dirichlet_semantic_t_value": 0.0,
38
+ "categorical_wrong_from_full_vocab": true,
39
+ "categorical_wrong_from_batch_valid_tokens": false,
40
+ "mask_mixture_original_prob": 0.0,
41
+ "mask_mixture_lowk_prob": 0.0,
42
+ "mask_mixture_lowcorrupt_prob": 0.0,
43
+ "mask_mixture_block_prob": 0.0,
44
+ "mask_mixture_all_prob": 0.0,
45
+ "mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
46
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
47
+ "mask_mixture_block_tokens": "64,128",
48
+ "simplex_bridge_sampler": "dirichlet",
49
+ "logistic_normal_sigma_min": 0.18,
50
+ "logistic_normal_sigma_max": 2.2,
51
+ "logistic_normal_tau_min": 0.65,
52
+ "logistic_normal_tau_max": 1.15,
53
+ "torch_compile": false,
54
+ "compile_mode": "max-autotune",
55
+ "state_format": "prob",
56
+ "target_loss": "hard_ce",
57
+ "meanflow_weight": 0.0,
58
+ "rollout_train_prob": 1.0,
59
+ "rollout_train_steps": 1,
60
+ "rollout_train_infer_steps": 64,
61
+ "rollout_train_temp": 1.45,
62
+ "rollout_train_max_gamma": 1.0,
63
+ "rollout_train_corrupt_only": true,
64
+ "bridge_noise_init": "logistic_normal",
65
+ "noise_sigma": -1.0,
66
+ "allow_tf32": true,
67
+ "activation_checkpointing": false,
68
+ "activation_checkpoint_interval": 1,
69
+ "ddp_static_graph": false,
70
+ "ddp_gradient_as_bucket_view": true,
71
+ "blocking_data_transfer": false,
72
+ "dataloader_prefetch_factor": 2,
73
+ "full_train_stats": false,
74
+ "record_pad_truncate": false,
75
+ "record_add_eos": false,
76
+ "record_add_special_tokens": false,
77
+ "record_pad_token": "pad",
78
+ "record_shuffle_buffer": 10000,
79
+ "wrap": true,
80
+ "wrap_mode": "stream",
81
+ "wrap_record_buffer_size": 200,
82
+ "owt_cached_chunks": true,
83
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train_minus_100k",
84
+ "owt_chunk_cache_rebuild": false,
85
+ "owt_chunk_cache_write_batch": 4096,
86
+ "owt_exact_repeat_per_chunk": 0,
87
+ "online_chunk_shuffle": false,
88
+ "online_chunk_shuffle_buffer": 10000,
89
+ "openwebtext_split": "train_minus_100k",
90
+ "detokenizer": "auto",
91
+ "resolved_detokenizer": null,
92
+ "num_workers": 0,
93
+ "latest_every": 100000,
94
+ "resume_path": ""
95
+ }
96
+ [rank1]: Traceback (most recent call last):
97
+ [rank1]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1488, in <module>
98
+ [rank1]: main()
99
+ [rank1]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1373, in main
100
+ [rank1]: logits = trainable_model(loss_state, model_t, bridge.attn_mask)
101
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
102
+ [rank1]: File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
103
+ [rank1]: return self._call_impl(*args, **kwargs)
104
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
105
+ [rank1]: File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
106
+ [rank1]: return forward_call(*args, **kwargs)
107
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
108
+ [rank1]: File "/usr/local/lib/python3.12/dist-packages/torch/nn/parallel/distributed.py", line 1650, in forward
109
+ [rank1]: inputs, kwargs = self._pre_forward(*inputs, **kwargs)
110
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
111
+ [rank1]: File "/usr/local/lib/python3.12/dist-packages/torch/nn/parallel/distributed.py", line 1539, in _pre_forward
112
+ [rank1]: if torch.is_grad_enabled() and self.reducer._rebuild_buckets():
113
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
114
+ [rank1]: RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one. This error indicates that your module has parameters that were not used in producing loss. You can enable unused parameter detection by passing the keyword argument `find_unused_parameters=True` to `torch.nn.parallel.DistributedDataParallel`, and by
115
+ [rank1]: making sure all `forward` function outputs participate in calculating loss.
116
+ [rank1]: If you already have done the above, then the distributed data parallel module wasn't able to locate the output tensors in the return value of your module's `forward` function. Please include the loss function and the structure of the return value of `forward` of your module when reporting this issue (e.g. list, dict, iterable).
117
+ [rank1]: Parameter indices which did not receive grad for rank 1: 0 1 2 3 4 6 7 9 10 11 12 13 14 16 17 19 20 21 22 23 24 26 27 29 30 31 32 33 34 36 37 39 40 41 42 43 44 46 47 49 50 51 52 53 54 56 57 59 60 61 62 63 64 66 67 69 70 71 72 73 74 76 77 79 80 81 82 83 84 86 87 89 90 91 92 93 94 96 97 99 100 101 102 103 104 106 107 109 110 111 112 113 114 116 117 119 120 121 122 123 ...
118
+ [rank1]: In addition, you can set the environment variable TORCH_DISTRIBUTED_DEBUG to either INFO or DETAIL to print out information about which particular parameters did not receive gradient on this rank as part of this error
119
+ step=1 micro_steps=1 elapsed=1.1s lr=6.000000e-04 acc_all=0.0005 acc_corrupt=0.0011 corrupt_frac=0.4424 loss_all=10.8125 loss_corrupt=10.8125 acc_corrupt_t_0p0_0p2=0.0015 corrupt_frac_t_0p0_0p2=0.7163 acc_corrupt_t_0p8_1p0=0.0000 corrupt_frac_t_0p8_1p0=0.2837 loss=10.8125 loss_recon=10.8125 loss_meanflow=0.0000 mean_model_t=0.5059 mean_corrupt_t=0.5059 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 rollout_train_applied=1.0000 wrong_frac=0.6821 init_acc_corrupt=0.2815 init_gold_top10=0.3013 init_gold_top100=0.3918
120
+ [rank0]: Traceback (most recent call last):
121
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1488, in <module>
122
+ [rank0]: main()
123
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1373, in main
124
+ [rank0]: logits = trainable_model(loss_state, model_t, bridge.attn_mask)
125
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
126
+ [rank0]: File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
127
+ [rank0]: return self._call_impl(*args, **kwargs)
128
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
129
+ [rank0]: File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
130
+ [rank0]: return forward_call(*args, **kwargs)
131
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
132
+ [rank0]: File "/usr/local/lib/python3.12/dist-packages/torch/nn/parallel/distributed.py", line 1650, in forward
133
+ [rank0]: inputs, kwargs = self._pre_forward(*inputs, **kwargs)
134
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
135
+ [rank0]: File "/usr/local/lib/python3.12/dist-packages/torch/nn/parallel/distributed.py", line 1539, in _pre_forward
136
+ [rank0]: if torch.is_grad_enabled() and self.reducer._rebuild_buckets():
137
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
138
+ [rank0]: RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one. This error indicates that your module has parameters that were not used in producing loss. You can enable unused parameter detection by passing the keyword argument `find_unused_parameters=True` to `torch.nn.parallel.DistributedDataParallel`, and by
139
+ [rank0]: making sure all `forward` function outputs participate in calculating loss.
140
+ [rank0]: If you already have done the above, then the distributed data parallel module wasn't able to locate the output tensors in the return value of your module's `forward` function. Please include the loss function and the structure of the return value of `forward` of your module when reporting this issue (e.g. list, dict, iterable).
141
+ [rank0]: Parameter indices which did not receive grad for rank 0: 0 1 2 3 4 6 7 9 10 11 12 13 14 16 17 19 20 21 22 23 24 26 27 29 30 31 32 33 34 36 37 39 40 41 42 43 44 46 47 49 50 51 52 53 54 56 57 59 60 61 62 63 64 66 67 69 70 71 72 73 74 76 77 79 80 81 82 83 84 86 87 89 90 91 92 93 94 96 97 99 100 101 102 103 104 106 107 109 110 111 112 113 114 116 117 119 120 121 122 123 ...
142
+ [rank0]: In addition, you can set the environment variable TORCH_DISTRIBUTED_DEBUG to either INFO or DETAIL to print out information about which particular parameters did not receive gradient on this rank as part of this error
143
+ [rank3]: Traceback (most recent call last):
144
+ [rank3]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1488, in <module>
145
+ [rank3]: main()
146
+ [rank3]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1373, in main
147
+ [rank3]: logits = trainable_model(loss_state, model_t, bridge.attn_mask)
148
+ [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
149
+ [rank3]: File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
150
+ [rank3]: return self._call_impl(*args, **kwargs)
151
+ [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
152
+ [rank3]: File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
153
+ [rank3]: return forward_call(*args, **kwargs)
154
+ [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
155
+ [rank3]: File "/usr/local/lib/python3.12/dist-packages/torch/nn/parallel/distributed.py", line 1650, in forward
156
+ [rank3]: inputs, kwargs = self._pre_forward(*inputs, **kwargs)
157
+ [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
158
+ [rank3]: File "/usr/local/lib/python3.12/dist-packages/torch/nn/parallel/distributed.py", line 1539, in _pre_forward
159
+ [rank3]: if torch.is_grad_enabled() and self.reducer._rebuild_buckets():
160
+ [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
161
+ [rank3]: RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one. This error indicates that your module has parameters that were not used in producing loss. You can enable unused parameter detection by passing the keyword argument `find_unused_parameters=True` to `torch.nn.parallel.DistributedDataParallel`, and by
162
+ [rank3]: making sure all `forward` function outputs participate in calculating loss.
163
+ [rank3]: If you already have done the above, then the distributed data parallel module wasn't able to locate the output tensors in the return value of your module's `forward` function. Please include the loss function and the structure of the return value of `forward` of your module when reporting this issue (e.g. list, dict, iterable).
164
+ [rank3]: Parameter indices which did not receive grad for rank 3: 0 1 2 3 4 6 7 9 10 11 12 13 14 16 17 19 20 21 22 23 24 26 27 29 30 31 32 33 34 36 37 39 40 41 42 43 44 46 47 49 50 51 52 53 54 56 57 59 60 61 62 63 64 66 67 69 70 71 72 73 74 76 77 79 80 81 82 83 84 86 87 89 90 91 92 93 94 96 97 99 100 101 102 103 104 106 107 109 110 111 112 113 114 116 117 119 120 121 122 123 ...
165
+ [rank3]: In addition, you can set the environment variable TORCH_DISTRIBUTED_DEBUG to either INFO or DETAIL to print out information about which particular parameters did not receive gradient on this rank as part of this error
166
+ [rank2]: Traceback (most recent call last):
167
+ [rank2]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1488, in <module>
168
+ [rank2]: main()
169
+ [rank2]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1373, in main
170
+ [rank2]: logits = trainable_model(loss_state, model_t, bridge.attn_mask)
171
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
172
+ [rank2]: File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
173
+ [rank2]: return self._call_impl(*args, **kwargs)
174
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
175
+ [rank2]: File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
176
+ [rank2]: return forward_call(*args, **kwargs)
177
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
178
+ [rank2]: File "/usr/local/lib/python3.12/dist-packages/torch/nn/parallel/distributed.py", line 1650, in forward
179
+ [rank2]: inputs, kwargs = self._pre_forward(*inputs, **kwargs)
180
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
181
+ [rank2]: File "/usr/local/lib/python3.12/dist-packages/torch/nn/parallel/distributed.py", line 1539, in _pre_forward
182
+ [rank2]: if torch.is_grad_enabled() and self.reducer._rebuild_buckets():
183
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
184
+ [rank2]: RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one. This error indicates that your module has parameters that were not used in producing loss. You can enable unused parameter detection by passing the keyword argument `find_unused_parameters=True` to `torch.nn.parallel.DistributedDataParallel`, and by
185
+ [rank2]: making sure all `forward` function outputs participate in calculating loss.
186
+ [rank2]: If you already have done the above, then the distributed data parallel module wasn't able to locate the output tensors in the return value of your module's `forward` function. Please include the loss function and the structure of the return value of `forward` of your module when reporting this issue (e.g. list, dict, iterable).
187
+ [rank2]: Parameter indices which did not receive grad for rank 2: 0 1 2 3 4 6 7 9 10 11 12 13 14 16 17 19 20 21 22 23 24 26 27 29 30 31 32 33 34 36 37 39 40 41 42 43 44 46 47 49 50 51 52 53 54 56 57 59 60 61 62 63 64 66 67 69 70 71 72 73 74 76 77 79 80 81 82 83 84 86 87 89 90 91 92 93 94 96 97 99 100 101 102 103 104 106 107 109 110 111 112 113 114 116 117 119 120 121 122 123 ...
188
+ [rank2]: In addition, you can set the environment variable TORCH_DISTRIBUTED_DEBUG to either INFO or DETAIL to print out information about which particular parameters did not receive gradient on this rank as part of this error
189
+ [rank0]:[W513 14:48:01.833475754 ProcessGroupNCCL.cpp:1487] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
190
+ W0513 14:48:01.550000 624985 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 624989 closing signal SIGTERM
191
+ W0513 14:48:01.550000 624985 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 624991 closing signal SIGTERM
192
+ E0513 14:48:01.779000 624985 torch/distributed/elastic/multiprocessing/api.py:870] failed (exitcode: 1) local_rank: 1 (pid: 624990) of binary: /usr/bin/python
193
+ Traceback (most recent call last):
194
+ File "<frozen runpy>", line 198, in _run_module_as_main
195
+ File "<frozen runpy>", line 88, in _run_code
196
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
197
+ main()
198
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
199
+ return f(*args, **kwargs)
200
+ ^^^^^^^^^^^^^^^^^^
201
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
202
+ run(args)
203
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
204
+ elastic_launch(
205
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
206
+ return launch_agent(self._config, self._entrypoint, list(args))
207
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
208
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
209
+ raise ChildFailedError(
210
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
211
+ ============================================================
212
+ train.py FAILED
213
+ ------------------------------------------------------------
214
+ Failures:
215
+ <NO_OTHER_FAILURES>
216
+ ------------------------------------------------------------
217
+ Root Cause (first observed failure):
218
+ [0]:
219
+ time : 2026-05-13_14:48:01
220
+ host : localhost
221
+ rank : 1 (local_rank: 1)
222
+ exitcode : 1 (pid: 624990)
223
+ error_file: <N/A>
224
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
225
+ ============================================================
LTA_openwebtext_dualt/logs/lta_owt_t5elf_absrope_time4_dirichlet_len1025_C1_to_1024_prebos_mask1_sameT_gbs512_b32_8gpu_1m_save10k_20260526_watcher.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 10250
LTA_openwebtext_dualt/logs/lta_owt_t5elf_dirichlet_len1024_Cv_to_2v_mask1_gbs512_b32_4gpu_20k_save1k_20260525.log ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ *****************************************
3
+ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
4
+ *****************************************
5
+ NCCL version 2.25.1+cuda12.8
6
+ {
7
+ "device": "cuda:0",
8
+ "rank": 0,
9
+ "world_size": 4,
10
+ "samples": "tokenized_hf:9737184:pad=0",
11
+ "vocab_size": 32100,
12
+ "tokenizer_vocab_size": 32100,
13
+ "save_dir": "runs/lta_owt_t5elf_dirichlet_len1024_Cv_to_2v_mask1_gbs512_b32_4gpu_20k_save1k_20260525",
14
+ "max_len": 1024,
15
+ "effective_model_max_len": 1024,
16
+ "batch_size": 32,
17
+ "grad_accum": 4,
18
+ "effective_batch_size": 512,
19
+ "global_batch_size": 512,
20
+ "lr_schedule": "constant_warmup",
21
+ "optimizer": "adamw",
22
+ "epochs": 0.0,
23
+ "steps_per_epoch": 19018,
24
+ "total_steps": 20000,
25
+ "warmup_steps": 2500,
26
+ "warmup_epochs": -1.0,
27
+ "min_lr": 6e-05,
28
+ "weight_decay": 0.0,
29
+ "output_weight_decay": -1.0,
30
+ "adamw_param_groups": "nanogpt",
31
+ "adam_beta1": 0.9,
32
+ "adam_beta2": 0.999,
33
+ "adam_eps": 1e-08,
34
+ "muon_impl": "legacy",
35
+ "muon_momentum": 0.95,
36
+ "muon_ns_steps": 5,
37
+ "muon_update_scale": 1.0,
38
+ "muon_nesterov": false,
39
+ "muon_width_scale": false,
40
+ "muon_grouping": "",
41
+ "muon_param_count": 0,
42
+ "muon_adam_param_count": 0,
43
+ "muon_param_names": [],
44
+ "muon_adam_param_names": [],
45
+ "muon_effective_nesterov": false,
46
+ "muon_effective_width_scale": false,
47
+ "muon_effective_weight_decay": 0.0,
48
+ "muon_adam_fallback_nesterov": false,
49
+ "muon_adam_fallback_weight_decay": 0.0,
50
+ "ema_decay": 0.0,
51
+ "ema_start_step": 0,
52
+ "model_type": "ddit",
53
+ "ddit_mlp_type": "gelu",
54
+ "block_anchor_every": 0,
55
+ "block_anchor_init_std": 0.02,
56
+ "bos_anchor_every": 0,
57
+ "bos_anchor_token_id": -1,
58
+ "bos_anchor_extra_len": 0,
59
+ "abs_pos_embed": false,
60
+ "abs_pos_init_std": 0.02,
61
+ "elf_num_time_tokens": 4,
62
+ "elf_num_model_mode_tokens": 0,
63
+ "qk_norm": true,
64
+ "output_bias": false,
65
+ "output_init_std": -1.0,
66
+ "norm_type": "rmsnorm",
67
+ "target_loss": "hard_ce",
68
+ "linear_soft_target_power": 1.0,
69
+ "linear_soft_target_min_conf": 0.0,
70
+ "linear_soft_target_max_conf": 1.0,
71
+ "t_sampling_mode": "uniform",
72
+ "t_sampling_power": 1.0,
73
+ "t_sampling_eps": 0.0001,
74
+ "t_sampling_logit_mean": -1.5,
75
+ "t_sampling_logit_std": 0.8,
76
+ "t_sampling_gumbel_loc": 2.2,
77
+ "t_sampling_gumbel_scale": 0.8,
78
+ "dual_t": true,
79
+ "corrupt_t_mode": "independent",
80
+ "corrupt_min_t": 0.0,
81
+ "corrupt_max_t": 1.0,
82
+ "prefix_block_prob": 0.0,
83
+ "prefix_block_len": 128,
84
+ "block_ar_two_stream": false,
85
+ "block_ar_block_len": 128,
86
+ "mask_ratio_floor_schedule": "none",
87
+ "dirichlet_endpoint_mode": "categorical_dual_t",
88
+ "dirichlet_semantic_t_mode": "same",
89
+ "dirichlet_semantic_t_value": 0.0,
90
+ "dirichlet_semantic_t_curve": "linear",
91
+ "dirichlet_semantic_t_power": 1.0,
92
+ "dirichlet_support_t_curve": "linear",
93
+ "dirichlet_support_t_power": 1.0,
94
+ "endpoint_sequence_random_prob_alpha": 0.0,
95
+ "categorical_wrong_from_full_vocab": true,
96
+ "categorical_wrong_from_batch_valid_tokens": false,
97
+ "categorical_wrong_basin_token_ids": "",
98
+ "categorical_wrong_basin_prob": 0.0,
99
+ "categorical_wrong_unigram_prob": 0.0,
100
+ "categorical_wrong_uniform_prob": 0.0,
101
+ "categorical_wrong_prob_floor": 0.0,
102
+ "categorical_gold_prob_floor": 0.0,
103
+ "categorical_gold_prob_ceil": 1.0,
104
+ "categorical_wrong_corpus_unigram_path": "",
105
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
106
+ "categorical_wrong_basin_shared_prob": 0.0,
107
+ "categorical_wrong_unigram_shared_prob": 0.0,
108
+ "mask_mixture_original_prob": 0.0,
109
+ "mask_mixture_lowk_prob": 0.0,
110
+ "mask_mixture_lowcorrupt_prob": 0.0,
111
+ "mask_mixture_block_prob": 0.0,
112
+ "mask_mixture_all_prob": 0.0,
113
+ "mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
114
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
115
+ "mask_mixture_block_tokens": "64,128",
116
+ "simplex_bridge_sampler": "dirichlet",
117
+ "logistic_normal_sigma_min": 0.18,
118
+ "logistic_normal_sigma_max": 2.2,
119
+ "logistic_normal_tau_min": 0.65,
120
+ "logistic_normal_tau_max": 1.15,
121
+ "torch_compile": false,
122
+ "compile_mode": "max-autotune",
123
+ "state_format": "prob",
124
+ "meanflow_weight": 0.0,
125
+ "rollout_train_prob": 0.0,
126
+ "rollout_train_steps": 1,
127
+ "rollout_train_steps_min": -1,
128
+ "rollout_train_infer_steps": 64,
129
+ "rollout_train_time_mode": "fixed_steps",
130
+ "rollout_train_s_dist": "uniform",
131
+ "rollout_train_s_min_frac": 0.0,
132
+ "rollout_train_s_max_frac": 0.125,
133
+ "rollout_train_s_beta_alpha": 2.0,
134
+ "rollout_train_s_beta_beta": 6.0,
135
+ "rollout_train_temp": 1.0,
136
+ "rollout_train_max_gamma": 1.0,
137
+ "rollout_train_rule": "flowmap",
138
+ "rollout_train_corrupt_only": true,
139
+ "rollout_train_samplewise": false,
140
+ "rollout_train_compute_always": false,
141
+ "rollout_train_keep_grad": false,
142
+ "rollout_train_sync_t": false,
143
+ "rollout_train_state_mix_mode": "final",
144
+ "rollout_train_state_mix_alpha": 0.5,
145
+ "bridge_noise_init": "logistic_normal",
146
+ "noise_sigma": -1.0,
147
+ "allow_tf32": true,
148
+ "activation_checkpointing": false,
149
+ "activation_checkpoint_interval": 1,
150
+ "activation_checkpoint_scope": "block",
151
+ "ddp_static_graph": false,
152
+ "ddp_gradient_as_bucket_view": true,
153
+ "blocking_data_transfer": false,
154
+ "dataloader_prefetch_factor": 2,
155
+ "full_train_stats": false,
156
+ "tokenized_hf": true,
157
+ "tokenized_pad_token": "pad",
158
+ "elf_conditional_hf": false,
159
+ "record_pad_truncate": false,
160
+ "record_add_eos": false,
161
+ "record_add_special_tokens": false,
162
+ "record_pad_token": "pad",
163
+ "record_shuffle_buffer": 10000,
164
+ "wrap": false,
165
+ "wrap_mode": "stream",
166
+ "wrap_record_buffer_size": 200,
167
+ "owt_cached_chunks": false,
168
+ "owt_chunk_cache_dir": "",
169
+ "owt_chunk_cache_rebuild": false,
170
+ "owt_chunk_cache_write_batch": 4096,
171
+ "owt_exact_repeat_per_chunk": 0,
172
+ "online_chunk_shuffle": false,
173
+ "online_chunk_shuffle_buffer": 10000,
174
+ "openwebtext_split": "all",
175
+ "detokenizer": "auto",
176
+ "resolved_detokenizer": null,
177
+ "num_workers": 8,
178
+ "latest_every": 1000,
179
+ "resume_path": ""
180
+ }
181
+ step=100 epoch=1/2 epoch_step=100/19018 micro_steps=400 elapsed=200.7s lr=1.212000e-05 loss=10.1985 loss_recon=10.1985 loss_meanflow=0.0000 mean_model_t=0.5004 mean_corrupt_t=0.4966 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3703 corrupt_frac=1.0000 acc_corrupt=0.3703 loss_corrupt=10.1985 wrong_frac=0.5036 init_acc_corrupt=0.4964 acc_corrupt_t_0p0_0p2=0.0746 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.1967 corrupt_frac_t_0p2_0p4=0.2058 acc_corrupt_t_0p4_0p6=0.3447 corrupt_frac_t_0p4_0p6=0.1937 acc_corrupt_t_0p6_0p8=0.5246 corrupt_frac_t_0p6_0p8=0.2019 acc_corrupt_t_0p8_1p0=0.7248 corrupt_frac_t_0p8_1p0=0.1960 out_w_norm=1.0610 out_g_norm=1.1618 loss_all=9.7534 init_gold_top10=0.4680 init_gold_top100=0.4698
182
+ step=200 epoch=1/2 epoch_step=200/19018 micro_steps=800 elapsed=199.4s lr=2.412000e-05 loss=8.9228 loss_recon=8.9228 loss_meanflow=0.0000 mean_model_t=0.4969 mean_corrupt_t=0.5032 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0558 corrupt_frac=1.0000 acc_corrupt=0.0558 loss_corrupt=8.9228 wrong_frac=0.4974 init_acc_corrupt=0.5026 acc_corrupt_t_0p0_0p2=0.0439 corrupt_frac_t_0p0_0p2=0.1932 acc_corrupt_t_0p2_0p4=0.0445 corrupt_frac_t_0p2_0p4=0.2028 acc_corrupt_t_0p4_0p6=0.0465 corrupt_frac_t_0p4_0p6=0.2034 acc_corrupt_t_0p6_0p8=0.0586 corrupt_frac_t_0p6_0p8=0.1996 acc_corrupt_t_0p8_1p0=0.0852 corrupt_frac_t_0p8_1p0=0.2009 out_w_norm=7.1043 out_g_norm=1.6873 loss_all=8.0208 init_gold_top10=0.4885 init_gold_top100=0.4902
183
+ step=300 epoch=1/2 epoch_step=300/19018 micro_steps=1200 elapsed=226.1s lr=3.612000e-05 loss=7.0822 loss_recon=7.0822 loss_meanflow=0.0000 mean_model_t=0.4995 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1080 corrupt_frac=1.0000 acc_corrupt=0.1080 loss_corrupt=7.0822 wrong_frac=0.5029 init_acc_corrupt=0.4971 acc_corrupt_t_0p0_0p2=0.0511 corrupt_frac_t_0p0_0p2=0.2049 acc_corrupt_t_0p2_0p4=0.0777 corrupt_frac_t_0p2_0p4=0.1954 acc_corrupt_t_0p4_0p6=0.1070 corrupt_frac_t_0p4_0p6=0.2034 acc_corrupt_t_0p6_0p8=0.1400 corrupt_frac_t_0p6_0p8=0.2009 acc_corrupt_t_0p8_1p0=0.1660 corrupt_frac_t_0p8_1p0=0.1954 out_w_norm=13.5033 out_g_norm=1.2325 loss_all=5.8280 init_gold_top10=0.5261 init_gold_top100=0.5270
184
+ step=400 epoch=1/2 epoch_step=400/19018 micro_steps=1600 elapsed=199.5s lr=4.812000e-05 loss=4.4781 loss_recon=4.4781 loss_meanflow=0.0000 mean_model_t=0.4998 mean_corrupt_t=0.4972 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4401 corrupt_frac=1.0000 acc_corrupt=0.4401 loss_corrupt=4.4781 wrong_frac=0.5029 init_acc_corrupt=0.4971 acc_corrupt_t_0p0_0p2=0.1077 corrupt_frac_t_0p0_0p2=0.2051 acc_corrupt_t_0p2_0p4=0.2716 corrupt_frac_t_0p2_0p4=0.2005 acc_corrupt_t_0p4_0p6=0.4383 corrupt_frac_t_0p4_0p6=0.2012 acc_corrupt_t_0p6_0p8=0.6137 corrupt_frac_t_0p6_0p8=0.1928 acc_corrupt_t_0p8_1p0=0.7803 corrupt_frac_t_0p8_1p0=0.2018 out_w_norm=19.2735 out_g_norm=0.4549 loss_all=3.8788 init_gold_top10=0.5142 init_gold_top100=0.5154
185
+ step=500 epoch=1/2 epoch_step=500/19018 micro_steps=2000 elapsed=199.5s lr=6.012000e-05 loss=3.9170 loss_recon=3.9170 loss_meanflow=0.0000 mean_model_t=0.4974 mean_corrupt_t=0.5002 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5027 corrupt_frac=1.0000 acc_corrupt=0.5027 loss_corrupt=3.9170 wrong_frac=0.5004 init_acc_corrupt=0.4996 acc_corrupt_t_0p0_0p2=0.1147 corrupt_frac_t_0p0_0p2=0.2013 acc_corrupt_t_0p2_0p4=0.3073 corrupt_frac_t_0p2_0p4=0.1991 acc_corrupt_t_0p4_0p6=0.5015 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=0.6964 corrupt_frac_t_0p6_0p8=0.1978 acc_corrupt_t_0p8_1p0=0.8928 corrupt_frac_t_0p8_1p0=0.2025 out_w_norm=22.6140 out_g_norm=0.3801 loss_all=3.6717 init_gold_top10=0.5317 init_gold_top100=0.5332
186
+ step=600 epoch=1/2 epoch_step=600/19018 micro_steps=2400 elapsed=199.5s lr=7.212000e-05 loss=3.8422 loss_recon=3.8422 loss_meanflow=0.0000 mean_model_t=0.4983 mean_corrupt_t=0.5039 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5099 corrupt_frac=1.0000 acc_corrupt=0.5099 loss_corrupt=3.8422 wrong_frac=0.4967 init_acc_corrupt=0.5033 acc_corrupt_t_0p0_0p2=0.1191 corrupt_frac_t_0p0_0p2=0.1962 acc_corrupt_t_0p2_0p4=0.3077 corrupt_frac_t_0p2_0p4=0.1950 acc_corrupt_t_0p4_0p6=0.5053 corrupt_frac_t_0p4_0p6=0.2069 acc_corrupt_t_0p6_0p8=0.7012 corrupt_frac_t_0p6_0p8=0.2041 acc_corrupt_t_0p8_1p0=0.9011 corrupt_frac_t_0p8_1p0=0.1993 out_w_norm=23.8739 out_g_norm=0.3429 loss_all=4.1737 init_gold_top10=0.4535 init_gold_top100=0.4548
187
+ step=700 epoch=1/2 epoch_step=700/19018 micro_steps=2800 elapsed=199.5s lr=8.412000e-05 loss=3.7273 loss_recon=3.7273 loss_meanflow=0.0000 mean_model_t=0.4997 mean_corrupt_t=0.5042 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5165 corrupt_frac=1.0000 acc_corrupt=0.5165 loss_corrupt=3.7273 wrong_frac=0.4958 init_acc_corrupt=0.5042 acc_corrupt_t_0p0_0p2=0.1227 corrupt_frac_t_0p0_0p2=0.1949 acc_corrupt_t_0p2_0p4=0.3148 corrupt_frac_t_0p2_0p4=0.1977 acc_corrupt_t_0p4_0p6=0.5121 corrupt_frac_t_0p4_0p6=0.2004 acc_corrupt_t_0p6_0p8=0.7082 corrupt_frac_t_0p6_0p8=0.2050 acc_corrupt_t_0p8_1p0=0.9019 corrupt_frac_t_0p8_1p0=0.2030 out_w_norm=24.9630 out_g_norm=0.3548 loss_all=4.1789 init_gold_top10=0.4437 init_gold_top100=0.4450
188
+ step=800 epoch=1/2 epoch_step=800/19018 micro_steps=3200 elapsed=200.5s lr=9.612000e-05 loss=3.6741 loss_recon=3.6741 loss_meanflow=0.0000 mean_model_t=0.5031 mean_corrupt_t=0.4978 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5160 corrupt_frac=1.0000 acc_corrupt=0.5160 loss_corrupt=3.6741 wrong_frac=0.5020 init_acc_corrupt=0.4980 acc_corrupt_t_0p0_0p2=0.1243 corrupt_frac_t_0p0_0p2=0.2044 acc_corrupt_t_0p2_0p4=0.3250 corrupt_frac_t_0p2_0p4=0.1984 acc_corrupt_t_0p4_0p6=0.5212 corrupt_frac_t_0p4_0p6=0.1988 acc_corrupt_t_0p6_0p8=0.7155 corrupt_frac_t_0p6_0p8=0.1989 acc_corrupt_t_0p8_1p0=0.9035 corrupt_frac_t_0p8_1p0=0.1999 out_w_norm=26.0897 out_g_norm=0.3698 loss_all=3.3709 init_gold_top10=0.5284 init_gold_top100=0.5297
189
+ step=900 epoch=1/2 epoch_step=900/19018 micro_steps=3600 elapsed=200.5s lr=1.081200e-04 loss=3.5740 loss_recon=3.5740 loss_meanflow=0.0000 mean_model_t=0.4997 mean_corrupt_t=0.5014 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5228 corrupt_frac=1.0000 acc_corrupt=0.5228 loss_corrupt=3.5740 wrong_frac=0.4989 init_acc_corrupt=0.5012 acc_corrupt_t_0p0_0p2=0.1279 corrupt_frac_t_0p0_0p2=0.1955 acc_corrupt_t_0p2_0p4=0.3258 corrupt_frac_t_0p2_0p4=0.2058 acc_corrupt_t_0p4_0p6=0.5272 corrupt_frac_t_0p4_0p6=0.1979 acc_corrupt_t_0p6_0p8=0.7187 corrupt_frac_t_0p6_0p8=0.1978 acc_corrupt_t_0p8_1p0=0.9052 corrupt_frac_t_0p8_1p0=0.2040 out_w_norm=27.1010 out_g_norm=0.3675 loss_all=3.9080 init_gold_top10=0.4464 init_gold_top100=0.4482
190
+ step=1000 epoch=1/2 epoch_step=1000/19018 micro_steps=4000 elapsed=210.6s lr=1.201200e-04 loss=3.5327 loss_recon=3.5327 loss_meanflow=0.0000 mean_model_t=0.5026 mean_corrupt_t=0.4997 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5239 corrupt_frac=1.0000 acc_corrupt=0.5239 loss_corrupt=3.5327 wrong_frac=0.5001 init_acc_corrupt=0.4999 acc_corrupt_t_0p0_0p2=0.1297 corrupt_frac_t_0p0_0p2=0.2010 acc_corrupt_t_0p2_0p4=0.3304 corrupt_frac_t_0p2_0p4=0.2006 acc_corrupt_t_0p4_0p6=0.5322 corrupt_frac_t_0p4_0p6=0.1958 acc_corrupt_t_0p6_0p8=0.7208 corrupt_frac_t_0p6_0p8=0.2033 acc_corrupt_t_0p8_1p0=0.9072 corrupt_frac_t_0p8_1p0=0.1993 out_w_norm=28.0792 out_g_norm=0.3662 loss_all=3.1546 init_gold_top10=0.5438 init_gold_top100=0.5452
191
+ step=1100 epoch=1/2 epoch_step=1100/19018 micro_steps=4400 elapsed=225.3s lr=1.321200e-04 loss=3.4852 loss_recon=3.4852 loss_meanflow=0.0000 mean_model_t=0.5014 mean_corrupt_t=0.5007 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5269 corrupt_frac=1.0000 acc_corrupt=0.5269 loss_corrupt=3.4852 wrong_frac=0.4993 init_acc_corrupt=0.5008 acc_corrupt_t_0p0_0p2=0.1325 corrupt_frac_t_0p0_0p2=0.1999 acc_corrupt_t_0p2_0p4=0.3354 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.5330 corrupt_frac_t_0p4_0p6=0.2044 acc_corrupt_t_0p6_0p8=0.7262 corrupt_frac_t_0p6_0p8=0.1983 acc_corrupt_t_0p8_1p0=0.9077 corrupt_frac_t_0p8_1p0=0.1995 out_w_norm=29.1077 out_g_norm=0.3601 loss_all=3.1611 init_gold_top10=0.5427 init_gold_top100=0.5437
192
+ step=1200 epoch=1/2 epoch_step=1200/19018 micro_steps=4800 elapsed=199.5s lr=1.441200e-04 loss=3.4507 loss_recon=3.4507 loss_meanflow=0.0000 mean_model_t=0.5030 mean_corrupt_t=0.5005 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5286 corrupt_frac=1.0000 acc_corrupt=0.5286 loss_corrupt=3.4507 wrong_frac=0.4995 init_acc_corrupt=0.5005 acc_corrupt_t_0p0_0p2=0.1307 corrupt_frac_t_0p0_0p2=0.1980 acc_corrupt_t_0p2_0p4=0.3380 corrupt_frac_t_0p2_0p4=0.2011 acc_corrupt_t_0p4_0p6=0.5372 corrupt_frac_t_0p4_0p6=0.2018 acc_corrupt_t_0p6_0p8=0.7286 corrupt_frac_t_0p6_0p8=0.2042 acc_corrupt_t_0p8_1p0=0.9088 corrupt_frac_t_0p8_1p0=0.1959 out_w_norm=30.2570 out_g_norm=0.3400 loss_all=3.7855 init_gold_top10=0.4513 init_gold_top100=0.4527
193
+ step=1300 epoch=1/2 epoch_step=1300/19018 micro_steps=5200 elapsed=199.5s lr=1.561200e-04 loss=3.4260 loss_recon=3.4260 loss_meanflow=0.0000 mean_model_t=0.5031 mean_corrupt_t=0.5000 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5299 corrupt_frac=1.0000 acc_corrupt=0.5299 loss_corrupt=3.4260 wrong_frac=0.4997 init_acc_corrupt=0.5003 acc_corrupt_t_0p0_0p2=0.1330 corrupt_frac_t_0p0_0p2=0.1987 acc_corrupt_t_0p2_0p4=0.3374 corrupt_frac_t_0p2_0p4=0.1973 acc_corrupt_t_0p4_0p6=0.5402 corrupt_frac_t_0p4_0p6=0.2074 acc_corrupt_t_0p6_0p8=0.7292 corrupt_frac_t_0p6_0p8=0.2066 acc_corrupt_t_0p8_1p0=0.9107 corrupt_frac_t_0p8_1p0=0.1925 out_w_norm=31.5035 out_g_norm=0.3191 loss_all=2.8906 init_gold_top10=0.5694 init_gold_top100=0.5705
194
+ step=1400 epoch=1/2 epoch_step=1400/19018 micro_steps=5600 elapsed=199.5s lr=1.681200e-04 loss=3.4040 loss_recon=3.4040 loss_meanflow=0.0000 mean_model_t=0.4973 mean_corrupt_t=0.5001 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5313 corrupt_frac=1.0000 acc_corrupt=0.5313 loss_corrupt=3.4040 wrong_frac=0.4995 init_acc_corrupt=0.5005 acc_corrupt_t_0p0_0p2=0.1332 corrupt_frac_t_0p0_0p2=0.1993 acc_corrupt_t_0p2_0p4=0.3377 corrupt_frac_t_0p2_0p4=0.1990 acc_corrupt_t_0p4_0p6=0.5388 corrupt_frac_t_0p4_0p6=0.2007 acc_corrupt_t_0p6_0p8=0.7315 corrupt_frac_t_0p6_0p8=0.2052 acc_corrupt_t_0p8_1p0=0.9111 corrupt_frac_t_0p8_1p0=0.1983 out_w_norm=32.9606 out_g_norm=0.2942 loss_all=2.8138 init_gold_top10=0.5705 init_gold_top100=0.5716
195
+ step=1500 epoch=1/2 epoch_step=1500/19018 micro_steps=6000 elapsed=199.6s lr=1.801200e-04 loss=3.3915 loss_recon=3.3915 loss_meanflow=0.0000 mean_model_t=0.5006 mean_corrupt_t=0.4996 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5313 corrupt_frac=1.0000 acc_corrupt=0.5313 loss_corrupt=3.3915 wrong_frac=0.5006 init_acc_corrupt=0.4994 acc_corrupt_t_0p0_0p2=0.1336 corrupt_frac_t_0p0_0p2=0.1960 acc_corrupt_t_0p2_0p4=0.3407 corrupt_frac_t_0p2_0p4=0.2036 acc_corrupt_t_0p4_0p6=0.5395 corrupt_frac_t_0p4_0p6=0.2020 acc_corrupt_t_0p6_0p8=0.7314 corrupt_frac_t_0p6_0p8=0.2017 acc_corrupt_t_0p8_1p0=0.9111 corrupt_frac_t_0p8_1p0=0.1967 out_w_norm=34.6646 out_g_norm=0.2818 loss_all=3.3737 init_gold_top10=0.5000 init_gold_top100=0.5015
196
+ step=1600 epoch=1/2 epoch_step=1600/19018 micro_steps=6400 elapsed=199.5s lr=1.921200e-04 loss=3.3626 loss_recon=3.3626 loss_meanflow=0.0000 mean_model_t=0.4998 mean_corrupt_t=0.5003 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5335 corrupt_frac=1.0000 acc_corrupt=0.5335 loss_corrupt=3.3626 wrong_frac=0.4994 init_acc_corrupt=0.5006 acc_corrupt_t_0p0_0p2=0.1365 corrupt_frac_t_0p0_0p2=0.2023 acc_corrupt_t_0p2_0p4=0.3397 corrupt_frac_t_0p2_0p4=0.1965 acc_corrupt_t_0p4_0p6=0.5408 corrupt_frac_t_0p4_0p6=0.1999 acc_corrupt_t_0p6_0p8=0.7328 corrupt_frac_t_0p6_0p8=0.1957 acc_corrupt_t_0p8_1p0=0.9115 corrupt_frac_t_0p8_1p0=0.2065 out_w_norm=36.5363 out_g_norm=0.2637 loss_all=3.5901 init_gold_top10=0.4688 init_gold_top100=0.4702
197
+ step=1700 epoch=1/2 epoch_step=1700/19018 micro_steps=6800 elapsed=199.6s lr=2.041200e-04 loss=3.3443 loss_recon=3.3443 loss_meanflow=0.0000 mean_model_t=0.4981 mean_corrupt_t=0.4997 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5344 corrupt_frac=1.0000 acc_corrupt=0.5344 loss_corrupt=3.3443 wrong_frac=0.4994 init_acc_corrupt=0.5006 acc_corrupt_t_0p0_0p2=0.1348 corrupt_frac_t_0p0_0p2=0.2017 acc_corrupt_t_0p2_0p4=0.3426 corrupt_frac_t_0p2_0p4=0.1997 acc_corrupt_t_0p4_0p6=0.5449 corrupt_frac_t_0p4_0p6=0.1972 acc_corrupt_t_0p6_0p8=0.7340 corrupt_frac_t_0p6_0p8=0.1989 acc_corrupt_t_0p8_1p0=0.9131 corrupt_frac_t_0p8_1p0=0.2034 out_w_norm=38.5982 out_g_norm=0.2560 loss_all=3.2181 init_gold_top10=0.5173 init_gold_top100=0.5193
LTA_openwebtext_dualt/logs/owt_fully_categorical_gate_probe_step116k_n8.log ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [forbid_endpoint_ids] n=352 first=[94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125]
2
+ [decode] steps32_c128_mtpre_t0p9_tpow1p0_noise0_state_categorical_argmax
3
+ [summary] {"name": "steps32_c128_mtpre_t0p9_tpow1p0_noise0_state_categorical_argmax", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 0.9, "temp_end": 0.9, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "state", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 16, "final_uncertain_threshold": 0.85, "update_rule": "categorical_argmax", "model_t_mode": "pre", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 1.7124667587101636, "distinct_1": 0.2037353515625, "distinct_2": 0.36779081133919844, "top_token_mass": 0.3330078125, "tokens_scored": 0, "readability_score": 2.0797317483616076, "mean_chars": 2024.625, "replacement_chars": 0.0}
4
+ [decode] steps32_c128_mtpost_t0p9_tpow1p0_noise0_state_categorical_argmax
5
+ [summary] {"name": "steps32_c128_mtpost_t0p9_tpow1p0_noise0_state_categorical_argmax", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 0.9, "temp_end": 0.9, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "state", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 16, "final_uncertain_threshold": 0.85, "update_rule": "categorical_argmax", "model_t_mode": "post", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 2.072406150304612, "distinct_1": 0.2462158203125, "distinct_2": 0.4319403714565005, "top_token_mass": 0.363525390625, "tokens_scored": 0, "readability_score": 3.8781453333859375, "mean_chars": 2446.625, "replacement_chars": 0.0}
6
+ [decode] steps32_c128_mtpre_t0p9_tpow1p0_noise0_state_catstep_top16_categorical_sample
7
+ [summary] {"name": "steps32_c128_mtpre_t0p9_tpow1p0_noise0_state_catstep_top16_categorical_sample", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 0.9, "temp_end": 0.9, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "state", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 16, "final_uncertain_threshold": 0.85, "update_rule": "categorical_sample", "model_t_mode": "pre", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 1.4362033482302652, "distinct_1": 0.0692138671875, "distinct_2": 0.14442815249266863, "top_token_mass": 0.4456787109375, "tokens_scored": 0, "readability_score": 1.912107983179559, "mean_chars": 1382.875, "replacement_chars": 0.0}
8
+ [decode] steps32_c128_mtpost_t0p9_tpow1p0_noise0_state_catstep_top16_categorical_sample
9
+ [summary] {"name": "steps32_c128_mtpost_t0p9_tpow1p0_noise0_state_catstep_top16_categorical_sample", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 0.9, "temp_end": 0.9, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "state", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 16, "final_uncertain_threshold": 0.85, "update_rule": "categorical_sample", "model_t_mode": "post", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 1.3343823647761228, "distinct_1": 0.055908203125, "distinct_2": 0.11632453567937438, "top_token_mass": 0.48876953125, "tokens_scored": 0, "readability_score": 2.46729427930036, "mean_chars": 1457.375, "replacement_chars": 0.0}
10
+ [decode] steps32_c128_mtpre_t0p9_tpow1p0_noise0_state_categorical_argmax
11
+ [summary] {"name": "steps32_c128_mtpre_t0p9_tpow1p0_noise0_state_categorical_argmax", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 0.9, "temp_end": 0.9, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "state", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 64, "final_uncertain_threshold": 0.85, "update_rule": "categorical_argmax", "model_t_mode": "pre", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 1.7124667587101636, "distinct_1": 0.2037353515625, "distinct_2": 0.36779081133919844, "top_token_mass": 0.3330078125, "tokens_scored": 0, "readability_score": 2.0797317483616076, "mean_chars": 2024.625, "replacement_chars": 0.0}
12
+ [decode] steps32_c128_mtpost_t0p9_tpow1p0_noise0_state_categorical_argmax
13
+ [summary] {"name": "steps32_c128_mtpost_t0p9_tpow1p0_noise0_state_categorical_argmax", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 0.9, "temp_end": 0.9, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "state", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 64, "final_uncertain_threshold": 0.85, "update_rule": "categorical_argmax", "model_t_mode": "post", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 2.072406150304612, "distinct_1": 0.2462158203125, "distinct_2": 0.4319403714565005, "top_token_mass": 0.363525390625, "tokens_scored": 0, "readability_score": 3.8781453333859375, "mean_chars": 2446.625, "replacement_chars": 0.0}
14
+ [decode] steps32_c128_mtpre_t0p9_tpow1p0_noise0_state_catstep_top64_categorical_sample
15
+ [summary] {"name": "steps32_c128_mtpre_t0p9_tpow1p0_noise0_state_catstep_top64_categorical_sample", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 0.9, "temp_end": 0.9, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "state", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 64, "final_uncertain_threshold": 0.85, "update_rule": "categorical_sample", "model_t_mode": "pre", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 1.800012647460023, "distinct_1": 0.0850830078125, "distinct_2": 0.19293743890518084, "top_token_mass": 0.471923828125, "tokens_scored": 0, "readability_score": 3.803961697268313, "mean_chars": 1491.875, "replacement_chars": 0.0}
16
+ [decode] steps32_c128_mtpost_t0p9_tpow1p0_noise0_state_catstep_top64_categorical_sample
17
+ [summary] {"name": "steps32_c128_mtpost_t0p9_tpow1p0_noise0_state_catstep_top64_categorical_sample", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 0.9, "temp_end": 0.9, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "state", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 64, "final_uncertain_threshold": 0.85, "update_rule": "categorical_sample", "model_t_mode": "post", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 1.9604407576571412, "distinct_1": 0.1053466796875, "distinct_2": 0.22886119257086998, "top_token_mass": 0.3310546875, "tokens_scored": 0, "readability_score": 3.1418669071837146, "mean_chars": 2092.5, "replacement_chars": 0.0}
18
+ [decode] steps32_c128_mtpre_t0p9_tpow1p0_noise0_endpoint_categorical_argmax
19
+ [summary] {"name": "steps32_c128_mtpre_t0p9_tpow1p0_noise0_endpoint_categorical_argmax", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 0.9, "temp_end": 0.9, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "endpoint", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 16, "final_uncertain_threshold": 0.85, "update_rule": "categorical_argmax", "model_t_mode": "pre", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 1.7124667587101636, "distinct_1": 0.2037353515625, "distinct_2": 0.36779081133919844, "top_token_mass": 0.3330078125, "tokens_scored": 0, "readability_score": 2.0797317483616076, "mean_chars": 2024.625, "replacement_chars": 0.0}
20
+ [decode] steps32_c128_mtpost_t0p9_tpow1p0_noise0_endpoint_categorical_argmax
21
+ [summary] {"name": "steps32_c128_mtpost_t0p9_tpow1p0_noise0_endpoint_categorical_argmax", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 0.9, "temp_end": 0.9, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "endpoint", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 16, "final_uncertain_threshold": 0.85, "update_rule": "categorical_argmax", "model_t_mode": "post", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 2.072406150304612, "distinct_1": 0.2462158203125, "distinct_2": 0.4319403714565005, "top_token_mass": 0.363525390625, "tokens_scored": 0, "readability_score": 3.8781453333859375, "mean_chars": 2446.625, "replacement_chars": 0.0}
22
+ [decode] steps32_c128_mtpre_t0p9_tpow1p0_noise0_endpoint_catstep_top16_categorical_sample
23
+ [summary] {"name": "steps32_c128_mtpre_t0p9_tpow1p0_noise0_endpoint_catstep_top16_categorical_sample", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 0.9, "temp_end": 0.9, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "endpoint", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 16, "final_uncertain_threshold": 0.85, "update_rule": "categorical_sample", "model_t_mode": "pre", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 1.4495944570112436, "distinct_1": 0.0716552734375, "distinct_2": 0.14821603128054742, "top_token_mass": 0.4456787109375, "tokens_scored": 0, "readability_score": 1.9542281177140906, "mean_chars": 1389.25, "replacement_chars": 0.0}
24
+ [decode] steps32_c128_mtpost_t0p9_tpow1p0_noise0_endpoint_catstep_top16_categorical_sample
25
+ [summary] {"name": "steps32_c128_mtpost_t0p9_tpow1p0_noise0_endpoint_catstep_top16_categorical_sample", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 0.9, "temp_end": 0.9, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "endpoint", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 16, "final_uncertain_threshold": 0.85, "update_rule": "categorical_sample", "model_t_mode": "post", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 1.3316715807580835, "distinct_1": 0.05615234375, "distinct_2": 0.1170576735092864, "top_token_mass": 0.4906005859375, "tokens_scored": 0, "readability_score": 2.485367356707624, "mean_chars": 1454.875, "replacement_chars": 0.0}
26
+ [decode] steps32_c128_mtpre_t0p9_tpow1p0_noise0_endpoint_categorical_argmax
27
+ [summary] {"name": "steps32_c128_mtpre_t0p9_tpow1p0_noise0_endpoint_categorical_argmax", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 0.9, "temp_end": 0.9, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "endpoint", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 64, "final_uncertain_threshold": 0.85, "update_rule": "categorical_argmax", "model_t_mode": "pre", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 1.7124667587101636, "distinct_1": 0.2037353515625, "distinct_2": 0.36779081133919844, "top_token_mass": 0.3330078125, "tokens_scored": 0, "readability_score": 2.0797317483616076, "mean_chars": 2024.625, "replacement_chars": 0.0}
28
+ [decode] steps32_c128_mtpost_t0p9_tpow1p0_noise0_endpoint_categorical_argmax
29
+ [summary] {"name": "steps32_c128_mtpost_t0p9_tpow1p0_noise0_endpoint_categorical_argmax", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 0.9, "temp_end": 0.9, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "endpoint", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 64, "final_uncertain_threshold": 0.85, "update_rule": "categorical_argmax", "model_t_mode": "post", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 2.072406150304612, "distinct_1": 0.2462158203125, "distinct_2": 0.4319403714565005, "top_token_mass": 0.363525390625, "tokens_scored": 0, "readability_score": 3.8781453333859375, "mean_chars": 2446.625, "replacement_chars": 0.0}
30
+ [decode] steps32_c128_mtpre_t0p9_tpow1p0_noise0_endpoint_catstep_top64_categorical_sample
31
+ [summary] {"name": "steps32_c128_mtpre_t0p9_tpow1p0_noise0_endpoint_catstep_top64_categorical_sample", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 0.9, "temp_end": 0.9, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "endpoint", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 64, "final_uncertain_threshold": 0.85, "update_rule": "categorical_sample", "model_t_mode": "pre", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 1.8150410584869763, "distinct_1": 0.08837890625, "distinct_2": 0.19709188660801563, "top_token_mass": 0.4718017578125, "tokens_scored": 0, "readability_score": 3.791988656340701, "mean_chars": 1499.0, "replacement_chars": 0.0}
32
+ [decode] steps32_c128_mtpost_t0p9_tpow1p0_noise0_endpoint_catstep_top64_categorical_sample
33
+ [summary] {"name": "steps32_c128_mtpost_t0p9_tpow1p0_noise0_endpoint_catstep_top64_categorical_sample", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 0.9, "temp_end": 0.9, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "endpoint", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 64, "final_uncertain_threshold": 0.85, "update_rule": "categorical_sample", "model_t_mode": "post", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 1.9645514611795167, "distinct_1": 0.1075439453125, "distinct_2": 0.2308162267839687, "top_token_mass": 0.331787109375, "tokens_scored": 0, "readability_score": 3.1571948202657896, "mean_chars": 2094.25, "replacement_chars": 0.0}
34
+ [decode] steps32_c128_mtpre_t1p0_tpow1p0_noise0_state_categorical_argmax
35
+ [summary] {"name": "steps32_c128_mtpre_t1p0_tpow1p0_noise0_state_categorical_argmax", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 1.0, "temp_end": 1.0, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "state", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 16, "final_uncertain_threshold": 0.85, "update_rule": "categorical_argmax", "model_t_mode": "pre", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 1.7124667587101636, "distinct_1": 0.2037353515625, "distinct_2": 0.36779081133919844, "top_token_mass": 0.3330078125, "tokens_scored": 0, "readability_score": 2.0797317483616076, "mean_chars": 2024.625, "replacement_chars": 0.0}
36
+ [decode] steps32_c128_mtpost_t1p0_tpow1p0_noise0_state_categorical_argmax
37
+ [summary] {"name": "steps32_c128_mtpost_t1p0_tpow1p0_noise0_state_categorical_argmax", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 1.0, "temp_end": 1.0, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "state", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 16, "final_uncertain_threshold": 0.85, "update_rule": "categorical_argmax", "model_t_mode": "post", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 2.072406150304612, "distinct_1": 0.2462158203125, "distinct_2": 0.4319403714565005, "top_token_mass": 0.363525390625, "tokens_scored": 0, "readability_score": 3.8781453333859375, "mean_chars": 2446.625, "replacement_chars": 0.0}
38
+ [decode] steps32_c128_mtpre_t1p0_tpow1p0_noise0_state_catstep_top16_categorical_sample
39
+ [summary] {"name": "steps32_c128_mtpre_t1p0_tpow1p0_noise0_state_catstep_top16_categorical_sample", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 1.0, "temp_end": 1.0, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "state", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 16, "final_uncertain_threshold": 0.85, "update_rule": "categorical_sample", "model_t_mode": "pre", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 1.256154834660427, "distinct_1": 0.0556640625, "distinct_2": 0.11925708699902249, "top_token_mass": 0.390380859375, "tokens_scored": 0, "readability_score": 2.1900031563523, "mean_chars": 1464.75, "replacement_chars": 0.0}
40
+ [decode] steps32_c128_mtpost_t1p0_tpow1p0_noise0_state_catstep_top16_categorical_sample
41
+ [summary] {"name": "steps32_c128_mtpost_t1p0_tpow1p0_noise0_state_catstep_top16_categorical_sample", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 1.0, "temp_end": 1.0, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "state", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 16, "final_uncertain_threshold": 0.85, "update_rule": "categorical_sample", "model_t_mode": "post", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 1.3816844647383557, "distinct_1": 0.0838623046875, "distinct_2": 0.16678885630498533, "top_token_mass": 0.3951416015625, "tokens_scored": 0, "readability_score": 3.33190362211042, "mean_chars": 1984.875, "replacement_chars": 0.0}
42
+ [decode] steps32_c128_mtpre_t1p0_tpow1p0_noise0_state_categorical_argmax
43
+ [summary] {"name": "steps32_c128_mtpre_t1p0_tpow1p0_noise0_state_categorical_argmax", "step": 118000, "n_samples": 8, "steps": 32, "concentration_max": 128.0, "temp_start": 1.0, "temp_end": 1.0, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "final_from": "state", "final_decode": "argmax", "final_temp": 1.0, "final_top_k": 64, "final_uncertain_threshold": 0.85, "update_rule": "categorical_argmax", "model_t_mode": "pre", "lock_bos": true, "lock_final_eos": false, "detok_genppl": NaN, "sample_entropy": 1.7124667587101636, "distinct_1": 0.2037353515625, "distinct_2": 0.36779081133919844, "top_token_mass": 0.3330078125, "tokens_scored": 0, "readability_score": 2.0797317483616076, "mean_chars": 2024.625, "replacement_chars": 0.0}
44
+ [decode] steps32_c128_mtpost_t1p0_tpow1p0_noise0_state_categorical_argmax
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/pilot_softendpoint_mn_n128_m8_16_32_onehot_20260516_softendpoint_mn_pilot.log ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8942",
7
+ "vocab_size": 50257,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/pilot_softendpoint_mn_n128_m8_16_32_onehot_20260516_softendpoint_mn_pilot",
10
+ "batch_size": 32,
11
+ "grad_accum": 4,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 18,
18
+ "total_steps": 500,
19
+ "warmup_steps": 100,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 169453056,
36
+ "muon_adam_param_count": 122368,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "blocks.3.attn_qkv.weight",
57
+ "blocks.3.attn_out.weight",
58
+ "blocks.3.mlp.0.weight",
59
+ "blocks.3.mlp.2.weight",
60
+ "blocks.3.adaLN_modulation.weight",
61
+ "blocks.4.attn_qkv.weight",
62
+ "blocks.4.attn_out.weight",
63
+ "blocks.4.mlp.0.weight",
64
+ "blocks.4.mlp.2.weight",
65
+ "blocks.4.adaLN_modulation.weight",
66
+ "blocks.5.attn_qkv.weight",
67
+ "blocks.5.attn_out.weight",
68
+ "blocks.5.mlp.0.weight",
69
+ "blocks.5.mlp.2.weight",
70
+ "blocks.5.adaLN_modulation.weight",
71
+ "blocks.6.attn_qkv.weight",
72
+ "blocks.6.attn_out.weight",
73
+ "blocks.6.mlp.0.weight",
74
+ "blocks.6.mlp.2.weight",
75
+ "blocks.6.adaLN_modulation.weight",
76
+ "blocks.7.attn_qkv.weight",
77
+ "blocks.7.attn_out.weight",
78
+ "blocks.7.mlp.0.weight",
79
+ "blocks.7.mlp.2.weight",
80
+ "blocks.7.adaLN_modulation.weight",
81
+ "blocks.8.attn_qkv.weight",
82
+ "blocks.8.attn_out.weight",
83
+ "blocks.8.mlp.0.weight",
84
+ "blocks.8.mlp.2.weight",
85
+ "blocks.8.adaLN_modulation.weight",
86
+ "blocks.9.attn_qkv.weight",
87
+ "blocks.9.attn_out.weight",
88
+ "blocks.9.mlp.0.weight",
89
+ "blocks.9.mlp.2.weight",
90
+ "blocks.9.adaLN_modulation.weight",
91
+ "blocks.10.attn_qkv.weight",
92
+ "blocks.10.attn_out.weight",
93
+ "blocks.10.mlp.0.weight",
94
+ "blocks.10.mlp.2.weight",
95
+ "blocks.10.adaLN_modulation.weight",
96
+ "blocks.11.attn_qkv.weight",
97
+ "blocks.11.attn_out.weight",
98
+ "blocks.11.mlp.0.weight",
99
+ "blocks.11.mlp.2.weight",
100
+ "blocks.11.adaLN_modulation.weight",
101
+ "output_layer.linear.weight",
102
+ "output_layer.adaLN_modulation.weight"
103
+ ],
104
+ "muon_adam_param_names": [
105
+ "sigma_map.net.0.bias",
106
+ "sigma_map.net.2.bias",
107
+ "blocks.0.norm1.weight",
108
+ "blocks.0.norm2.weight",
109
+ "blocks.0.mlp.0.bias",
110
+ "blocks.0.mlp.2.bias",
111
+ "blocks.0.adaLN_modulation.bias",
112
+ "blocks.1.norm1.weight",
113
+ "blocks.1.norm2.weight",
114
+ "blocks.1.mlp.0.bias",
115
+ "blocks.1.mlp.2.bias",
116
+ "blocks.1.adaLN_modulation.bias",
117
+ "blocks.2.norm1.weight",
118
+ "blocks.2.norm2.weight",
119
+ "blocks.2.mlp.0.bias",
120
+ "blocks.2.mlp.2.bias",
121
+ "blocks.2.adaLN_modulation.bias",
122
+ "blocks.3.norm1.weight",
123
+ "blocks.3.norm2.weight",
124
+ "blocks.3.mlp.0.bias",
125
+ "blocks.3.mlp.2.bias",
126
+ "blocks.3.adaLN_modulation.bias",
127
+ "blocks.4.norm1.weight",
128
+ "blocks.4.norm2.weight",
129
+ "blocks.4.mlp.0.bias",
130
+ "blocks.4.mlp.2.bias",
131
+ "blocks.4.adaLN_modulation.bias",
132
+ "blocks.5.norm1.weight",
133
+ "blocks.5.norm2.weight",
134
+ "blocks.5.mlp.0.bias",
135
+ "blocks.5.mlp.2.bias",
136
+ "blocks.5.adaLN_modulation.bias",
137
+ "blocks.6.norm1.weight",
138
+ "blocks.6.norm2.weight",
139
+ "blocks.6.mlp.0.bias",
140
+ "blocks.6.mlp.2.bias",
141
+ "blocks.6.adaLN_modulation.bias",
142
+ "blocks.7.norm1.weight",
143
+ "blocks.7.norm2.weight",
144
+ "blocks.7.mlp.0.bias",
145
+ "blocks.7.mlp.2.bias",
146
+ "blocks.7.adaLN_modulation.bias",
147
+ "blocks.8.norm1.weight",
148
+ "blocks.8.norm2.weight",
149
+ "blocks.8.mlp.0.bias",
150
+ "blocks.8.mlp.2.bias",
151
+ "blocks.8.adaLN_modulation.bias",
152
+ "blocks.9.norm1.weight",
153
+ "blocks.9.norm2.weight",
154
+ "blocks.9.mlp.0.bias",
155
+ "blocks.9.mlp.2.bias",
156
+ "blocks.9.adaLN_modulation.bias",
157
+ "blocks.10.norm1.weight",
158
+ "blocks.10.norm2.weight",
159
+ "blocks.10.mlp.0.bias",
160
+ "blocks.10.mlp.2.bias",
161
+ "blocks.10.adaLN_modulation.bias",
162
+ "blocks.11.norm1.weight",
163
+ "blocks.11.norm2.weight",
164
+ "blocks.11.mlp.0.bias",
165
+ "blocks.11.mlp.2.bias",
166
+ "blocks.11.adaLN_modulation.bias",
167
+ "output_layer.norm_final.weight",
168
+ "output_layer.adaLN_modulation.bias"
169
+ ],
170
+ "muon_effective_nesterov": false,
171
+ "muon_effective_width_scale": false,
172
+ "muon_effective_weight_decay": 0.1,
173
+ "muon_adam_fallback_nesterov": false,
174
+ "muon_adam_fallback_weight_decay": 0.1,
175
+ "ema_decay": 0.9999,
176
+ "ema_start_step": 0,
177
+ "model_type": "ddit",
178
+ "elf_num_time_tokens": 4,
179
+ "elf_num_model_mode_tokens": 0,
180
+ "qk_norm": true,
181
+ "output_bias": false,
182
+ "output_init_std": -1.0,
183
+ "norm_type": "rmsnorm",
184
+ "target_loss": "linear_soft_kl",
185
+ "linear_soft_target_power": 1.0,
186
+ "linear_soft_target_min_conf": 0.0,
187
+ "linear_soft_target_max_conf": 1.0,
188
+ "t_sampling_mode": "logit_normal",
189
+ "t_sampling_power": 1.0,
190
+ "t_sampling_eps": 0.0001,
191
+ "t_sampling_logit_mean": -1.5,
192
+ "t_sampling_logit_std": 0.8,
193
+ "dual_t": true,
194
+ "corrupt_t_mode": "same",
195
+ "corrupt_min_t": 0.0,
196
+ "corrupt_max_t": 1.0,
197
+ "prefix_block_prob": 0.0,
198
+ "prefix_block_len": 128,
199
+ "mask_ratio_floor_schedule": "none",
200
+ "dirichlet_endpoint_mode": "categorical_dual_t",
201
+ "dirichlet_semantic_t_mode": "same",
202
+ "dirichlet_semantic_t_value": 0.0,
203
+ "dirichlet_semantic_t_curve": "linear",
204
+ "dirichlet_semantic_t_power": 1.0,
205
+ "endpoint_sequence_random_prob_alpha": 0.0,
206
+ "categorical_wrong_from_full_vocab": true,
207
+ "categorical_wrong_from_batch_valid_tokens": false,
208
+ "categorical_wrong_basin_token_ids": "",
209
+ "categorical_wrong_basin_prob": 0.0,
210
+ "categorical_wrong_unigram_prob": 0.0,
211
+ "categorical_wrong_uniform_prob": 0.0,
212
+ "categorical_wrong_corpus_unigram_path": "",
213
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
214
+ "categorical_wrong_basin_shared_prob": 0.0,
215
+ "categorical_wrong_unigram_shared_prob": 0.0,
216
+ "mask_mixture_original_prob": 0.0,
217
+ "mask_mixture_lowk_prob": 1.0,
218
+ "mask_mixture_lowcorrupt_prob": 0.0,
219
+ "mask_mixture_block_prob": 0.0,
220
+ "mask_mixture_all_prob": 0.0,
221
+ "mask_mixture_lowk_clean_tokens": "8,16,32",
222
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
223
+ "mask_mixture_block_tokens": "64,128",
224
+ "simplex_bridge_sampler": "dirichlet",
225
+ "logistic_normal_sigma_min": 0.18,
226
+ "logistic_normal_sigma_max": 2.2,
227
+ "logistic_normal_tau_min": 0.65,
228
+ "logistic_normal_tau_max": 1.15,
229
+ "torch_compile": false,
230
+ "compile_mode": "max-autotune",
231
+ "state_format": "prob",
232
+ "meanflow_weight": 0.0,
233
+ "rollout_train_prob": 0.0,
234
+ "rollout_train_steps": 1,
235
+ "rollout_train_infer_steps": 64,
236
+ "rollout_train_temp": 1.45,
237
+ "rollout_train_max_gamma": 1.0,
238
+ "rollout_train_corrupt_only": true,
239
+ "rollout_train_samplewise": false,
240
+ "rollout_train_compute_always": false,
241
+ "bridge_noise_init": "logistic_normal",
242
+ "noise_sigma": -1.0,
243
+ "allow_tf32": true,
244
+ "activation_checkpointing": false,
245
+ "activation_checkpoint_interval": 1,
246
+ "activation_checkpoint_scope": "block",
247
+ "ddp_static_graph": false,
248
+ "ddp_gradient_as_bucket_view": true,
249
+ "blocking_data_transfer": false,
250
+ "dataloader_prefetch_factor": 4,
251
+ "full_train_stats": false,
252
+ "tokenized_hf": false,
253
+ "tokenized_pad_token": "pad",
254
+ "elf_conditional_hf": false,
255
+ "record_pad_truncate": false,
256
+ "record_add_eos": false,
257
+ "record_add_special_tokens": false,
258
+ "record_pad_token": "pad",
259
+ "record_shuffle_buffer": 10000,
260
+ "wrap": true,
261
+ "wrap_mode": "stream",
262
+ "wrap_record_buffer_size": 200,
263
+ "owt_cached_chunks": true,
264
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len128_train_minus_100k_fast1000",
265
+ "owt_chunk_cache_rebuild": false,
266
+ "owt_chunk_cache_write_batch": 4096,
267
+ "owt_exact_repeat_per_chunk": 0,
268
+ "online_chunk_shuffle": false,
269
+ "online_chunk_shuffle_buffer": 10000,
270
+ "openwebtext_split": "train_minus_100k",
271
+ "detokenizer": "auto",
272
+ "resolved_detokenizer": null,
273
+ "num_workers": 4,
274
+ "latest_every": 250,
275
+ "resume_path": ""
276
+ }
277
+ step=50 epoch=3/28 epoch_step=14/18 micro_steps=200 elapsed=23.0s lr=1.020000e-03 loss=1.7917 loss_recon=1.7917 loss_meanflow=0.0000 mean_model_t=0.2093 mean_corrupt_t=0.2093 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2093 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2120 corrupt_frac=0.8538 acc_corrupt=0.1234 loss_corrupt=1.8349 wrong_frac=0.7911 init_acc_corrupt=0.1163 acc_corrupt_t_0p0_0p2=0.0581 corrupt_frac_t_0p0_0p2=0.5584 acc_corrupt_t_0p2_0p4=0.1713 corrupt_frac_t_0p2_0p4=0.3540 acc_corrupt_t_0p4_0p6=0.3341 corrupt_frac_t_0p4_0p6=0.0868 acc_corrupt_t_0p6_0p8=0.4550 corrupt_frac_t_0p6_0p8=0.0373 out_w_norm=0.1776 out_g_norm=0.1567 acc_corrupt_t_0p8_1p0=0.6161 corrupt_frac_t_0p8_1p0=0.0328 loss_all=10.7031 init_gold_top10=0.1862 init_gold_top100=0.2651
278
+ step=100 epoch=6/28 epoch_step=10/18 micro_steps=400 elapsed=23.1s lr=2.000000e-03 loss=1.7514 loss_recon=1.7514 loss_meanflow=0.0000 mean_model_t=0.2104 mean_corrupt_t=0.2104 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2104 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1905 corrupt_frac=0.8546 acc_corrupt=0.1165 loss_corrupt=1.7954 wrong_frac=0.7896 init_acc_corrupt=0.1178 acc_corrupt_t_0p0_0p2=0.0612 corrupt_frac_t_0p0_0p2=0.5503 acc_corrupt_t_0p2_0p4=0.1555 corrupt_frac_t_0p2_0p4=0.3588 acc_corrupt_t_0p4_0p6=0.2880 corrupt_frac_t_0p4_0p6=0.0906 out_w_norm=0.9471 out_g_norm=0.2692 acc_corrupt_t_0p6_0p8=0.4150 corrupt_frac_t_0p6_0p8=0.0369 loss_all=10.2155 init_gold_top10=0.1995 init_gold_top100=0.2771
279
+ step=150 epoch=9/28 epoch_step=6/18 micro_steps=600 elapsed=22.6s lr=2.000000e-03 loss=1.5999 loss_recon=1.5999 loss_meanflow=0.0000 mean_model_t=0.2087 mean_corrupt_t=0.2087 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2087 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1839 corrupt_frac=0.8538 acc_corrupt=0.1177 loss_corrupt=1.6417 wrong_frac=0.7919 init_acc_corrupt=0.1155 acc_corrupt_t_0p0_0p2=0.0637 corrupt_frac_t_0p0_0p2=0.5617 acc_corrupt_t_0p2_0p4=0.1614 corrupt_frac_t_0p2_0p4=0.3511 acc_corrupt_t_0p4_0p6=0.2794 corrupt_frac_t_0p4_0p6=0.0844 out_w_norm=2.1097 out_g_norm=0.3814 acc_corrupt_t_0p6_0p8=0.3666 corrupt_frac_t_0p6_0p8=0.0370 acc_corrupt_t_0p8_1p0=0.5583 corrupt_frac_t_0p8_1p0=0.0345 loss_all=9.3504 init_gold_top10=0.1673 init_gold_top100=0.2515
280
+ step=200 epoch=12/28 epoch_step=2/18 micro_steps=800 elapsed=22.0s lr=2.000000e-03 loss=1.4210 loss_recon=1.4210 loss_meanflow=0.0000 mean_model_t=0.2072 mean_corrupt_t=0.2072 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2072 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1925 corrupt_frac=0.8542 acc_corrupt=0.1254 loss_corrupt=1.4600 wrong_frac=0.7919 init_acc_corrupt=0.1142 acc_corrupt_t_0p0_0p2=0.0679 corrupt_frac_t_0p0_0p2=0.5621 acc_corrupt_t_0p2_0p4=0.1727 corrupt_frac_t_0p2_0p4=0.3544 acc_corrupt_t_0p4_0p6=0.3009 corrupt_frac_t_0p4_0p6=0.0847 out_w_norm=3.2409 out_g_norm=0.4034 acc_corrupt_t_0p6_0p8=0.3879 corrupt_frac_t_0p6_0p8=0.0370 acc_corrupt_t_0p8_1p0=0.4833 corrupt_frac_t_0p8_1p0=0.0339 loss_all=8.4330 init_gold_top10=0.1778 init_gold_top100=0.2627
281
+ step=250 epoch=14/28 epoch_step=16/18 micro_steps=1000 elapsed=21.9s lr=2.000000e-03 loss=1.3332 loss_recon=1.3332 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2082 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1970 corrupt_frac=0.8524 acc_corrupt=0.1295 loss_corrupt=1.3671 wrong_frac=0.7918 init_acc_corrupt=0.1144 acc_corrupt_t_0p0_0p2=0.0696 corrupt_frac_t_0p0_0p2=0.5532 acc_corrupt_t_0p2_0p4=0.1783 corrupt_frac_t_0p2_0p4=0.3651 acc_corrupt_t_0p4_0p6=0.3076 corrupt_frac_t_0p4_0p6=0.0790 out_w_norm=3.9107 out_g_norm=0.3331 acc_corrupt_t_0p6_0p8=0.4125 corrupt_frac_t_0p6_0p8=0.0348 loss_all=7.8325 init_gold_top10=0.2347 init_gold_top100=0.2905
282
+ step=300 epoch=17/28 epoch_step=12/18 micro_steps=1200 elapsed=24.1s lr=2.000000e-03 loss=1.2538 loss_recon=1.2538 loss_meanflow=0.0000 mean_model_t=0.2086 mean_corrupt_t=0.2086 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2086 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2131 corrupt_frac=0.8535 acc_corrupt=0.1470 loss_corrupt=1.2905 wrong_frac=0.7917 init_acc_corrupt=0.1162 acc_corrupt_t_0p0_0p2=0.0852 corrupt_frac_t_0p0_0p2=0.5643 acc_corrupt_t_0p2_0p4=0.1973 corrupt_frac_t_0p2_0p4=0.3468 acc_corrupt_t_0p4_0p6=0.3338 corrupt_frac_t_0p4_0p6=0.0857 out_w_norm=4.4521 out_g_norm=0.2184 acc_corrupt_t_0p6_0p8=0.4296 corrupt_frac_t_0p6_0p8=0.0347 loss_all=7.9047 init_gold_top10=0.1832 init_gold_top100=0.2668
283
+ step=350 epoch=20/28 epoch_step=8/18 micro_steps=1400 elapsed=21.9s lr=2.000000e-03 loss=1.1992 loss_recon=1.1992 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2096 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2328 corrupt_frac=0.8532 acc_corrupt=0.1650 loss_corrupt=1.2288 wrong_frac=0.7900 init_acc_corrupt=0.1174 acc_corrupt_t_0p0_0p2=0.0993 corrupt_frac_t_0p0_0p2=0.5586 acc_corrupt_t_0p2_0p4=0.2181 corrupt_frac_t_0p2_0p4=0.3502 acc_corrupt_t_0p4_0p6=0.3540 corrupt_frac_t_0p4_0p6=0.0890 out_w_norm=5.1527 out_g_norm=0.1618 acc_corrupt_t_0p6_0p8=0.4492 corrupt_frac_t_0p6_0p8=0.0340 acc_corrupt_t_0p8_1p0=0.5833 corrupt_frac_t_0p8_1p0=0.0276 loss_all=7.4018 init_gold_top10=0.1712 init_gold_top100=0.2596
284
+ step=400 epoch=23/28 epoch_step=4/18 micro_steps=1600 elapsed=21.9s lr=2.000000e-03 loss=1.1396 loss_recon=1.1396 loss_meanflow=0.0000 mean_model_t=0.2076 mean_corrupt_t=0.2076 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2076 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2469 corrupt_frac=0.8556 acc_corrupt=0.1781 loss_corrupt=1.1718 wrong_frac=0.7928 init_acc_corrupt=0.1136 acc_corrupt_t_0p0_0p2=0.1103 corrupt_frac_t_0p0_0p2=0.5638 acc_corrupt_t_0p2_0p4=0.2365 corrupt_frac_t_0p2_0p4=0.3519 acc_corrupt_t_0p4_0p6=0.3769 corrupt_frac_t_0p4_0p6=0.0817 out_w_norm=5.9267 out_g_norm=0.1555 acc_corrupt_t_0p6_0p8=0.4965 corrupt_frac_t_0p6_0p8=0.0335 acc_corrupt_t_0p8_1p0=0.6071 corrupt_frac_t_0p8_1p0=0.0317 loss_all=7.1084 init_gold_top10=0.1902 init_gold_top100=0.2700
285
+ step=450 epoch=25/28 epoch_step=18/18 micro_steps=1800 elapsed=22.0s lr=2.000000e-03 loss=1.1084 loss_recon=1.1084 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2094 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2634 corrupt_frac=0.8545 acc_corrupt=0.1916 loss_corrupt=1.1423 wrong_frac=0.7907 init_acc_corrupt=0.1168 acc_corrupt_t_0p0_0p2=0.1172 corrupt_frac_t_0p0_0p2=0.5565 acc_corrupt_t_0p2_0p4=0.2523 corrupt_frac_t_0p2_0p4=0.3519 acc_corrupt_t_0p4_0p6=0.4017 corrupt_frac_t_0p4_0p6=0.0885 out_w_norm=6.6910 out_g_norm=0.1516 acc_corrupt_t_0p6_0p8=0.5167 corrupt_frac_t_0p6_0p8=0.0365 loss_all=7.1996 init_gold_top10=0.1661 init_gold_top100=0.2604
286
+ step=500 epoch=28/28 epoch_step=14/18 micro_steps=2000 elapsed=22.0s lr=2.000000e-03 loss=1.0744 loss_recon=1.0744 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2095 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2768 corrupt_frac=0.8538 acc_corrupt=0.2030 loss_corrupt=1.1074 wrong_frac=0.7903 init_acc_corrupt=0.1164 acc_corrupt_t_0p0_0p2=0.1256 corrupt_frac_t_0p0_0p2=0.5560 acc_corrupt_t_0p2_0p4=0.2668 corrupt_frac_t_0p2_0p4=0.3545 acc_corrupt_t_0p4_0p6=0.4203 corrupt_frac_t_0p4_0p6=0.0856 out_w_norm=7.4904 out_g_norm=0.1582 acc_corrupt_t_0p6_0p8=0.5357 corrupt_frac_t_0p6_0p8=0.0339 acc_corrupt_t_0p8_1p0=0.6667 corrupt_frac_t_0p8_1p0=0.0275 loss_all=7.0227 init_gold_top10=0.1576 init_gold_top100=0.2480
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_combo_len256_logistic_unigram_shared_C1024_20260517_170456.log ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 969,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_combo_len256_logistic_unigram_shared_C1024_20260517_170456",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 1965440,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 1.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_corpus_unigram_path": "",
124
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
125
+ "categorical_wrong_basin_shared_prob": 0.0,
126
+ "categorical_wrong_unigram_shared_prob": 0.5,
127
+ "mask_mixture_original_prob": 0.0,
128
+ "mask_mixture_lowk_prob": 0.0,
129
+ "mask_mixture_lowcorrupt_prob": 0.0,
130
+ "mask_mixture_block_prob": 0.0,
131
+ "mask_mixture_all_prob": 1.0,
132
+ "mask_mixture_lowk_clean_tokens": "0",
133
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
134
+ "mask_mixture_block_tokens": "64,128",
135
+ "simplex_bridge_sampler": "logistic_normal_linear_mean",
136
+ "logistic_normal_sigma_min": 0.03,
137
+ "logistic_normal_sigma_max": 0.4,
138
+ "logistic_normal_tau_min": 1.0,
139
+ "logistic_normal_tau_max": 1.0,
140
+ "torch_compile": false,
141
+ "compile_mode": "max-autotune",
142
+ "state_format": "prob",
143
+ "meanflow_weight": 0.0,
144
+ "rollout_train_prob": 0.0,
145
+ "rollout_train_steps": 1,
146
+ "rollout_train_infer_steps": 64,
147
+ "rollout_train_temp": 1.45,
148
+ "rollout_train_max_gamma": 1.0,
149
+ "rollout_train_corrupt_only": true,
150
+ "rollout_train_samplewise": false,
151
+ "rollout_train_compute_always": false,
152
+ "bridge_noise_init": "logistic_normal",
153
+ "noise_sigma": -1.0,
154
+ "allow_tf32": true,
155
+ "activation_checkpointing": false,
156
+ "activation_checkpoint_interval": 1,
157
+ "activation_checkpoint_scope": "block",
158
+ "ddp_static_graph": false,
159
+ "ddp_gradient_as_bucket_view": true,
160
+ "blocking_data_transfer": false,
161
+ "dataloader_prefetch_factor": 4,
162
+ "full_train_stats": false,
163
+ "tokenized_hf": false,
164
+ "tokenized_pad_token": "pad",
165
+ "elf_conditional_hf": false,
166
+ "record_pad_truncate": false,
167
+ "record_add_eos": false,
168
+ "record_add_special_tokens": false,
169
+ "record_pad_token": "pad",
170
+ "record_shuffle_buffer": 10000,
171
+ "wrap": true,
172
+ "wrap_mode": "stream",
173
+ "wrap_record_buffer_size": 200,
174
+ "owt_cached_chunks": true,
175
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
176
+ "owt_chunk_cache_rebuild": false,
177
+ "owt_chunk_cache_write_batch": 4096,
178
+ "owt_exact_repeat_per_chunk": 64,
179
+ "online_chunk_shuffle": false,
180
+ "online_chunk_shuffle_buffer": 10000,
181
+ "openwebtext_split": "train_minus_100k",
182
+ "detokenizer": "auto",
183
+ "resolved_detokenizer": null,
184
+ "num_workers": 0,
185
+ "latest_every": 1000,
186
+ "resume_path": ""
187
+ }
188
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=4.7s lr=2.000000e-03 loss=6.6554 loss_recon=6.6554 loss_meanflow=0.0000 mean_model_t=0.2098 mean_corrupt_t=0.2098 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1884 corrupt_frac=1.0000 acc_corrupt=0.1884 loss_corrupt=6.6554 wrong_frac=0.7900 init_acc_corrupt=0.2100 acc_corrupt_t_0p0_0p2=0.0937 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.2650 corrupt_frac_t_0p2_0p4=0.3581 acc_corrupt_t_0p4_0p6=0.4636 corrupt_frac_t_0p4_0p6=0.0759 acc_corrupt_t_0p6_0p8=0.6394 corrupt_frac_t_0p6_0p8=0.0137 out_w_norm=1.2294 out_g_norm=0.9309 acc_corrupt_t_0p8_1p0=0.8184 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.3647 init_gold_top10=0.2083 init_gold_top100=0.2807
189
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=4.0s lr=2.000000e-03 loss=5.9356 loss_recon=5.9356 loss_meanflow=0.0000 mean_model_t=0.2086 mean_corrupt_t=0.2086 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1916 corrupt_frac=1.0000 acc_corrupt=0.1916 loss_corrupt=5.9356 wrong_frac=0.7913 init_acc_corrupt=0.2087 acc_corrupt_t_0p0_0p2=0.1036 corrupt_frac_t_0p0_0p2=0.5588 acc_corrupt_t_0p2_0p4=0.2602 corrupt_frac_t_0p2_0p4=0.3575 acc_corrupt_t_0p4_0p6=0.4665 corrupt_frac_t_0p4_0p6=0.0753 acc_corrupt_t_0p6_0p8=0.6580 corrupt_frac_t_0p6_0p8=0.0130 out_w_norm=4.0673 out_g_norm=1.2994 acc_corrupt_t_0p8_1p0=0.8477 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.5045 init_gold_top10=0.2240 init_gold_top100=0.2987
190
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=4.0s lr=2.000000e-03 loss=5.2619 loss_recon=5.2619 loss_meanflow=0.0000 mean_model_t=0.2113 mean_corrupt_t=0.2113 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2186 corrupt_frac=1.0000 acc_corrupt=0.2186 loss_corrupt=5.2619 wrong_frac=0.7886 init_acc_corrupt=0.2114 acc_corrupt_t_0p0_0p2=0.1285 corrupt_frac_t_0p0_0p2=0.5517 acc_corrupt_t_0p2_0p4=0.2875 corrupt_frac_t_0p2_0p4=0.3592 acc_corrupt_t_0p4_0p6=0.4807 corrupt_frac_t_0p4_0p6=0.0802 acc_corrupt_t_0p6_0p8=0.6575 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=6.7067 out_g_norm=0.5999 acc_corrupt_t_0p8_1p0=0.8438 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.1481 init_gold_top10=0.2068 init_gold_top100=0.2804
191
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=4.0s lr=2.000000e-03 loss=4.9796 loss_recon=4.9796 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2246 corrupt_frac=1.0000 acc_corrupt=0.2246 loss_corrupt=4.9796 wrong_frac=0.7907 init_acc_corrupt=0.2093 acc_corrupt_t_0p0_0p2=0.1368 corrupt_frac_t_0p0_0p2=0.5551 acc_corrupt_t_0p2_0p4=0.2930 corrupt_frac_t_0p2_0p4=0.3560 acc_corrupt_t_0p4_0p6=0.4809 corrupt_frac_t_0p4_0p6=0.0795 acc_corrupt_t_0p6_0p8=0.6536 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=8.3842 out_g_norm=0.3521 acc_corrupt_t_0p8_1p0=0.7969 corrupt_frac_t_0p8_1p0=0.0078 loss_all=4.8480 init_gold_top10=0.2126 init_gold_top100=0.2850
192
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=4.0s lr=2.000000e-03 loss=4.4947 loss_recon=4.4947 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2309 corrupt_frac=1.0000 acc_corrupt=0.2309 loss_corrupt=4.4947 wrong_frac=0.7926 init_acc_corrupt=0.2074 acc_corrupt_t_0p0_0p2=0.1447 corrupt_frac_t_0p0_0p2=0.5591 acc_corrupt_t_0p2_0p4=0.3020 corrupt_frac_t_0p2_0p4=0.3566 acc_corrupt_t_0p4_0p6=0.4839 corrupt_frac_t_0p4_0p6=0.0758 out_w_norm=9.5633 out_g_norm=0.4817 acc_corrupt_t_0p6_0p8=0.6634 corrupt_frac_t_0p6_0p8=0.0129 acc_corrupt_t_0p8_1p0=0.8516 corrupt_frac_t_0p8_1p0=0.0078 loss_all=4.1750 init_gold_top10=0.1929 init_gold_top100=0.2686
193
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=4.0s lr=2.000000e-03 loss=3.6893 loss_recon=3.6893 loss_meanflow=0.0000 mean_model_t=0.2080 mean_corrupt_t=0.2080 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2459 corrupt_frac=1.0000 acc_corrupt=0.2459 loss_corrupt=3.6893 wrong_frac=0.7923 init_acc_corrupt=0.2077 acc_corrupt_t_0p0_0p2=0.1584 corrupt_frac_t_0p0_0p2=0.5596 acc_corrupt_t_0p2_0p4=0.3192 corrupt_frac_t_0p2_0p4=0.3571 acc_corrupt_t_0p4_0p6=0.5011 corrupt_frac_t_0p4_0p6=0.0742 acc_corrupt_t_0p6_0p8=0.6711 corrupt_frac_t_0p6_0p8=0.0128 out_w_norm=9.9203 out_g_norm=0.4404 acc_corrupt_t_0p8_1p0=0.7656 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.1350 init_gold_top10=0.2306 init_gold_top100=0.3021
194
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=4.1s lr=2.000000e-03 loss=2.6742 loss_recon=2.6742 loss_meanflow=0.0000 mean_model_t=0.2109 mean_corrupt_t=0.2109 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3112 corrupt_frac=1.0000 acc_corrupt=0.3112 loss_corrupt=2.6742 wrong_frac=0.7892 init_acc_corrupt=0.2108 acc_corrupt_t_0p0_0p2=0.2056 corrupt_frac_t_0p0_0p2=0.5513 acc_corrupt_t_0p2_0p4=0.4036 corrupt_frac_t_0p2_0p4=0.3610 acc_corrupt_t_0p4_0p6=0.5805 corrupt_frac_t_0p4_0p6=0.0786 out_w_norm=10.1693 out_g_norm=0.4827 acc_corrupt_t_0p6_0p8=0.7189 corrupt_frac_t_0p6_0p8=0.0134 acc_corrupt_t_0p8_1p0=0.8789 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.1149 init_gold_top10=0.2158 init_gold_top100=0.2894
195
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=4.0s lr=2.000000e-03 loss=1.5900 loss_recon=1.5900 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5102 corrupt_frac=1.0000 acc_corrupt=0.5102 loss_corrupt=1.5900 wrong_frac=0.7911 init_acc_corrupt=0.2089 acc_corrupt_t_0p0_0p2=0.3754 corrupt_frac_t_0p0_0p2=0.5545 acc_corrupt_t_0p2_0p4=0.6481 corrupt_frac_t_0p2_0p4=0.3613 acc_corrupt_t_0p4_0p6=0.7977 corrupt_frac_t_0p4_0p6=0.0755 acc_corrupt_t_0p6_0p8=0.8729 corrupt_frac_t_0p6_0p8=0.0125 out_w_norm=10.4756 out_g_norm=0.5089 acc_corrupt_t_0p8_1p0=0.9342 corrupt_frac_t_0p8_1p0=0.0094 loss_all=1.0926 init_gold_top10=0.2185 init_gold_top100=0.2932
196
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=4.0s lr=2.000000e-03 loss=0.7840 loss_recon=0.7840 loss_meanflow=0.0000 mean_model_t=0.2101 mean_corrupt_t=0.2101 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7687 corrupt_frac=1.0000 acc_corrupt=0.7687 loss_corrupt=0.7840 wrong_frac=0.7898 init_acc_corrupt=0.2102 acc_corrupt_t_0p0_0p2=0.6409 corrupt_frac_t_0p0_0p2=0.5542 acc_corrupt_t_0p2_0p4=0.9159 corrupt_frac_t_0p2_0p4=0.3577 acc_corrupt_t_0p4_0p6=0.9730 corrupt_frac_t_0p4_0p6=0.0784 out_w_norm=10.7803 out_g_norm=0.4530 acc_corrupt_t_0p6_0p8=0.9885 corrupt_frac_t_0p6_0p8=0.0141 loss_all=0.4419 init_gold_top10=0.2162 init_gold_top100=0.2884
197
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=4.0s lr=2.000000e-03 loss=0.3991 loss_recon=0.3991 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8878 corrupt_frac=1.0000 acc_corrupt=0.8878 loss_corrupt=0.3991 wrong_frac=0.7906 init_acc_corrupt=0.2094 acc_corrupt_t_0p0_0p2=0.8048 corrupt_frac_t_0p0_0p2=0.5538 acc_corrupt_t_0p2_0p4=0.9889 corrupt_frac_t_0p2_0p4=0.3596 acc_corrupt_t_0p4_0p6=0.9981 corrupt_frac_t_0p4_0p6=0.0773 out_w_norm=11.1175 out_g_norm=0.3625 acc_corrupt_t_0p6_0p8=0.9988 corrupt_frac_t_0p6_0p8=0.0140 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3601 init_gold_top10=0.2185 init_gold_top100=0.2934
198
+ NCCL version 2.25.1+cuda12.8
199
+ resumed_from=runs/train8_combo_len256_logistic_unigram_shared_C1024_20260517_170456/latest.pt start_step=1001
200
+ {
201
+ "device": "cuda:0",
202
+ "rank": 0,
203
+ "world_size": 4,
204
+ "samples": "owt_cached_chunks:8",
205
+ "vocab_size": 969,
206
+ "tokenizer_vocab_size": 50257,
207
+ "save_dir": "runs/train8_combo_len256_logistic_unigram_shared_C1024_20260517_170456",
208
+ "batch_size": 128,
209
+ "grad_accum": 1,
210
+ "effective_batch_size": 512,
211
+ "global_batch_size": 512,
212
+ "lr_schedule": "constant_warmup",
213
+ "optimizer": "muon",
214
+ "epochs": 0.0,
215
+ "steps_per_epoch": 1,
216
+ "total_steps": 2000,
217
+ "warmup_steps": 10,
218
+ "warmup_epochs": -1.0,
219
+ "min_lr": 0.0,
220
+ "weight_decay": 0.1,
221
+ "output_weight_decay": -1.0,
222
+ "adamw_param_groups": "nanogpt",
223
+ "adam_beta1": 0.9,
224
+ "adam_beta2": 0.95,
225
+ "adam_eps": 1e-08,
226
+ "muon_impl": "legacy",
227
+ "muon_momentum": 0.95,
228
+ "muon_ns_steps": 5,
229
+ "muon_update_scale": 1.0,
230
+ "muon_nesterov": false,
231
+ "muon_width_scale": false,
232
+ "muon_grouping": "legacy_dim_ge_2",
233
+ "muon_param_count": 1965440,
234
+ "muon_adam_param_count": 8192,
235
+ "muon_param_names": [
236
+ "vocab_embed.embedding",
237
+ "sigma_map.net.0.weight",
238
+ "sigma_map.net.2.weight",
239
+ "blocks.0.attn_qkv.weight",
240
+ "blocks.0.attn_out.weight",
241
+ "blocks.0.mlp.0.weight",
242
+ "blocks.0.mlp.2.weight",
243
+ "blocks.0.adaLN_modulation.weight",
244
+ "blocks.1.attn_qkv.weight",
245
+ "blocks.1.attn_out.weight",
246
+ "blocks.1.mlp.0.weight",
247
+ "blocks.1.mlp.2.weight",
248
+ "blocks.1.adaLN_modulation.weight",
249
+ "blocks.2.attn_qkv.weight",
250
+ "blocks.2.attn_out.weight",
251
+ "blocks.2.mlp.0.weight",
252
+ "blocks.2.mlp.2.weight",
253
+ "blocks.2.adaLN_modulation.weight",
254
+ "output_layer.linear.weight",
255
+ "output_layer.adaLN_modulation.weight"
256
+ ],
257
+ "muon_adam_param_names": [
258
+ "sigma_map.net.0.bias",
259
+ "sigma_map.net.2.bias",
260
+ "blocks.0.norm1.weight",
261
+ "blocks.0.norm2.weight",
262
+ "blocks.0.mlp.0.bias",
263
+ "blocks.0.mlp.2.bias",
264
+ "blocks.0.adaLN_modulation.bias",
265
+ "blocks.1.norm1.weight",
266
+ "blocks.1.norm2.weight",
267
+ "blocks.1.mlp.0.bias",
268
+ "blocks.1.mlp.2.bias",
269
+ "blocks.1.adaLN_modulation.bias",
270
+ "blocks.2.norm1.weight",
271
+ "blocks.2.norm2.weight",
272
+ "blocks.2.mlp.0.bias",
273
+ "blocks.2.mlp.2.bias",
274
+ "blocks.2.adaLN_modulation.bias",
275
+ "output_layer.norm_final.weight",
276
+ "output_layer.adaLN_modulation.bias"
277
+ ],
278
+ "muon_effective_nesterov": false,
279
+ "muon_effective_width_scale": false,
280
+ "muon_effective_weight_decay": 0.1,
281
+ "muon_adam_fallback_nesterov": false,
282
+ "muon_adam_fallback_weight_decay": 0.1,
283
+ "ema_decay": 0.9999,
284
+ "ema_start_step": 0,
285
+ "model_type": "ddit",
286
+ "ddit_mlp_type": "gelu",
287
+ "elf_num_time_tokens": 4,
288
+ "elf_num_model_mode_tokens": 0,
289
+ "qk_norm": true,
290
+ "output_bias": false,
291
+ "output_init_std": -1.0,
292
+ "norm_type": "rmsnorm",
293
+ "target_loss": "hard_ce",
294
+ "linear_soft_target_power": 1.0,
295
+ "linear_soft_target_min_conf": 0.0,
296
+ "linear_soft_target_max_conf": 1.0,
297
+ "t_sampling_mode": "logit_normal",
298
+ "t_sampling_power": 1.0,
299
+ "t_sampling_eps": 0.0001,
300
+ "t_sampling_logit_mean": -1.5,
301
+ "t_sampling_logit_std": 0.8,
302
+ "dual_t": true,
303
+ "corrupt_t_mode": "same",
304
+ "corrupt_min_t": 0.0,
305
+ "corrupt_max_t": 1.0,
306
+ "prefix_block_prob": 0.0,
307
+ "prefix_block_len": 128,
308
+ "mask_ratio_floor_schedule": "none",
309
+ "dirichlet_endpoint_mode": "categorical_dual_t",
310
+ "dirichlet_semantic_t_mode": "same",
311
+ "dirichlet_semantic_t_value": 0.0,
312
+ "dirichlet_semantic_t_curve": "linear",
313
+ "dirichlet_semantic_t_power": 1.0,
314
+ "endpoint_sequence_random_prob_alpha": 0.0,
315
+ "categorical_wrong_from_full_vocab": true,
316
+ "categorical_wrong_from_batch_valid_tokens": false,
317
+ "categorical_wrong_basin_token_ids": "",
318
+ "categorical_wrong_basin_prob": 0.0,
319
+ "categorical_wrong_unigram_prob": 1.0,
320
+ "categorical_wrong_uniform_prob": 0.0,
321
+ "categorical_wrong_corpus_unigram_path": "",
322
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
323
+ "categorical_wrong_basin_shared_prob": 0.0,
324
+ "categorical_wrong_unigram_shared_prob": 0.5,
325
+ "mask_mixture_original_prob": 0.0,
326
+ "mask_mixture_lowk_prob": 0.0,
327
+ "mask_mixture_lowcorrupt_prob": 0.0,
328
+ "mask_mixture_block_prob": 0.0,
329
+ "mask_mixture_all_prob": 1.0,
330
+ "mask_mixture_lowk_clean_tokens": "0",
331
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
332
+ "mask_mixture_block_tokens": "64,128",
333
+ "simplex_bridge_sampler": "logistic_normal_linear_mean",
334
+ "logistic_normal_sigma_min": 0.03,
335
+ "logistic_normal_sigma_max": 0.4,
336
+ "logistic_normal_tau_min": 1.0,
337
+ "logistic_normal_tau_max": 1.0,
338
+ "torch_compile": false,
339
+ "compile_mode": "max-autotune",
340
+ "state_format": "prob",
341
+ "meanflow_weight": 0.0,
342
+ "rollout_train_prob": 0.0,
343
+ "rollout_train_steps": 1,
344
+ "rollout_train_infer_steps": 64,
345
+ "rollout_train_temp": 1.45,
346
+ "rollout_train_max_gamma": 1.0,
347
+ "rollout_train_corrupt_only": true,
348
+ "rollout_train_samplewise": false,
349
+ "rollout_train_compute_always": false,
350
+ "bridge_noise_init": "logistic_normal",
351
+ "noise_sigma": -1.0,
352
+ "allow_tf32": true,
353
+ "activation_checkpointing": false,
354
+ "activation_checkpoint_interval": 1,
355
+ "activation_checkpoint_scope": "block",
356
+ "ddp_static_graph": false,
357
+ "ddp_gradient_as_bucket_view": true,
358
+ "blocking_data_transfer": false,
359
+ "dataloader_prefetch_factor": 4,
360
+ "full_train_stats": false,
361
+ "tokenized_hf": false,
362
+ "tokenized_pad_token": "pad",
363
+ "elf_conditional_hf": false,
364
+ "record_pad_truncate": false,
365
+ "record_add_eos": false,
366
+ "record_add_special_tokens": false,
367
+ "record_pad_token": "pad",
368
+ "record_shuffle_buffer": 10000,
369
+ "wrap": true,
370
+ "wrap_mode": "stream",
371
+ "wrap_record_buffer_size": 200,
372
+ "owt_cached_chunks": true,
373
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
374
+ "owt_chunk_cache_rebuild": false,
375
+ "owt_chunk_cache_write_batch": 4096,
376
+ "owt_exact_repeat_per_chunk": 64,
377
+ "online_chunk_shuffle": false,
378
+ "online_chunk_shuffle_buffer": 10000,
379
+ "openwebtext_split": "train_minus_100k",
380
+ "detokenizer": "auto",
381
+ "resolved_detokenizer": null,
382
+ "num_workers": 0,
383
+ "latest_every": 1000,
384
+ "resume_path": "runs/train8_combo_len256_logistic_unigram_shared_C1024_20260517_170456/latest.pt"
385
+ }
386
+ step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=4.4s lr=2.000000e-03 loss=0.2548 loss_recon=0.2548 loss_meanflow=0.0000 mean_model_t=0.2098 mean_corrupt_t=0.2098 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9283 corrupt_frac=1.0000 acc_corrupt=0.9283 loss_corrupt=0.2548 wrong_frac=0.7900 init_acc_corrupt=0.2100 acc_corrupt_t_0p0_0p2=0.8728 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.9975 corrupt_frac_t_0p2_0p4=0.3581 acc_corrupt_t_0p4_0p6=0.9995 corrupt_frac_t_0p4_0p6=0.0759 acc_corrupt_t_0p6_0p8=0.9996 corrupt_frac_t_0p6_0p8=0.0137 out_w_norm=11.3638 out_g_norm=0.3115 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2897 init_gold_top10=0.2083 init_gold_top100=0.2807
387
+ step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=3.8s lr=2.000000e-03 loss=0.1933 loss_recon=0.1933 loss_meanflow=0.0000 mean_model_t=0.2086 mean_corrupt_t=0.2086 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9462 corrupt_frac=1.0000 acc_corrupt=0.9462 loss_corrupt=0.1933 wrong_frac=0.7913 init_acc_corrupt=0.2087 acc_corrupt_t_0p0_0p2=0.9044 corrupt_frac_t_0p0_0p2=0.5588 acc_corrupt_t_0p2_0p4=0.9990 corrupt_frac_t_0p2_0p4=0.3575 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0753 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0130 out_w_norm=11.4992 out_g_norm=0.2630 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.2213 init_gold_top10=0.2240 init_gold_top100=0.2987
388
+ step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=3.8s lr=2.000000e-03 loss=0.1427 loss_recon=0.1427 loss_meanflow=0.0000 mean_model_t=0.2113 mean_corrupt_t=0.2113 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9602 corrupt_frac=1.0000 acc_corrupt=0.9602 loss_corrupt=0.1427 wrong_frac=0.7886 init_acc_corrupt=0.2114 acc_corrupt_t_0p0_0p2=0.9283 corrupt_frac_t_0p0_0p2=0.5517 acc_corrupt_t_0p2_0p4=0.9994 corrupt_frac_t_0p2_0p4=0.3592 acc_corrupt_t_0p4_0p6=0.9997 corrupt_frac_t_0p4_0p6=0.0802 acc_corrupt_t_0p6_0p8=0.9995 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=11.5852 out_g_norm=0.2256 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.1299 init_gold_top10=0.2068 init_gold_top100=0.2804
389
+ step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=3.8s lr=2.000000e-03 loss=0.1327 loss_recon=0.1327 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9637 corrupt_frac=1.0000 acc_corrupt=0.9637 loss_corrupt=0.1327 wrong_frac=0.7907 init_acc_corrupt=0.2093 acc_corrupt_t_0p0_0p2=0.9349 corrupt_frac_t_0p0_0p2=0.5551 acc_corrupt_t_0p2_0p4=0.9997 corrupt_frac_t_0p2_0p4=0.3560 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0795 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=11.6459 out_g_norm=0.2093 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.0916 init_gold_top10=0.2126 init_gold_top100=0.2850
390
+ W0517 17:10:36.514000 234504 torch/distributed/elastic/agent/server/api.py:719] Received 15 death signal, shutting down workers
391
+ W0517 17:10:36.515000 234504 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 234508 closing signal SIGTERM
392
+ W0517 17:10:36.515000 234504 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 234509 closing signal SIGTERM
393
+ W0517 17:10:36.516000 234504 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 234510 closing signal SIGTERM
394
+ W0517 17:10:36.516000 234504 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 234511 closing signal SIGTERM
395
+ Traceback (most recent call last):
396
+ File "<frozen runpy>", line 198, in _run_module_as_main
397
+ File "<frozen runpy>", line 88, in _run_code
398
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
399
+ main()
400
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
401
+ return f(*args, **kwargs)
402
+ ^^^^^^^^^^^^^^^^^^
403
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
404
+ run(args)
405
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
406
+ elastic_launch(
407
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
408
+ return launch_agent(self._config, self._entrypoint, list(args))
409
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
410
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
411
+ result = agent.run()
412
+ ^^^^^^^^^^^
413
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
414
+ result = f(*args, **kwargs)
415
+ ^^^^^^^^^^^^^^^^^^
416
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run
417
+ result = self._invoke_run(role)
418
+ ^^^^^^^^^^^^^^^^^^^^^^
419
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 870, in _invoke_run
420
+ time.sleep(monitor_interval)
421
+ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler
422
+ raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
423
+ torch.distributed.elastic.multiprocessing.api.SignalException: Process 234504 got signal: 15
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_core_p50_unif0_0p25_outwdm1_ctx1024_core_tradeoff_dual_20260517_230929.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_p50_unif0_0p25_outwdm1_ctx1024_sampleds_wide_20260517_220321.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_path_p50_path3_unif0_0p25_outwdm1_ctx1024_path_tradeoff_sde_20260517_232950.log ADDED
@@ -0,0 +1,617 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 2664,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_ctx1024_path_p50_path3_unif0_0p25_outwdm1_ctx1024_path_tradeoff_sde_20260517_232950",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 2616320,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_prob_floor": 0.0,
124
+ "categorical_wrong_corpus_unigram_path": "",
125
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
126
+ "categorical_wrong_basin_shared_prob": 0.0,
127
+ "categorical_wrong_unigram_shared_prob": 0.0,
128
+ "mask_mixture_original_prob": 0.0,
129
+ "mask_mixture_lowk_prob": 0.0,
130
+ "mask_mixture_lowcorrupt_prob": 0.0,
131
+ "mask_mixture_block_prob": 0.0,
132
+ "mask_mixture_all_prob": 1.0,
133
+ "mask_mixture_lowk_clean_tokens": "0",
134
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
135
+ "mask_mixture_block_tokens": "64,128",
136
+ "simplex_bridge_sampler": "dirichlet",
137
+ "logistic_normal_sigma_min": 0.1,
138
+ "logistic_normal_sigma_max": 1.0,
139
+ "logistic_normal_tau_min": 1.0,
140
+ "logistic_normal_tau_max": 1.0,
141
+ "torch_compile": false,
142
+ "compile_mode": "max-autotune",
143
+ "state_format": "prob",
144
+ "meanflow_weight": 0.0,
145
+ "rollout_train_prob": 0.5,
146
+ "rollout_train_steps": 3,
147
+ "rollout_train_infer_steps": 1,
148
+ "rollout_train_time_mode": "sampled_path",
149
+ "rollout_train_s_dist": "uniform",
150
+ "rollout_train_s_min_frac": 0.0,
151
+ "rollout_train_s_max_frac": 0.25,
152
+ "rollout_train_s_beta_alpha": 2.0,
153
+ "rollout_train_s_beta_beta": 6.0,
154
+ "rollout_train_temp": 1.45,
155
+ "rollout_train_max_gamma": 1.0,
156
+ "rollout_train_corrupt_only": true,
157
+ "rollout_train_samplewise": true,
158
+ "rollout_train_compute_always": false,
159
+ "rollout_train_sync_t": true,
160
+ "bridge_noise_init": "logistic_normal",
161
+ "noise_sigma": -1.0,
162
+ "allow_tf32": true,
163
+ "activation_checkpointing": false,
164
+ "activation_checkpoint_interval": 1,
165
+ "activation_checkpoint_scope": "block",
166
+ "ddp_static_graph": false,
167
+ "ddp_gradient_as_bucket_view": true,
168
+ "blocking_data_transfer": false,
169
+ "dataloader_prefetch_factor": 4,
170
+ "full_train_stats": false,
171
+ "tokenized_hf": false,
172
+ "tokenized_pad_token": "pad",
173
+ "elf_conditional_hf": false,
174
+ "record_pad_truncate": false,
175
+ "record_add_eos": false,
176
+ "record_add_special_tokens": false,
177
+ "record_pad_token": "pad",
178
+ "record_shuffle_buffer": 10000,
179
+ "wrap": true,
180
+ "wrap_mode": "stream",
181
+ "wrap_record_buffer_size": 200,
182
+ "owt_cached_chunks": true,
183
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
184
+ "owt_chunk_cache_rebuild": false,
185
+ "owt_chunk_cache_write_batch": 4096,
186
+ "owt_exact_repeat_per_chunk": 64,
187
+ "online_chunk_shuffle": false,
188
+ "online_chunk_shuffle_buffer": 10000,
189
+ "openwebtext_split": "train_minus_100k",
190
+ "detokenizer": "auto",
191
+ "resolved_detokenizer": null,
192
+ "num_workers": 0,
193
+ "latest_every": 1000,
194
+ "resume_path": ""
195
+ }
196
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=21.5s lr=2.000000e-03 loss=7.7229 loss_recon=7.7229 loss_meanflow=0.0000 mean_model_t=0.2076 mean_corrupt_t=0.2076 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5055 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0942 corrupt_frac=1.0000 acc_corrupt=0.0942 loss_corrupt=7.7229 wrong_frac=0.7923 init_acc_corrupt=0.1148 acc_corrupt_t_0p0_0p2=0.0501 corrupt_frac_t_0p0_0p2=0.5602 acc_corrupt_t_0p2_0p4=0.1248 corrupt_frac_t_0p2_0p4=0.3545 acc_corrupt_t_0p4_0p6=0.2468 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.3541 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=0.9887 out_g_norm=1.1051 acc_corrupt_t_0p8_1p0=0.4805 corrupt_frac_t_0p8_1p0=0.0078 loss_all=7.4886 init_gold_top10=0.1877 init_gold_top100=0.4027 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.1012 init_acc_rollout_kept=0.1215 logit_acc_rollout_applied=0.0875 logit_acc_rollout_kept=0.1014
197
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=20.4s lr=2.000000e-03 loss=7.0910 loss_recon=7.0910 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4901 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1032 corrupt_frac=1.0000 acc_corrupt=0.1032 loss_corrupt=7.0910 wrong_frac=0.7903 init_acc_corrupt=0.1175 acc_corrupt_t_0p0_0p2=0.0559 corrupt_frac_t_0p0_0p2=0.5515 acc_corrupt_t_0p2_0p4=0.1372 corrupt_frac_t_0p2_0p4=0.3636 acc_corrupt_t_0p4_0p6=0.2539 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=2.8096 out_g_norm=1.7834 acc_corrupt_t_0p6_0p8=0.3611 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=0.3809 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.6724 init_gold_top10=0.2130 init_gold_top100=0.4321 rollout_applied_pos_frac=0.4531 init_acc_rollout_applied=0.1605 init_acc_rollout_kept=0.1231 logit_acc_rollout_applied=0.1265 logit_acc_rollout_kept=0.1099
198
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=20.6s lr=2.000000e-03 loss=6.4650 loss_recon=6.4650 loss_meanflow=0.0000 mean_model_t=0.2087 mean_corrupt_t=0.2087 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4968 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1123 corrupt_frac=1.0000 acc_corrupt=0.1123 loss_corrupt=6.4650 wrong_frac=0.7912 init_acc_corrupt=0.1185 acc_corrupt_t_0p0_0p2=0.0588 corrupt_frac_t_0p0_0p2=0.5584 acc_corrupt_t_0p2_0p4=0.1531 corrupt_frac_t_0p2_0p4=0.3549 acc_corrupt_t_0p4_0p6=0.2802 corrupt_frac_t_0p4_0p6=0.0783 acc_corrupt_t_0p6_0p8=0.3817 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=4.2736 out_g_norm=1.3202 acc_corrupt_t_0p8_1p0=0.4082 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.1804 init_gold_top10=0.2178 init_gold_top100=0.4494 rollout_applied_pos_frac=0.4609 init_acc_rollout_applied=0.1468 init_acc_rollout_kept=0.1181 logit_acc_rollout_applied=0.1312 logit_acc_rollout_kept=0.1208
199
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=20.6s lr=2.000000e-03 loss=5.9949 loss_recon=5.9949 loss_meanflow=0.0000 mean_model_t=0.2086 mean_corrupt_t=0.2086 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4985 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1232 corrupt_frac=1.0000 acc_corrupt=0.1232 loss_corrupt=5.9949 wrong_frac=0.7913 init_acc_corrupt=0.1187 acc_corrupt_t_0p0_0p2=0.0631 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=0.1693 corrupt_frac_t_0p2_0p4=0.3548 acc_corrupt_t_0p4_0p6=0.3079 corrupt_frac_t_0p4_0p6=0.0790 acc_corrupt_t_0p6_0p8=0.4315 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=5.3865 out_g_norm=0.5134 acc_corrupt_t_0p8_1p0=0.4951 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.7900 init_gold_top10=0.2070 init_gold_top100=0.4617 rollout_applied_pos_frac=0.4844 init_acc_rollout_applied=0.1433 init_acc_rollout_kept=0.0884 logit_acc_rollout_applied=0.1405 logit_acc_rollout_kept=0.1180
200
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=20.7s lr=2.000000e-03 loss=5.4954 loss_recon=5.4954 loss_meanflow=0.0000 mean_model_t=0.2100 mean_corrupt_t=0.2100 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5038 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1370 corrupt_frac=1.0000 acc_corrupt=0.1370 loss_corrupt=5.4954 wrong_frac=0.7902 init_acc_corrupt=0.1205 acc_corrupt_t_0p0_0p2=0.0677 corrupt_frac_t_0p0_0p2=0.5514 acc_corrupt_t_0p2_0p4=0.1895 corrupt_frac_t_0p2_0p4=0.3640 acc_corrupt_t_0p4_0p6=0.3484 corrupt_frac_t_0p4_0p6=0.0755 acc_corrupt_t_0p6_0p8=0.4805 corrupt_frac_t_0p6_0p8=0.0128 out_w_norm=6.5907 out_g_norm=0.3970 loss_all=5.1394 init_gold_top10=0.2261 init_gold_top100=0.5374 rollout_applied_pos_frac=0.5547 init_acc_rollout_applied=0.1467 init_acc_rollout_kept=0.1158 logit_acc_rollout_applied=0.1596 logit_acc_rollout_kept=0.1415
201
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=20.6s lr=2.000000e-03 loss=4.9137 loss_recon=4.9137 loss_meanflow=0.0000 mean_model_t=0.2076 mean_corrupt_t=0.2076 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4995 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1505 corrupt_frac=1.0000 acc_corrupt=0.1505 loss_corrupt=4.9137 wrong_frac=0.7925 init_acc_corrupt=0.1184 acc_corrupt_t_0p0_0p2=0.0716 corrupt_frac_t_0p0_0p2=0.5603 acc_corrupt_t_0p2_0p4=0.2125 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.3959 corrupt_frac_t_0p4_0p6=0.0753 acc_corrupt_t_0p6_0p8=0.5692 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=7.8665 out_g_norm=0.4388 acc_corrupt_t_0p8_1p0=0.7359 corrupt_frac_t_0p8_1p0=0.0078 loss_all=4.5371 init_gold_top10=0.2243 init_gold_top100=0.5435 rollout_applied_pos_frac=0.4453 init_acc_rollout_applied=0.1610 init_acc_rollout_kept=0.1132 logit_acc_rollout_applied=0.1869 logit_acc_rollout_kept=0.1600
202
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=20.5s lr=2.000000e-03 loss=4.3335 loss_recon=4.3335 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4981 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1727 corrupt_frac=1.0000 acc_corrupt=0.1727 loss_corrupt=4.3335 wrong_frac=0.7923 init_acc_corrupt=0.1192 acc_corrupt_t_0p0_0p2=0.0768 corrupt_frac_t_0p0_0p2=0.5594 acc_corrupt_t_0p2_0p4=0.2471 corrupt_frac_t_0p2_0p4=0.3580 acc_corrupt_t_0p4_0p6=0.4819 corrupt_frac_t_0p4_0p6=0.0748 out_w_norm=9.0569 out_g_norm=0.5927 acc_corrupt_t_0p6_0p8=0.6675 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=0.8350 corrupt_frac_t_0p8_1p0=0.0078 loss_all=4.1192 init_gold_top10=0.2113 init_gold_top100=0.5984 rollout_applied_pos_frac=0.4844 init_acc_rollout_applied=0.1299 init_acc_rollout_kept=0.0787 logit_acc_rollout_applied=0.1912 logit_acc_rollout_kept=0.1465
203
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=20.5s lr=2.000000e-03 loss=3.8239 loss_recon=3.8239 loss_meanflow=0.0000 mean_model_t=0.2107 mean_corrupt_t=0.2107 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5004 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1981 corrupt_frac=1.0000 acc_corrupt=0.1981 loss_corrupt=3.8239 wrong_frac=0.7893 init_acc_corrupt=0.1249 acc_corrupt_t_0p0_0p2=0.0851 corrupt_frac_t_0p0_0p2=0.5520 acc_corrupt_t_0p2_0p4=0.2899 corrupt_frac_t_0p2_0p4=0.3615 acc_corrupt_t_0p4_0p6=0.5214 corrupt_frac_t_0p4_0p6=0.0795 acc_corrupt_t_0p6_0p8=0.6915 corrupt_frac_t_0p6_0p8=0.0119 out_w_norm=9.9110 out_g_norm=0.6682 acc_corrupt_t_0p8_1p0=0.8662 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.5685 init_gold_top10=0.2574 init_gold_top100=0.6007 rollout_applied_pos_frac=0.4531 init_acc_rollout_applied=0.1424 init_acc_rollout_kept=0.1156 logit_acc_rollout_applied=0.2235 logit_acc_rollout_kept=0.2104
204
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=20.6s lr=2.000000e-03 loss=3.4022 loss_recon=3.4022 loss_meanflow=0.0000 mean_model_t=0.2116 mean_corrupt_t=0.2116 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4984 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2315 corrupt_frac=1.0000 acc_corrupt=0.2315 loss_corrupt=3.4022 wrong_frac=0.7884 init_acc_corrupt=0.1278 acc_corrupt_t_0p0_0p2=0.0982 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3450 corrupt_frac_t_0p2_0p4=0.3591 acc_corrupt_t_0p4_0p6=0.5803 corrupt_frac_t_0p4_0p6=0.0791 out_w_norm=10.4178 out_g_norm=0.8631 acc_corrupt_t_0p6_0p8=0.7328 corrupt_frac_t_0p6_0p8=0.0136 acc_corrupt_t_0p8_1p0=0.8910 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.2329 init_gold_top10=0.3245 init_gold_top100=0.6804 rollout_applied_pos_frac=0.5781 init_acc_rollout_applied=0.1320 init_acc_rollout_kept=0.1357 logit_acc_rollout_applied=0.2549 logit_acc_rollout_kept=0.2607
205
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=20.7s lr=2.000000e-03 loss=3.0118 loss_recon=3.0118 loss_meanflow=0.0000 mean_model_t=0.2093 mean_corrupt_t=0.2093 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5052 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2798 corrupt_frac=1.0000 acc_corrupt=0.2798 loss_corrupt=3.0118 wrong_frac=0.7908 init_acc_corrupt=0.1282 acc_corrupt_t_0p0_0p2=0.1208 corrupt_frac_t_0p0_0p2=0.5602 acc_corrupt_t_0p2_0p4=0.4317 corrupt_frac_t_0p2_0p4=0.3545 acc_corrupt_t_0p4_0p6=0.6799 corrupt_frac_t_0p4_0p6=0.0764 acc_corrupt_t_0p6_0p8=0.8054 corrupt_frac_t_0p6_0p8=0.0119 out_w_norm=10.7548 out_g_norm=1.0637 acc_corrupt_t_0p8_1p0=0.8809 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.7119 init_gold_top10=0.3879 init_gold_top100=0.6551 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.1689 init_acc_rollout_kept=0.1305 logit_acc_rollout_applied=0.3542 logit_acc_rollout_kept=0.3334
206
+ NCCL version 2.25.1+cuda12.8
207
+ resumed_from=runs/train8_ctx1024_path_p50_path3_unif0_0p25_outwdm1_ctx1024_path_tradeoff_sde_20260517_232950/latest.pt start_step=1001
208
+ {
209
+ "device": "cuda:0",
210
+ "rank": 0,
211
+ "world_size": 4,
212
+ "samples": "owt_cached_chunks:8",
213
+ "vocab_size": 2664,
214
+ "tokenizer_vocab_size": 50257,
215
+ "save_dir": "runs/train8_ctx1024_path_p50_path3_unif0_0p25_outwdm1_ctx1024_path_tradeoff_sde_20260517_232950",
216
+ "batch_size": 128,
217
+ "grad_accum": 1,
218
+ "effective_batch_size": 512,
219
+ "global_batch_size": 512,
220
+ "lr_schedule": "constant_warmup",
221
+ "optimizer": "muon",
222
+ "epochs": 0.0,
223
+ "steps_per_epoch": 1,
224
+ "total_steps": 2000,
225
+ "warmup_steps": 10,
226
+ "warmup_epochs": -1.0,
227
+ "min_lr": 0.0,
228
+ "weight_decay": 0.1,
229
+ "output_weight_decay": -1.0,
230
+ "adamw_param_groups": "nanogpt",
231
+ "adam_beta1": 0.9,
232
+ "adam_beta2": 0.95,
233
+ "adam_eps": 1e-08,
234
+ "muon_impl": "legacy",
235
+ "muon_momentum": 0.95,
236
+ "muon_ns_steps": 5,
237
+ "muon_update_scale": 1.0,
238
+ "muon_nesterov": false,
239
+ "muon_width_scale": false,
240
+ "muon_grouping": "legacy_dim_ge_2",
241
+ "muon_param_count": 2616320,
242
+ "muon_adam_param_count": 8192,
243
+ "muon_param_names": [
244
+ "vocab_embed.embedding",
245
+ "sigma_map.net.0.weight",
246
+ "sigma_map.net.2.weight",
247
+ "blocks.0.attn_qkv.weight",
248
+ "blocks.0.attn_out.weight",
249
+ "blocks.0.mlp.0.weight",
250
+ "blocks.0.mlp.2.weight",
251
+ "blocks.0.adaLN_modulation.weight",
252
+ "blocks.1.attn_qkv.weight",
253
+ "blocks.1.attn_out.weight",
254
+ "blocks.1.mlp.0.weight",
255
+ "blocks.1.mlp.2.weight",
256
+ "blocks.1.adaLN_modulation.weight",
257
+ "blocks.2.attn_qkv.weight",
258
+ "blocks.2.attn_out.weight",
259
+ "blocks.2.mlp.0.weight",
260
+ "blocks.2.mlp.2.weight",
261
+ "blocks.2.adaLN_modulation.weight",
262
+ "output_layer.linear.weight",
263
+ "output_layer.adaLN_modulation.weight"
264
+ ],
265
+ "muon_adam_param_names": [
266
+ "sigma_map.net.0.bias",
267
+ "sigma_map.net.2.bias",
268
+ "blocks.0.norm1.weight",
269
+ "blocks.0.norm2.weight",
270
+ "blocks.0.mlp.0.bias",
271
+ "blocks.0.mlp.2.bias",
272
+ "blocks.0.adaLN_modulation.bias",
273
+ "blocks.1.norm1.weight",
274
+ "blocks.1.norm2.weight",
275
+ "blocks.1.mlp.0.bias",
276
+ "blocks.1.mlp.2.bias",
277
+ "blocks.1.adaLN_modulation.bias",
278
+ "blocks.2.norm1.weight",
279
+ "blocks.2.norm2.weight",
280
+ "blocks.2.mlp.0.bias",
281
+ "blocks.2.mlp.2.bias",
282
+ "blocks.2.adaLN_modulation.bias",
283
+ "output_layer.norm_final.weight",
284
+ "output_layer.adaLN_modulation.bias"
285
+ ],
286
+ "muon_effective_nesterov": false,
287
+ "muon_effective_width_scale": false,
288
+ "muon_effective_weight_decay": 0.1,
289
+ "muon_adam_fallback_nesterov": false,
290
+ "muon_adam_fallback_weight_decay": 0.1,
291
+ "ema_decay": 0.9999,
292
+ "ema_start_step": 0,
293
+ "model_type": "ddit",
294
+ "ddit_mlp_type": "gelu",
295
+ "elf_num_time_tokens": 4,
296
+ "elf_num_model_mode_tokens": 0,
297
+ "qk_norm": true,
298
+ "output_bias": false,
299
+ "output_init_std": -1.0,
300
+ "norm_type": "rmsnorm",
301
+ "target_loss": "hard_ce",
302
+ "linear_soft_target_power": 1.0,
303
+ "linear_soft_target_min_conf": 0.0,
304
+ "linear_soft_target_max_conf": 1.0,
305
+ "t_sampling_mode": "logit_normal",
306
+ "t_sampling_power": 1.0,
307
+ "t_sampling_eps": 0.0001,
308
+ "t_sampling_logit_mean": -1.5,
309
+ "t_sampling_logit_std": 0.8,
310
+ "dual_t": true,
311
+ "corrupt_t_mode": "same",
312
+ "corrupt_min_t": 0.0,
313
+ "corrupt_max_t": 1.0,
314
+ "prefix_block_prob": 0.0,
315
+ "prefix_block_len": 128,
316
+ "mask_ratio_floor_schedule": "none",
317
+ "dirichlet_endpoint_mode": "categorical_dual_t",
318
+ "dirichlet_semantic_t_mode": "same",
319
+ "dirichlet_semantic_t_value": 0.0,
320
+ "dirichlet_semantic_t_curve": "linear",
321
+ "dirichlet_semantic_t_power": 1.0,
322
+ "endpoint_sequence_random_prob_alpha": 0.0,
323
+ "categorical_wrong_from_full_vocab": true,
324
+ "categorical_wrong_from_batch_valid_tokens": false,
325
+ "categorical_wrong_basin_token_ids": "",
326
+ "categorical_wrong_basin_prob": 0.0,
327
+ "categorical_wrong_unigram_prob": 0.0,
328
+ "categorical_wrong_uniform_prob": 0.0,
329
+ "categorical_wrong_prob_floor": 0.0,
330
+ "categorical_wrong_corpus_unigram_path": "",
331
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
332
+ "categorical_wrong_basin_shared_prob": 0.0,
333
+ "categorical_wrong_unigram_shared_prob": 0.0,
334
+ "mask_mixture_original_prob": 0.0,
335
+ "mask_mixture_lowk_prob": 0.0,
336
+ "mask_mixture_lowcorrupt_prob": 0.0,
337
+ "mask_mixture_block_prob": 0.0,
338
+ "mask_mixture_all_prob": 1.0,
339
+ "mask_mixture_lowk_clean_tokens": "0",
340
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
341
+ "mask_mixture_block_tokens": "64,128",
342
+ "simplex_bridge_sampler": "dirichlet",
343
+ "logistic_normal_sigma_min": 0.1,
344
+ "logistic_normal_sigma_max": 1.0,
345
+ "logistic_normal_tau_min": 1.0,
346
+ "logistic_normal_tau_max": 1.0,
347
+ "torch_compile": false,
348
+ "compile_mode": "max-autotune",
349
+ "state_format": "prob",
350
+ "meanflow_weight": 0.0,
351
+ "rollout_train_prob": 0.5,
352
+ "rollout_train_steps": 3,
353
+ "rollout_train_infer_steps": 1,
354
+ "rollout_train_time_mode": "sampled_path",
355
+ "rollout_train_s_dist": "uniform",
356
+ "rollout_train_s_min_frac": 0.0,
357
+ "rollout_train_s_max_frac": 0.25,
358
+ "rollout_train_s_beta_alpha": 2.0,
359
+ "rollout_train_s_beta_beta": 6.0,
360
+ "rollout_train_temp": 1.45,
361
+ "rollout_train_max_gamma": 1.0,
362
+ "rollout_train_corrupt_only": true,
363
+ "rollout_train_samplewise": true,
364
+ "rollout_train_compute_always": false,
365
+ "rollout_train_sync_t": true,
366
+ "bridge_noise_init": "logistic_normal",
367
+ "noise_sigma": -1.0,
368
+ "allow_tf32": true,
369
+ "activation_checkpointing": false,
370
+ "activation_checkpoint_interval": 1,
371
+ "activation_checkpoint_scope": "block",
372
+ "ddp_static_graph": false,
373
+ "ddp_gradient_as_bucket_view": true,
374
+ "blocking_data_transfer": false,
375
+ "dataloader_prefetch_factor": 4,
376
+ "full_train_stats": false,
377
+ "tokenized_hf": false,
378
+ "tokenized_pad_token": "pad",
379
+ "elf_conditional_hf": false,
380
+ "record_pad_truncate": false,
381
+ "record_add_eos": false,
382
+ "record_add_special_tokens": false,
383
+ "record_pad_token": "pad",
384
+ "record_shuffle_buffer": 10000,
385
+ "wrap": true,
386
+ "wrap_mode": "stream",
387
+ "wrap_record_buffer_size": 200,
388
+ "owt_cached_chunks": true,
389
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
390
+ "owt_chunk_cache_rebuild": false,
391
+ "owt_chunk_cache_write_batch": 4096,
392
+ "owt_exact_repeat_per_chunk": 64,
393
+ "online_chunk_shuffle": false,
394
+ "online_chunk_shuffle_buffer": 10000,
395
+ "openwebtext_split": "train_minus_100k",
396
+ "detokenizer": "auto",
397
+ "resolved_detokenizer": null,
398
+ "num_workers": 0,
399
+ "latest_every": 1000,
400
+ "resume_path": "runs/train8_ctx1024_path_p50_path3_unif0_0p25_outwdm1_ctx1024_path_tradeoff_sde_20260517_232950/latest.pt"
401
+ }
402
+ step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=21.5s lr=2.000000e-03 loss=2.6007 loss_recon=2.6007 loss_meanflow=0.0000 mean_model_t=0.2076 mean_corrupt_t=0.2076 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5055 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3519 corrupt_frac=1.0000 acc_corrupt=0.3519 loss_corrupt=2.6007 wrong_frac=0.7923 init_acc_corrupt=0.1354 acc_corrupt_t_0p0_0p2=0.1544 corrupt_frac_t_0p0_0p2=0.5602 acc_corrupt_t_0p2_0p4=0.5582 corrupt_frac_t_0p2_0p4=0.3545 acc_corrupt_t_0p4_0p6=0.7834 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.8669 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=11.0101 out_g_norm=1.2415 acc_corrupt_t_0p8_1p0=0.9233 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.4227 init_gold_top10=0.3863 init_gold_top100=0.6320 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.1754 init_acc_rollout_kept=0.1215 logit_acc_rollout_applied=0.4194 logit_acc_rollout_kept=0.3710
403
+ step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=20.4s lr=2.000000e-03 loss=2.1852 loss_recon=2.1852 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4901 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4409 corrupt_frac=1.0000 acc_corrupt=0.4409 loss_corrupt=2.1852 wrong_frac=0.7903 init_acc_corrupt=0.1584 acc_corrupt_t_0p0_0p2=0.2066 corrupt_frac_t_0p0_0p2=0.5515 acc_corrupt_t_0p2_0p4=0.6935 corrupt_frac_t_0p2_0p4=0.3636 acc_corrupt_t_0p4_0p6=0.8762 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=11.2251 out_g_norm=1.4497 acc_corrupt_t_0p6_0p8=0.9189 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=0.9497 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.1440 init_gold_top10=0.4227 init_gold_top100=0.6286 rollout_applied_pos_frac=0.4531 init_acc_rollout_applied=0.2945 init_acc_rollout_kept=0.1231 logit_acc_rollout_applied=0.5639 logit_acc_rollout_kept=0.4072
404
+ step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=20.6s lr=2.000000e-03 loss=1.8483 loss_recon=1.8483 loss_meanflow=0.0000 mean_model_t=0.2087 mean_corrupt_t=0.2087 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4968 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5183 corrupt_frac=1.0000 acc_corrupt=0.5183 loss_corrupt=1.8483 wrong_frac=0.7912 init_acc_corrupt=0.1841 acc_corrupt_t_0p0_0p2=0.2702 corrupt_frac_t_0p0_0p2=0.5584 acc_corrupt_t_0p2_0p4=0.8046 corrupt_frac_t_0p2_0p4=0.3549 acc_corrupt_t_0p4_0p6=0.9431 corrupt_frac_t_0p4_0p6=0.0783 acc_corrupt_t_0p6_0p8=0.9620 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=11.3898 out_g_norm=1.5154 acc_corrupt_t_0p8_1p0=0.9551 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7327 init_gold_top10=0.4682 init_gold_top100=0.6342 rollout_applied_pos_frac=0.4609 init_acc_rollout_applied=0.3313 init_acc_rollout_kept=0.1181 logit_acc_rollout_applied=0.6339 logit_acc_rollout_kept=0.4920
405
+ step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=20.6s lr=2.000000e-03 loss=1.5286 loss_recon=1.5286 loss_meanflow=0.0000 mean_model_t=0.2086 mean_corrupt_t=0.2086 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4985 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5926 corrupt_frac=1.0000 acc_corrupt=0.5926 loss_corrupt=1.5286 wrong_frac=0.7913 init_acc_corrupt=0.2132 acc_corrupt_t_0p0_0p2=0.3484 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=0.8821 corrupt_frac_t_0p2_0p4=0.3548 acc_corrupt_t_0p4_0p6=0.9749 corrupt_frac_t_0p4_0p6=0.0790 acc_corrupt_t_0p6_0p8=0.9795 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=11.5103 out_g_norm=1.5472 acc_corrupt_t_0p8_1p0=0.9922 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.5544 init_gold_top10=0.4753 init_gold_top100=0.6428 rollout_applied_pos_frac=0.4844 init_acc_rollout_applied=0.3921 init_acc_rollout_kept=0.0884 logit_acc_rollout_applied=0.6677 logit_acc_rollout_kept=0.4791
406
+ step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=20.7s lr=2.000000e-03 loss=1.2701 loss_recon=1.2701 loss_meanflow=0.0000 mean_model_t=0.2100 mean_corrupt_t=0.2100 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5038 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6534 corrupt_frac=1.0000 acc_corrupt=0.6534 loss_corrupt=1.2701 wrong_frac=0.7902 init_acc_corrupt=0.2420 acc_corrupt_t_0p0_0p2=0.4187 corrupt_frac_t_0p0_0p2=0.5514 acc_corrupt_t_0p2_0p4=0.9310 corrupt_frac_t_0p2_0p4=0.3640 acc_corrupt_t_0p4_0p6=0.9888 corrupt_frac_t_0p4_0p6=0.0755 acc_corrupt_t_0p6_0p8=0.9901 corrupt_frac_t_0p6_0p8=0.0128 out_w_norm=11.6102 out_g_norm=1.4650 loss_all=1.1326 init_gold_top10=0.5749 init_gold_top100=0.6964 rollout_applied_pos_frac=0.5547 init_acc_rollout_applied=0.4505 init_acc_rollout_kept=0.1158 logit_acc_rollout_applied=0.8114 logit_acc_rollout_kept=0.6025
407
+ step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=20.6s lr=2.000000e-03 loss=1.1130 loss_recon=1.1130 loss_meanflow=0.0000 mean_model_t=0.2076 mean_corrupt_t=0.2076 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4995 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6901 corrupt_frac=1.0000 acc_corrupt=0.6901 loss_corrupt=1.1130 wrong_frac=0.7925 init_acc_corrupt=0.2551 acc_corrupt_t_0p0_0p2=0.4756 corrupt_frac_t_0p0_0p2=0.5603 acc_corrupt_t_0p2_0p4=0.9562 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.9942 corrupt_frac_t_0p4_0p6=0.0753 acc_corrupt_t_0p6_0p8=0.9937 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=11.6878 out_g_norm=1.3594 acc_corrupt_t_0p8_1p0=0.9920 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0690 init_gold_top10=0.5011 init_gold_top100=0.6227 rollout_applied_pos_frac=0.4453 init_acc_rollout_applied=0.5025 init_acc_rollout_kept=0.1132 logit_acc_rollout_applied=0.7692 logit_acc_rollout_kept=0.6391
408
+ step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=20.5s lr=2.000000e-03 loss=0.9917 loss_recon=0.9917 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4981 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7188 corrupt_frac=1.0000 acc_corrupt=0.7188 loss_corrupt=0.9917 wrong_frac=0.7923 init_acc_corrupt=0.2721 acc_corrupt_t_0p0_0p2=0.5160 corrupt_frac_t_0p0_0p2=0.5594 acc_corrupt_t_0p2_0p4=0.9715 corrupt_frac_t_0p2_0p4=0.3580 acc_corrupt_t_0p4_0p6=0.9969 corrupt_frac_t_0p4_0p6=0.0748 out_w_norm=11.7412 out_g_norm=1.3206 acc_corrupt_t_0p6_0p8=0.9963 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=0.9922 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.9401 init_gold_top10=0.5248 init_gold_top100=0.6422 rollout_applied_pos_frac=0.4844 init_acc_rollout_applied=0.4452 init_acc_rollout_kept=0.0787 logit_acc_rollout_applied=0.8077 logit_acc_rollout_kept=0.6293
409
+ step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=20.6s lr=2.000000e-03 loss=0.8416 loss_recon=0.8416 loss_meanflow=0.0000 mean_model_t=0.2107 mean_corrupt_t=0.2107 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5004 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7575 corrupt_frac=1.0000 acc_corrupt=0.7575 loss_corrupt=0.8416 wrong_frac=0.7893 init_acc_corrupt=0.2866 acc_corrupt_t_0p0_0p2=0.5724 corrupt_frac_t_0p0_0p2=0.5520 acc_corrupt_t_0p2_0p4=0.9825 corrupt_frac_t_0p2_0p4=0.3615 acc_corrupt_t_0p4_0p6=0.9979 corrupt_frac_t_0p4_0p6=0.0795 acc_corrupt_t_0p6_0p8=0.9968 corrupt_frac_t_0p6_0p8=0.0119 out_w_norm=11.7821 out_g_norm=1.3048 acc_corrupt_t_0p8_1p0=0.9902 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.7426 init_gold_top10=0.5374 init_gold_top100=0.6283 rollout_applied_pos_frac=0.4531 init_acc_rollout_applied=0.4870 init_acc_rollout_kept=0.1156 logit_acc_rollout_applied=0.8416 logit_acc_rollout_kept=0.7231
410
+ step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=20.6s lr=2.000000e-03 loss=0.7291 loss_recon=0.7291 loss_meanflow=0.0000 mean_model_t=0.2116 mean_corrupt_t=0.2116 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4984 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7818 corrupt_frac=1.0000 acc_corrupt=0.7818 loss_corrupt=0.7291 wrong_frac=0.7884 init_acc_corrupt=0.2983 acc_corrupt_t_0p0_0p2=0.6120 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.9889 corrupt_frac_t_0p2_0p4=0.3591 acc_corrupt_t_0p4_0p6=0.9984 corrupt_frac_t_0p4_0p6=0.0791 out_w_norm=11.7959 out_g_norm=1.1896 acc_corrupt_t_0p6_0p8=0.9976 corrupt_frac_t_0p6_0p8=0.0136 acc_corrupt_t_0p8_1p0=0.9967 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.6827 init_gold_top10=0.6266 init_gold_top100=0.7165 rollout_applied_pos_frac=0.5781 init_acc_rollout_applied=0.4913 init_acc_rollout_kept=0.1357 logit_acc_rollout_applied=0.8069 logit_acc_rollout_kept=0.7395
411
+ step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=20.7s lr=2.000000e-03 loss=0.6577 loss_recon=0.6577 loss_meanflow=0.0000 mean_model_t=0.2093 mean_corrupt_t=0.2093 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5052 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7984 corrupt_frac=1.0000 acc_corrupt=0.7984 loss_corrupt=0.6577 wrong_frac=0.7908 init_acc_corrupt=0.3061 acc_corrupt_t_0p0_0p2=0.6442 corrupt_frac_t_0p0_0p2=0.5602 acc_corrupt_t_0p2_0p4=0.9938 corrupt_frac_t_0p2_0p4=0.3545 acc_corrupt_t_0p4_0p6=0.9989 corrupt_frac_t_0p4_0p6=0.0764 acc_corrupt_t_0p6_0p8=0.9981 corrupt_frac_t_0p6_0p8=0.0119 out_w_norm=11.8116 out_g_norm=1.0515 acc_corrupt_t_0p8_1p0=0.9990 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5439 init_gold_top10=0.6001 init_gold_top100=0.6846 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.5206 init_acc_rollout_kept=0.1305 logit_acc_rollout_applied=0.8481 logit_acc_rollout_kept=0.8096
412
+ NCCL version 2.25.1+cuda12.8
413
+ resumed_from=runs/train8_ctx1024_path_p50_path3_unif0_0p25_outwdm1_ctx1024_path_tradeoff_sde_20260517_232950/latest.pt start_step=2001
414
+ {
415
+ "device": "cuda:0",
416
+ "rank": 0,
417
+ "world_size": 4,
418
+ "samples": "owt_cached_chunks:8",
419
+ "vocab_size": 2664,
420
+ "tokenizer_vocab_size": 50257,
421
+ "save_dir": "runs/train8_ctx1024_path_p50_path3_unif0_0p25_outwdm1_ctx1024_path_tradeoff_sde_20260517_232950",
422
+ "batch_size": 128,
423
+ "grad_accum": 1,
424
+ "effective_batch_size": 512,
425
+ "global_batch_size": 512,
426
+ "lr_schedule": "constant_warmup",
427
+ "optimizer": "muon",
428
+ "epochs": 0.0,
429
+ "steps_per_epoch": 1,
430
+ "total_steps": 3000,
431
+ "warmup_steps": 10,
432
+ "warmup_epochs": -1.0,
433
+ "min_lr": 0.0,
434
+ "weight_decay": 0.1,
435
+ "output_weight_decay": -1.0,
436
+ "adamw_param_groups": "nanogpt",
437
+ "adam_beta1": 0.9,
438
+ "adam_beta2": 0.95,
439
+ "adam_eps": 1e-08,
440
+ "muon_impl": "legacy",
441
+ "muon_momentum": 0.95,
442
+ "muon_ns_steps": 5,
443
+ "muon_update_scale": 1.0,
444
+ "muon_nesterov": false,
445
+ "muon_width_scale": false,
446
+ "muon_grouping": "legacy_dim_ge_2",
447
+ "muon_param_count": 2616320,
448
+ "muon_adam_param_count": 8192,
449
+ "muon_param_names": [
450
+ "vocab_embed.embedding",
451
+ "sigma_map.net.0.weight",
452
+ "sigma_map.net.2.weight",
453
+ "blocks.0.attn_qkv.weight",
454
+ "blocks.0.attn_out.weight",
455
+ "blocks.0.mlp.0.weight",
456
+ "blocks.0.mlp.2.weight",
457
+ "blocks.0.adaLN_modulation.weight",
458
+ "blocks.1.attn_qkv.weight",
459
+ "blocks.1.attn_out.weight",
460
+ "blocks.1.mlp.0.weight",
461
+ "blocks.1.mlp.2.weight",
462
+ "blocks.1.adaLN_modulation.weight",
463
+ "blocks.2.attn_qkv.weight",
464
+ "blocks.2.attn_out.weight",
465
+ "blocks.2.mlp.0.weight",
466
+ "blocks.2.mlp.2.weight",
467
+ "blocks.2.adaLN_modulation.weight",
468
+ "output_layer.linear.weight",
469
+ "output_layer.adaLN_modulation.weight"
470
+ ],
471
+ "muon_adam_param_names": [
472
+ "sigma_map.net.0.bias",
473
+ "sigma_map.net.2.bias",
474
+ "blocks.0.norm1.weight",
475
+ "blocks.0.norm2.weight",
476
+ "blocks.0.mlp.0.bias",
477
+ "blocks.0.mlp.2.bias",
478
+ "blocks.0.adaLN_modulation.bias",
479
+ "blocks.1.norm1.weight",
480
+ "blocks.1.norm2.weight",
481
+ "blocks.1.mlp.0.bias",
482
+ "blocks.1.mlp.2.bias",
483
+ "blocks.1.adaLN_modulation.bias",
484
+ "blocks.2.norm1.weight",
485
+ "blocks.2.norm2.weight",
486
+ "blocks.2.mlp.0.bias",
487
+ "blocks.2.mlp.2.bias",
488
+ "blocks.2.adaLN_modulation.bias",
489
+ "output_layer.norm_final.weight",
490
+ "output_layer.adaLN_modulation.bias"
491
+ ],
492
+ "muon_effective_nesterov": false,
493
+ "muon_effective_width_scale": false,
494
+ "muon_effective_weight_decay": 0.1,
495
+ "muon_adam_fallback_nesterov": false,
496
+ "muon_adam_fallback_weight_decay": 0.1,
497
+ "ema_decay": 0.9999,
498
+ "ema_start_step": 0,
499
+ "model_type": "ddit",
500
+ "ddit_mlp_type": "gelu",
501
+ "elf_num_time_tokens": 4,
502
+ "elf_num_model_mode_tokens": 0,
503
+ "qk_norm": true,
504
+ "output_bias": false,
505
+ "output_init_std": -1.0,
506
+ "norm_type": "rmsnorm",
507
+ "target_loss": "hard_ce",
508
+ "linear_soft_target_power": 1.0,
509
+ "linear_soft_target_min_conf": 0.0,
510
+ "linear_soft_target_max_conf": 1.0,
511
+ "t_sampling_mode": "logit_normal",
512
+ "t_sampling_power": 1.0,
513
+ "t_sampling_eps": 0.0001,
514
+ "t_sampling_logit_mean": -1.5,
515
+ "t_sampling_logit_std": 0.8,
516
+ "dual_t": true,
517
+ "corrupt_t_mode": "same",
518
+ "corrupt_min_t": 0.0,
519
+ "corrupt_max_t": 1.0,
520
+ "prefix_block_prob": 0.0,
521
+ "prefix_block_len": 128,
522
+ "mask_ratio_floor_schedule": "none",
523
+ "dirichlet_endpoint_mode": "categorical_dual_t",
524
+ "dirichlet_semantic_t_mode": "same",
525
+ "dirichlet_semantic_t_value": 0.0,
526
+ "dirichlet_semantic_t_curve": "linear",
527
+ "dirichlet_semantic_t_power": 1.0,
528
+ "endpoint_sequence_random_prob_alpha": 0.0,
529
+ "categorical_wrong_from_full_vocab": true,
530
+ "categorical_wrong_from_batch_valid_tokens": false,
531
+ "categorical_wrong_basin_token_ids": "",
532
+ "categorical_wrong_basin_prob": 0.0,
533
+ "categorical_wrong_unigram_prob": 0.0,
534
+ "categorical_wrong_uniform_prob": 0.0,
535
+ "categorical_wrong_prob_floor": 0.0,
536
+ "categorical_wrong_corpus_unigram_path": "",
537
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
538
+ "categorical_wrong_basin_shared_prob": 0.0,
539
+ "categorical_wrong_unigram_shared_prob": 0.0,
540
+ "mask_mixture_original_prob": 0.0,
541
+ "mask_mixture_lowk_prob": 0.0,
542
+ "mask_mixture_lowcorrupt_prob": 0.0,
543
+ "mask_mixture_block_prob": 0.0,
544
+ "mask_mixture_all_prob": 1.0,
545
+ "mask_mixture_lowk_clean_tokens": "0",
546
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
547
+ "mask_mixture_block_tokens": "64,128",
548
+ "simplex_bridge_sampler": "dirichlet",
549
+ "logistic_normal_sigma_min": 0.1,
550
+ "logistic_normal_sigma_max": 1.0,
551
+ "logistic_normal_tau_min": 1.0,
552
+ "logistic_normal_tau_max": 1.0,
553
+ "torch_compile": false,
554
+ "compile_mode": "max-autotune",
555
+ "state_format": "prob",
556
+ "meanflow_weight": 0.0,
557
+ "rollout_train_prob": 0.5,
558
+ "rollout_train_steps": 3,
559
+ "rollout_train_infer_steps": 1,
560
+ "rollout_train_time_mode": "sampled_path",
561
+ "rollout_train_s_dist": "uniform",
562
+ "rollout_train_s_min_frac": 0.0,
563
+ "rollout_train_s_max_frac": 0.25,
564
+ "rollout_train_s_beta_alpha": 2.0,
565
+ "rollout_train_s_beta_beta": 6.0,
566
+ "rollout_train_temp": 1.45,
567
+ "rollout_train_max_gamma": 1.0,
568
+ "rollout_train_corrupt_only": true,
569
+ "rollout_train_samplewise": true,
570
+ "rollout_train_compute_always": false,
571
+ "rollout_train_sync_t": true,
572
+ "bridge_noise_init": "logistic_normal",
573
+ "noise_sigma": -1.0,
574
+ "allow_tf32": true,
575
+ "activation_checkpointing": false,
576
+ "activation_checkpoint_interval": 1,
577
+ "activation_checkpoint_scope": "block",
578
+ "ddp_static_graph": false,
579
+ "ddp_gradient_as_bucket_view": true,
580
+ "blocking_data_transfer": false,
581
+ "dataloader_prefetch_factor": 4,
582
+ "full_train_stats": false,
583
+ "tokenized_hf": false,
584
+ "tokenized_pad_token": "pad",
585
+ "elf_conditional_hf": false,
586
+ "record_pad_truncate": false,
587
+ "record_add_eos": false,
588
+ "record_add_special_tokens": false,
589
+ "record_pad_token": "pad",
590
+ "record_shuffle_buffer": 10000,
591
+ "wrap": true,
592
+ "wrap_mode": "stream",
593
+ "wrap_record_buffer_size": 200,
594
+ "owt_cached_chunks": true,
595
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
596
+ "owt_chunk_cache_rebuild": false,
597
+ "owt_chunk_cache_write_batch": 4096,
598
+ "owt_exact_repeat_per_chunk": 64,
599
+ "online_chunk_shuffle": false,
600
+ "online_chunk_shuffle_buffer": 10000,
601
+ "openwebtext_split": "train_minus_100k",
602
+ "detokenizer": "auto",
603
+ "resolved_detokenizer": null,
604
+ "num_workers": 0,
605
+ "latest_every": 1000,
606
+ "resume_path": "runs/train8_ctx1024_path_p50_path3_unif0_0p25_outwdm1_ctx1024_path_tradeoff_sde_20260517_232950/latest.pt"
607
+ }
608
+ step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=21.6s lr=2.000000e-03 loss=0.6116 loss_recon=0.6116 loss_meanflow=0.0000 mean_model_t=0.2076 mean_corrupt_t=0.2076 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5055 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8095 corrupt_frac=1.0000 acc_corrupt=0.8095 loss_corrupt=0.6116 wrong_frac=0.7923 init_acc_corrupt=0.3060 acc_corrupt_t_0p0_0p2=0.6630 corrupt_frac_t_0p0_0p2=0.5602 acc_corrupt_t_0p2_0p4=0.9954 corrupt_frac_t_0p2_0p4=0.3545 acc_corrupt_t_0p4_0p6=0.9991 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.9986 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=11.8015 out_g_norm=1.0953 acc_corrupt_t_0p8_1p0=0.9941 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5711 init_gold_top10=0.5506 init_gold_top100=0.6400 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.4890 init_acc_rollout_kept=0.1215 logit_acc_rollout_applied=0.8333 logit_acc_rollout_kept=0.8067
609
+ step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=20.6s lr=2.000000e-03 loss=0.5650 loss_recon=0.5650 loss_meanflow=0.0000 mean_model_t=0.2097 mean_corrupt_t=0.2097 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4901 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8231 corrupt_frac=1.0000 acc_corrupt=0.8231 loss_corrupt=0.5650 wrong_frac=0.7903 init_acc_corrupt=0.3110 acc_corrupt_t_0p0_0p2=0.6814 corrupt_frac_t_0p0_0p2=0.5515 acc_corrupt_t_0p2_0p4=0.9970 corrupt_frac_t_0p2_0p4=0.3636 acc_corrupt_t_0p4_0p6=0.9993 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=11.7909 out_g_norm=1.0084 acc_corrupt_t_0p6_0p8=0.9983 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=0.9990 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.7012 init_gold_top10=0.5394 init_gold_top100=0.6326 rollout_applied_pos_frac=0.4531 init_acc_rollout_applied=0.5477 init_acc_rollout_kept=0.1231 logit_acc_rollout_applied=0.8061 logit_acc_rollout_kept=0.7533
610
+ step=2300 epoch=2300/3000 epoch_step=1/1 micro_steps=2300 elapsed=20.7s lr=2.000000e-03 loss=0.5565 loss_recon=0.5565 loss_meanflow=0.0000 mean_model_t=0.2087 mean_corrupt_t=0.2087 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4968 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8242 corrupt_frac=1.0000 acc_corrupt=0.8242 loss_corrupt=0.5565 wrong_frac=0.7912 init_acc_corrupt=0.3154 acc_corrupt_t_0p0_0p2=0.6866 corrupt_frac_t_0p0_0p2=0.5584 acc_corrupt_t_0p2_0p4=0.9980 corrupt_frac_t_0p2_0p4=0.3549 acc_corrupt_t_0p4_0p6=0.9993 corrupt_frac_t_0p4_0p6=0.0783 acc_corrupt_t_0p6_0p8=0.9989 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=11.7780 out_g_norm=0.9346 acc_corrupt_t_0p8_1p0=0.9883 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.4615 init_gold_top10=0.5515 init_gold_top100=0.6347 rollout_applied_pos_frac=0.4609 init_acc_rollout_applied=0.6219 init_acc_rollout_kept=0.1181 logit_acc_rollout_applied=0.8899 logit_acc_rollout_kept=0.8108
611
+ step=2400 epoch=2400/3000 epoch_step=1/1 micro_steps=2400 elapsed=20.8s lr=2.000000e-03 loss=0.5106 loss_recon=0.5106 loss_meanflow=0.0000 mean_model_t=0.2086 mean_corrupt_t=0.2086 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4985 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8376 corrupt_frac=1.0000 acc_corrupt=0.8376 loss_corrupt=0.5106 wrong_frac=0.7913 init_acc_corrupt=0.3201 acc_corrupt_t_0p0_0p2=0.7097 corrupt_frac_t_0p0_0p2=0.5577 acc_corrupt_t_0p2_0p4=0.9987 corrupt_frac_t_0p2_0p4=0.3548 acc_corrupt_t_0p4_0p6=0.9996 corrupt_frac_t_0p4_0p6=0.0790 acc_corrupt_t_0p6_0p8=0.9983 corrupt_frac_t_0p6_0p8=0.0131 out_w_norm=11.7673 out_g_norm=0.8512 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5876 init_gold_top10=0.5469 init_gold_top100=0.6429 rollout_applied_pos_frac=0.4844 init_acc_rollout_applied=0.5571 init_acc_rollout_kept=0.0884 logit_acc_rollout_applied=0.8035 logit_acc_rollout_kept=0.8121
612
+ step=2500 epoch=2500/3000 epoch_step=1/1 micro_steps=2500 elapsed=20.9s lr=2.000000e-03 loss=0.4941 loss_recon=0.4941 loss_meanflow=0.0000 mean_model_t=0.2100 mean_corrupt_t=0.2100 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5038 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8428 corrupt_frac=1.0000 acc_corrupt=0.8428 loss_corrupt=0.4941 wrong_frac=0.7902 init_acc_corrupt=0.3264 acc_corrupt_t_0p0_0p2=0.7156 corrupt_frac_t_0p0_0p2=0.5514 acc_corrupt_t_0p2_0p4=0.9990 corrupt_frac_t_0p2_0p4=0.3640 acc_corrupt_t_0p4_0p6=0.9996 corrupt_frac_t_0p4_0p6=0.0755 acc_corrupt_t_0p6_0p8=0.9992 corrupt_frac_t_0p6_0p8=0.0128 out_w_norm=11.7640 out_g_norm=0.7915 loss_all=0.5320 init_gold_top10=0.6219 init_gold_top100=0.6965 rollout_applied_pos_frac=0.5547 init_acc_rollout_applied=0.5813 init_acc_rollout_kept=0.1158 logit_acc_rollout_applied=0.8649 logit_acc_rollout_kept=0.8050
613
+ step=2600 epoch=2600/3000 epoch_step=1/1 micro_steps=2600 elapsed=20.8s lr=2.000000e-03 loss=0.4740 loss_recon=0.4740 loss_meanflow=0.0000 mean_model_t=0.2076 mean_corrupt_t=0.2076 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4995 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8466 corrupt_frac=1.0000 acc_corrupt=0.8466 loss_corrupt=0.4740 wrong_frac=0.7925 init_acc_corrupt=0.3236 acc_corrupt_t_0p0_0p2=0.7268 corrupt_frac_t_0p0_0p2=0.5603 acc_corrupt_t_0p2_0p4=0.9992 corrupt_frac_t_0p2_0p4=0.3559 acc_corrupt_t_0p4_0p6=0.9997 corrupt_frac_t_0p4_0p6=0.0753 acc_corrupt_t_0p6_0p8=0.9990 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=11.7541 out_g_norm=0.7300 acc_corrupt_t_0p8_1p0=0.9988 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5310 init_gold_top10=0.5434 init_gold_top100=0.6227 rollout_applied_pos_frac=0.4453 init_acc_rollout_applied=0.5719 init_acc_rollout_kept=0.1132 logit_acc_rollout_applied=0.8330 logit_acc_rollout_kept=0.8179
614
+ step=2700 epoch=2700/3000 epoch_step=1/1 micro_steps=2700 elapsed=20.7s lr=2.000000e-03 loss=0.4484 loss_recon=0.4484 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4981 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8557 corrupt_frac=1.0000 acc_corrupt=0.8557 loss_corrupt=0.4484 wrong_frac=0.7923 init_acc_corrupt=0.3287 acc_corrupt_t_0p0_0p2=0.7426 corrupt_frac_t_0p0_0p2=0.5594 acc_corrupt_t_0p2_0p4=0.9992 corrupt_frac_t_0p2_0p4=0.3580 acc_corrupt_t_0p4_0p6=0.9995 corrupt_frac_t_0p4_0p6=0.0748 out_w_norm=11.7655 out_g_norm=0.7211 acc_corrupt_t_0p6_0p8=0.9993 corrupt_frac_t_0p6_0p8=0.0123 acc_corrupt_t_0p8_1p0=0.9932 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3845 init_gold_top10=0.5591 init_gold_top100=0.6422 rollout_applied_pos_frac=0.4844 init_acc_rollout_applied=0.5483 init_acc_rollout_kept=0.0787 logit_acc_rollout_applied=0.8704 logit_acc_rollout_kept=0.8799
615
+ step=2800 epoch=2800/3000 epoch_step=1/1 micro_steps=2800 elapsed=20.8s lr=2.000000e-03 loss=0.4162 loss_recon=0.4162 loss_meanflow=0.0000 mean_model_t=0.2107 mean_corrupt_t=0.2107 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5004 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8651 corrupt_frac=1.0000 acc_corrupt=0.8651 loss_corrupt=0.4162 wrong_frac=0.7893 init_acc_corrupt=0.3343 acc_corrupt_t_0p0_0p2=0.7560 corrupt_frac_t_0p0_0p2=0.5520 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.3615 acc_corrupt_t_0p4_0p6=0.9996 corrupt_frac_t_0p4_0p6=0.0795 acc_corrupt_t_0p6_0p8=0.9993 corrupt_frac_t_0p6_0p8=0.0119 out_w_norm=11.7847 out_g_norm=0.6821 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3874 init_gold_top10=0.5517 init_gold_top100=0.6283 rollout_applied_pos_frac=0.4531 init_acc_rollout_applied=0.5704 init_acc_rollout_kept=0.1156 logit_acc_rollout_applied=0.8763 logit_acc_rollout_kept=0.8778
616
+ step=2900 epoch=2900/3000 epoch_step=1/1 micro_steps=2900 elapsed=20.7s lr=2.000000e-03 loss=0.4102 loss_recon=0.4102 loss_meanflow=0.0000 mean_model_t=0.2116 mean_corrupt_t=0.2116 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4984 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8688 corrupt_frac=1.0000 acc_corrupt=0.8688 loss_corrupt=0.4102 wrong_frac=0.7884 init_acc_corrupt=0.3386 acc_corrupt_t_0p0_0p2=0.7625 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.3591 acc_corrupt_t_0p4_0p6=0.9997 corrupt_frac_t_0p4_0p6=0.0791 out_w_norm=11.7949 out_g_norm=0.6817 acc_corrupt_t_0p6_0p8=0.9994 corrupt_frac_t_0p6_0p8=0.0136 acc_corrupt_t_0p8_1p0=0.9990 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.4163 init_gold_top10=0.6542 init_gold_top100=0.7165 rollout_applied_pos_frac=0.5781 init_acc_rollout_applied=0.5628 init_acc_rollout_kept=0.1357 logit_acc_rollout_applied=0.8776 logit_acc_rollout_kept=0.8271
617
+ step=3000 epoch=3000/3000 epoch_step=1/1 micro_steps=3000 elapsed=20.9s lr=2.000000e-03 loss=0.4047 loss_recon=0.4047 loss_meanflow=0.0000 mean_model_t=0.2093 mean_corrupt_t=0.2093 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5052 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8704 corrupt_frac=1.0000 acc_corrupt=0.8704 loss_corrupt=0.4047 wrong_frac=0.7908 init_acc_corrupt=0.3397 acc_corrupt_t_0p0_0p2=0.7689 corrupt_frac_t_0p0_0p2=0.5602 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.3545 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0764 acc_corrupt_t_0p6_0p8=0.9991 corrupt_frac_t_0p6_0p8=0.0119 out_w_norm=11.8225 out_g_norm=0.6293 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.3669 init_gold_top10=0.6152 init_gold_top100=0.6846 rollout_applied_pos_frac=0.5312 init_acc_rollout_applied=0.5702 init_acc_rollout_kept=0.1305 logit_acc_rollout_applied=0.8638 logit_acc_rollout_kept=0.8817
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_randk_p50_path3_rand1_3_unif0_0p25_ctx1024_uniformt_temp1_randk_20260518_010217.log ADDED
@@ -0,0 +1,827 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 2664,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_ctx1024_randk_p50_path3_rand1_3_unif0_0p25_ctx1024_uniformt_temp1_randk_20260518_010217",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 2616320,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "uniform",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_prob_floor": 0.0,
124
+ "categorical_wrong_corpus_unigram_path": "",
125
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
126
+ "categorical_wrong_basin_shared_prob": 0.0,
127
+ "categorical_wrong_unigram_shared_prob": 0.0,
128
+ "mask_mixture_original_prob": 0.0,
129
+ "mask_mixture_lowk_prob": 0.0,
130
+ "mask_mixture_lowcorrupt_prob": 0.0,
131
+ "mask_mixture_block_prob": 0.0,
132
+ "mask_mixture_all_prob": 1.0,
133
+ "mask_mixture_lowk_clean_tokens": "0",
134
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
135
+ "mask_mixture_block_tokens": "64,128",
136
+ "simplex_bridge_sampler": "dirichlet",
137
+ "logistic_normal_sigma_min": 0.1,
138
+ "logistic_normal_sigma_max": 1.0,
139
+ "logistic_normal_tau_min": 1.0,
140
+ "logistic_normal_tau_max": 1.0,
141
+ "torch_compile": false,
142
+ "compile_mode": "max-autotune",
143
+ "state_format": "prob",
144
+ "meanflow_weight": 0.0,
145
+ "rollout_train_prob": 0.5,
146
+ "rollout_train_steps": 3,
147
+ "rollout_train_steps_min": 1,
148
+ "rollout_train_infer_steps": 1,
149
+ "rollout_train_time_mode": "sampled_path",
150
+ "rollout_train_s_dist": "uniform",
151
+ "rollout_train_s_min_frac": 0.0,
152
+ "rollout_train_s_max_frac": 0.25,
153
+ "rollout_train_s_beta_alpha": 2.0,
154
+ "rollout_train_s_beta_beta": 6.0,
155
+ "rollout_train_temp": 1.0,
156
+ "rollout_train_max_gamma": 1.0,
157
+ "rollout_train_corrupt_only": true,
158
+ "rollout_train_samplewise": true,
159
+ "rollout_train_compute_always": false,
160
+ "rollout_train_sync_t": true,
161
+ "bridge_noise_init": "logistic_normal",
162
+ "noise_sigma": -1.0,
163
+ "allow_tf32": true,
164
+ "activation_checkpointing": false,
165
+ "activation_checkpoint_interval": 1,
166
+ "activation_checkpoint_scope": "block",
167
+ "ddp_static_graph": false,
168
+ "ddp_gradient_as_bucket_view": true,
169
+ "blocking_data_transfer": false,
170
+ "dataloader_prefetch_factor": 4,
171
+ "full_train_stats": false,
172
+ "tokenized_hf": false,
173
+ "tokenized_pad_token": "pad",
174
+ "elf_conditional_hf": false,
175
+ "record_pad_truncate": false,
176
+ "record_add_eos": false,
177
+ "record_add_special_tokens": false,
178
+ "record_pad_token": "pad",
179
+ "record_shuffle_buffer": 10000,
180
+ "wrap": true,
181
+ "wrap_mode": "stream",
182
+ "wrap_record_buffer_size": 200,
183
+ "owt_cached_chunks": true,
184
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
185
+ "owt_chunk_cache_rebuild": false,
186
+ "owt_chunk_cache_write_batch": 4096,
187
+ "owt_exact_repeat_per_chunk": 64,
188
+ "online_chunk_shuffle": false,
189
+ "online_chunk_shuffle_buffer": 10000,
190
+ "openwebtext_split": "train_minus_100k",
191
+ "detokenizer": "auto",
192
+ "resolved_detokenizer": null,
193
+ "num_workers": 0,
194
+ "latest_every": 1000,
195
+ "resume_path": ""
196
+ }
197
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=21.9s lr=2.000000e-03 loss=7.4577 loss_recon=7.4577 loss_meanflow=0.0000 mean_model_t=0.4971 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5052 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3137 corrupt_frac=1.0000 acc_corrupt=0.3137 loss_corrupt=7.4577 wrong_frac=0.5028 init_acc_corrupt=0.4627 acc_corrupt_t_0p0_0p2=0.0440 corrupt_frac_t_0p0_0p2=0.2071 acc_corrupt_t_0p2_0p4=0.1521 corrupt_frac_t_0p2_0p4=0.1988 acc_corrupt_t_0p4_0p6=0.3120 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=0.4623 corrupt_frac_t_0p6_0p8=0.1951 acc_corrupt_t_0p8_1p0=0.6081 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=1.0709 out_g_norm=0.9533 loss_all=6.9234 init_gold_top10=0.4675 init_gold_top100=0.5761 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.4066 init_acc_rollout_kept=0.4381 logit_acc_rollout_applied=0.2648 logit_acc_rollout_kept=0.2797
198
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=21.0s lr=2.000000e-03 loss=6.0049 loss_recon=6.0049 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5026 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3121 corrupt_frac=1.0000 acc_corrupt=0.3121 loss_corrupt=6.0049 wrong_frac=0.4984 init_acc_corrupt=0.4679 acc_corrupt_t_0p0_0p2=0.0493 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.1498 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.3102 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=0.4522 corrupt_frac_t_0p6_0p8=0.2007 acc_corrupt_t_0p8_1p0=0.5953 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=3.4208 out_g_norm=1.2518 loss_all=5.2404 init_gold_top10=0.5095 init_gold_top100=0.6191 rollout_applied_pos_frac=0.5391 init_acc_rollout_applied=0.4932 init_acc_rollout_kept=0.4670 logit_acc_rollout_applied=0.3495 logit_acc_rollout_kept=0.3428
199
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=21.0s lr=2.000000e-03 loss=4.9228 loss_recon=4.9228 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5045 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3544 corrupt_frac=1.0000 acc_corrupt=0.3544 loss_corrupt=4.9228 wrong_frac=0.4985 init_acc_corrupt=0.4691 acc_corrupt_t_0p0_0p2=0.0521 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.1786 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.3490 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=0.5087 corrupt_frac_t_0p6_0p8=0.2045 acc_corrupt_t_0p8_1p0=0.6764 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=5.5235 out_g_norm=0.5182 loss_all=4.5675 init_gold_top10=0.5196 init_gold_top100=0.6338 rollout_applied_pos_frac=0.5391 init_acc_rollout_applied=0.4711 init_acc_rollout_kept=0.4756 logit_acc_rollout_applied=0.3717 logit_acc_rollout_kept=0.3857
200
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=20.9s lr=2.000000e-03 loss=4.2847 loss_recon=4.2847 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4976 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4073 corrupt_frac=1.0000 acc_corrupt=0.4073 loss_corrupt=4.2847 wrong_frac=0.5016 init_acc_corrupt=0.4658 acc_corrupt_t_0p0_0p2=0.0555 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.2009 corrupt_frac_t_0p2_0p4=0.2001 acc_corrupt_t_0p4_0p6=0.4027 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.5906 corrupt_frac_t_0p6_0p8=0.1992 acc_corrupt_t_0p8_1p0=0.7928 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=7.1205 out_g_norm=0.2698 loss_all=4.0941 init_gold_top10=0.4926 init_gold_top100=0.6325 rollout_applied_pos_frac=0.5469 init_acc_rollout_applied=0.4727 init_acc_rollout_kept=0.4091 logit_acc_rollout_applied=0.4534 logit_acc_rollout_kept=0.4036
201
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=21.0s lr=2.000000e-03 loss=3.6516 loss_recon=3.6516 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4981 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4776 corrupt_frac=1.0000 acc_corrupt=0.4776 loss_corrupt=3.6516 wrong_frac=0.5023 init_acc_corrupt=0.4652 acc_corrupt_t_0p0_0p2=0.0567 corrupt_frac_t_0p0_0p2=0.2059 acc_corrupt_t_0p2_0p4=0.2329 corrupt_frac_t_0p2_0p4=0.1985 acc_corrupt_t_0p4_0p6=0.4959 corrupt_frac_t_0p4_0p6=0.1969 acc_corrupt_t_0p6_0p8=0.7067 corrupt_frac_t_0p6_0p8=0.1927 acc_corrupt_t_0p8_1p0=0.9025 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=8.4567 out_g_norm=0.2542 loss_all=3.4269 init_gold_top10=0.5004 init_gold_top100=0.6532 rollout_applied_pos_frac=0.5234 init_acc_rollout_applied=0.4284 init_acc_rollout_kept=0.4792 logit_acc_rollout_applied=0.4457 logit_acc_rollout_kept=0.5015
202
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=21.0s lr=2.000000e-03 loss=3.1363 loss_recon=3.1363 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5030 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4935 corrupt_frac=1.0000 acc_corrupt=0.4935 loss_corrupt=3.1363 wrong_frac=0.4987 init_acc_corrupt=0.4692 acc_corrupt_t_0p0_0p2=0.0594 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.2643 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=0.5216 corrupt_frac_t_0p4_0p6=0.1990 acc_corrupt_t_0p6_0p8=0.7136 corrupt_frac_t_0p6_0p8=0.2017 acc_corrupt_t_0p8_1p0=0.9043 corrupt_frac_t_0p8_1p0=0.1997 out_w_norm=9.7279 out_g_norm=0.2716 loss_all=2.5213 init_gold_top10=0.5964 init_gold_top100=0.7261 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.5189 init_acc_rollout_kept=0.5812 logit_acc_rollout_applied=0.5407 logit_acc_rollout_kept=0.6049
203
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=20.9s lr=2.000000e-03 loss=2.8057 loss_recon=2.8057 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4924 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5005 corrupt_frac=1.0000 acc_corrupt=0.5005 loss_corrupt=2.8057 wrong_frac=0.4994 init_acc_corrupt=0.4690 acc_corrupt_t_0p0_0p2=0.0608 corrupt_frac_t_0p0_0p2=0.2030 acc_corrupt_t_0p2_0p4=0.2768 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.5293 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.7205 corrupt_frac_t_0p6_0p8=0.2001 acc_corrupt_t_0p8_1p0=0.9054 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=10.7412 out_g_norm=0.3097 loss_all=2.6672 init_gold_top10=0.5354 init_gold_top100=0.7197 rollout_applied_pos_frac=0.5000 init_acc_rollout_applied=0.4357 init_acc_rollout_kept=0.5183 logit_acc_rollout_applied=0.4733 logit_acc_rollout_kept=0.5519
204
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=21.0s lr=2.000000e-03 loss=2.3716 loss_recon=2.3716 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4930 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5263 corrupt_frac=1.0000 acc_corrupt=0.5263 loss_corrupt=2.3716 wrong_frac=0.5037 init_acc_corrupt=0.4648 acc_corrupt_t_0p0_0p2=0.0622 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.3062 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=0.5802 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=0.7709 corrupt_frac_t_0p6_0p8=0.1965 acc_corrupt_t_0p8_1p0=0.9275 corrupt_frac_t_0p8_1p0=0.1984 out_w_norm=11.2531 out_g_norm=0.3908 loss_all=2.2379 init_gold_top10=0.5527 init_gold_top100=0.7075 rollout_applied_pos_frac=0.4609 init_acc_rollout_applied=0.5008 init_acc_rollout_kept=0.4082 logit_acc_rollout_applied=0.5964 logit_acc_rollout_kept=0.4949
205
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=21.0s lr=2.000000e-03 loss=1.7996 loss_recon=1.7996 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.5040 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5050 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6185 corrupt_frac=1.0000 acc_corrupt=0.6185 loss_corrupt=1.7996 wrong_frac=0.4960 init_acc_corrupt=0.4768 acc_corrupt_t_0p0_0p2=0.0646 corrupt_frac_t_0p0_0p2=0.1931 acc_corrupt_t_0p2_0p4=0.3911 corrupt_frac_t_0p2_0p4=0.2013 acc_corrupt_t_0p4_0p6=0.7430 corrupt_frac_t_0p4_0p6=0.2019 acc_corrupt_t_0p6_0p8=0.8942 corrupt_frac_t_0p6_0p8=0.1981 acc_corrupt_t_0p8_1p0=0.9734 corrupt_frac_t_0p8_1p0=0.2056 out_w_norm=11.6784 out_g_norm=0.4833 loss_all=1.4855 init_gold_top10=0.6236 init_gold_top100=0.7438 rollout_applied_pos_frac=0.4297 init_acc_rollout_applied=0.4757 init_acc_rollout_kept=0.5220 logit_acc_rollout_applied=0.6405 logit_acc_rollout_kept=0.6997
206
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=21.0s lr=2.000000e-03 loss=1.4411 loss_recon=1.4411 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4999 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6921 corrupt_frac=1.0000 acc_corrupt=0.6921 loss_corrupt=1.4411 wrong_frac=0.4993 init_acc_corrupt=0.4821 acc_corrupt_t_0p0_0p2=0.0744 corrupt_frac_t_0p0_0p2=0.2034 acc_corrupt_t_0p2_0p4=0.5412 corrupt_frac_t_0p2_0p4=0.1967 acc_corrupt_t_0p4_0p6=0.8866 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=0.9712 corrupt_frac_t_0p6_0p8=0.1933 acc_corrupt_t_0p8_1p0=0.9954 corrupt_frac_t_0p8_1p0=0.2055 out_w_norm=12.0185 out_g_norm=0.5595 loss_all=1.5013 init_gold_top10=0.6330 init_gold_top100=0.7566 rollout_applied_pos_frac=0.5547 init_acc_rollout_applied=0.5527 init_acc_rollout_kept=0.4294 logit_acc_rollout_applied=0.7367 logit_acc_rollout_kept=0.6481
207
+ NCCL version 2.25.1+cuda12.8
208
+ resumed_from=runs/train8_ctx1024_randk_p50_path3_rand1_3_unif0_0p25_ctx1024_uniformt_temp1_randk_20260518_010217/latest.pt start_step=1001
209
+ {
210
+ "device": "cuda:0",
211
+ "rank": 0,
212
+ "world_size": 4,
213
+ "samples": "owt_cached_chunks:8",
214
+ "vocab_size": 2664,
215
+ "tokenizer_vocab_size": 50257,
216
+ "save_dir": "runs/train8_ctx1024_randk_p50_path3_rand1_3_unif0_0p25_ctx1024_uniformt_temp1_randk_20260518_010217",
217
+ "batch_size": 128,
218
+ "grad_accum": 1,
219
+ "effective_batch_size": 512,
220
+ "global_batch_size": 512,
221
+ "lr_schedule": "constant_warmup",
222
+ "optimizer": "muon",
223
+ "epochs": 0.0,
224
+ "steps_per_epoch": 1,
225
+ "total_steps": 2000,
226
+ "warmup_steps": 10,
227
+ "warmup_epochs": -1.0,
228
+ "min_lr": 0.0,
229
+ "weight_decay": 0.1,
230
+ "output_weight_decay": -1.0,
231
+ "adamw_param_groups": "nanogpt",
232
+ "adam_beta1": 0.9,
233
+ "adam_beta2": 0.95,
234
+ "adam_eps": 1e-08,
235
+ "muon_impl": "legacy",
236
+ "muon_momentum": 0.95,
237
+ "muon_ns_steps": 5,
238
+ "muon_update_scale": 1.0,
239
+ "muon_nesterov": false,
240
+ "muon_width_scale": false,
241
+ "muon_grouping": "legacy_dim_ge_2",
242
+ "muon_param_count": 2616320,
243
+ "muon_adam_param_count": 8192,
244
+ "muon_param_names": [
245
+ "vocab_embed.embedding",
246
+ "sigma_map.net.0.weight",
247
+ "sigma_map.net.2.weight",
248
+ "blocks.0.attn_qkv.weight",
249
+ "blocks.0.attn_out.weight",
250
+ "blocks.0.mlp.0.weight",
251
+ "blocks.0.mlp.2.weight",
252
+ "blocks.0.adaLN_modulation.weight",
253
+ "blocks.1.attn_qkv.weight",
254
+ "blocks.1.attn_out.weight",
255
+ "blocks.1.mlp.0.weight",
256
+ "blocks.1.mlp.2.weight",
257
+ "blocks.1.adaLN_modulation.weight",
258
+ "blocks.2.attn_qkv.weight",
259
+ "blocks.2.attn_out.weight",
260
+ "blocks.2.mlp.0.weight",
261
+ "blocks.2.mlp.2.weight",
262
+ "blocks.2.adaLN_modulation.weight",
263
+ "output_layer.linear.weight",
264
+ "output_layer.adaLN_modulation.weight"
265
+ ],
266
+ "muon_adam_param_names": [
267
+ "sigma_map.net.0.bias",
268
+ "sigma_map.net.2.bias",
269
+ "blocks.0.norm1.weight",
270
+ "blocks.0.norm2.weight",
271
+ "blocks.0.mlp.0.bias",
272
+ "blocks.0.mlp.2.bias",
273
+ "blocks.0.adaLN_modulation.bias",
274
+ "blocks.1.norm1.weight",
275
+ "blocks.1.norm2.weight",
276
+ "blocks.1.mlp.0.bias",
277
+ "blocks.1.mlp.2.bias",
278
+ "blocks.1.adaLN_modulation.bias",
279
+ "blocks.2.norm1.weight",
280
+ "blocks.2.norm2.weight",
281
+ "blocks.2.mlp.0.bias",
282
+ "blocks.2.mlp.2.bias",
283
+ "blocks.2.adaLN_modulation.bias",
284
+ "output_layer.norm_final.weight",
285
+ "output_layer.adaLN_modulation.bias"
286
+ ],
287
+ "muon_effective_nesterov": false,
288
+ "muon_effective_width_scale": false,
289
+ "muon_effective_weight_decay": 0.1,
290
+ "muon_adam_fallback_nesterov": false,
291
+ "muon_adam_fallback_weight_decay": 0.1,
292
+ "ema_decay": 0.9999,
293
+ "ema_start_step": 0,
294
+ "model_type": "ddit",
295
+ "ddit_mlp_type": "gelu",
296
+ "elf_num_time_tokens": 4,
297
+ "elf_num_model_mode_tokens": 0,
298
+ "qk_norm": true,
299
+ "output_bias": false,
300
+ "output_init_std": -1.0,
301
+ "norm_type": "rmsnorm",
302
+ "target_loss": "hard_ce",
303
+ "linear_soft_target_power": 1.0,
304
+ "linear_soft_target_min_conf": 0.0,
305
+ "linear_soft_target_max_conf": 1.0,
306
+ "t_sampling_mode": "uniform",
307
+ "t_sampling_power": 1.0,
308
+ "t_sampling_eps": 0.0001,
309
+ "t_sampling_logit_mean": -1.5,
310
+ "t_sampling_logit_std": 0.8,
311
+ "dual_t": true,
312
+ "corrupt_t_mode": "same",
313
+ "corrupt_min_t": 0.0,
314
+ "corrupt_max_t": 1.0,
315
+ "prefix_block_prob": 0.0,
316
+ "prefix_block_len": 128,
317
+ "mask_ratio_floor_schedule": "none",
318
+ "dirichlet_endpoint_mode": "categorical_dual_t",
319
+ "dirichlet_semantic_t_mode": "same",
320
+ "dirichlet_semantic_t_value": 0.0,
321
+ "dirichlet_semantic_t_curve": "linear",
322
+ "dirichlet_semantic_t_power": 1.0,
323
+ "endpoint_sequence_random_prob_alpha": 0.0,
324
+ "categorical_wrong_from_full_vocab": true,
325
+ "categorical_wrong_from_batch_valid_tokens": false,
326
+ "categorical_wrong_basin_token_ids": "",
327
+ "categorical_wrong_basin_prob": 0.0,
328
+ "categorical_wrong_unigram_prob": 0.0,
329
+ "categorical_wrong_uniform_prob": 0.0,
330
+ "categorical_wrong_prob_floor": 0.0,
331
+ "categorical_wrong_corpus_unigram_path": "",
332
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
333
+ "categorical_wrong_basin_shared_prob": 0.0,
334
+ "categorical_wrong_unigram_shared_prob": 0.0,
335
+ "mask_mixture_original_prob": 0.0,
336
+ "mask_mixture_lowk_prob": 0.0,
337
+ "mask_mixture_lowcorrupt_prob": 0.0,
338
+ "mask_mixture_block_prob": 0.0,
339
+ "mask_mixture_all_prob": 1.0,
340
+ "mask_mixture_lowk_clean_tokens": "0",
341
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
342
+ "mask_mixture_block_tokens": "64,128",
343
+ "simplex_bridge_sampler": "dirichlet",
344
+ "logistic_normal_sigma_min": 0.1,
345
+ "logistic_normal_sigma_max": 1.0,
346
+ "logistic_normal_tau_min": 1.0,
347
+ "logistic_normal_tau_max": 1.0,
348
+ "torch_compile": false,
349
+ "compile_mode": "max-autotune",
350
+ "state_format": "prob",
351
+ "meanflow_weight": 0.0,
352
+ "rollout_train_prob": 0.5,
353
+ "rollout_train_steps": 3,
354
+ "rollout_train_steps_min": 1,
355
+ "rollout_train_infer_steps": 1,
356
+ "rollout_train_time_mode": "sampled_path",
357
+ "rollout_train_s_dist": "uniform",
358
+ "rollout_train_s_min_frac": 0.0,
359
+ "rollout_train_s_max_frac": 0.25,
360
+ "rollout_train_s_beta_alpha": 2.0,
361
+ "rollout_train_s_beta_beta": 6.0,
362
+ "rollout_train_temp": 1.0,
363
+ "rollout_train_max_gamma": 1.0,
364
+ "rollout_train_corrupt_only": true,
365
+ "rollout_train_samplewise": true,
366
+ "rollout_train_compute_always": false,
367
+ "rollout_train_sync_t": true,
368
+ "bridge_noise_init": "logistic_normal",
369
+ "noise_sigma": -1.0,
370
+ "allow_tf32": true,
371
+ "activation_checkpointing": false,
372
+ "activation_checkpoint_interval": 1,
373
+ "activation_checkpoint_scope": "block",
374
+ "ddp_static_graph": false,
375
+ "ddp_gradient_as_bucket_view": true,
376
+ "blocking_data_transfer": false,
377
+ "dataloader_prefetch_factor": 4,
378
+ "full_train_stats": false,
379
+ "tokenized_hf": false,
380
+ "tokenized_pad_token": "pad",
381
+ "elf_conditional_hf": false,
382
+ "record_pad_truncate": false,
383
+ "record_add_eos": false,
384
+ "record_add_special_tokens": false,
385
+ "record_pad_token": "pad",
386
+ "record_shuffle_buffer": 10000,
387
+ "wrap": true,
388
+ "wrap_mode": "stream",
389
+ "wrap_record_buffer_size": 200,
390
+ "owt_cached_chunks": true,
391
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
392
+ "owt_chunk_cache_rebuild": false,
393
+ "owt_chunk_cache_write_batch": 4096,
394
+ "owt_exact_repeat_per_chunk": 64,
395
+ "online_chunk_shuffle": false,
396
+ "online_chunk_shuffle_buffer": 10000,
397
+ "openwebtext_split": "train_minus_100k",
398
+ "detokenizer": "auto",
399
+ "resolved_detokenizer": null,
400
+ "num_workers": 0,
401
+ "latest_every": 1000,
402
+ "resume_path": "runs/train8_ctx1024_randk_p50_path3_rand1_3_unif0_0p25_ctx1024_uniformt_temp1_randk_20260518_010217/latest.pt"
403
+ }
404
+ step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=21.9s lr=2.000000e-03 loss=1.1828 loss_recon=1.1828 loss_meanflow=0.0000 mean_model_t=0.4971 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5052 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7436 corrupt_frac=1.0000 acc_corrupt=0.7436 loss_corrupt=1.1828 wrong_frac=0.5028 init_acc_corrupt=0.4897 acc_corrupt_t_0p0_0p2=0.1001 corrupt_frac_t_0p0_0p2=0.2071 acc_corrupt_t_0p2_0p4=0.6997 corrupt_frac_t_0p2_0p4=0.1988 acc_corrupt_t_0p4_0p6=0.9562 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=0.9923 corrupt_frac_t_0p6_0p8=0.1951 acc_corrupt_t_0p8_1p0=0.9989 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.2683 out_g_norm=0.5843 loss_all=0.9598 init_gold_top10=0.6373 init_gold_top100=0.7402 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.5113 init_acc_rollout_kept=0.4381 logit_acc_rollout_applied=0.7818 logit_acc_rollout_kept=0.7686
405
+ step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=21.0s lr=2.000000e-03 loss=0.9520 loss_recon=0.9520 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5026 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7891 corrupt_frac=1.0000 acc_corrupt=0.7891 loss_corrupt=0.9520 wrong_frac=0.4984 init_acc_corrupt=0.5036 acc_corrupt_t_0p0_0p2=0.1409 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.8245 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.9834 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=0.9974 corrupt_frac_t_0p6_0p8=0.2007 acc_corrupt_t_0p8_1p0=0.9996 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.4616 out_g_norm=0.5938 loss_all=0.8206 init_gold_top10=0.7031 init_gold_top100=0.7861 rollout_applied_pos_frac=0.5391 init_acc_rollout_applied=0.6005 init_acc_rollout_kept=0.4670 logit_acc_rollout_applied=0.8755 logit_acc_rollout_kept=0.7547
406
+ step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=21.0s lr=2.000000e-03 loss=0.7780 loss_recon=0.7780 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5045 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8234 corrupt_frac=1.0000 acc_corrupt=0.8234 loss_corrupt=0.7780 wrong_frac=0.4985 init_acc_corrupt=0.5140 acc_corrupt_t_0p0_0p2=0.1952 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.9095 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.9936 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=0.9989 corrupt_frac_t_0p6_0p8=0.2045 acc_corrupt_t_0p8_1p0=0.9998 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=12.6249 out_g_norm=0.6038 loss_all=0.7202 init_gold_top10=0.6836 init_gold_top100=0.7906 rollout_applied_pos_frac=0.5391 init_acc_rollout_applied=0.5468 init_acc_rollout_kept=0.4756 logit_acc_rollout_applied=0.7984 logit_acc_rollout_kept=0.8757
407
+ step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=20.9s lr=2.000000e-03 loss=0.6907 loss_recon=0.6907 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4976 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8375 corrupt_frac=1.0000 acc_corrupt=0.8375 loss_corrupt=0.6907 wrong_frac=0.5016 init_acc_corrupt=0.5133 acc_corrupt_t_0p0_0p2=0.2442 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9484 corrupt_frac_t_0p2_0p4=0.2001 acc_corrupt_t_0p4_0p6=0.9973 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.9994 corrupt_frac_t_0p6_0p8=0.1992 acc_corrupt_t_0p8_1p0=0.9998 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=12.7397 out_g_norm=0.5999 loss_all=0.6364 init_gold_top10=0.6986 init_gold_top100=0.7792 rollout_applied_pos_frac=0.5469 init_acc_rollout_applied=0.6127 init_acc_rollout_kept=0.4091 logit_acc_rollout_applied=0.8890 logit_acc_rollout_kept=0.7844
408
+ step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=21.0s lr=2.000000e-03 loss=0.6199 loss_recon=0.6199 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4981 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8481 corrupt_frac=1.0000 acc_corrupt=0.8481 loss_corrupt=0.6199 wrong_frac=0.5023 init_acc_corrupt=0.5157 acc_corrupt_t_0p0_0p2=0.2895 corrupt_frac_t_0p0_0p2=0.2059 acc_corrupt_t_0p2_0p4=0.9737 corrupt_frac_t_0p2_0p4=0.1985 acc_corrupt_t_0p4_0p6=0.9988 corrupt_frac_t_0p4_0p6=0.1969 acc_corrupt_t_0p6_0p8=0.9997 corrupt_frac_t_0p6_0p8=0.1927 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=12.8201 out_g_norm=0.5816 loss_all=0.5766 init_gold_top10=0.7126 init_gold_top100=0.7874 rollout_applied_pos_frac=0.5234 init_acc_rollout_applied=0.5788 init_acc_rollout_kept=0.4792 logit_acc_rollout_applied=0.8811 logit_acc_rollout_kept=0.8343
409
+ step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=21.0s lr=2.000000e-03 loss=0.5101 loss_recon=0.5101 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5030 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8690 corrupt_frac=1.0000 acc_corrupt=0.8690 loss_corrupt=0.5101 wrong_frac=0.4987 init_acc_corrupt=0.5235 acc_corrupt_t_0p0_0p2=0.3524 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.9847 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=0.9994 corrupt_frac_t_0p4_0p6=0.1990 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.2017 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.1997 out_w_norm=12.8723 out_g_norm=0.5502 loss_all=0.3946 init_gold_top10=0.7601 init_gold_top100=0.8180 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.6475 init_acc_rollout_kept=0.5812 logit_acc_rollout_applied=0.8885 logit_acc_rollout_kept=0.9108
410
+ step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=20.9s lr=2.000000e-03 loss=0.4813 loss_recon=0.4813 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4924 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8720 corrupt_frac=1.0000 acc_corrupt=0.8720 loss_corrupt=0.4813 wrong_frac=0.4994 init_acc_corrupt=0.5224 acc_corrupt_t_0p0_0p2=0.3792 corrupt_frac_t_0p0_0p2=0.2030 acc_corrupt_t_0p2_0p4=0.9900 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.9997 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.2001 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=12.9038 out_g_norm=0.5105 loss_all=0.6447 init_gold_top10=0.6933 init_gold_top100=0.7971 rollout_applied_pos_frac=0.5000 init_acc_rollout_applied=0.4862 init_acc_rollout_kept=0.5183 logit_acc_rollout_applied=0.8161 logit_acc_rollout_kept=0.8284
411
+ step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=21.0s lr=2.000000e-03 loss=0.4209 loss_recon=0.4209 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4930 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8804 corrupt_frac=1.0000 acc_corrupt=0.8804 loss_corrupt=0.4209 wrong_frac=0.5037 init_acc_corrupt=0.5205 acc_corrupt_t_0p0_0p2=0.4165 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.9935 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1965 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.1984 out_w_norm=12.9067 out_g_norm=0.4878 loss_all=0.4171 init_gold_top10=0.6790 init_gold_top100=0.7384 rollout_applied_pos_frac=0.4609 init_acc_rollout_applied=0.6274 init_acc_rollout_kept=0.4082 logit_acc_rollout_applied=0.9365 logit_acc_rollout_kept=0.8279
412
+ step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=21.0s lr=2.000000e-03 loss=0.3533 loss_recon=0.3533 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.5040 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5050 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8944 corrupt_frac=1.0000 acc_corrupt=0.8944 loss_corrupt=0.3533 wrong_frac=0.4960 init_acc_corrupt=0.5324 acc_corrupt_t_0p0_0p2=0.4578 corrupt_frac_t_0p0_0p2=0.1931 acc_corrupt_t_0p2_0p4=0.9961 corrupt_frac_t_0p2_0p4=0.2013 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2019 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1981 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2056 out_w_norm=12.8946 out_g_norm=0.4670 loss_all=0.2855 init_gold_top10=0.7142 init_gold_top100=0.7687 rollout_applied_pos_frac=0.4297 init_acc_rollout_applied=0.6093 init_acc_rollout_kept=0.5220 logit_acc_rollout_applied=0.9011 logit_acc_rollout_kept=0.9306
413
+ step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=21.0s lr=2.000000e-03 loss=0.3333 loss_recon=0.3333 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4999 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8962 corrupt_frac=1.0000 acc_corrupt=0.8962 loss_corrupt=0.3333 wrong_frac=0.4993 init_acc_corrupt=0.5289 acc_corrupt_t_0p0_0p2=0.4924 corrupt_frac_t_0p0_0p2=0.2034 acc_corrupt_t_0p2_0p4=0.9974 corrupt_frac_t_0p2_0p4=0.1967 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1933 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2055 out_w_norm=12.8551 out_g_norm=0.4372 loss_all=0.4913 init_gold_top10=0.7266 init_gold_top100=0.7923 rollout_applied_pos_frac=0.5547 init_acc_rollout_applied=0.6145 init_acc_rollout_kept=0.4294 logit_acc_rollout_applied=0.8696 logit_acc_rollout_kept=0.8036
414
+ NCCL version 2.25.1+cuda12.8
415
+ resumed_from=runs/train8_ctx1024_randk_p50_path3_rand1_3_unif0_0p25_ctx1024_uniformt_temp1_randk_20260518_010217/latest.pt start_step=2001
416
+ {
417
+ "device": "cuda:0",
418
+ "rank": 0,
419
+ "world_size": 4,
420
+ "samples": "owt_cached_chunks:8",
421
+ "vocab_size": 2664,
422
+ "tokenizer_vocab_size": 50257,
423
+ "save_dir": "runs/train8_ctx1024_randk_p50_path3_rand1_3_unif0_0p25_ctx1024_uniformt_temp1_randk_20260518_010217",
424
+ "batch_size": 128,
425
+ "grad_accum": 1,
426
+ "effective_batch_size": 512,
427
+ "global_batch_size": 512,
428
+ "lr_schedule": "constant_warmup",
429
+ "optimizer": "muon",
430
+ "epochs": 0.0,
431
+ "steps_per_epoch": 1,
432
+ "total_steps": 3000,
433
+ "warmup_steps": 10,
434
+ "warmup_epochs": -1.0,
435
+ "min_lr": 0.0,
436
+ "weight_decay": 0.1,
437
+ "output_weight_decay": -1.0,
438
+ "adamw_param_groups": "nanogpt",
439
+ "adam_beta1": 0.9,
440
+ "adam_beta2": 0.95,
441
+ "adam_eps": 1e-08,
442
+ "muon_impl": "legacy",
443
+ "muon_momentum": 0.95,
444
+ "muon_ns_steps": 5,
445
+ "muon_update_scale": 1.0,
446
+ "muon_nesterov": false,
447
+ "muon_width_scale": false,
448
+ "muon_grouping": "legacy_dim_ge_2",
449
+ "muon_param_count": 2616320,
450
+ "muon_adam_param_count": 8192,
451
+ "muon_param_names": [
452
+ "vocab_embed.embedding",
453
+ "sigma_map.net.0.weight",
454
+ "sigma_map.net.2.weight",
455
+ "blocks.0.attn_qkv.weight",
456
+ "blocks.0.attn_out.weight",
457
+ "blocks.0.mlp.0.weight",
458
+ "blocks.0.mlp.2.weight",
459
+ "blocks.0.adaLN_modulation.weight",
460
+ "blocks.1.attn_qkv.weight",
461
+ "blocks.1.attn_out.weight",
462
+ "blocks.1.mlp.0.weight",
463
+ "blocks.1.mlp.2.weight",
464
+ "blocks.1.adaLN_modulation.weight",
465
+ "blocks.2.attn_qkv.weight",
466
+ "blocks.2.attn_out.weight",
467
+ "blocks.2.mlp.0.weight",
468
+ "blocks.2.mlp.2.weight",
469
+ "blocks.2.adaLN_modulation.weight",
470
+ "output_layer.linear.weight",
471
+ "output_layer.adaLN_modulation.weight"
472
+ ],
473
+ "muon_adam_param_names": [
474
+ "sigma_map.net.0.bias",
475
+ "sigma_map.net.2.bias",
476
+ "blocks.0.norm1.weight",
477
+ "blocks.0.norm2.weight",
478
+ "blocks.0.mlp.0.bias",
479
+ "blocks.0.mlp.2.bias",
480
+ "blocks.0.adaLN_modulation.bias",
481
+ "blocks.1.norm1.weight",
482
+ "blocks.1.norm2.weight",
483
+ "blocks.1.mlp.0.bias",
484
+ "blocks.1.mlp.2.bias",
485
+ "blocks.1.adaLN_modulation.bias",
486
+ "blocks.2.norm1.weight",
487
+ "blocks.2.norm2.weight",
488
+ "blocks.2.mlp.0.bias",
489
+ "blocks.2.mlp.2.bias",
490
+ "blocks.2.adaLN_modulation.bias",
491
+ "output_layer.norm_final.weight",
492
+ "output_layer.adaLN_modulation.bias"
493
+ ],
494
+ "muon_effective_nesterov": false,
495
+ "muon_effective_width_scale": false,
496
+ "muon_effective_weight_decay": 0.1,
497
+ "muon_adam_fallback_nesterov": false,
498
+ "muon_adam_fallback_weight_decay": 0.1,
499
+ "ema_decay": 0.9999,
500
+ "ema_start_step": 0,
501
+ "model_type": "ddit",
502
+ "ddit_mlp_type": "gelu",
503
+ "elf_num_time_tokens": 4,
504
+ "elf_num_model_mode_tokens": 0,
505
+ "qk_norm": true,
506
+ "output_bias": false,
507
+ "output_init_std": -1.0,
508
+ "norm_type": "rmsnorm",
509
+ "target_loss": "hard_ce",
510
+ "linear_soft_target_power": 1.0,
511
+ "linear_soft_target_min_conf": 0.0,
512
+ "linear_soft_target_max_conf": 1.0,
513
+ "t_sampling_mode": "uniform",
514
+ "t_sampling_power": 1.0,
515
+ "t_sampling_eps": 0.0001,
516
+ "t_sampling_logit_mean": -1.5,
517
+ "t_sampling_logit_std": 0.8,
518
+ "dual_t": true,
519
+ "corrupt_t_mode": "same",
520
+ "corrupt_min_t": 0.0,
521
+ "corrupt_max_t": 1.0,
522
+ "prefix_block_prob": 0.0,
523
+ "prefix_block_len": 128,
524
+ "mask_ratio_floor_schedule": "none",
525
+ "dirichlet_endpoint_mode": "categorical_dual_t",
526
+ "dirichlet_semantic_t_mode": "same",
527
+ "dirichlet_semantic_t_value": 0.0,
528
+ "dirichlet_semantic_t_curve": "linear",
529
+ "dirichlet_semantic_t_power": 1.0,
530
+ "endpoint_sequence_random_prob_alpha": 0.0,
531
+ "categorical_wrong_from_full_vocab": true,
532
+ "categorical_wrong_from_batch_valid_tokens": false,
533
+ "categorical_wrong_basin_token_ids": "",
534
+ "categorical_wrong_basin_prob": 0.0,
535
+ "categorical_wrong_unigram_prob": 0.0,
536
+ "categorical_wrong_uniform_prob": 0.0,
537
+ "categorical_wrong_prob_floor": 0.0,
538
+ "categorical_wrong_corpus_unigram_path": "",
539
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
540
+ "categorical_wrong_basin_shared_prob": 0.0,
541
+ "categorical_wrong_unigram_shared_prob": 0.0,
542
+ "mask_mixture_original_prob": 0.0,
543
+ "mask_mixture_lowk_prob": 0.0,
544
+ "mask_mixture_lowcorrupt_prob": 0.0,
545
+ "mask_mixture_block_prob": 0.0,
546
+ "mask_mixture_all_prob": 1.0,
547
+ "mask_mixture_lowk_clean_tokens": "0",
548
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
549
+ "mask_mixture_block_tokens": "64,128",
550
+ "simplex_bridge_sampler": "dirichlet",
551
+ "logistic_normal_sigma_min": 0.1,
552
+ "logistic_normal_sigma_max": 1.0,
553
+ "logistic_normal_tau_min": 1.0,
554
+ "logistic_normal_tau_max": 1.0,
555
+ "torch_compile": false,
556
+ "compile_mode": "max-autotune",
557
+ "state_format": "prob",
558
+ "meanflow_weight": 0.0,
559
+ "rollout_train_prob": 0.5,
560
+ "rollout_train_steps": 3,
561
+ "rollout_train_steps_min": 1,
562
+ "rollout_train_infer_steps": 1,
563
+ "rollout_train_time_mode": "sampled_path",
564
+ "rollout_train_s_dist": "uniform",
565
+ "rollout_train_s_min_frac": 0.0,
566
+ "rollout_train_s_max_frac": 0.25,
567
+ "rollout_train_s_beta_alpha": 2.0,
568
+ "rollout_train_s_beta_beta": 6.0,
569
+ "rollout_train_temp": 1.0,
570
+ "rollout_train_max_gamma": 1.0,
571
+ "rollout_train_corrupt_only": true,
572
+ "rollout_train_samplewise": true,
573
+ "rollout_train_compute_always": false,
574
+ "rollout_train_sync_t": true,
575
+ "bridge_noise_init": "logistic_normal",
576
+ "noise_sigma": -1.0,
577
+ "allow_tf32": true,
578
+ "activation_checkpointing": false,
579
+ "activation_checkpoint_interval": 1,
580
+ "activation_checkpoint_scope": "block",
581
+ "ddp_static_graph": false,
582
+ "ddp_gradient_as_bucket_view": true,
583
+ "blocking_data_transfer": false,
584
+ "dataloader_prefetch_factor": 4,
585
+ "full_train_stats": false,
586
+ "tokenized_hf": false,
587
+ "tokenized_pad_token": "pad",
588
+ "elf_conditional_hf": false,
589
+ "record_pad_truncate": false,
590
+ "record_add_eos": false,
591
+ "record_add_special_tokens": false,
592
+ "record_pad_token": "pad",
593
+ "record_shuffle_buffer": 10000,
594
+ "wrap": true,
595
+ "wrap_mode": "stream",
596
+ "wrap_record_buffer_size": 200,
597
+ "owt_cached_chunks": true,
598
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
599
+ "owt_chunk_cache_rebuild": false,
600
+ "owt_chunk_cache_write_batch": 4096,
601
+ "owt_exact_repeat_per_chunk": 64,
602
+ "online_chunk_shuffle": false,
603
+ "online_chunk_shuffle_buffer": 10000,
604
+ "openwebtext_split": "train_minus_100k",
605
+ "detokenizer": "auto",
606
+ "resolved_detokenizer": null,
607
+ "num_workers": 0,
608
+ "latest_every": 1000,
609
+ "resume_path": "runs/train8_ctx1024_randk_p50_path3_rand1_3_unif0_0p25_ctx1024_uniformt_temp1_randk_20260518_010217/latest.pt"
610
+ }
611
+ step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=21.9s lr=2.000000e-03 loss=0.3161 loss_recon=0.3161 loss_meanflow=0.0000 mean_model_t=0.4971 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5052 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8973 corrupt_frac=1.0000 acc_corrupt=0.8973 loss_corrupt=0.3161 wrong_frac=0.5028 init_acc_corrupt=0.5262 acc_corrupt_t_0p0_0p2=0.5060 corrupt_frac_t_0p0_0p2=0.2071 acc_corrupt_t_0p2_0p4=0.9982 corrupt_frac_t_0p2_0p4=0.1988 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1951 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.8108 out_g_norm=0.4501 loss_all=0.2830 init_gold_top10=0.7065 init_gold_top100=0.7509 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.5977 init_acc_rollout_kept=0.4381 logit_acc_rollout_applied=0.9050 logit_acc_rollout_kept=0.9239
612
+ step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=21.0s lr=2.000000e-03 loss=0.2827 loss_recon=0.2827 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5026 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9058 corrupt_frac=1.0000 acc_corrupt=0.9058 loss_corrupt=0.2827 wrong_frac=0.4984 init_acc_corrupt=0.5310 acc_corrupt_t_0p0_0p2=0.5314 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9989 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2007 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.7548 out_g_norm=0.4159 loss_all=0.3139 init_gold_top10=0.7399 init_gold_top100=0.7944 rollout_applied_pos_frac=0.5391 init_acc_rollout_applied=0.6434 init_acc_rollout_kept=0.4670 logit_acc_rollout_applied=0.9194 logit_acc_rollout_kept=0.8570
613
+ step=2300 epoch=2300/3000 epoch_step=1/1 micro_steps=2300 elapsed=21.0s lr=2.000000e-03 loss=0.2735 loss_recon=0.2735 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5045 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9076 corrupt_frac=1.0000 acc_corrupt=0.9076 loss_corrupt=0.2735 wrong_frac=0.4985 init_acc_corrupt=0.5331 acc_corrupt_t_0p0_0p2=0.5262 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.9991 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2045 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=12.6923 out_g_norm=0.3854 loss_all=0.2540 init_gold_top10=0.7547 init_gold_top100=0.7958 rollout_applied_pos_frac=0.5391 init_acc_rollout_applied=0.5763 init_acc_rollout_kept=0.4756 logit_acc_rollout_applied=0.8630 logit_acc_rollout_kept=0.9677
614
+ step=2400 epoch=2400/3000 epoch_step=1/1 micro_steps=2400 elapsed=20.9s lr=2.000000e-03 loss=0.2624 loss_recon=0.2624 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4976 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9102 corrupt_frac=1.0000 acc_corrupt=0.9102 loss_corrupt=0.2624 wrong_frac=0.5016 init_acc_corrupt=0.5291 acc_corrupt_t_0p0_0p2=0.5527 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9993 corrupt_frac_t_0p2_0p4=0.2001 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1992 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=12.6358 out_g_norm=0.3459 loss_all=0.2486 init_gold_top10=0.7335 init_gold_top100=0.7797 rollout_applied_pos_frac=0.5469 init_acc_rollout_applied=0.6508 init_acc_rollout_kept=0.4091 logit_acc_rollout_applied=0.9490 logit_acc_rollout_kept=0.8727
615
+ step=2500 epoch=2500/3000 epoch_step=1/1 micro_steps=2500 elapsed=21.0s lr=2.000000e-03 loss=0.2686 loss_recon=0.2686 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4981 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9081 corrupt_frac=1.0000 acc_corrupt=0.9081 loss_corrupt=0.2686 wrong_frac=0.5023 init_acc_corrupt=0.5270 acc_corrupt_t_0p0_0p2=0.5543 corrupt_frac_t_0p0_0p2=0.2059 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.1985 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1969 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1927 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=12.5776 out_g_norm=0.3313 loss_all=0.2582 init_gold_top10=0.7440 init_gold_top100=0.7876 rollout_applied_pos_frac=0.5234 init_acc_rollout_applied=0.5948 init_acc_rollout_kept=0.4792 logit_acc_rollout_applied=0.9056 logit_acc_rollout_kept=0.9203
616
+ step=2600 epoch=2600/3000 epoch_step=1/1 micro_steps=2600 elapsed=21.0s lr=2.000000e-03 loss=0.2444 loss_recon=0.2444 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5030 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9163 corrupt_frac=1.0000 acc_corrupt=0.9163 loss_corrupt=0.2444 wrong_frac=0.4987 init_acc_corrupt=0.5327 acc_corrupt_t_0p0_0p2=0.5762 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1990 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2017 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1997 out_w_norm=12.5266 out_g_norm=0.3170 loss_all=0.1803 init_gold_top10=0.7868 init_gold_top100=0.8180 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.6804 init_acc_rollout_kept=0.5812 logit_acc_rollout_applied=0.9321 logit_acc_rollout_kept=0.9435
617
+ step=2700 epoch=2700/3000 epoch_step=1/1 micro_steps=2700 elapsed=20.9s lr=2.000000e-03 loss=0.2527 loss_recon=0.2527 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4924 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9135 corrupt_frac=1.0000 acc_corrupt=0.9135 loss_corrupt=0.2527 wrong_frac=0.4994 init_acc_corrupt=0.5296 acc_corrupt_t_0p0_0p2=0.5743 corrupt_frac_t_0p0_0p2=0.2030 acc_corrupt_t_0p2_0p4=0.9997 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2001 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=12.4752 out_g_norm=0.3100 loss_all=0.3246 init_gold_top10=0.7333 init_gold_top100=0.7971 rollout_applied_pos_frac=0.5000 init_acc_rollout_applied=0.5154 init_acc_rollout_kept=0.5183 logit_acc_rollout_applied=0.8861 logit_acc_rollout_kept=0.8862
618
+ step=2800 epoch=2800/3000 epoch_step=1/1 micro_steps=2800 elapsed=21.0s lr=2.000000e-03 loss=0.2473 loss_recon=0.2473 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4930 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9143 corrupt_frac=1.0000 acc_corrupt=0.9143 loss_corrupt=0.2473 wrong_frac=0.5037 init_acc_corrupt=0.5270 acc_corrupt_t_0p0_0p2=0.5771 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.9998 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1965 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1984 out_w_norm=12.4323 out_g_norm=0.2977 loss_all=0.2093 init_gold_top10=0.6922 init_gold_top100=0.7384 rollout_applied_pos_frac=0.4609 init_acc_rollout_applied=0.6469 init_acc_rollout_kept=0.4082 logit_acc_rollout_applied=0.9627 logit_acc_rollout_kept=0.8979
619
+ step=2900 epoch=2900/3000 epoch_step=1/1 micro_steps=2900 elapsed=21.1s lr=2.000000e-03 loss=0.2302 loss_recon=0.2302 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.5040 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5050 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9204 corrupt_frac=1.0000 acc_corrupt=0.9204 loss_corrupt=0.2302 wrong_frac=0.4960 init_acc_corrupt=0.5370 acc_corrupt_t_0p0_0p2=0.5886 corrupt_frac_t_0p0_0p2=0.1931 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.2013 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.2019 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1981 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2056 out_w_norm=12.4016 out_g_norm=0.2660 loss_all=0.1783 init_gold_top10=0.7274 init_gold_top100=0.7687 rollout_applied_pos_frac=0.4297 init_acc_rollout_applied=0.6324 init_acc_rollout_kept=0.5220 logit_acc_rollout_applied=0.9225 logit_acc_rollout_kept=0.9457
620
+ step=3000 epoch=3000/3000 epoch_step=1/1 micro_steps=3000 elapsed=21.0s lr=2.000000e-03 loss=0.2408 loss_recon=0.2408 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4999 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9173 corrupt_frac=1.0000 acc_corrupt=0.9173 loss_corrupt=0.2408 wrong_frac=0.4993 init_acc_corrupt=0.5327 acc_corrupt_t_0p0_0p2=0.5937 corrupt_frac_t_0p0_0p2=0.2034 acc_corrupt_t_0p2_0p4=0.9998 corrupt_frac_t_0p2_0p4=0.1967 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1933 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2055 out_w_norm=12.3836 out_g_norm=0.2845 loss_all=0.4039 init_gold_top10=0.7350 init_gold_top100=0.7923 rollout_applied_pos_frac=0.5547 init_acc_rollout_applied=0.6142 init_acc_rollout_kept=0.4294 logit_acc_rollout_applied=0.8752 logit_acc_rollout_kept=0.8304
621
+ NCCL version 2.25.1+cuda12.8
622
+ resumed_from=runs/train8_ctx1024_randk_p50_path3_rand1_3_unif0_0p25_ctx1024_uniformt_temp1_randk_20260518_010217/latest.pt start_step=3001
623
+ {
624
+ "device": "cuda:0",
625
+ "rank": 0,
626
+ "world_size": 4,
627
+ "samples": "owt_cached_chunks:8",
628
+ "vocab_size": 2664,
629
+ "tokenizer_vocab_size": 50257,
630
+ "save_dir": "runs/train8_ctx1024_randk_p50_path3_rand1_3_unif0_0p25_ctx1024_uniformt_temp1_randk_20260518_010217",
631
+ "batch_size": 128,
632
+ "grad_accum": 1,
633
+ "effective_batch_size": 512,
634
+ "global_batch_size": 512,
635
+ "lr_schedule": "constant_warmup",
636
+ "optimizer": "muon",
637
+ "epochs": 0.0,
638
+ "steps_per_epoch": 1,
639
+ "total_steps": 4000,
640
+ "warmup_steps": 10,
641
+ "warmup_epochs": -1.0,
642
+ "min_lr": 0.0,
643
+ "weight_decay": 0.1,
644
+ "output_weight_decay": -1.0,
645
+ "adamw_param_groups": "nanogpt",
646
+ "adam_beta1": 0.9,
647
+ "adam_beta2": 0.95,
648
+ "adam_eps": 1e-08,
649
+ "muon_impl": "legacy",
650
+ "muon_momentum": 0.95,
651
+ "muon_ns_steps": 5,
652
+ "muon_update_scale": 1.0,
653
+ "muon_nesterov": false,
654
+ "muon_width_scale": false,
655
+ "muon_grouping": "legacy_dim_ge_2",
656
+ "muon_param_count": 2616320,
657
+ "muon_adam_param_count": 8192,
658
+ "muon_param_names": [
659
+ "vocab_embed.embedding",
660
+ "sigma_map.net.0.weight",
661
+ "sigma_map.net.2.weight",
662
+ "blocks.0.attn_qkv.weight",
663
+ "blocks.0.attn_out.weight",
664
+ "blocks.0.mlp.0.weight",
665
+ "blocks.0.mlp.2.weight",
666
+ "blocks.0.adaLN_modulation.weight",
667
+ "blocks.1.attn_qkv.weight",
668
+ "blocks.1.attn_out.weight",
669
+ "blocks.1.mlp.0.weight",
670
+ "blocks.1.mlp.2.weight",
671
+ "blocks.1.adaLN_modulation.weight",
672
+ "blocks.2.attn_qkv.weight",
673
+ "blocks.2.attn_out.weight",
674
+ "blocks.2.mlp.0.weight",
675
+ "blocks.2.mlp.2.weight",
676
+ "blocks.2.adaLN_modulation.weight",
677
+ "output_layer.linear.weight",
678
+ "output_layer.adaLN_modulation.weight"
679
+ ],
680
+ "muon_adam_param_names": [
681
+ "sigma_map.net.0.bias",
682
+ "sigma_map.net.2.bias",
683
+ "blocks.0.norm1.weight",
684
+ "blocks.0.norm2.weight",
685
+ "blocks.0.mlp.0.bias",
686
+ "blocks.0.mlp.2.bias",
687
+ "blocks.0.adaLN_modulation.bias",
688
+ "blocks.1.norm1.weight",
689
+ "blocks.1.norm2.weight",
690
+ "blocks.1.mlp.0.bias",
691
+ "blocks.1.mlp.2.bias",
692
+ "blocks.1.adaLN_modulation.bias",
693
+ "blocks.2.norm1.weight",
694
+ "blocks.2.norm2.weight",
695
+ "blocks.2.mlp.0.bias",
696
+ "blocks.2.mlp.2.bias",
697
+ "blocks.2.adaLN_modulation.bias",
698
+ "output_layer.norm_final.weight",
699
+ "output_layer.adaLN_modulation.bias"
700
+ ],
701
+ "muon_effective_nesterov": false,
702
+ "muon_effective_width_scale": false,
703
+ "muon_effective_weight_decay": 0.1,
704
+ "muon_adam_fallback_nesterov": false,
705
+ "muon_adam_fallback_weight_decay": 0.1,
706
+ "ema_decay": 0.9999,
707
+ "ema_start_step": 0,
708
+ "model_type": "ddit",
709
+ "ddit_mlp_type": "gelu",
710
+ "elf_num_time_tokens": 4,
711
+ "elf_num_model_mode_tokens": 0,
712
+ "qk_norm": true,
713
+ "output_bias": false,
714
+ "output_init_std": -1.0,
715
+ "norm_type": "rmsnorm",
716
+ "target_loss": "hard_ce",
717
+ "linear_soft_target_power": 1.0,
718
+ "linear_soft_target_min_conf": 0.0,
719
+ "linear_soft_target_max_conf": 1.0,
720
+ "t_sampling_mode": "uniform",
721
+ "t_sampling_power": 1.0,
722
+ "t_sampling_eps": 0.0001,
723
+ "t_sampling_logit_mean": -1.5,
724
+ "t_sampling_logit_std": 0.8,
725
+ "dual_t": true,
726
+ "corrupt_t_mode": "same",
727
+ "corrupt_min_t": 0.0,
728
+ "corrupt_max_t": 1.0,
729
+ "prefix_block_prob": 0.0,
730
+ "prefix_block_len": 128,
731
+ "mask_ratio_floor_schedule": "none",
732
+ "dirichlet_endpoint_mode": "categorical_dual_t",
733
+ "dirichlet_semantic_t_mode": "same",
734
+ "dirichlet_semantic_t_value": 0.0,
735
+ "dirichlet_semantic_t_curve": "linear",
736
+ "dirichlet_semantic_t_power": 1.0,
737
+ "endpoint_sequence_random_prob_alpha": 0.0,
738
+ "categorical_wrong_from_full_vocab": true,
739
+ "categorical_wrong_from_batch_valid_tokens": false,
740
+ "categorical_wrong_basin_token_ids": "",
741
+ "categorical_wrong_basin_prob": 0.0,
742
+ "categorical_wrong_unigram_prob": 0.0,
743
+ "categorical_wrong_uniform_prob": 0.0,
744
+ "categorical_wrong_prob_floor": 0.0,
745
+ "categorical_wrong_corpus_unigram_path": "",
746
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
747
+ "categorical_wrong_basin_shared_prob": 0.0,
748
+ "categorical_wrong_unigram_shared_prob": 0.0,
749
+ "mask_mixture_original_prob": 0.0,
750
+ "mask_mixture_lowk_prob": 0.0,
751
+ "mask_mixture_lowcorrupt_prob": 0.0,
752
+ "mask_mixture_block_prob": 0.0,
753
+ "mask_mixture_all_prob": 1.0,
754
+ "mask_mixture_lowk_clean_tokens": "0",
755
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
756
+ "mask_mixture_block_tokens": "64,128",
757
+ "simplex_bridge_sampler": "dirichlet",
758
+ "logistic_normal_sigma_min": 0.1,
759
+ "logistic_normal_sigma_max": 1.0,
760
+ "logistic_normal_tau_min": 1.0,
761
+ "logistic_normal_tau_max": 1.0,
762
+ "torch_compile": false,
763
+ "compile_mode": "max-autotune",
764
+ "state_format": "prob",
765
+ "meanflow_weight": 0.0,
766
+ "rollout_train_prob": 0.5,
767
+ "rollout_train_steps": 3,
768
+ "rollout_train_steps_min": 1,
769
+ "rollout_train_infer_steps": 1,
770
+ "rollout_train_time_mode": "sampled_path",
771
+ "rollout_train_s_dist": "uniform",
772
+ "rollout_train_s_min_frac": 0.0,
773
+ "rollout_train_s_max_frac": 0.25,
774
+ "rollout_train_s_beta_alpha": 2.0,
775
+ "rollout_train_s_beta_beta": 6.0,
776
+ "rollout_train_temp": 1.0,
777
+ "rollout_train_max_gamma": 1.0,
778
+ "rollout_train_corrupt_only": true,
779
+ "rollout_train_samplewise": true,
780
+ "rollout_train_compute_always": false,
781
+ "rollout_train_sync_t": true,
782
+ "bridge_noise_init": "logistic_normal",
783
+ "noise_sigma": -1.0,
784
+ "allow_tf32": true,
785
+ "activation_checkpointing": false,
786
+ "activation_checkpoint_interval": 1,
787
+ "activation_checkpoint_scope": "block",
788
+ "ddp_static_graph": false,
789
+ "ddp_gradient_as_bucket_view": true,
790
+ "blocking_data_transfer": false,
791
+ "dataloader_prefetch_factor": 4,
792
+ "full_train_stats": false,
793
+ "tokenized_hf": false,
794
+ "tokenized_pad_token": "pad",
795
+ "elf_conditional_hf": false,
796
+ "record_pad_truncate": false,
797
+ "record_add_eos": false,
798
+ "record_add_special_tokens": false,
799
+ "record_pad_token": "pad",
800
+ "record_shuffle_buffer": 10000,
801
+ "wrap": true,
802
+ "wrap_mode": "stream",
803
+ "wrap_record_buffer_size": 200,
804
+ "owt_cached_chunks": true,
805
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
806
+ "owt_chunk_cache_rebuild": false,
807
+ "owt_chunk_cache_write_batch": 4096,
808
+ "owt_exact_repeat_per_chunk": 64,
809
+ "online_chunk_shuffle": false,
810
+ "online_chunk_shuffle_buffer": 10000,
811
+ "openwebtext_split": "train_minus_100k",
812
+ "detokenizer": "auto",
813
+ "resolved_detokenizer": null,
814
+ "num_workers": 0,
815
+ "latest_every": 1000,
816
+ "resume_path": "runs/train8_ctx1024_randk_p50_path3_rand1_3_unif0_0p25_ctx1024_uniformt_temp1_randk_20260518_010217/latest.pt"
817
+ }
818
+ step=3100 epoch=3100/4000 epoch_step=1/1 micro_steps=3100 elapsed=22.1s lr=2.000000e-03 loss=0.2433 loss_recon=0.2433 loss_meanflow=0.0000 mean_model_t=0.4971 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5052 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9151 corrupt_frac=1.0000 acc_corrupt=0.9151 loss_corrupt=0.2433 wrong_frac=0.5028 init_acc_corrupt=0.5289 acc_corrupt_t_0p0_0p2=0.5905 corrupt_frac_t_0p0_0p2=0.2071 acc_corrupt_t_0p2_0p4=0.9997 corrupt_frac_t_0p2_0p4=0.1988 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1951 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.3632 out_g_norm=0.2843 loss_all=0.1842 init_gold_top10=0.7116 init_gold_top100=0.7509 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.6123 init_acc_rollout_kept=0.4381 logit_acc_rollout_applied=0.9172 logit_acc_rollout_kept=0.9454
819
+ step=3200 epoch=3200/4000 epoch_step=1/1 micro_steps=3200 elapsed=21.1s lr=2.000000e-03 loss=0.2336 loss_recon=0.2336 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5026 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9201 corrupt_frac=1.0000 acc_corrupt=0.9201 loss_corrupt=0.2336 wrong_frac=0.4984 init_acc_corrupt=0.5333 acc_corrupt_t_0p0_0p2=0.6020 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9998 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2007 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.3521 out_g_norm=0.2786 loss_all=0.2783 init_gold_top10=0.7430 init_gold_top100=0.7944 rollout_applied_pos_frac=0.5391 init_acc_rollout_applied=0.6480 init_acc_rollout_kept=0.4670 logit_acc_rollout_applied=0.9318 logit_acc_rollout_kept=0.8587
820
+ step=3300 epoch=3300/4000 epoch_step=1/1 micro_steps=3300 elapsed=21.1s lr=2.000000e-03 loss=0.2347 loss_recon=0.2347 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5045 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9199 corrupt_frac=1.0000 acc_corrupt=0.9199 loss_corrupt=0.2347 wrong_frac=0.4985 init_acc_corrupt=0.5346 acc_corrupt_t_0p0_0p2=0.5887 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.9998 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2045 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=12.3468 out_g_norm=0.2393 loss_all=0.2737 init_gold_top10=0.7531 init_gold_top100=0.7958 rollout_applied_pos_frac=0.5391 init_acc_rollout_applied=0.5829 init_acc_rollout_kept=0.4756 logit_acc_rollout_applied=0.8521 logit_acc_rollout_kept=0.9751
821
+ step=3400 epoch=3400/4000 epoch_step=1/1 micro_steps=3400 elapsed=21.0s lr=2.000000e-03 loss=0.2271 loss_recon=0.2271 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4976 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9210 corrupt_frac=1.0000 acc_corrupt=0.9210 loss_corrupt=0.2271 wrong_frac=0.5016 init_acc_corrupt=0.5301 acc_corrupt_t_0p0_0p2=0.6062 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.2001 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1992 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=12.3537 out_g_norm=0.2470 loss_all=0.1980 init_gold_top10=0.7329 init_gold_top100=0.7797 rollout_applied_pos_frac=0.5469 init_acc_rollout_applied=0.6524 init_acc_rollout_kept=0.4091 logit_acc_rollout_applied=0.9412 logit_acc_rollout_kept=0.9176
822
+ step=3500 epoch=3500/4000 epoch_step=1/1 micro_steps=3500 elapsed=21.1s lr=2.000000e-03 loss=0.2349 loss_recon=0.2349 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4981 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9185 corrupt_frac=1.0000 acc_corrupt=0.9185 loss_corrupt=0.2349 wrong_frac=0.5023 init_acc_corrupt=0.5280 acc_corrupt_t_0p0_0p2=0.6042 corrupt_frac_t_0p0_0p2=0.2059 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.1985 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1969 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1927 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=12.3571 out_g_norm=0.2425 loss_all=0.1466 init_gold_top10=0.7489 init_gold_top100=0.7876 rollout_applied_pos_frac=0.5234 init_acc_rollout_applied=0.6015 init_acc_rollout_kept=0.4792 logit_acc_rollout_applied=0.9383 logit_acc_rollout_kept=0.9684
823
+ step=3600 epoch=3600/4000 epoch_step=1/1 micro_steps=3600 elapsed=21.1s lr=2.000000e-03 loss=0.2125 loss_recon=0.2125 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5030 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9255 corrupt_frac=1.0000 acc_corrupt=0.9255 loss_corrupt=0.2125 wrong_frac=0.4987 init_acc_corrupt=0.5329 acc_corrupt_t_0p0_0p2=0.6225 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1990 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2017 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1997 out_w_norm=12.3715 out_g_norm=0.2083 loss_all=0.1811 init_gold_top10=0.7849 init_gold_top100=0.8180 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.6842 init_acc_rollout_kept=0.5812 logit_acc_rollout_applied=0.9273 logit_acc_rollout_kept=0.9449
824
+ step=3700 epoch=3700/4000 epoch_step=1/1 micro_steps=3700 elapsed=21.0s lr=2.000000e-03 loss=0.2240 loss_recon=0.2240 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4924 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9226 corrupt_frac=1.0000 acc_corrupt=0.9226 loss_corrupt=0.2240 wrong_frac=0.4994 init_acc_corrupt=0.5301 acc_corrupt_t_0p0_0p2=0.6189 corrupt_frac_t_0p0_0p2=0.2030 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2001 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=12.3829 out_g_norm=0.2166 loss_all=0.2939 init_gold_top10=0.7362 init_gold_top100=0.7971 rollout_applied_pos_frac=0.5000 init_acc_rollout_applied=0.5125 init_acc_rollout_kept=0.5183 logit_acc_rollout_applied=0.8871 logit_acc_rollout_kept=0.9070
825
+ step=3800 epoch=3800/4000 epoch_step=1/1 micro_steps=3800 elapsed=21.1s lr=2.000000e-03 loss=0.2217 loss_recon=0.2217 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4930 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9224 corrupt_frac=1.0000 acc_corrupt=0.9224 loss_corrupt=0.2217 wrong_frac=0.5037 init_acc_corrupt=0.5277 acc_corrupt_t_0p0_0p2=0.6173 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1965 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1984 out_w_norm=12.3995 out_g_norm=0.2179 loss_all=0.1585 init_gold_top10=0.6908 init_gold_top100=0.7384 rollout_applied_pos_frac=0.4609 init_acc_rollout_applied=0.6480 init_acc_rollout_kept=0.4082 logit_acc_rollout_applied=0.9687 logit_acc_rollout_kept=0.9203
826
+ step=3900 epoch=3900/4000 epoch_step=1/1 micro_steps=3900 elapsed=21.2s lr=2.000000e-03 loss=0.2028 loss_recon=0.2028 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.5040 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5050 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9292 corrupt_frac=1.0000 acc_corrupt=0.9292 loss_corrupt=0.2028 wrong_frac=0.4960 init_acc_corrupt=0.5374 acc_corrupt_t_0p0_0p2=0.6333 corrupt_frac_t_0p0_0p2=0.1931 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.2013 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.2019 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1981 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2056 out_w_norm=12.4209 out_g_norm=0.1907 loss_all=0.1519 init_gold_top10=0.7240 init_gold_top100=0.7687 rollout_applied_pos_frac=0.4297 init_acc_rollout_applied=0.6257 init_acc_rollout_kept=0.5220 logit_acc_rollout_applied=0.9088 logit_acc_rollout_kept=0.9743
827
+ step=4000 epoch=4000/4000 epoch_step=1/1 micro_steps=4000 elapsed=21.1s lr=2.000000e-03 loss=0.2196 loss_recon=0.2196 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4999 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9237 corrupt_frac=1.0000 acc_corrupt=0.9237 loss_corrupt=0.2196 wrong_frac=0.4993 init_acc_corrupt=0.5335 acc_corrupt_t_0p0_0p2=0.6254 corrupt_frac_t_0p0_0p2=0.2034 acc_corrupt_t_0p2_0p4=0.9998 corrupt_frac_t_0p2_0p4=0.1967 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1933 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2055 out_w_norm=12.4520 out_g_norm=0.1884 loss_all=0.4319 init_gold_top10=0.7309 init_gold_top100=0.7923 rollout_applied_pos_frac=0.5547 init_acc_rollout_applied=0.6149 init_acc_rollout_kept=0.4294 logit_acc_rollout_applied=0.8693 logit_acc_rollout_kept=0.8358
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_t5tok_p35_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014800.log ADDED
@@ -0,0 +1,1026 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 2423,
8
+ "tokenizer_vocab_size": 32100,
9
+ "save_dir": "runs/train8_ctx1024_t5tok_p35_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014800",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 2523776,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "uniform",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_prob_floor": 0.0,
124
+ "categorical_wrong_corpus_unigram_path": "",
125
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
126
+ "categorical_wrong_basin_shared_prob": 0.0,
127
+ "categorical_wrong_unigram_shared_prob": 0.0,
128
+ "mask_mixture_original_prob": 0.0,
129
+ "mask_mixture_lowk_prob": 0.0,
130
+ "mask_mixture_lowcorrupt_prob": 0.0,
131
+ "mask_mixture_block_prob": 0.0,
132
+ "mask_mixture_all_prob": 1.0,
133
+ "mask_mixture_lowk_clean_tokens": "0",
134
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
135
+ "mask_mixture_block_tokens": "64,128",
136
+ "simplex_bridge_sampler": "dirichlet",
137
+ "logistic_normal_sigma_min": 0.1,
138
+ "logistic_normal_sigma_max": 1.0,
139
+ "logistic_normal_tau_min": 1.0,
140
+ "logistic_normal_tau_max": 1.0,
141
+ "torch_compile": false,
142
+ "compile_mode": "max-autotune",
143
+ "state_format": "prob",
144
+ "meanflow_weight": 0.0,
145
+ "rollout_train_prob": 0.35,
146
+ "rollout_train_steps": 4,
147
+ "rollout_train_steps_min": 0,
148
+ "rollout_train_infer_steps": 1,
149
+ "rollout_train_time_mode": "sampled_path",
150
+ "rollout_train_s_dist": "uniform",
151
+ "rollout_train_s_min_frac": 0.0,
152
+ "rollout_train_s_max_frac": 0.25,
153
+ "rollout_train_s_beta_alpha": 2.0,
154
+ "rollout_train_s_beta_beta": 6.0,
155
+ "rollout_train_temp": 1.0,
156
+ "rollout_train_max_gamma": 1.0,
157
+ "rollout_train_corrupt_only": true,
158
+ "rollout_train_samplewise": true,
159
+ "rollout_train_compute_always": false,
160
+ "rollout_train_sync_t": true,
161
+ "bridge_noise_init": "logistic_normal",
162
+ "noise_sigma": -1.0,
163
+ "allow_tf32": true,
164
+ "activation_checkpointing": false,
165
+ "activation_checkpoint_interval": 1,
166
+ "activation_checkpoint_scope": "block",
167
+ "ddp_static_graph": false,
168
+ "ddp_gradient_as_bucket_view": true,
169
+ "blocking_data_transfer": false,
170
+ "dataloader_prefetch_factor": 4,
171
+ "full_train_stats": false,
172
+ "tokenized_hf": false,
173
+ "tokenized_pad_token": "pad",
174
+ "elf_conditional_hf": false,
175
+ "record_pad_truncate": false,
176
+ "record_add_eos": false,
177
+ "record_add_special_tokens": false,
178
+ "record_pad_token": "pad",
179
+ "record_shuffle_buffer": 10000,
180
+ "wrap": true,
181
+ "wrap_mode": "stream",
182
+ "wrap_record_buffer_size": 200,
183
+ "owt_cached_chunks": true,
184
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
185
+ "owt_chunk_cache_rebuild": false,
186
+ "owt_chunk_cache_write_batch": 4096,
187
+ "owt_exact_repeat_per_chunk": 64,
188
+ "online_chunk_shuffle": false,
189
+ "online_chunk_shuffle_buffer": 10000,
190
+ "openwebtext_split": "train_minus_100k",
191
+ "detokenizer": "auto",
192
+ "resolved_detokenizer": null,
193
+ "num_workers": 0,
194
+ "latest_every": 1000,
195
+ "resume_path": ""
196
+ }
197
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=22.3s lr=2.000000e-03 loss=7.3398 loss_recon=7.3398 loss_meanflow=0.0000 mean_model_t=0.5013 mean_corrupt_t=0.5013 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3505 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3321 corrupt_frac=1.0000 acc_corrupt=0.3321 loss_corrupt=7.3398 wrong_frac=0.4986 init_acc_corrupt=0.4674 acc_corrupt_t_0p0_0p2=0.0458 corrupt_frac_t_0p0_0p2=0.1952 acc_corrupt_t_0p2_0p4=0.1644 corrupt_frac_t_0p2_0p4=0.2063 acc_corrupt_t_0p4_0p6=0.3261 corrupt_frac_t_0p4_0p6=0.1973 acc_corrupt_t_0p6_0p8=0.4811 corrupt_frac_t_0p6_0p8=0.1976 acc_corrupt_t_0p8_1p0=0.6381 corrupt_frac_t_0p8_1p0=0.2036 out_w_norm=1.0907 out_g_norm=1.0054 loss_all=6.6983 init_gold_top10=0.5036 init_gold_top100=0.6154 rollout_applied_pos_frac=0.3672 init_acc_rollout_applied=0.5092 init_acc_rollout_kept=0.4438 logit_acc_rollout_applied=0.3436 logit_acc_rollout_kept=0.3040
198
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=21.4s lr=2.000000e-03 loss=5.8158 loss_recon=5.8158 loss_meanflow=0.0000 mean_model_t=0.4985 mean_corrupt_t=0.4985 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3512 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3270 corrupt_frac=1.0000 acc_corrupt=0.3270 loss_corrupt=5.8158 wrong_frac=0.5014 init_acc_corrupt=0.4644 acc_corrupt_t_0p0_0p2=0.0525 corrupt_frac_t_0p0_0p2=0.2037 acc_corrupt_t_0p2_0p4=0.1627 corrupt_frac_t_0p2_0p4=0.1982 acc_corrupt_t_0p4_0p6=0.3281 corrupt_frac_t_0p4_0p6=0.1956 acc_corrupt_t_0p6_0p8=0.4717 corrupt_frac_t_0p6_0p8=0.2056 acc_corrupt_t_0p8_1p0=0.6239 corrupt_frac_t_0p8_1p0=0.1969 out_w_norm=3.4902 out_g_norm=1.3276 loss_all=5.0556 init_gold_top10=0.5033 init_gold_top100=0.6302 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5130 init_acc_rollout_kept=0.4511 logit_acc_rollout_applied=0.3732 logit_acc_rollout_kept=0.3346
199
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=21.5s lr=2.000000e-03 loss=4.7529 loss_recon=4.7529 loss_meanflow=0.0000 mean_model_t=0.4953 mean_corrupt_t=0.4953 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3561 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3619 corrupt_frac=1.0000 acc_corrupt=0.3619 loss_corrupt=4.7529 wrong_frac=0.5048 init_acc_corrupt=0.4611 acc_corrupt_t_0p0_0p2=0.0554 corrupt_frac_t_0p0_0p2=0.2036 acc_corrupt_t_0p2_0p4=0.1879 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.3636 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=0.5241 corrupt_frac_t_0p6_0p8=0.1995 acc_corrupt_t_0p8_1p0=0.6983 corrupt_frac_t_0p8_1p0=0.1934 out_w_norm=5.5748 out_g_norm=0.5497 loss_all=4.3187 init_gold_top10=0.5207 init_gold_top100=0.6468 rollout_applied_pos_frac=0.3203 init_acc_rollout_applied=0.5246 init_acc_rollout_kept=0.4730 logit_acc_rollout_applied=0.4309 logit_acc_rollout_kept=0.3892
200
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=21.3s lr=2.000000e-03 loss=4.1311 loss_recon=4.1311 loss_meanflow=0.0000 mean_model_t=0.4980 mean_corrupt_t=0.4980 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3477 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4208 corrupt_frac=1.0000 acc_corrupt=0.4208 loss_corrupt=4.1311 wrong_frac=0.5019 init_acc_corrupt=0.4646 acc_corrupt_t_0p0_0p2=0.0580 corrupt_frac_t_0p0_0p2=0.2061 acc_corrupt_t_0p2_0p4=0.2099 corrupt_frac_t_0p2_0p4=0.1960 acc_corrupt_t_0p4_0p6=0.4188 corrupt_frac_t_0p4_0p6=0.1973 acc_corrupt_t_0p6_0p8=0.6131 corrupt_frac_t_0p6_0p8=0.2020 acc_corrupt_t_0p8_1p0=0.8120 corrupt_frac_t_0p8_1p0=0.1985 out_w_norm=7.1079 out_g_norm=0.2788 loss_all=3.8756 init_gold_top10=0.4993 init_gold_top100=0.6349 rollout_applied_pos_frac=0.3203 init_acc_rollout_applied=0.4744 init_acc_rollout_kept=0.4485 logit_acc_rollout_applied=0.4807 logit_acc_rollout_kept=0.4541
201
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=21.6s lr=2.000000e-03 loss=3.5338 loss_recon=3.5338 loss_meanflow=0.0000 mean_model_t=0.4998 mean_corrupt_t=0.4998 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3556 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4850 corrupt_frac=1.0000 acc_corrupt=0.4850 loss_corrupt=3.5338 wrong_frac=0.5002 init_acc_corrupt=0.4672 acc_corrupt_t_0p0_0p2=0.0595 corrupt_frac_t_0p0_0p2=0.1998 acc_corrupt_t_0p2_0p4=0.2389 corrupt_frac_t_0p2_0p4=0.1971 acc_corrupt_t_0p4_0p6=0.5047 corrupt_frac_t_0p4_0p6=0.2008 acc_corrupt_t_0p6_0p8=0.7118 corrupt_frac_t_0p6_0p8=0.2002 acc_corrupt_t_0p8_1p0=0.9014 corrupt_frac_t_0p8_1p0=0.2020 out_w_norm=8.4429 out_g_norm=0.2396 loss_all=3.1683 init_gold_top10=0.5190 init_gold_top100=0.6517 rollout_applied_pos_frac=0.3125 init_acc_rollout_applied=0.4787 init_acc_rollout_kept=0.4850 logit_acc_rollout_applied=0.4965 logit_acc_rollout_kept=0.5102
202
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=21.2s lr=2.000000e-03 loss=3.0899 loss_recon=3.0899 loss_meanflow=0.0000 mean_model_t=0.5009 mean_corrupt_t=0.5009 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3408 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4960 corrupt_frac=1.0000 acc_corrupt=0.4960 loss_corrupt=3.0899 wrong_frac=0.4992 init_acc_corrupt=0.4683 acc_corrupt_t_0p0_0p2=0.0615 corrupt_frac_t_0p0_0p2=0.1963 acc_corrupt_t_0p2_0p4=0.2689 corrupt_frac_t_0p2_0p4=0.2016 acc_corrupt_t_0p4_0p6=0.5237 corrupt_frac_t_0p4_0p6=0.2056 acc_corrupt_t_0p6_0p8=0.7171 corrupt_frac_t_0p6_0p8=0.1941 acc_corrupt_t_0p8_1p0=0.9034 corrupt_frac_t_0p8_1p0=0.2024 out_w_norm=9.7107 out_g_norm=0.2350 loss_all=2.8169 init_gold_top10=0.5280 init_gold_top100=0.6629 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.5225 init_acc_rollout_kept=0.4687 logit_acc_rollout_applied=0.5480 logit_acc_rollout_kept=0.5012
203
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=21.5s lr=2.000000e-03 loss=2.7701 loss_recon=2.7701 loss_meanflow=0.0000 mean_model_t=0.5003 mean_corrupt_t=0.5003 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3456 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5024 corrupt_frac=1.0000 acc_corrupt=0.5024 loss_corrupt=2.7701 wrong_frac=0.4998 init_acc_corrupt=0.4679 acc_corrupt_t_0p0_0p2=0.0632 corrupt_frac_t_0p0_0p2=0.2006 acc_corrupt_t_0p2_0p4=0.2825 corrupt_frac_t_0p2_0p4=0.1945 acc_corrupt_t_0p4_0p6=0.5328 corrupt_frac_t_0p4_0p6=0.2064 acc_corrupt_t_0p6_0p8=0.7215 corrupt_frac_t_0p6_0p8=0.1962 acc_corrupt_t_0p8_1p0=0.9055 corrupt_frac_t_0p8_1p0=0.2023 out_w_norm=10.7214 out_g_norm=0.2984 loss_all=2.4240 init_gold_top10=0.5449 init_gold_top100=0.6994 rollout_applied_pos_frac=0.3359 init_acc_rollout_applied=0.4338 init_acc_rollout_kept=0.5231 logit_acc_rollout_applied=0.4746 logit_acc_rollout_kept=0.5618
204
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=21.5s lr=2.000000e-03 loss=2.3140 loss_recon=2.3140 loss_meanflow=0.0000 mean_model_t=0.4990 mean_corrupt_t=0.4990 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3480 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5340 corrupt_frac=1.0000 acc_corrupt=0.5340 loss_corrupt=2.3140 wrong_frac=0.5010 init_acc_corrupt=0.4673 acc_corrupt_t_0p0_0p2=0.0627 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.3125 corrupt_frac_t_0p2_0p4=0.2004 acc_corrupt_t_0p4_0p6=0.5945 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=0.7750 corrupt_frac_t_0p6_0p8=0.2018 acc_corrupt_t_0p8_1p0=0.9285 corrupt_frac_t_0p8_1p0=0.1982 out_w_norm=11.2606 out_g_norm=0.3675 loss_all=1.9381 init_gold_top10=0.5523 init_gold_top100=0.6877 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.4343 init_acc_rollout_kept=0.5056 logit_acc_rollout_applied=0.5323 logit_acc_rollout_kept=0.6172
205
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=21.5s lr=2.000000e-03 loss=1.8331 loss_recon=1.8331 loss_meanflow=0.0000 mean_model_t=0.5027 mean_corrupt_t=0.5027 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3570 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6092 corrupt_frac=1.0000 acc_corrupt=0.6092 loss_corrupt=1.8331 wrong_frac=0.4975 init_acc_corrupt=0.4744 acc_corrupt_t_0p0_0p2=0.0647 corrupt_frac_t_0p0_0p2=0.1991 acc_corrupt_t_0p2_0p4=0.3814 corrupt_frac_t_0p2_0p4=0.1997 acc_corrupt_t_0p4_0p6=0.7313 corrupt_frac_t_0p4_0p6=0.1958 acc_corrupt_t_0p6_0p8=0.8878 corrupt_frac_t_0p6_0p8=0.1995 acc_corrupt_t_0p8_1p0=0.9707 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=11.7391 out_g_norm=0.4679 loss_all=1.6215 init_gold_top10=0.5723 init_gold_top100=0.7072 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5013 init_acc_rollout_kept=0.4759 logit_acc_rollout_applied=0.6607 logit_acc_rollout_kept=0.6511
206
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=21.4s lr=2.000000e-03 loss=1.5052 loss_recon=1.5052 loss_meanflow=0.0000 mean_model_t=0.5005 mean_corrupt_t=0.5005 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3489 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6756 corrupt_frac=1.0000 acc_corrupt=0.6756 loss_corrupt=1.5052 wrong_frac=0.4995 init_acc_corrupt=0.4803 acc_corrupt_t_0p0_0p2=0.0712 corrupt_frac_t_0p0_0p2=0.2029 acc_corrupt_t_0p2_0p4=0.4982 corrupt_frac_t_0p2_0p4=0.1965 acc_corrupt_t_0p4_0p6=0.8585 corrupt_frac_t_0p4_0p6=0.1988 acc_corrupt_t_0p6_0p8=0.9612 corrupt_frac_t_0p6_0p8=0.2028 acc_corrupt_t_0p8_1p0=0.9934 corrupt_frac_t_0p8_1p0=0.1990 out_w_norm=12.1101 out_g_norm=0.5194 loss_all=1.0420 init_gold_top10=0.6055 init_gold_top100=0.7165 rollout_applied_pos_frac=0.3125 init_acc_rollout_applied=0.5002 init_acc_rollout_kept=0.5045 logit_acc_rollout_applied=0.7523 logit_acc_rollout_kept=0.7499
207
+ NCCL version 2.25.1+cuda12.8
208
+ resumed_from=runs/train8_ctx1024_t5tok_p35_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014800/latest.pt start_step=1001
209
+ {
210
+ "device": "cuda:0",
211
+ "rank": 0,
212
+ "world_size": 4,
213
+ "samples": "owt_cached_chunks:8",
214
+ "vocab_size": 2423,
215
+ "tokenizer_vocab_size": 32100,
216
+ "save_dir": "runs/train8_ctx1024_t5tok_p35_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014800",
217
+ "batch_size": 128,
218
+ "grad_accum": 1,
219
+ "effective_batch_size": 512,
220
+ "global_batch_size": 512,
221
+ "lr_schedule": "constant_warmup",
222
+ "optimizer": "muon",
223
+ "epochs": 0.0,
224
+ "steps_per_epoch": 1,
225
+ "total_steps": 2000,
226
+ "warmup_steps": 10,
227
+ "warmup_epochs": -1.0,
228
+ "min_lr": 0.0,
229
+ "weight_decay": 0.1,
230
+ "output_weight_decay": -1.0,
231
+ "adamw_param_groups": "nanogpt",
232
+ "adam_beta1": 0.9,
233
+ "adam_beta2": 0.95,
234
+ "adam_eps": 1e-08,
235
+ "muon_impl": "legacy",
236
+ "muon_momentum": 0.95,
237
+ "muon_ns_steps": 5,
238
+ "muon_update_scale": 1.0,
239
+ "muon_nesterov": false,
240
+ "muon_width_scale": false,
241
+ "muon_grouping": "legacy_dim_ge_2",
242
+ "muon_param_count": 2523776,
243
+ "muon_adam_param_count": 8192,
244
+ "muon_param_names": [
245
+ "vocab_embed.embedding",
246
+ "sigma_map.net.0.weight",
247
+ "sigma_map.net.2.weight",
248
+ "blocks.0.attn_qkv.weight",
249
+ "blocks.0.attn_out.weight",
250
+ "blocks.0.mlp.0.weight",
251
+ "blocks.0.mlp.2.weight",
252
+ "blocks.0.adaLN_modulation.weight",
253
+ "blocks.1.attn_qkv.weight",
254
+ "blocks.1.attn_out.weight",
255
+ "blocks.1.mlp.0.weight",
256
+ "blocks.1.mlp.2.weight",
257
+ "blocks.1.adaLN_modulation.weight",
258
+ "blocks.2.attn_qkv.weight",
259
+ "blocks.2.attn_out.weight",
260
+ "blocks.2.mlp.0.weight",
261
+ "blocks.2.mlp.2.weight",
262
+ "blocks.2.adaLN_modulation.weight",
263
+ "output_layer.linear.weight",
264
+ "output_layer.adaLN_modulation.weight"
265
+ ],
266
+ "muon_adam_param_names": [
267
+ "sigma_map.net.0.bias",
268
+ "sigma_map.net.2.bias",
269
+ "blocks.0.norm1.weight",
270
+ "blocks.0.norm2.weight",
271
+ "blocks.0.mlp.0.bias",
272
+ "blocks.0.mlp.2.bias",
273
+ "blocks.0.adaLN_modulation.bias",
274
+ "blocks.1.norm1.weight",
275
+ "blocks.1.norm2.weight",
276
+ "blocks.1.mlp.0.bias",
277
+ "blocks.1.mlp.2.bias",
278
+ "blocks.1.adaLN_modulation.bias",
279
+ "blocks.2.norm1.weight",
280
+ "blocks.2.norm2.weight",
281
+ "blocks.2.mlp.0.bias",
282
+ "blocks.2.mlp.2.bias",
283
+ "blocks.2.adaLN_modulation.bias",
284
+ "output_layer.norm_final.weight",
285
+ "output_layer.adaLN_modulation.bias"
286
+ ],
287
+ "muon_effective_nesterov": false,
288
+ "muon_effective_width_scale": false,
289
+ "muon_effective_weight_decay": 0.1,
290
+ "muon_adam_fallback_nesterov": false,
291
+ "muon_adam_fallback_weight_decay": 0.1,
292
+ "ema_decay": 0.9999,
293
+ "ema_start_step": 0,
294
+ "model_type": "ddit",
295
+ "ddit_mlp_type": "gelu",
296
+ "elf_num_time_tokens": 4,
297
+ "elf_num_model_mode_tokens": 0,
298
+ "qk_norm": true,
299
+ "output_bias": false,
300
+ "output_init_std": -1.0,
301
+ "norm_type": "rmsnorm",
302
+ "target_loss": "hard_ce",
303
+ "linear_soft_target_power": 1.0,
304
+ "linear_soft_target_min_conf": 0.0,
305
+ "linear_soft_target_max_conf": 1.0,
306
+ "t_sampling_mode": "uniform",
307
+ "t_sampling_power": 1.0,
308
+ "t_sampling_eps": 0.0001,
309
+ "t_sampling_logit_mean": -1.5,
310
+ "t_sampling_logit_std": 0.8,
311
+ "dual_t": true,
312
+ "corrupt_t_mode": "same",
313
+ "corrupt_min_t": 0.0,
314
+ "corrupt_max_t": 1.0,
315
+ "prefix_block_prob": 0.0,
316
+ "prefix_block_len": 128,
317
+ "mask_ratio_floor_schedule": "none",
318
+ "dirichlet_endpoint_mode": "categorical_dual_t",
319
+ "dirichlet_semantic_t_mode": "same",
320
+ "dirichlet_semantic_t_value": 0.0,
321
+ "dirichlet_semantic_t_curve": "linear",
322
+ "dirichlet_semantic_t_power": 1.0,
323
+ "endpoint_sequence_random_prob_alpha": 0.0,
324
+ "categorical_wrong_from_full_vocab": true,
325
+ "categorical_wrong_from_batch_valid_tokens": false,
326
+ "categorical_wrong_basin_token_ids": "",
327
+ "categorical_wrong_basin_prob": 0.0,
328
+ "categorical_wrong_unigram_prob": 0.0,
329
+ "categorical_wrong_uniform_prob": 0.0,
330
+ "categorical_wrong_prob_floor": 0.0,
331
+ "categorical_wrong_corpus_unigram_path": "",
332
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
333
+ "categorical_wrong_basin_shared_prob": 0.0,
334
+ "categorical_wrong_unigram_shared_prob": 0.0,
335
+ "mask_mixture_original_prob": 0.0,
336
+ "mask_mixture_lowk_prob": 0.0,
337
+ "mask_mixture_lowcorrupt_prob": 0.0,
338
+ "mask_mixture_block_prob": 0.0,
339
+ "mask_mixture_all_prob": 1.0,
340
+ "mask_mixture_lowk_clean_tokens": "0",
341
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
342
+ "mask_mixture_block_tokens": "64,128",
343
+ "simplex_bridge_sampler": "dirichlet",
344
+ "logistic_normal_sigma_min": 0.1,
345
+ "logistic_normal_sigma_max": 1.0,
346
+ "logistic_normal_tau_min": 1.0,
347
+ "logistic_normal_tau_max": 1.0,
348
+ "torch_compile": false,
349
+ "compile_mode": "max-autotune",
350
+ "state_format": "prob",
351
+ "meanflow_weight": 0.0,
352
+ "rollout_train_prob": 0.35,
353
+ "rollout_train_steps": 4,
354
+ "rollout_train_steps_min": 0,
355
+ "rollout_train_infer_steps": 1,
356
+ "rollout_train_time_mode": "sampled_path",
357
+ "rollout_train_s_dist": "uniform",
358
+ "rollout_train_s_min_frac": 0.0,
359
+ "rollout_train_s_max_frac": 0.25,
360
+ "rollout_train_s_beta_alpha": 2.0,
361
+ "rollout_train_s_beta_beta": 6.0,
362
+ "rollout_train_temp": 1.0,
363
+ "rollout_train_max_gamma": 1.0,
364
+ "rollout_train_corrupt_only": true,
365
+ "rollout_train_samplewise": true,
366
+ "rollout_train_compute_always": false,
367
+ "rollout_train_sync_t": true,
368
+ "bridge_noise_init": "logistic_normal",
369
+ "noise_sigma": -1.0,
370
+ "allow_tf32": true,
371
+ "activation_checkpointing": false,
372
+ "activation_checkpoint_interval": 1,
373
+ "activation_checkpoint_scope": "block",
374
+ "ddp_static_graph": false,
375
+ "ddp_gradient_as_bucket_view": true,
376
+ "blocking_data_transfer": false,
377
+ "dataloader_prefetch_factor": 4,
378
+ "full_train_stats": false,
379
+ "tokenized_hf": false,
380
+ "tokenized_pad_token": "pad",
381
+ "elf_conditional_hf": false,
382
+ "record_pad_truncate": false,
383
+ "record_add_eos": false,
384
+ "record_add_special_tokens": false,
385
+ "record_pad_token": "pad",
386
+ "record_shuffle_buffer": 10000,
387
+ "wrap": true,
388
+ "wrap_mode": "stream",
389
+ "wrap_record_buffer_size": 200,
390
+ "owt_cached_chunks": true,
391
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
392
+ "owt_chunk_cache_rebuild": false,
393
+ "owt_chunk_cache_write_batch": 4096,
394
+ "owt_exact_repeat_per_chunk": 64,
395
+ "online_chunk_shuffle": false,
396
+ "online_chunk_shuffle_buffer": 10000,
397
+ "openwebtext_split": "train_minus_100k",
398
+ "detokenizer": "auto",
399
+ "resolved_detokenizer": null,
400
+ "num_workers": 0,
401
+ "latest_every": 1000,
402
+ "resume_path": "runs/train8_ctx1024_t5tok_p35_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014800/latest.pt"
403
+ }
404
+ step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=22.4s lr=2.000000e-03 loss=1.2767 loss_recon=1.2767 loss_meanflow=0.0000 mean_model_t=0.5013 mean_corrupt_t=0.5013 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3505 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7230 corrupt_frac=1.0000 acc_corrupt=0.7230 loss_corrupt=1.2767 wrong_frac=0.4986 init_acc_corrupt=0.4886 acc_corrupt_t_0p0_0p2=0.0808 corrupt_frac_t_0p0_0p2=0.1952 acc_corrupt_t_0p2_0p4=0.6060 corrupt_frac_t_0p2_0p4=0.2063 acc_corrupt_t_0p4_0p6=0.9317 corrupt_frac_t_0p4_0p6=0.1973 acc_corrupt_t_0p6_0p8=0.9879 corrupt_frac_t_0p6_0p8=0.1976 acc_corrupt_t_0p8_1p0=0.9984 corrupt_frac_t_0p8_1p0=0.2036 out_w_norm=12.3788 out_g_norm=0.6042 loss_all=1.0626 init_gold_top10=0.5895 init_gold_top100=0.6964 rollout_applied_pos_frac=0.3672 init_acc_rollout_applied=0.5416 init_acc_rollout_kept=0.4438 logit_acc_rollout_applied=0.7898 logit_acc_rollout_kept=0.7326
405
+ step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=21.5s lr=2.000000e-03 loss=1.1235 loss_recon=1.1235 loss_meanflow=0.0000 mean_model_t=0.4985 mean_corrupt_t=0.4985 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3512 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7536 corrupt_frac=1.0000 acc_corrupt=0.7536 loss_corrupt=1.1235 wrong_frac=0.5014 init_acc_corrupt=0.4914 acc_corrupt_t_0p0_0p2=0.1035 corrupt_frac_t_0p0_0p2=0.2037 acc_corrupt_t_0p2_0p4=0.7157 corrupt_frac_t_0p2_0p4=0.1982 acc_corrupt_t_0p4_0p6=0.9669 corrupt_frac_t_0p4_0p6=0.1956 acc_corrupt_t_0p6_0p8=0.9956 corrupt_frac_t_0p6_0p8=0.2056 acc_corrupt_t_0p8_1p0=0.9995 corrupt_frac_t_0p8_1p0=0.1969 out_w_norm=12.5808 out_g_norm=0.6078 loss_all=0.9406 init_gold_top10=0.5981 init_gold_top100=0.7016 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.6352 init_acc_rollout_kept=0.4511 logit_acc_rollout_applied=0.8745 logit_acc_rollout_kept=0.7457
406
+ step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=21.6s lr=2.000000e-03 loss=0.9872 loss_recon=0.9872 loss_meanflow=0.0000 mean_model_t=0.4953 mean_corrupt_t=0.4953 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3561 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7801 corrupt_frac=1.0000 acc_corrupt=0.7801 loss_corrupt=0.9872 wrong_frac=0.5048 init_acc_corrupt=0.4923 acc_corrupt_t_0p0_0p2=0.1360 corrupt_frac_t_0p0_0p2=0.2036 acc_corrupt_t_0p2_0p4=0.8013 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.9834 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=0.9982 corrupt_frac_t_0p6_0p8=0.1995 acc_corrupt_t_0p8_1p0=0.9998 corrupt_frac_t_0p8_1p0=0.1934 out_w_norm=12.7435 out_g_norm=0.6096 loss_all=0.9355 init_gold_top10=0.5925 init_gold_top100=0.7038 rollout_applied_pos_frac=0.3203 init_acc_rollout_applied=0.6028 init_acc_rollout_kept=0.4730 logit_acc_rollout_applied=0.8285 logit_acc_rollout_kept=0.7765
407
+ step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=21.4s lr=2.000000e-03 loss=0.8762 loss_recon=0.8762 loss_meanflow=0.0000 mean_model_t=0.4980 mean_corrupt_t=0.4980 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3477 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8016 corrupt_frac=1.0000 acc_corrupt=0.8016 loss_corrupt=0.8762 wrong_frac=0.5019 init_acc_corrupt=0.4999 acc_corrupt_t_0p0_0p2=0.1713 corrupt_frac_t_0p0_0p2=0.2061 acc_corrupt_t_0p2_0p4=0.8680 corrupt_frac_t_0p2_0p4=0.1960 acc_corrupt_t_0p4_0p6=0.9922 corrupt_frac_t_0p4_0p6=0.1973 acc_corrupt_t_0p6_0p8=0.9990 corrupt_frac_t_0p6_0p8=0.2020 acc_corrupt_t_0p8_1p0=0.9998 corrupt_frac_t_0p8_1p0=0.1985 out_w_norm=12.8779 out_g_norm=0.6266 loss_all=1.0431 init_gold_top10=0.5569 init_gold_top100=0.6933 rollout_applied_pos_frac=0.3203 init_acc_rollout_applied=0.5252 init_acc_rollout_kept=0.4485 logit_acc_rollout_applied=0.7440 logit_acc_rollout_kept=0.7830
408
+ step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=21.7s lr=2.000000e-03 loss=0.7811 loss_recon=0.7811 loss_meanflow=0.0000 mean_model_t=0.4998 mean_corrupt_t=0.4998 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3556 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8210 corrupt_frac=1.0000 acc_corrupt=0.8210 loss_corrupt=0.7811 wrong_frac=0.5002 init_acc_corrupt=0.5047 acc_corrupt_t_0p0_0p2=0.1993 corrupt_frac_t_0p0_0p2=0.1998 acc_corrupt_t_0p2_0p4=0.9085 corrupt_frac_t_0p2_0p4=0.1971 acc_corrupt_t_0p4_0p6=0.9959 corrupt_frac_t_0p4_0p6=0.2008 acc_corrupt_t_0p6_0p8=0.9994 corrupt_frac_t_0p6_0p8=0.2002 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.2020 out_w_norm=12.9795 out_g_norm=0.6605 loss_all=0.7407 init_gold_top10=0.6019 init_gold_top100=0.7049 rollout_applied_pos_frac=0.3125 init_acc_rollout_applied=0.5921 init_acc_rollout_kept=0.4850 logit_acc_rollout_applied=0.8213 logit_acc_rollout_kept=0.8415
409
+ step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=21.3s lr=2.000000e-03 loss=0.6912 loss_recon=0.6912 loss_meanflow=0.0000 mean_model_t=0.5009 mean_corrupt_t=0.5009 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3408 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8386 corrupt_frac=1.0000 acc_corrupt=0.8386 loss_corrupt=0.6912 wrong_frac=0.4992 init_acc_corrupt=0.5091 acc_corrupt_t_0p0_0p2=0.2368 corrupt_frac_t_0p0_0p2=0.1963 acc_corrupt_t_0p2_0p4=0.9448 corrupt_frac_t_0p2_0p4=0.2016 acc_corrupt_t_0p4_0p6=0.9979 corrupt_frac_t_0p4_0p6=0.2056 acc_corrupt_t_0p6_0p8=0.9996 corrupt_frac_t_0p6_0p8=0.1941 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.2024 out_w_norm=13.0625 out_g_norm=0.6266 loss_all=0.6583 init_gold_top10=0.6102 init_gold_top100=0.7189 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.5999 init_acc_rollout_kept=0.4687 logit_acc_rollout_applied=0.8509 logit_acc_rollout_kept=0.8435
410
+ step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=21.6s lr=2.000000e-03 loss=0.6448 loss_recon=0.6448 loss_meanflow=0.0000 mean_model_t=0.5003 mean_corrupt_t=0.5003 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3456 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8462 corrupt_frac=1.0000 acc_corrupt=0.8462 loss_corrupt=0.6448 wrong_frac=0.4998 init_acc_corrupt=0.5116 acc_corrupt_t_0p0_0p2=0.2707 corrupt_frac_t_0p0_0p2=0.2006 acc_corrupt_t_0p2_0p4=0.9633 corrupt_frac_t_0p2_0p4=0.1945 acc_corrupt_t_0p4_0p6=0.9987 corrupt_frac_t_0p4_0p6=0.2064 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.1962 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.2023 out_w_norm=13.1160 out_g_norm=0.5624 loss_all=0.5461 init_gold_top10=0.6344 init_gold_top100=0.7312 rollout_applied_pos_frac=0.3359 init_acc_rollout_applied=0.5986 init_acc_rollout_kept=0.5231 logit_acc_rollout_applied=0.8390 logit_acc_rollout_kept=0.8845
411
+ step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=21.6s lr=2.000000e-03 loss=0.6028 loss_recon=0.6028 loss_meanflow=0.0000 mean_model_t=0.4990 mean_corrupt_t=0.4990 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3480 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8527 corrupt_frac=1.0000 acc_corrupt=0.8527 loss_corrupt=0.6028 wrong_frac=0.5010 init_acc_corrupt=0.5109 acc_corrupt_t_0p0_0p2=0.2903 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9758 corrupt_frac_t_0p2_0p4=0.2004 acc_corrupt_t_0p4_0p6=0.9992 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.2018 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.1982 out_w_norm=13.1521 out_g_norm=0.5877 loss_all=0.5062 init_gold_top10=0.6120 init_gold_top100=0.7186 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5046 init_acc_rollout_kept=0.5056 logit_acc_rollout_applied=0.8045 logit_acc_rollout_kept=0.9070
412
+ step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=21.7s lr=2.000000e-03 loss=0.5329 loss_recon=0.5329 loss_meanflow=0.0000 mean_model_t=0.5027 mean_corrupt_t=0.5027 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3570 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8641 corrupt_frac=1.0000 acc_corrupt=0.8641 loss_corrupt=0.5329 wrong_frac=0.4975 init_acc_corrupt=0.5146 acc_corrupt_t_0p0_0p2=0.3359 corrupt_frac_t_0p0_0p2=0.1991 acc_corrupt_t_0p2_0p4=0.9823 corrupt_frac_t_0p2_0p4=0.1997 acc_corrupt_t_0p4_0p6=0.9994 corrupt_frac_t_0p4_0p6=0.1958 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1995 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=13.1712 out_g_norm=0.5743 loss_all=0.5300 init_gold_top10=0.6270 init_gold_top100=0.7250 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.6050 init_acc_rollout_kept=0.4759 logit_acc_rollout_applied=0.8912 logit_acc_rollout_kept=0.8507
413
+ step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=21.6s lr=2.000000e-03 loss=0.4949 loss_recon=0.4949 loss_meanflow=0.0000 mean_model_t=0.5005 mean_corrupt_t=0.5005 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3489 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8686 corrupt_frac=1.0000 acc_corrupt=0.8686 loss_corrupt=0.4949 wrong_frac=0.4995 init_acc_corrupt=0.5134 acc_corrupt_t_0p0_0p2=0.3647 corrupt_frac_t_0p0_0p2=0.2029 acc_corrupt_t_0p2_0p4=0.9877 corrupt_frac_t_0p2_0p4=0.1965 acc_corrupt_t_0p4_0p6=0.9996 corrupt_frac_t_0p4_0p6=0.1988 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.2028 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1990 out_w_norm=13.1851 out_g_norm=0.5259 loss_all=0.2158 init_gold_top10=0.6500 init_gold_top100=0.7168 rollout_applied_pos_frac=0.3125 init_acc_rollout_applied=0.6294 init_acc_rollout_kept=0.5045 logit_acc_rollout_applied=0.9802 logit_acc_rollout_kept=0.9204
414
+ NCCL version 2.25.1+cuda12.8
415
+ resumed_from=runs/train8_ctx1024_t5tok_p35_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014800/latest.pt start_step=2001
416
+ {
417
+ "device": "cuda:0",
418
+ "rank": 0,
419
+ "world_size": 4,
420
+ "samples": "owt_cached_chunks:8",
421
+ "vocab_size": 2423,
422
+ "tokenizer_vocab_size": 32100,
423
+ "save_dir": "runs/train8_ctx1024_t5tok_p35_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014800",
424
+ "batch_size": 128,
425
+ "grad_accum": 1,
426
+ "effective_batch_size": 512,
427
+ "global_batch_size": 512,
428
+ "lr_schedule": "constant_warmup",
429
+ "optimizer": "muon",
430
+ "epochs": 0.0,
431
+ "steps_per_epoch": 1,
432
+ "total_steps": 3000,
433
+ "warmup_steps": 10,
434
+ "warmup_epochs": -1.0,
435
+ "min_lr": 0.0,
436
+ "weight_decay": 0.1,
437
+ "output_weight_decay": -1.0,
438
+ "adamw_param_groups": "nanogpt",
439
+ "adam_beta1": 0.9,
440
+ "adam_beta2": 0.95,
441
+ "adam_eps": 1e-08,
442
+ "muon_impl": "legacy",
443
+ "muon_momentum": 0.95,
444
+ "muon_ns_steps": 5,
445
+ "muon_update_scale": 1.0,
446
+ "muon_nesterov": false,
447
+ "muon_width_scale": false,
448
+ "muon_grouping": "legacy_dim_ge_2",
449
+ "muon_param_count": 2523776,
450
+ "muon_adam_param_count": 8192,
451
+ "muon_param_names": [
452
+ "vocab_embed.embedding",
453
+ "sigma_map.net.0.weight",
454
+ "sigma_map.net.2.weight",
455
+ "blocks.0.attn_qkv.weight",
456
+ "blocks.0.attn_out.weight",
457
+ "blocks.0.mlp.0.weight",
458
+ "blocks.0.mlp.2.weight",
459
+ "blocks.0.adaLN_modulation.weight",
460
+ "blocks.1.attn_qkv.weight",
461
+ "blocks.1.attn_out.weight",
462
+ "blocks.1.mlp.0.weight",
463
+ "blocks.1.mlp.2.weight",
464
+ "blocks.1.adaLN_modulation.weight",
465
+ "blocks.2.attn_qkv.weight",
466
+ "blocks.2.attn_out.weight",
467
+ "blocks.2.mlp.0.weight",
468
+ "blocks.2.mlp.2.weight",
469
+ "blocks.2.adaLN_modulation.weight",
470
+ "output_layer.linear.weight",
471
+ "output_layer.adaLN_modulation.weight"
472
+ ],
473
+ "muon_adam_param_names": [
474
+ "sigma_map.net.0.bias",
475
+ "sigma_map.net.2.bias",
476
+ "blocks.0.norm1.weight",
477
+ "blocks.0.norm2.weight",
478
+ "blocks.0.mlp.0.bias",
479
+ "blocks.0.mlp.2.bias",
480
+ "blocks.0.adaLN_modulation.bias",
481
+ "blocks.1.norm1.weight",
482
+ "blocks.1.norm2.weight",
483
+ "blocks.1.mlp.0.bias",
484
+ "blocks.1.mlp.2.bias",
485
+ "blocks.1.adaLN_modulation.bias",
486
+ "blocks.2.norm1.weight",
487
+ "blocks.2.norm2.weight",
488
+ "blocks.2.mlp.0.bias",
489
+ "blocks.2.mlp.2.bias",
490
+ "blocks.2.adaLN_modulation.bias",
491
+ "output_layer.norm_final.weight",
492
+ "output_layer.adaLN_modulation.bias"
493
+ ],
494
+ "muon_effective_nesterov": false,
495
+ "muon_effective_width_scale": false,
496
+ "muon_effective_weight_decay": 0.1,
497
+ "muon_adam_fallback_nesterov": false,
498
+ "muon_adam_fallback_weight_decay": 0.1,
499
+ "ema_decay": 0.9999,
500
+ "ema_start_step": 0,
501
+ "model_type": "ddit",
502
+ "ddit_mlp_type": "gelu",
503
+ "elf_num_time_tokens": 4,
504
+ "elf_num_model_mode_tokens": 0,
505
+ "qk_norm": true,
506
+ "output_bias": false,
507
+ "output_init_std": -1.0,
508
+ "norm_type": "rmsnorm",
509
+ "target_loss": "hard_ce",
510
+ "linear_soft_target_power": 1.0,
511
+ "linear_soft_target_min_conf": 0.0,
512
+ "linear_soft_target_max_conf": 1.0,
513
+ "t_sampling_mode": "uniform",
514
+ "t_sampling_power": 1.0,
515
+ "t_sampling_eps": 0.0001,
516
+ "t_sampling_logit_mean": -1.5,
517
+ "t_sampling_logit_std": 0.8,
518
+ "dual_t": true,
519
+ "corrupt_t_mode": "same",
520
+ "corrupt_min_t": 0.0,
521
+ "corrupt_max_t": 1.0,
522
+ "prefix_block_prob": 0.0,
523
+ "prefix_block_len": 128,
524
+ "mask_ratio_floor_schedule": "none",
525
+ "dirichlet_endpoint_mode": "categorical_dual_t",
526
+ "dirichlet_semantic_t_mode": "same",
527
+ "dirichlet_semantic_t_value": 0.0,
528
+ "dirichlet_semantic_t_curve": "linear",
529
+ "dirichlet_semantic_t_power": 1.0,
530
+ "endpoint_sequence_random_prob_alpha": 0.0,
531
+ "categorical_wrong_from_full_vocab": true,
532
+ "categorical_wrong_from_batch_valid_tokens": false,
533
+ "categorical_wrong_basin_token_ids": "",
534
+ "categorical_wrong_basin_prob": 0.0,
535
+ "categorical_wrong_unigram_prob": 0.0,
536
+ "categorical_wrong_uniform_prob": 0.0,
537
+ "categorical_wrong_prob_floor": 0.0,
538
+ "categorical_wrong_corpus_unigram_path": "",
539
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
540
+ "categorical_wrong_basin_shared_prob": 0.0,
541
+ "categorical_wrong_unigram_shared_prob": 0.0,
542
+ "mask_mixture_original_prob": 0.0,
543
+ "mask_mixture_lowk_prob": 0.0,
544
+ "mask_mixture_lowcorrupt_prob": 0.0,
545
+ "mask_mixture_block_prob": 0.0,
546
+ "mask_mixture_all_prob": 1.0,
547
+ "mask_mixture_lowk_clean_tokens": "0",
548
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
549
+ "mask_mixture_block_tokens": "64,128",
550
+ "simplex_bridge_sampler": "dirichlet",
551
+ "logistic_normal_sigma_min": 0.1,
552
+ "logistic_normal_sigma_max": 1.0,
553
+ "logistic_normal_tau_min": 1.0,
554
+ "logistic_normal_tau_max": 1.0,
555
+ "torch_compile": false,
556
+ "compile_mode": "max-autotune",
557
+ "state_format": "prob",
558
+ "meanflow_weight": 0.0,
559
+ "rollout_train_prob": 0.35,
560
+ "rollout_train_steps": 4,
561
+ "rollout_train_steps_min": 0,
562
+ "rollout_train_infer_steps": 1,
563
+ "rollout_train_time_mode": "sampled_path",
564
+ "rollout_train_s_dist": "uniform",
565
+ "rollout_train_s_min_frac": 0.0,
566
+ "rollout_train_s_max_frac": 0.25,
567
+ "rollout_train_s_beta_alpha": 2.0,
568
+ "rollout_train_s_beta_beta": 6.0,
569
+ "rollout_train_temp": 1.0,
570
+ "rollout_train_max_gamma": 1.0,
571
+ "rollout_train_corrupt_only": true,
572
+ "rollout_train_samplewise": true,
573
+ "rollout_train_compute_always": false,
574
+ "rollout_train_sync_t": true,
575
+ "bridge_noise_init": "logistic_normal",
576
+ "noise_sigma": -1.0,
577
+ "allow_tf32": true,
578
+ "activation_checkpointing": false,
579
+ "activation_checkpoint_interval": 1,
580
+ "activation_checkpoint_scope": "block",
581
+ "ddp_static_graph": false,
582
+ "ddp_gradient_as_bucket_view": true,
583
+ "blocking_data_transfer": false,
584
+ "dataloader_prefetch_factor": 4,
585
+ "full_train_stats": false,
586
+ "tokenized_hf": false,
587
+ "tokenized_pad_token": "pad",
588
+ "elf_conditional_hf": false,
589
+ "record_pad_truncate": false,
590
+ "record_add_eos": false,
591
+ "record_add_special_tokens": false,
592
+ "record_pad_token": "pad",
593
+ "record_shuffle_buffer": 10000,
594
+ "wrap": true,
595
+ "wrap_mode": "stream",
596
+ "wrap_record_buffer_size": 200,
597
+ "owt_cached_chunks": true,
598
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
599
+ "owt_chunk_cache_rebuild": false,
600
+ "owt_chunk_cache_write_batch": 4096,
601
+ "owt_exact_repeat_per_chunk": 64,
602
+ "online_chunk_shuffle": false,
603
+ "online_chunk_shuffle_buffer": 10000,
604
+ "openwebtext_split": "train_minus_100k",
605
+ "detokenizer": "auto",
606
+ "resolved_detokenizer": null,
607
+ "num_workers": 0,
608
+ "latest_every": 1000,
609
+ "resume_path": "runs/train8_ctx1024_t5tok_p35_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014800/latest.pt"
610
+ }
611
+ step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=22.3s lr=2.000000e-03 loss=0.4462 loss_recon=0.4462 loss_meanflow=0.0000 mean_model_t=0.5013 mean_corrupt_t=0.5013 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3505 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8772 corrupt_frac=1.0000 acc_corrupt=0.8772 loss_corrupt=0.4462 wrong_frac=0.4986 init_acc_corrupt=0.5139 acc_corrupt_t_0p0_0p2=0.3826 corrupt_frac_t_0p0_0p2=0.1952 acc_corrupt_t_0p2_0p4=0.9897 corrupt_frac_t_0p2_0p4=0.2063 acc_corrupt_t_0p4_0p6=0.9996 corrupt_frac_t_0p4_0p6=0.1973 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1976 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.2036 out_w_norm=13.1855 out_g_norm=0.5866 loss_all=0.3201 init_gold_top10=0.6185 init_gold_top100=0.7053 rollout_applied_pos_frac=0.3672 init_acc_rollout_applied=0.5589 init_acc_rollout_kept=0.4438 logit_acc_rollout_applied=0.9093 logit_acc_rollout_kept=0.9095
612
+ step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=21.4s lr=2.000000e-03 loss=0.4130 loss_recon=0.4130 loss_meanflow=0.0000 mean_model_t=0.4985 mean_corrupt_t=0.4985 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3512 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8820 corrupt_frac=1.0000 acc_corrupt=0.8820 loss_corrupt=0.4130 wrong_frac=0.5014 init_acc_corrupt=0.5131 acc_corrupt_t_0p0_0p2=0.4279 corrupt_frac_t_0p0_0p2=0.2037 acc_corrupt_t_0p2_0p4=0.9927 corrupt_frac_t_0p2_0p4=0.1982 acc_corrupt_t_0p4_0p6=0.9997 corrupt_frac_t_0p4_0p6=0.1956 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.2056 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1969 out_w_norm=13.1924 out_g_norm=0.5322 loss_all=0.3586 init_gold_top10=0.6108 init_gold_top100=0.7017 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.6741 init_acc_rollout_kept=0.4511 logit_acc_rollout_applied=0.9561 logit_acc_rollout_kept=0.8641
613
+ step=2300 epoch=2300/3000 epoch_step=1/1 micro_steps=2300 elapsed=21.4s lr=2.000000e-03 loss=0.3803 loss_recon=0.3803 loss_meanflow=0.0000 mean_model_t=0.4953 mean_corrupt_t=0.4953 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3561 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8884 corrupt_frac=1.0000 acc_corrupt=0.8884 loss_corrupt=0.3803 wrong_frac=0.5048 init_acc_corrupt=0.5092 acc_corrupt_t_0p0_0p2=0.4584 corrupt_frac_t_0p0_0p2=0.2036 acc_corrupt_t_0p2_0p4=0.9940 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1995 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1934 out_w_norm=13.1910 out_g_norm=0.5184 loss_all=0.4000 init_gold_top10=0.6118 init_gold_top100=0.7075 rollout_applied_pos_frac=0.3203 init_acc_rollout_applied=0.6476 init_acc_rollout_kept=0.4730 logit_acc_rollout_applied=0.9479 logit_acc_rollout_kept=0.8544
614
+ step=2400 epoch=2400/3000 epoch_step=1/1 micro_steps=2400 elapsed=21.3s lr=2.000000e-03 loss=0.3508 loss_recon=0.3508 loss_meanflow=0.0000 mean_model_t=0.4980 mean_corrupt_t=0.4980 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3477 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8934 corrupt_frac=1.0000 acc_corrupt=0.8934 loss_corrupt=0.3508 wrong_frac=0.5019 init_acc_corrupt=0.5133 acc_corrupt_t_0p0_0p2=0.4868 corrupt_frac_t_0p0_0p2=0.2061 acc_corrupt_t_0p2_0p4=0.9958 corrupt_frac_t_0p2_0p4=0.1960 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.1973 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.2020 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1985 out_w_norm=13.1887 out_g_norm=0.5057 loss_all=0.4408 init_gold_top10=0.5895 init_gold_top100=0.7003 rollout_applied_pos_frac=0.3203 init_acc_rollout_applied=0.5455 init_acc_rollout_kept=0.4485 logit_acc_rollout_applied=0.8244 logit_acc_rollout_kept=0.8776
615
+ step=2500 epoch=2500/3000 epoch_step=1/1 micro_steps=2500 elapsed=21.6s lr=2.000000e-03 loss=0.3328 loss_recon=0.3328 loss_meanflow=0.0000 mean_model_t=0.4998 mean_corrupt_t=0.4998 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3556 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8969 corrupt_frac=1.0000 acc_corrupt=0.8969 loss_corrupt=0.3328 wrong_frac=0.5002 init_acc_corrupt=0.5156 acc_corrupt_t_0p0_0p2=0.4878 corrupt_frac_t_0p0_0p2=0.1998 acc_corrupt_t_0p2_0p4=0.9964 corrupt_frac_t_0p2_0p4=0.1971 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2008 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2002 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2020 out_w_norm=13.1803 out_g_norm=0.4942 loss_all=0.3619 init_gold_top10=0.6216 init_gold_top100=0.7085 rollout_applied_pos_frac=0.3125 init_acc_rollout_applied=0.6158 init_acc_rollout_kept=0.4850 logit_acc_rollout_applied=0.8643 logit_acc_rollout_kept=0.8886
616
+ step=2600 epoch=2600/3000 epoch_step=1/1 micro_steps=2600 elapsed=21.2s lr=2.000000e-03 loss=0.3044 loss_recon=0.3044 loss_meanflow=0.0000 mean_model_t=0.5009 mean_corrupt_t=0.5009 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3408 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9041 corrupt_frac=1.0000 acc_corrupt=0.9041 loss_corrupt=0.3044 wrong_frac=0.4992 init_acc_corrupt=0.5175 acc_corrupt_t_0p0_0p2=0.5144 corrupt_frac_t_0p0_0p2=0.1963 acc_corrupt_t_0p2_0p4=0.9973 corrupt_frac_t_0p2_0p4=0.2016 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2056 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1941 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2024 out_w_norm=13.1441 out_g_norm=0.4960 loss_all=0.3029 init_gold_top10=0.6351 init_gold_top100=0.7199 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.6223 init_acc_rollout_kept=0.4687 logit_acc_rollout_applied=0.8984 logit_acc_rollout_kept=0.9060
617
+ step=2700 epoch=2700/3000 epoch_step=1/1 micro_steps=2700 elapsed=21.5s lr=2.000000e-03 loss=0.2965 loss_recon=0.2965 loss_meanflow=0.0000 mean_model_t=0.5003 mean_corrupt_t=0.5003 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3456 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9053 corrupt_frac=1.0000 acc_corrupt=0.9053 loss_corrupt=0.2965 wrong_frac=0.4998 init_acc_corrupt=0.5185 acc_corrupt_t_0p0_0p2=0.5301 corrupt_frac_t_0p0_0p2=0.2006 acc_corrupt_t_0p2_0p4=0.9979 corrupt_frac_t_0p2_0p4=0.1945 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2064 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1962 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2023 out_w_norm=13.0921 out_g_norm=0.4470 loss_all=0.2555 init_gold_top10=0.6564 init_gold_top100=0.7317 rollout_applied_pos_frac=0.3359 init_acc_rollout_applied=0.6218 init_acc_rollout_kept=0.5231 logit_acc_rollout_applied=0.8883 logit_acc_rollout_kept=0.9308
618
+ step=2800 epoch=2800/3000 epoch_step=1/1 micro_steps=2800 elapsed=21.5s lr=2.000000e-03 loss=0.2918 loss_recon=0.2918 loss_meanflow=0.0000 mean_model_t=0.4990 mean_corrupt_t=0.4990 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3480 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9048 corrupt_frac=1.0000 acc_corrupt=0.9048 loss_corrupt=0.2918 wrong_frac=0.5010 init_acc_corrupt=0.5162 acc_corrupt_t_0p0_0p2=0.5270 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9982 corrupt_frac_t_0p2_0p4=0.2004 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2018 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1982 out_w_norm=13.0408 out_g_norm=0.4282 loss_all=0.2638 init_gold_top10=0.6356 init_gold_top100=0.7187 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5152 init_acc_rollout_kept=0.5056 logit_acc_rollout_applied=0.8472 logit_acc_rollout_kept=0.9478
619
+ step=2900 epoch=2900/3000 epoch_step=1/1 micro_steps=2900 elapsed=21.5s lr=2.000000e-03 loss=0.2759 loss_recon=0.2759 loss_meanflow=0.0000 mean_model_t=0.5027 mean_corrupt_t=0.5027 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3570 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9100 corrupt_frac=1.0000 acc_corrupt=0.9100 loss_corrupt=0.2759 wrong_frac=0.4975 init_acc_corrupt=0.5199 acc_corrupt_t_0p0_0p2=0.5494 corrupt_frac_t_0p0_0p2=0.1991 acc_corrupt_t_0p2_0p4=0.9986 corrupt_frac_t_0p2_0p4=0.1997 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1958 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1995 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=12.9782 out_g_norm=0.4290 loss_all=0.3209 init_gold_top10=0.6401 init_gold_top100=0.7250 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.6212 init_acc_rollout_kept=0.4759 logit_acc_rollout_applied=0.9051 logit_acc_rollout_kept=0.8887
620
+ step=3000 epoch=3000/3000 epoch_step=1/1 micro_steps=3000 elapsed=21.5s lr=2.000000e-03 loss=0.2653 loss_recon=0.2653 loss_meanflow=0.0000 mean_model_t=0.5005 mean_corrupt_t=0.5005 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3489 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9119 corrupt_frac=1.0000 acc_corrupt=0.9119 loss_corrupt=0.2653 wrong_frac=0.4995 init_acc_corrupt=0.5176 acc_corrupt_t_0p0_0p2=0.5668 corrupt_frac_t_0p0_0p2=0.2029 acc_corrupt_t_0p2_0p4=0.9990 corrupt_frac_t_0p2_0p4=0.1965 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1988 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2028 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1990 out_w_norm=12.9209 out_g_norm=0.3901 loss_all=0.1247 init_gold_top10=0.6494 init_gold_top100=0.7168 rollout_applied_pos_frac=0.3125 init_acc_rollout_applied=0.6353 init_acc_rollout_kept=0.5045 logit_acc_rollout_applied=0.9789 logit_acc_rollout_kept=0.9466
621
+ NCCL version 2.25.1+cuda12.8
622
+ resumed_from=runs/train8_ctx1024_t5tok_p35_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014800/latest.pt start_step=3001
623
+ {
624
+ "device": "cuda:0",
625
+ "rank": 0,
626
+ "world_size": 4,
627
+ "samples": "owt_cached_chunks:8",
628
+ "vocab_size": 2423,
629
+ "tokenizer_vocab_size": 32100,
630
+ "save_dir": "runs/train8_ctx1024_t5tok_p35_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014800",
631
+ "batch_size": 128,
632
+ "grad_accum": 1,
633
+ "effective_batch_size": 512,
634
+ "global_batch_size": 512,
635
+ "lr_schedule": "constant_warmup",
636
+ "optimizer": "muon",
637
+ "epochs": 0.0,
638
+ "steps_per_epoch": 1,
639
+ "total_steps": 4000,
640
+ "warmup_steps": 10,
641
+ "warmup_epochs": -1.0,
642
+ "min_lr": 0.0,
643
+ "weight_decay": 0.1,
644
+ "output_weight_decay": -1.0,
645
+ "adamw_param_groups": "nanogpt",
646
+ "adam_beta1": 0.9,
647
+ "adam_beta2": 0.95,
648
+ "adam_eps": 1e-08,
649
+ "muon_impl": "legacy",
650
+ "muon_momentum": 0.95,
651
+ "muon_ns_steps": 5,
652
+ "muon_update_scale": 1.0,
653
+ "muon_nesterov": false,
654
+ "muon_width_scale": false,
655
+ "muon_grouping": "legacy_dim_ge_2",
656
+ "muon_param_count": 2523776,
657
+ "muon_adam_param_count": 8192,
658
+ "muon_param_names": [
659
+ "vocab_embed.embedding",
660
+ "sigma_map.net.0.weight",
661
+ "sigma_map.net.2.weight",
662
+ "blocks.0.attn_qkv.weight",
663
+ "blocks.0.attn_out.weight",
664
+ "blocks.0.mlp.0.weight",
665
+ "blocks.0.mlp.2.weight",
666
+ "blocks.0.adaLN_modulation.weight",
667
+ "blocks.1.attn_qkv.weight",
668
+ "blocks.1.attn_out.weight",
669
+ "blocks.1.mlp.0.weight",
670
+ "blocks.1.mlp.2.weight",
671
+ "blocks.1.adaLN_modulation.weight",
672
+ "blocks.2.attn_qkv.weight",
673
+ "blocks.2.attn_out.weight",
674
+ "blocks.2.mlp.0.weight",
675
+ "blocks.2.mlp.2.weight",
676
+ "blocks.2.adaLN_modulation.weight",
677
+ "output_layer.linear.weight",
678
+ "output_layer.adaLN_modulation.weight"
679
+ ],
680
+ "muon_adam_param_names": [
681
+ "sigma_map.net.0.bias",
682
+ "sigma_map.net.2.bias",
683
+ "blocks.0.norm1.weight",
684
+ "blocks.0.norm2.weight",
685
+ "blocks.0.mlp.0.bias",
686
+ "blocks.0.mlp.2.bias",
687
+ "blocks.0.adaLN_modulation.bias",
688
+ "blocks.1.norm1.weight",
689
+ "blocks.1.norm2.weight",
690
+ "blocks.1.mlp.0.bias",
691
+ "blocks.1.mlp.2.bias",
692
+ "blocks.1.adaLN_modulation.bias",
693
+ "blocks.2.norm1.weight",
694
+ "blocks.2.norm2.weight",
695
+ "blocks.2.mlp.0.bias",
696
+ "blocks.2.mlp.2.bias",
697
+ "blocks.2.adaLN_modulation.bias",
698
+ "output_layer.norm_final.weight",
699
+ "output_layer.adaLN_modulation.bias"
700
+ ],
701
+ "muon_effective_nesterov": false,
702
+ "muon_effective_width_scale": false,
703
+ "muon_effective_weight_decay": 0.1,
704
+ "muon_adam_fallback_nesterov": false,
705
+ "muon_adam_fallback_weight_decay": 0.1,
706
+ "ema_decay": 0.9999,
707
+ "ema_start_step": 0,
708
+ "model_type": "ddit",
709
+ "ddit_mlp_type": "gelu",
710
+ "elf_num_time_tokens": 4,
711
+ "elf_num_model_mode_tokens": 0,
712
+ "qk_norm": true,
713
+ "output_bias": false,
714
+ "output_init_std": -1.0,
715
+ "norm_type": "rmsnorm",
716
+ "target_loss": "hard_ce",
717
+ "linear_soft_target_power": 1.0,
718
+ "linear_soft_target_min_conf": 0.0,
719
+ "linear_soft_target_max_conf": 1.0,
720
+ "t_sampling_mode": "uniform",
721
+ "t_sampling_power": 1.0,
722
+ "t_sampling_eps": 0.0001,
723
+ "t_sampling_logit_mean": -1.5,
724
+ "t_sampling_logit_std": 0.8,
725
+ "dual_t": true,
726
+ "corrupt_t_mode": "same",
727
+ "corrupt_min_t": 0.0,
728
+ "corrupt_max_t": 1.0,
729
+ "prefix_block_prob": 0.0,
730
+ "prefix_block_len": 128,
731
+ "mask_ratio_floor_schedule": "none",
732
+ "dirichlet_endpoint_mode": "categorical_dual_t",
733
+ "dirichlet_semantic_t_mode": "same",
734
+ "dirichlet_semantic_t_value": 0.0,
735
+ "dirichlet_semantic_t_curve": "linear",
736
+ "dirichlet_semantic_t_power": 1.0,
737
+ "endpoint_sequence_random_prob_alpha": 0.0,
738
+ "categorical_wrong_from_full_vocab": true,
739
+ "categorical_wrong_from_batch_valid_tokens": false,
740
+ "categorical_wrong_basin_token_ids": "",
741
+ "categorical_wrong_basin_prob": 0.0,
742
+ "categorical_wrong_unigram_prob": 0.0,
743
+ "categorical_wrong_uniform_prob": 0.0,
744
+ "categorical_wrong_prob_floor": 0.0,
745
+ "categorical_wrong_corpus_unigram_path": "",
746
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
747
+ "categorical_wrong_basin_shared_prob": 0.0,
748
+ "categorical_wrong_unigram_shared_prob": 0.0,
749
+ "mask_mixture_original_prob": 0.0,
750
+ "mask_mixture_lowk_prob": 0.0,
751
+ "mask_mixture_lowcorrupt_prob": 0.0,
752
+ "mask_mixture_block_prob": 0.0,
753
+ "mask_mixture_all_prob": 1.0,
754
+ "mask_mixture_lowk_clean_tokens": "0",
755
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
756
+ "mask_mixture_block_tokens": "64,128",
757
+ "simplex_bridge_sampler": "dirichlet",
758
+ "logistic_normal_sigma_min": 0.1,
759
+ "logistic_normal_sigma_max": 1.0,
760
+ "logistic_normal_tau_min": 1.0,
761
+ "logistic_normal_tau_max": 1.0,
762
+ "torch_compile": false,
763
+ "compile_mode": "max-autotune",
764
+ "state_format": "prob",
765
+ "meanflow_weight": 0.0,
766
+ "rollout_train_prob": 0.35,
767
+ "rollout_train_steps": 4,
768
+ "rollout_train_steps_min": 0,
769
+ "rollout_train_infer_steps": 1,
770
+ "rollout_train_time_mode": "sampled_path",
771
+ "rollout_train_s_dist": "uniform",
772
+ "rollout_train_s_min_frac": 0.0,
773
+ "rollout_train_s_max_frac": 0.25,
774
+ "rollout_train_s_beta_alpha": 2.0,
775
+ "rollout_train_s_beta_beta": 6.0,
776
+ "rollout_train_temp": 1.0,
777
+ "rollout_train_max_gamma": 1.0,
778
+ "rollout_train_corrupt_only": true,
779
+ "rollout_train_samplewise": true,
780
+ "rollout_train_compute_always": false,
781
+ "rollout_train_sync_t": true,
782
+ "bridge_noise_init": "logistic_normal",
783
+ "noise_sigma": -1.0,
784
+ "allow_tf32": true,
785
+ "activation_checkpointing": false,
786
+ "activation_checkpoint_interval": 1,
787
+ "activation_checkpoint_scope": "block",
788
+ "ddp_static_graph": false,
789
+ "ddp_gradient_as_bucket_view": true,
790
+ "blocking_data_transfer": false,
791
+ "dataloader_prefetch_factor": 4,
792
+ "full_train_stats": false,
793
+ "tokenized_hf": false,
794
+ "tokenized_pad_token": "pad",
795
+ "elf_conditional_hf": false,
796
+ "record_pad_truncate": false,
797
+ "record_add_eos": false,
798
+ "record_add_special_tokens": false,
799
+ "record_pad_token": "pad",
800
+ "record_shuffle_buffer": 10000,
801
+ "wrap": true,
802
+ "wrap_mode": "stream",
803
+ "wrap_record_buffer_size": 200,
804
+ "owt_cached_chunks": true,
805
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
806
+ "owt_chunk_cache_rebuild": false,
807
+ "owt_chunk_cache_write_batch": 4096,
808
+ "owt_exact_repeat_per_chunk": 64,
809
+ "online_chunk_shuffle": false,
810
+ "online_chunk_shuffle_buffer": 10000,
811
+ "openwebtext_split": "train_minus_100k",
812
+ "detokenizer": "auto",
813
+ "resolved_detokenizer": null,
814
+ "num_workers": 0,
815
+ "latest_every": 1000,
816
+ "resume_path": "runs/train8_ctx1024_t5tok_p35_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014800/latest.pt"
817
+ }
818
+ step=3100 epoch=3100/4000 epoch_step=1/1 micro_steps=3100 elapsed=22.3s lr=2.000000e-03 loss=0.2609 loss_recon=0.2609 loss_meanflow=0.0000 mean_model_t=0.5013 mean_corrupt_t=0.5013 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3505 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9123 corrupt_frac=1.0000 acc_corrupt=0.9123 loss_corrupt=0.2609 wrong_frac=0.4986 init_acc_corrupt=0.5173 acc_corrupt_t_0p0_0p2=0.5519 corrupt_frac_t_0p0_0p2=0.1952 acc_corrupt_t_0p2_0p4=0.9991 corrupt_frac_t_0p2_0p4=0.2063 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1973 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1976 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2036 out_w_norm=12.8783 out_g_norm=0.4025 loss_all=0.1703 init_gold_top10=0.6252 init_gold_top100=0.7053 rollout_applied_pos_frac=0.3672 init_acc_rollout_applied=0.5608 init_acc_rollout_kept=0.4438 logit_acc_rollout_applied=0.9224 logit_acc_rollout_kept=0.9513
819
+ step=3200 epoch=3200/4000 epoch_step=1/1 micro_steps=3200 elapsed=21.4s lr=2.000000e-03 loss=0.2604 loss_recon=0.2604 loss_meanflow=0.0000 mean_model_t=0.4985 mean_corrupt_t=0.4985 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3512 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9118 corrupt_frac=1.0000 acc_corrupt=0.9118 loss_corrupt=0.2604 wrong_frac=0.5014 init_acc_corrupt=0.5157 acc_corrupt_t_0p0_0p2=0.5677 corrupt_frac_t_0p0_0p2=0.2037 acc_corrupt_t_0p2_0p4=0.9994 corrupt_frac_t_0p2_0p4=0.1982 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1956 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2056 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1969 out_w_norm=12.8221 out_g_norm=0.3907 loss_all=0.2038 init_gold_top10=0.6119 init_gold_top100=0.7017 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.6757 init_acc_rollout_kept=0.4511 logit_acc_rollout_applied=0.9773 logit_acc_rollout_kept=0.9039
820
+ step=3300 epoch=3300/4000 epoch_step=1/1 micro_steps=3300 elapsed=21.5s lr=2.000000e-03 loss=0.2517 loss_recon=0.2517 loss_meanflow=0.0000 mean_model_t=0.4953 mean_corrupt_t=0.4953 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3561 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9148 corrupt_frac=1.0000 acc_corrupt=0.9148 loss_corrupt=0.2517 wrong_frac=0.5048 init_acc_corrupt=0.5118 acc_corrupt_t_0p0_0p2=0.5823 corrupt_frac_t_0p0_0p2=0.2036 acc_corrupt_t_0p2_0p4=0.9993 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1995 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1934 out_w_norm=12.7718 out_g_norm=0.3434 loss_all=0.2658 init_gold_top10=0.6161 init_gold_top100=0.7075 rollout_applied_pos_frac=0.3203 init_acc_rollout_applied=0.6542 init_acc_rollout_kept=0.4730 logit_acc_rollout_applied=0.9581 logit_acc_rollout_kept=0.8839
821
+ step=3400 epoch=3400/4000 epoch_step=1/1 micro_steps=3400 elapsed=21.3s lr=2.000000e-03 loss=0.2479 loss_recon=0.2479 loss_meanflow=0.0000 mean_model_t=0.4980 mean_corrupt_t=0.4980 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3477 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9155 corrupt_frac=1.0000 acc_corrupt=0.9155 loss_corrupt=0.2479 wrong_frac=0.5019 init_acc_corrupt=0.5150 acc_corrupt_t_0p0_0p2=0.5904 corrupt_frac_t_0p0_0p2=0.2061 acc_corrupt_t_0p2_0p4=0.9995 corrupt_frac_t_0p2_0p4=0.1960 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1973 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2020 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1985 out_w_norm=12.7430 out_g_norm=0.3736 loss_all=0.3522 init_gold_top10=0.5957 init_gold_top100=0.7003 rollout_applied_pos_frac=0.3203 init_acc_rollout_applied=0.5486 init_acc_rollout_kept=0.4485 logit_acc_rollout_applied=0.8328 logit_acc_rollout_kept=0.9001
822
+ step=3500 epoch=3500/4000 epoch_step=1/1 micro_steps=3500 elapsed=21.6s lr=2.000000e-03 loss=0.2511 loss_recon=0.2511 loss_meanflow=0.0000 mean_model_t=0.4998 mean_corrupt_t=0.4998 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3556 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9152 corrupt_frac=1.0000 acc_corrupt=0.9152 loss_corrupt=0.2511 wrong_frac=0.5002 init_acc_corrupt=0.5169 acc_corrupt_t_0p0_0p2=0.5763 corrupt_frac_t_0p0_0p2=0.1998 acc_corrupt_t_0p2_0p4=0.9994 corrupt_frac_t_0p2_0p4=0.1971 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2008 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2002 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2020 out_w_norm=12.7032 out_g_norm=0.3187 loss_all=0.3310 init_gold_top10=0.6211 init_gold_top100=0.7085 rollout_applied_pos_frac=0.3125 init_acc_rollout_applied=0.6220 init_acc_rollout_kept=0.4850 logit_acc_rollout_applied=0.8659 logit_acc_rollout_kept=0.9094
823
+ step=3600 epoch=3600/4000 epoch_step=1/1 micro_steps=3600 elapsed=21.2s lr=2.000000e-03 loss=0.2338 loss_recon=0.2338 loss_meanflow=0.0000 mean_model_t=0.5009 mean_corrupt_t=0.5009 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3408 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9201 corrupt_frac=1.0000 acc_corrupt=0.9201 loss_corrupt=0.2338 wrong_frac=0.4992 init_acc_corrupt=0.5187 acc_corrupt_t_0p0_0p2=0.5936 corrupt_frac_t_0p0_0p2=0.1963 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.2016 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.2056 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1941 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2024 out_w_norm=12.6731 out_g_norm=0.3229 loss_all=0.2204 init_gold_top10=0.6387 init_gold_top100=0.7199 rollout_applied_pos_frac=0.3828 init_acc_rollout_applied=0.6243 init_acc_rollout_kept=0.4687 logit_acc_rollout_applied=0.8953 logit_acc_rollout_kept=0.9395
824
+ step=3700 epoch=3700/4000 epoch_step=1/1 micro_steps=3700 elapsed=21.5s lr=2.000000e-03 loss=0.2356 loss_recon=0.2356 loss_meanflow=0.0000 mean_model_t=0.5003 mean_corrupt_t=0.5003 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3456 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9198 corrupt_frac=1.0000 acc_corrupt=0.9198 loss_corrupt=0.2356 wrong_frac=0.4998 init_acc_corrupt=0.5195 acc_corrupt_t_0p0_0p2=0.6006 corrupt_frac_t_0p0_0p2=0.2006 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.1945 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.2064 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1962 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2023 out_w_norm=12.6559 out_g_norm=0.3148 loss_all=0.2045 init_gold_top10=0.6618 init_gold_top100=0.7317 rollout_applied_pos_frac=0.3359 init_acc_rollout_applied=0.6307 init_acc_rollout_kept=0.5231 logit_acc_rollout_applied=0.9134 logit_acc_rollout_kept=0.9381
825
+ step=3800 epoch=3800/4000 epoch_step=1/1 micro_steps=3800 elapsed=21.6s lr=2.000000e-03 loss=0.2395 loss_recon=0.2395 loss_meanflow=0.0000 mean_model_t=0.4990 mean_corrupt_t=0.4990 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3480 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9184 corrupt_frac=1.0000 acc_corrupt=0.9184 loss_corrupt=0.2395 wrong_frac=0.5010 init_acc_corrupt=0.5169 acc_corrupt_t_0p0_0p2=0.5933 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.2004 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2018 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1982 out_w_norm=12.6400 out_g_norm=0.3101 loss_all=0.2123 init_gold_top10=0.6421 init_gold_top100=0.7187 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.5164 init_acc_rollout_kept=0.5056 logit_acc_rollout_applied=0.8528 logit_acc_rollout_kept=0.9597
826
+ step=3900 epoch=3900/4000 epoch_step=1/1 micro_steps=3900 elapsed=21.6s lr=2.000000e-03 loss=0.2277 loss_recon=0.2277 loss_meanflow=0.0000 mean_model_t=0.5027 mean_corrupt_t=0.5027 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3570 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9211 corrupt_frac=1.0000 acc_corrupt=0.9211 loss_corrupt=0.2277 wrong_frac=0.4975 init_acc_corrupt=0.5205 acc_corrupt_t_0p0_0p2=0.6044 corrupt_frac_t_0p0_0p2=0.1991 acc_corrupt_t_0p2_0p4=0.9996 corrupt_frac_t_0p2_0p4=0.1997 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1958 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1995 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=12.6312 out_g_norm=0.2864 loss_all=0.2865 init_gold_top10=0.6438 init_gold_top100=0.7250 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.6188 init_acc_rollout_kept=0.4759 logit_acc_rollout_applied=0.9056 logit_acc_rollout_kept=0.8930
827
+ step=4000 epoch=4000/4000 epoch_step=1/1 micro_steps=4000 elapsed=21.5s lr=2.000000e-03 loss=0.2278 loss_recon=0.2278 loss_meanflow=0.0000 mean_model_t=0.5005 mean_corrupt_t=0.5005 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3489 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9225 corrupt_frac=1.0000 acc_corrupt=0.9225 loss_corrupt=0.2278 wrong_frac=0.4995 init_acc_corrupt=0.5186 acc_corrupt_t_0p0_0p2=0.6184 corrupt_frac_t_0p0_0p2=0.2029 acc_corrupt_t_0p2_0p4=0.9997 corrupt_frac_t_0p2_0p4=0.1965 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1988 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2028 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1990 out_w_norm=12.6244 out_g_norm=0.2732 loss_all=0.1031 init_gold_top10=0.6493 init_gold_top100=0.7168 rollout_applied_pos_frac=0.3125 init_acc_rollout_applied=0.6364 init_acc_rollout_kept=0.5045 logit_acc_rollout_applied=0.9772 logit_acc_rollout_kept=0.9571
828
+ NCCL version 2.25.1+cuda12.8
829
+ resumed_from=runs/train8_ctx1024_t5tok_p35_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014800/latest.pt start_step=4001
830
+ {
831
+ "device": "cuda:0",
832
+ "rank": 0,
833
+ "world_size": 4,
834
+ "samples": "owt_cached_chunks:8",
835
+ "vocab_size": 2423,
836
+ "tokenizer_vocab_size": 32100,
837
+ "save_dir": "runs/train8_ctx1024_t5tok_p35_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014800",
838
+ "batch_size": 128,
839
+ "grad_accum": 1,
840
+ "effective_batch_size": 512,
841
+ "global_batch_size": 512,
842
+ "lr_schedule": "constant_warmup",
843
+ "optimizer": "muon",
844
+ "epochs": 0.0,
845
+ "steps_per_epoch": 1,
846
+ "total_steps": 5000,
847
+ "warmup_steps": 10,
848
+ "warmup_epochs": -1.0,
849
+ "min_lr": 0.0,
850
+ "weight_decay": 0.1,
851
+ "output_weight_decay": -1.0,
852
+ "adamw_param_groups": "nanogpt",
853
+ "adam_beta1": 0.9,
854
+ "adam_beta2": 0.95,
855
+ "adam_eps": 1e-08,
856
+ "muon_impl": "legacy",
857
+ "muon_momentum": 0.95,
858
+ "muon_ns_steps": 5,
859
+ "muon_update_scale": 1.0,
860
+ "muon_nesterov": false,
861
+ "muon_width_scale": false,
862
+ "muon_grouping": "legacy_dim_ge_2",
863
+ "muon_param_count": 2523776,
864
+ "muon_adam_param_count": 8192,
865
+ "muon_param_names": [
866
+ "vocab_embed.embedding",
867
+ "sigma_map.net.0.weight",
868
+ "sigma_map.net.2.weight",
869
+ "blocks.0.attn_qkv.weight",
870
+ "blocks.0.attn_out.weight",
871
+ "blocks.0.mlp.0.weight",
872
+ "blocks.0.mlp.2.weight",
873
+ "blocks.0.adaLN_modulation.weight",
874
+ "blocks.1.attn_qkv.weight",
875
+ "blocks.1.attn_out.weight",
876
+ "blocks.1.mlp.0.weight",
877
+ "blocks.1.mlp.2.weight",
878
+ "blocks.1.adaLN_modulation.weight",
879
+ "blocks.2.attn_qkv.weight",
880
+ "blocks.2.attn_out.weight",
881
+ "blocks.2.mlp.0.weight",
882
+ "blocks.2.mlp.2.weight",
883
+ "blocks.2.adaLN_modulation.weight",
884
+ "output_layer.linear.weight",
885
+ "output_layer.adaLN_modulation.weight"
886
+ ],
887
+ "muon_adam_param_names": [
888
+ "sigma_map.net.0.bias",
889
+ "sigma_map.net.2.bias",
890
+ "blocks.0.norm1.weight",
891
+ "blocks.0.norm2.weight",
892
+ "blocks.0.mlp.0.bias",
893
+ "blocks.0.mlp.2.bias",
894
+ "blocks.0.adaLN_modulation.bias",
895
+ "blocks.1.norm1.weight",
896
+ "blocks.1.norm2.weight",
897
+ "blocks.1.mlp.0.bias",
898
+ "blocks.1.mlp.2.bias",
899
+ "blocks.1.adaLN_modulation.bias",
900
+ "blocks.2.norm1.weight",
901
+ "blocks.2.norm2.weight",
902
+ "blocks.2.mlp.0.bias",
903
+ "blocks.2.mlp.2.bias",
904
+ "blocks.2.adaLN_modulation.bias",
905
+ "output_layer.norm_final.weight",
906
+ "output_layer.adaLN_modulation.bias"
907
+ ],
908
+ "muon_effective_nesterov": false,
909
+ "muon_effective_width_scale": false,
910
+ "muon_effective_weight_decay": 0.1,
911
+ "muon_adam_fallback_nesterov": false,
912
+ "muon_adam_fallback_weight_decay": 0.1,
913
+ "ema_decay": 0.9999,
914
+ "ema_start_step": 0,
915
+ "model_type": "ddit",
916
+ "ddit_mlp_type": "gelu",
917
+ "elf_num_time_tokens": 4,
918
+ "elf_num_model_mode_tokens": 0,
919
+ "qk_norm": true,
920
+ "output_bias": false,
921
+ "output_init_std": -1.0,
922
+ "norm_type": "rmsnorm",
923
+ "target_loss": "hard_ce",
924
+ "linear_soft_target_power": 1.0,
925
+ "linear_soft_target_min_conf": 0.0,
926
+ "linear_soft_target_max_conf": 1.0,
927
+ "t_sampling_mode": "uniform",
928
+ "t_sampling_power": 1.0,
929
+ "t_sampling_eps": 0.0001,
930
+ "t_sampling_logit_mean": -1.5,
931
+ "t_sampling_logit_std": 0.8,
932
+ "dual_t": true,
933
+ "corrupt_t_mode": "same",
934
+ "corrupt_min_t": 0.0,
935
+ "corrupt_max_t": 1.0,
936
+ "prefix_block_prob": 0.0,
937
+ "prefix_block_len": 128,
938
+ "mask_ratio_floor_schedule": "none",
939
+ "dirichlet_endpoint_mode": "categorical_dual_t",
940
+ "dirichlet_semantic_t_mode": "same",
941
+ "dirichlet_semantic_t_value": 0.0,
942
+ "dirichlet_semantic_t_curve": "linear",
943
+ "dirichlet_semantic_t_power": 1.0,
944
+ "endpoint_sequence_random_prob_alpha": 0.0,
945
+ "categorical_wrong_from_full_vocab": true,
946
+ "categorical_wrong_from_batch_valid_tokens": false,
947
+ "categorical_wrong_basin_token_ids": "",
948
+ "categorical_wrong_basin_prob": 0.0,
949
+ "categorical_wrong_unigram_prob": 0.0,
950
+ "categorical_wrong_uniform_prob": 0.0,
951
+ "categorical_wrong_prob_floor": 0.0,
952
+ "categorical_wrong_corpus_unigram_path": "",
953
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
954
+ "categorical_wrong_basin_shared_prob": 0.0,
955
+ "categorical_wrong_unigram_shared_prob": 0.0,
956
+ "mask_mixture_original_prob": 0.0,
957
+ "mask_mixture_lowk_prob": 0.0,
958
+ "mask_mixture_lowcorrupt_prob": 0.0,
959
+ "mask_mixture_block_prob": 0.0,
960
+ "mask_mixture_all_prob": 1.0,
961
+ "mask_mixture_lowk_clean_tokens": "0",
962
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
963
+ "mask_mixture_block_tokens": "64,128",
964
+ "simplex_bridge_sampler": "dirichlet",
965
+ "logistic_normal_sigma_min": 0.1,
966
+ "logistic_normal_sigma_max": 1.0,
967
+ "logistic_normal_tau_min": 1.0,
968
+ "logistic_normal_tau_max": 1.0,
969
+ "torch_compile": false,
970
+ "compile_mode": "max-autotune",
971
+ "state_format": "prob",
972
+ "meanflow_weight": 0.0,
973
+ "rollout_train_prob": 0.35,
974
+ "rollout_train_steps": 4,
975
+ "rollout_train_steps_min": 0,
976
+ "rollout_train_infer_steps": 1,
977
+ "rollout_train_time_mode": "sampled_path",
978
+ "rollout_train_s_dist": "uniform",
979
+ "rollout_train_s_min_frac": 0.0,
980
+ "rollout_train_s_max_frac": 0.25,
981
+ "rollout_train_s_beta_alpha": 2.0,
982
+ "rollout_train_s_beta_beta": 6.0,
983
+ "rollout_train_temp": 1.0,
984
+ "rollout_train_max_gamma": 1.0,
985
+ "rollout_train_corrupt_only": true,
986
+ "rollout_train_samplewise": true,
987
+ "rollout_train_compute_always": false,
988
+ "rollout_train_sync_t": true,
989
+ "bridge_noise_init": "logistic_normal",
990
+ "noise_sigma": -1.0,
991
+ "allow_tf32": true,
992
+ "activation_checkpointing": false,
993
+ "activation_checkpoint_interval": 1,
994
+ "activation_checkpoint_scope": "block",
995
+ "ddp_static_graph": false,
996
+ "ddp_gradient_as_bucket_view": true,
997
+ "blocking_data_transfer": false,
998
+ "dataloader_prefetch_factor": 4,
999
+ "full_train_stats": false,
1000
+ "tokenized_hf": false,
1001
+ "tokenized_pad_token": "pad",
1002
+ "elf_conditional_hf": false,
1003
+ "record_pad_truncate": false,
1004
+ "record_add_eos": false,
1005
+ "record_add_special_tokens": false,
1006
+ "record_pad_token": "pad",
1007
+ "record_shuffle_buffer": 10000,
1008
+ "wrap": true,
1009
+ "wrap_mode": "stream",
1010
+ "wrap_record_buffer_size": 200,
1011
+ "owt_cached_chunks": true,
1012
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
1013
+ "owt_chunk_cache_rebuild": false,
1014
+ "owt_chunk_cache_write_batch": 4096,
1015
+ "owt_exact_repeat_per_chunk": 64,
1016
+ "online_chunk_shuffle": false,
1017
+ "online_chunk_shuffle_buffer": 10000,
1018
+ "openwebtext_split": "train_minus_100k",
1019
+ "detokenizer": "auto",
1020
+ "resolved_detokenizer": null,
1021
+ "num_workers": 0,
1022
+ "latest_every": 1000,
1023
+ "resume_path": "runs/train8_ctx1024_t5tok_p35_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014800/latest.pt"
1024
+ }
1025
+ step=4100 epoch=4100/5000 epoch_step=1/1 micro_steps=4100 elapsed=22.5s lr=2.000000e-03 loss=0.2292 loss_recon=0.2292 loss_meanflow=0.0000 mean_model_t=0.5013 mean_corrupt_t=0.5013 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3505 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9208 corrupt_frac=1.0000 acc_corrupt=0.9208 loss_corrupt=0.2292 wrong_frac=0.4986 init_acc_corrupt=0.5174 acc_corrupt_t_0p0_0p2=0.5949 corrupt_frac_t_0p0_0p2=0.1952 acc_corrupt_t_0p2_0p4=0.9997 corrupt_frac_t_0p2_0p4=0.2063 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.1973 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1976 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2036 out_w_norm=12.6267 out_g_norm=0.2705 loss_all=0.1427 init_gold_top10=0.6265 init_gold_top100=0.7053 rollout_applied_pos_frac=0.3672 init_acc_rollout_applied=0.5611 init_acc_rollout_kept=0.4438 logit_acc_rollout_applied=0.9235 logit_acc_rollout_kept=0.9667
1026
+ step=4200 epoch=4200/5000 epoch_step=1/1 micro_steps=4200 elapsed=21.6s lr=2.000000e-03 loss=0.2282 loss_recon=0.2282 loss_meanflow=0.0000 mean_model_t=0.4985 mean_corrupt_t=0.4985 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.3512 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9205 corrupt_frac=1.0000 acc_corrupt=0.9205 loss_corrupt=0.2282 wrong_frac=0.5014 init_acc_corrupt=0.5162 acc_corrupt_t_0p0_0p2=0.6100 corrupt_frac_t_0p0_0p2=0.2037 acc_corrupt_t_0p2_0p4=0.9997 corrupt_frac_t_0p2_0p4=0.1982 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1956 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2056 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1969 out_w_norm=12.6309 out_g_norm=0.2424 loss_all=0.1838 init_gold_top10=0.6120 init_gold_top100=0.7017 rollout_applied_pos_frac=0.3438 init_acc_rollout_applied=0.6763 init_acc_rollout_kept=0.4511 logit_acc_rollout_applied=0.9769 logit_acc_rollout_kept=0.9125
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_t5tok_p50_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728.log ADDED
@@ -0,0 +1,620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 2423,
8
+ "tokenizer_vocab_size": 32100,
9
+ "save_dir": "runs/train8_ctx1024_t5tok_p50_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 2523776,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "uniform",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_prob_floor": 0.0,
124
+ "categorical_wrong_corpus_unigram_path": "",
125
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
126
+ "categorical_wrong_basin_shared_prob": 0.0,
127
+ "categorical_wrong_unigram_shared_prob": 0.0,
128
+ "mask_mixture_original_prob": 0.0,
129
+ "mask_mixture_lowk_prob": 0.0,
130
+ "mask_mixture_lowcorrupt_prob": 0.0,
131
+ "mask_mixture_block_prob": 0.0,
132
+ "mask_mixture_all_prob": 1.0,
133
+ "mask_mixture_lowk_clean_tokens": "0",
134
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
135
+ "mask_mixture_block_tokens": "64,128",
136
+ "simplex_bridge_sampler": "dirichlet",
137
+ "logistic_normal_sigma_min": 0.1,
138
+ "logistic_normal_sigma_max": 1.0,
139
+ "logistic_normal_tau_min": 1.0,
140
+ "logistic_normal_tau_max": 1.0,
141
+ "torch_compile": false,
142
+ "compile_mode": "max-autotune",
143
+ "state_format": "prob",
144
+ "meanflow_weight": 0.0,
145
+ "rollout_train_prob": 0.5,
146
+ "rollout_train_steps": 3,
147
+ "rollout_train_steps_min": 0,
148
+ "rollout_train_infer_steps": 1,
149
+ "rollout_train_time_mode": "sampled_path",
150
+ "rollout_train_s_dist": "uniform",
151
+ "rollout_train_s_min_frac": 0.0,
152
+ "rollout_train_s_max_frac": 0.25,
153
+ "rollout_train_s_beta_alpha": 2.0,
154
+ "rollout_train_s_beta_beta": 6.0,
155
+ "rollout_train_temp": 1.0,
156
+ "rollout_train_max_gamma": 1.0,
157
+ "rollout_train_corrupt_only": true,
158
+ "rollout_train_samplewise": true,
159
+ "rollout_train_compute_always": false,
160
+ "rollout_train_sync_t": true,
161
+ "bridge_noise_init": "logistic_normal",
162
+ "noise_sigma": -1.0,
163
+ "allow_tf32": true,
164
+ "activation_checkpointing": false,
165
+ "activation_checkpoint_interval": 1,
166
+ "activation_checkpoint_scope": "block",
167
+ "ddp_static_graph": false,
168
+ "ddp_gradient_as_bucket_view": true,
169
+ "blocking_data_transfer": false,
170
+ "dataloader_prefetch_factor": 4,
171
+ "full_train_stats": false,
172
+ "tokenized_hf": false,
173
+ "tokenized_pad_token": "pad",
174
+ "elf_conditional_hf": false,
175
+ "record_pad_truncate": false,
176
+ "record_add_eos": false,
177
+ "record_add_special_tokens": false,
178
+ "record_pad_token": "pad",
179
+ "record_shuffle_buffer": 10000,
180
+ "wrap": true,
181
+ "wrap_mode": "stream",
182
+ "wrap_record_buffer_size": 200,
183
+ "owt_cached_chunks": true,
184
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
185
+ "owt_chunk_cache_rebuild": false,
186
+ "owt_chunk_cache_write_batch": 4096,
187
+ "owt_exact_repeat_per_chunk": 64,
188
+ "online_chunk_shuffle": false,
189
+ "online_chunk_shuffle_buffer": 10000,
190
+ "openwebtext_split": "train_minus_100k",
191
+ "detokenizer": "auto",
192
+ "resolved_detokenizer": null,
193
+ "num_workers": 0,
194
+ "latest_every": 1000,
195
+ "resume_path": ""
196
+ }
197
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=23.1s lr=2.000000e-03 loss=7.3415 loss_recon=7.3415 loss_meanflow=0.0000 mean_model_t=0.4971 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5052 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3284 corrupt_frac=1.0000 acc_corrupt=0.3284 loss_corrupt=7.3415 wrong_frac=0.5028 init_acc_corrupt=0.4627 acc_corrupt_t_0p0_0p2=0.0465 corrupt_frac_t_0p0_0p2=0.2071 acc_corrupt_t_0p2_0p4=0.1615 corrupt_frac_t_0p2_0p4=0.1988 acc_corrupt_t_0p4_0p6=0.3279 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=0.4814 corrupt_frac_t_0p6_0p8=0.1951 acc_corrupt_t_0p8_1p0=0.6350 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=1.0924 out_g_norm=1.0103 loss_all=6.7761 init_gold_top10=0.4676 init_gold_top100=0.6068 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.4056 init_acc_rollout_kept=0.4382 logit_acc_rollout_applied=0.2790 logit_acc_rollout_kept=0.2989
198
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=21.9s lr=2.000000e-03 loss=5.8047 loss_recon=5.8047 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5026 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3298 corrupt_frac=1.0000 acc_corrupt=0.3298 loss_corrupt=5.8047 wrong_frac=0.4984 init_acc_corrupt=0.4677 acc_corrupt_t_0p0_0p2=0.0525 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.1623 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.3295 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=0.4757 corrupt_frac_t_0p6_0p8=0.2007 acc_corrupt_t_0p8_1p0=0.6253 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=3.4960 out_g_norm=1.3301 loss_all=5.0150 init_gold_top10=0.5089 init_gold_top100=0.6439 rollout_applied_pos_frac=0.5391 init_acc_rollout_applied=0.4913 init_acc_rollout_kept=0.4670 logit_acc_rollout_applied=0.3618 logit_acc_rollout_kept=0.3521
199
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=22.0s lr=2.000000e-03 loss=4.7177 loss_recon=4.7177 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5045 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3676 corrupt_frac=1.0000 acc_corrupt=0.3676 loss_corrupt=4.7177 wrong_frac=0.4985 init_acc_corrupt=0.4688 acc_corrupt_t_0p0_0p2=0.0555 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.1893 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.3638 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=0.5243 corrupt_frac_t_0p6_0p8=0.2045 acc_corrupt_t_0p8_1p0=0.6979 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=5.5798 out_g_norm=0.5487 loss_all=4.3897 init_gold_top10=0.5163 init_gold_top100=0.6526 rollout_applied_pos_frac=0.5391 init_acc_rollout_applied=0.4707 init_acc_rollout_kept=0.4746 logit_acc_rollout_applied=0.3883 logit_acc_rollout_kept=0.3952
200
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=21.9s lr=2.000000e-03 loss=4.1243 loss_recon=4.1243 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4976 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4221 corrupt_frac=1.0000 acc_corrupt=0.4221 loss_corrupt=4.1243 wrong_frac=0.5016 init_acc_corrupt=0.4654 acc_corrupt_t_0p0_0p2=0.0580 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.2091 corrupt_frac_t_0p2_0p4=0.2001 acc_corrupt_t_0p4_0p6=0.4195 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.6143 corrupt_frac_t_0p6_0p8=0.1992 acc_corrupt_t_0p8_1p0=0.8153 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=7.1150 out_g_norm=0.2786 loss_all=3.9526 init_gold_top10=0.4907 init_gold_top100=0.6628 rollout_applied_pos_frac=0.5469 init_acc_rollout_applied=0.4724 init_acc_rollout_kept=0.4094 logit_acc_rollout_applied=0.4663 logit_acc_rollout_kept=0.4213
201
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=21.9s lr=2.000000e-03 loss=3.5453 loss_recon=3.5453 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4981 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4820 corrupt_frac=1.0000 acc_corrupt=0.4820 loss_corrupt=3.5453 wrong_frac=0.5023 init_acc_corrupt=0.4647 acc_corrupt_t_0p0_0p2=0.0593 corrupt_frac_t_0p0_0p2=0.2059 acc_corrupt_t_0p2_0p4=0.2410 corrupt_frac_t_0p2_0p4=0.1985 acc_corrupt_t_0p4_0p6=0.5043 corrupt_frac_t_0p4_0p6=0.1969 acc_corrupt_t_0p6_0p8=0.7092 corrupt_frac_t_0p6_0p8=0.1927 acc_corrupt_t_0p8_1p0=0.9032 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=8.4492 out_g_norm=0.2354 loss_all=3.3655 init_gold_top10=0.4939 init_gold_top100=0.6374 rollout_applied_pos_frac=0.5234 init_acc_rollout_applied=0.4260 init_acc_rollout_kept=0.4782 logit_acc_rollout_applied=0.4481 logit_acc_rollout_kept=0.5033
202
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=21.9s lr=2.000000e-03 loss=3.0856 loss_recon=3.0856 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5030 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4961 corrupt_frac=1.0000 acc_corrupt=0.4961 loss_corrupt=3.0856 wrong_frac=0.4987 init_acc_corrupt=0.4687 acc_corrupt_t_0p0_0p2=0.0620 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.2697 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=0.5245 corrupt_frac_t_0p4_0p6=0.1990 acc_corrupt_t_0p6_0p8=0.7150 corrupt_frac_t_0p6_0p8=0.2017 acc_corrupt_t_0p8_1p0=0.9048 corrupt_frac_t_0p8_1p0=0.1997 out_w_norm=9.7161 out_g_norm=0.2573 loss_all=2.4550 init_gold_top10=0.5903 init_gold_top100=0.7292 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.5184 init_acc_rollout_kept=0.5818 logit_acc_rollout_applied=0.5434 logit_acc_rollout_kept=0.6081
203
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=21.8s lr=2.000000e-03 loss=2.7639 loss_recon=2.7639 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4924 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5033 corrupt_frac=1.0000 acc_corrupt=0.5033 loss_corrupt=2.7639 wrong_frac=0.4994 init_acc_corrupt=0.4685 acc_corrupt_t_0p0_0p2=0.0636 corrupt_frac_t_0p0_0p2=0.2030 acc_corrupt_t_0p2_0p4=0.2806 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.5329 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.7229 corrupt_frac_t_0p6_0p8=0.2001 acc_corrupt_t_0p8_1p0=0.9063 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=10.7285 out_g_norm=0.2921 loss_all=2.5847 init_gold_top10=0.5278 init_gold_top100=0.7140 rollout_applied_pos_frac=0.5000 init_acc_rollout_applied=0.4351 init_acc_rollout_kept=0.5192 logit_acc_rollout_applied=0.4776 logit_acc_rollout_kept=0.5548
204
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=21.9s lr=2.000000e-03 loss=2.3239 loss_recon=2.3239 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4930 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5318 corrupt_frac=1.0000 acc_corrupt=0.5318 loss_corrupt=2.3239 wrong_frac=0.5037 init_acc_corrupt=0.4641 acc_corrupt_t_0p0_0p2=0.0632 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.3081 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=0.5910 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=0.7810 corrupt_frac_t_0p6_0p8=0.1965 acc_corrupt_t_0p8_1p0=0.9311 corrupt_frac_t_0p8_1p0=0.1984 out_w_norm=11.2679 out_g_norm=0.3653 loss_all=2.2086 init_gold_top10=0.5381 init_gold_top100=0.7009 rollout_applied_pos_frac=0.4609 init_acc_rollout_applied=0.4962 init_acc_rollout_kept=0.4085 logit_acc_rollout_applied=0.5986 logit_acc_rollout_kept=0.5025
205
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=22.0s lr=2.000000e-03 loss=1.7969 loss_recon=1.7969 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.5040 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5050 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6158 corrupt_frac=1.0000 acc_corrupt=0.6158 loss_corrupt=1.7969 wrong_frac=0.4960 init_acc_corrupt=0.4751 acc_corrupt_t_0p0_0p2=0.0647 corrupt_frac_t_0p0_0p2=0.1931 acc_corrupt_t_0p2_0p4=0.3813 corrupt_frac_t_0p2_0p4=0.2013 acc_corrupt_t_0p4_0p6=0.7385 corrupt_frac_t_0p4_0p6=0.2019 acc_corrupt_t_0p6_0p8=0.8946 corrupt_frac_t_0p6_0p8=0.1981 acc_corrupt_t_0p8_1p0=0.9738 corrupt_frac_t_0p8_1p0=0.2056 out_w_norm=11.7474 out_g_norm=0.4692 loss_all=1.5321 init_gold_top10=0.6081 init_gold_top100=0.7522 rollout_applied_pos_frac=0.4297 init_acc_rollout_applied=0.4678 init_acc_rollout_kept=0.5233 logit_acc_rollout_applied=0.6263 logit_acc_rollout_kept=0.7023
206
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=21.9s lr=2.000000e-03 loss=1.5082 loss_recon=1.5082 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4999 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6774 corrupt_frac=1.0000 acc_corrupt=0.6774 loss_corrupt=1.5082 wrong_frac=0.4993 init_acc_corrupt=0.4771 acc_corrupt_t_0p0_0p2=0.0701 corrupt_frac_t_0p0_0p2=0.2034 acc_corrupt_t_0p2_0p4=0.4956 corrupt_frac_t_0p2_0p4=0.1967 acc_corrupt_t_0p4_0p6=0.8670 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=0.9669 corrupt_frac_t_0p6_0p8=0.1933 acc_corrupt_t_0p8_1p0=0.9947 corrupt_frac_t_0p8_1p0=0.2055 out_w_norm=12.1092 out_g_norm=0.5374 loss_all=1.5845 init_gold_top10=0.5851 init_gold_top100=0.7357 rollout_applied_pos_frac=0.5547 init_acc_rollout_applied=0.5140 init_acc_rollout_kept=0.4298 logit_acc_rollout_applied=0.7118 logit_acc_rollout_kept=0.6295
207
+ NCCL version 2.25.1+cuda12.8
208
+ resumed_from=runs/train8_ctx1024_t5tok_p50_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt start_step=1001
209
+ {
210
+ "device": "cuda:0",
211
+ "rank": 0,
212
+ "world_size": 4,
213
+ "samples": "owt_cached_chunks:8",
214
+ "vocab_size": 2423,
215
+ "tokenizer_vocab_size": 32100,
216
+ "save_dir": "runs/train8_ctx1024_t5tok_p50_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728",
217
+ "batch_size": 128,
218
+ "grad_accum": 1,
219
+ "effective_batch_size": 512,
220
+ "global_batch_size": 512,
221
+ "lr_schedule": "constant_warmup",
222
+ "optimizer": "muon",
223
+ "epochs": 0.0,
224
+ "steps_per_epoch": 1,
225
+ "total_steps": 2000,
226
+ "warmup_steps": 10,
227
+ "warmup_epochs": -1.0,
228
+ "min_lr": 0.0,
229
+ "weight_decay": 0.1,
230
+ "output_weight_decay": -1.0,
231
+ "adamw_param_groups": "nanogpt",
232
+ "adam_beta1": 0.9,
233
+ "adam_beta2": 0.95,
234
+ "adam_eps": 1e-08,
235
+ "muon_impl": "legacy",
236
+ "muon_momentum": 0.95,
237
+ "muon_ns_steps": 5,
238
+ "muon_update_scale": 1.0,
239
+ "muon_nesterov": false,
240
+ "muon_width_scale": false,
241
+ "muon_grouping": "legacy_dim_ge_2",
242
+ "muon_param_count": 2523776,
243
+ "muon_adam_param_count": 8192,
244
+ "muon_param_names": [
245
+ "vocab_embed.embedding",
246
+ "sigma_map.net.0.weight",
247
+ "sigma_map.net.2.weight",
248
+ "blocks.0.attn_qkv.weight",
249
+ "blocks.0.attn_out.weight",
250
+ "blocks.0.mlp.0.weight",
251
+ "blocks.0.mlp.2.weight",
252
+ "blocks.0.adaLN_modulation.weight",
253
+ "blocks.1.attn_qkv.weight",
254
+ "blocks.1.attn_out.weight",
255
+ "blocks.1.mlp.0.weight",
256
+ "blocks.1.mlp.2.weight",
257
+ "blocks.1.adaLN_modulation.weight",
258
+ "blocks.2.attn_qkv.weight",
259
+ "blocks.2.attn_out.weight",
260
+ "blocks.2.mlp.0.weight",
261
+ "blocks.2.mlp.2.weight",
262
+ "blocks.2.adaLN_modulation.weight",
263
+ "output_layer.linear.weight",
264
+ "output_layer.adaLN_modulation.weight"
265
+ ],
266
+ "muon_adam_param_names": [
267
+ "sigma_map.net.0.bias",
268
+ "sigma_map.net.2.bias",
269
+ "blocks.0.norm1.weight",
270
+ "blocks.0.norm2.weight",
271
+ "blocks.0.mlp.0.bias",
272
+ "blocks.0.mlp.2.bias",
273
+ "blocks.0.adaLN_modulation.bias",
274
+ "blocks.1.norm1.weight",
275
+ "blocks.1.norm2.weight",
276
+ "blocks.1.mlp.0.bias",
277
+ "blocks.1.mlp.2.bias",
278
+ "blocks.1.adaLN_modulation.bias",
279
+ "blocks.2.norm1.weight",
280
+ "blocks.2.norm2.weight",
281
+ "blocks.2.mlp.0.bias",
282
+ "blocks.2.mlp.2.bias",
283
+ "blocks.2.adaLN_modulation.bias",
284
+ "output_layer.norm_final.weight",
285
+ "output_layer.adaLN_modulation.bias"
286
+ ],
287
+ "muon_effective_nesterov": false,
288
+ "muon_effective_width_scale": false,
289
+ "muon_effective_weight_decay": 0.1,
290
+ "muon_adam_fallback_nesterov": false,
291
+ "muon_adam_fallback_weight_decay": 0.1,
292
+ "ema_decay": 0.9999,
293
+ "ema_start_step": 0,
294
+ "model_type": "ddit",
295
+ "ddit_mlp_type": "gelu",
296
+ "elf_num_time_tokens": 4,
297
+ "elf_num_model_mode_tokens": 0,
298
+ "qk_norm": true,
299
+ "output_bias": false,
300
+ "output_init_std": -1.0,
301
+ "norm_type": "rmsnorm",
302
+ "target_loss": "hard_ce",
303
+ "linear_soft_target_power": 1.0,
304
+ "linear_soft_target_min_conf": 0.0,
305
+ "linear_soft_target_max_conf": 1.0,
306
+ "t_sampling_mode": "uniform",
307
+ "t_sampling_power": 1.0,
308
+ "t_sampling_eps": 0.0001,
309
+ "t_sampling_logit_mean": -1.5,
310
+ "t_sampling_logit_std": 0.8,
311
+ "dual_t": true,
312
+ "corrupt_t_mode": "same",
313
+ "corrupt_min_t": 0.0,
314
+ "corrupt_max_t": 1.0,
315
+ "prefix_block_prob": 0.0,
316
+ "prefix_block_len": 128,
317
+ "mask_ratio_floor_schedule": "none",
318
+ "dirichlet_endpoint_mode": "categorical_dual_t",
319
+ "dirichlet_semantic_t_mode": "same",
320
+ "dirichlet_semantic_t_value": 0.0,
321
+ "dirichlet_semantic_t_curve": "linear",
322
+ "dirichlet_semantic_t_power": 1.0,
323
+ "endpoint_sequence_random_prob_alpha": 0.0,
324
+ "categorical_wrong_from_full_vocab": true,
325
+ "categorical_wrong_from_batch_valid_tokens": false,
326
+ "categorical_wrong_basin_token_ids": "",
327
+ "categorical_wrong_basin_prob": 0.0,
328
+ "categorical_wrong_unigram_prob": 0.0,
329
+ "categorical_wrong_uniform_prob": 0.0,
330
+ "categorical_wrong_prob_floor": 0.0,
331
+ "categorical_wrong_corpus_unigram_path": "",
332
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
333
+ "categorical_wrong_basin_shared_prob": 0.0,
334
+ "categorical_wrong_unigram_shared_prob": 0.0,
335
+ "mask_mixture_original_prob": 0.0,
336
+ "mask_mixture_lowk_prob": 0.0,
337
+ "mask_mixture_lowcorrupt_prob": 0.0,
338
+ "mask_mixture_block_prob": 0.0,
339
+ "mask_mixture_all_prob": 1.0,
340
+ "mask_mixture_lowk_clean_tokens": "0",
341
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
342
+ "mask_mixture_block_tokens": "64,128",
343
+ "simplex_bridge_sampler": "dirichlet",
344
+ "logistic_normal_sigma_min": 0.1,
345
+ "logistic_normal_sigma_max": 1.0,
346
+ "logistic_normal_tau_min": 1.0,
347
+ "logistic_normal_tau_max": 1.0,
348
+ "torch_compile": false,
349
+ "compile_mode": "max-autotune",
350
+ "state_format": "prob",
351
+ "meanflow_weight": 0.0,
352
+ "rollout_train_prob": 0.5,
353
+ "rollout_train_steps": 3,
354
+ "rollout_train_steps_min": 0,
355
+ "rollout_train_infer_steps": 1,
356
+ "rollout_train_time_mode": "sampled_path",
357
+ "rollout_train_s_dist": "uniform",
358
+ "rollout_train_s_min_frac": 0.0,
359
+ "rollout_train_s_max_frac": 0.25,
360
+ "rollout_train_s_beta_alpha": 2.0,
361
+ "rollout_train_s_beta_beta": 6.0,
362
+ "rollout_train_temp": 1.0,
363
+ "rollout_train_max_gamma": 1.0,
364
+ "rollout_train_corrupt_only": true,
365
+ "rollout_train_samplewise": true,
366
+ "rollout_train_compute_always": false,
367
+ "rollout_train_sync_t": true,
368
+ "bridge_noise_init": "logistic_normal",
369
+ "noise_sigma": -1.0,
370
+ "allow_tf32": true,
371
+ "activation_checkpointing": false,
372
+ "activation_checkpoint_interval": 1,
373
+ "activation_checkpoint_scope": "block",
374
+ "ddp_static_graph": false,
375
+ "ddp_gradient_as_bucket_view": true,
376
+ "blocking_data_transfer": false,
377
+ "dataloader_prefetch_factor": 4,
378
+ "full_train_stats": false,
379
+ "tokenized_hf": false,
380
+ "tokenized_pad_token": "pad",
381
+ "elf_conditional_hf": false,
382
+ "record_pad_truncate": false,
383
+ "record_add_eos": false,
384
+ "record_add_special_tokens": false,
385
+ "record_pad_token": "pad",
386
+ "record_shuffle_buffer": 10000,
387
+ "wrap": true,
388
+ "wrap_mode": "stream",
389
+ "wrap_record_buffer_size": 200,
390
+ "owt_cached_chunks": true,
391
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
392
+ "owt_chunk_cache_rebuild": false,
393
+ "owt_chunk_cache_write_batch": 4096,
394
+ "owt_exact_repeat_per_chunk": 64,
395
+ "online_chunk_shuffle": false,
396
+ "online_chunk_shuffle_buffer": 10000,
397
+ "openwebtext_split": "train_minus_100k",
398
+ "detokenizer": "auto",
399
+ "resolved_detokenizer": null,
400
+ "num_workers": 0,
401
+ "latest_every": 1000,
402
+ "resume_path": "runs/train8_ctx1024_t5tok_p50_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt"
403
+ }
404
+ step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=22.9s lr=2.000000e-03 loss=1.3136 loss_recon=1.3136 loss_meanflow=0.0000 mean_model_t=0.4971 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5052 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7171 corrupt_frac=1.0000 acc_corrupt=0.7171 loss_corrupt=1.3136 wrong_frac=0.5028 init_acc_corrupt=0.4805 acc_corrupt_t_0p0_0p2=0.0811 corrupt_frac_t_0p0_0p2=0.2071 acc_corrupt_t_0p2_0p4=0.6097 corrupt_frac_t_0p2_0p4=0.1988 acc_corrupt_t_0p4_0p6=0.9357 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=0.9896 corrupt_frac_t_0p6_0p8=0.1951 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.3565 out_g_norm=0.5727 loss_all=1.1645 init_gold_top10=0.5814 init_gold_top100=0.7221 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.4710 init_acc_rollout_kept=0.4382 logit_acc_rollout_applied=0.7261 logit_acc_rollout_kept=0.7346
405
+ step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=21.9s lr=2.000000e-03 loss=1.1183 loss_recon=1.1183 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5026 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7564 corrupt_frac=1.0000 acc_corrupt=0.7564 loss_corrupt=1.1183 wrong_frac=0.4984 init_acc_corrupt=0.4896 acc_corrupt_t_0p0_0p2=0.1022 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.7152 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.9682 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=0.9960 corrupt_frac_t_0p6_0p8=0.2007 acc_corrupt_t_0p8_1p0=0.9995 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=12.5352 out_g_norm=0.6232 loss_all=1.0609 init_gold_top10=0.6302 init_gold_top100=0.7548 rollout_applied_pos_frac=0.5391 init_acc_rollout_applied=0.5547 init_acc_rollout_kept=0.4670 logit_acc_rollout_applied=0.8231 logit_acc_rollout_kept=0.7185
406
+ step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=22.0s lr=2.000000e-03 loss=0.9490 loss_recon=0.9490 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5045 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7902 corrupt_frac=1.0000 acc_corrupt=0.7902 loss_corrupt=0.9490 wrong_frac=0.4985 init_acc_corrupt=0.4958 acc_corrupt_t_0p0_0p2=0.1353 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.8127 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.9848 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=0.9984 corrupt_frac_t_0p6_0p8=0.2045 acc_corrupt_t_0p8_1p0=0.9998 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=12.6919 out_g_norm=0.6284 loss_all=0.8864 init_gold_top10=0.6368 init_gold_top100=0.7525 rollout_applied_pos_frac=0.5391 init_acc_rollout_applied=0.5113 init_acc_rollout_kept=0.4746 logit_acc_rollout_applied=0.7752 logit_acc_rollout_kept=0.8346
407
+ step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=21.9s lr=2.000000e-03 loss=0.8473 loss_recon=0.8473 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4976 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8092 corrupt_frac=1.0000 acc_corrupt=0.8092 loss_corrupt=0.8473 wrong_frac=0.5016 init_acc_corrupt=0.4976 acc_corrupt_t_0p0_0p2=0.1782 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.8774 corrupt_frac_t_0p2_0p4=0.2001 acc_corrupt_t_0p4_0p6=0.9931 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.9992 corrupt_frac_t_0p6_0p8=0.1992 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=12.8295 out_g_norm=0.5916 loss_all=0.7454 init_gold_top10=0.6684 init_gold_top100=0.7834 rollout_applied_pos_frac=0.5469 init_acc_rollout_applied=0.5835 init_acc_rollout_kept=0.4094 logit_acc_rollout_applied=0.8850 logit_acc_rollout_kept=0.7593
408
+ step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=21.9s lr=2.000000e-03 loss=0.7699 loss_recon=0.7699 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4981 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8231 corrupt_frac=1.0000 acc_corrupt=0.8231 loss_corrupt=0.7699 wrong_frac=0.5023 init_acc_corrupt=0.4995 acc_corrupt_t_0p0_0p2=0.2149 corrupt_frac_t_0p0_0p2=0.2059 acc_corrupt_t_0p2_0p4=0.9273 corrupt_frac_t_0p2_0p4=0.1985 acc_corrupt_t_0p4_0p6=0.9967 corrupt_frac_t_0p4_0p6=0.1969 acc_corrupt_t_0p6_0p8=0.9995 corrupt_frac_t_0p6_0p8=0.1927 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=12.9363 out_g_norm=0.6203 loss_all=0.6508 init_gold_top10=0.6072 init_gold_top100=0.7108 rollout_applied_pos_frac=0.5234 init_acc_rollout_applied=0.5076 init_acc_rollout_kept=0.4782 logit_acc_rollout_applied=0.8608 logit_acc_rollout_kept=0.8318
409
+ step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=22.0s lr=2.000000e-03 loss=0.6566 loss_recon=0.6566 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5030 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8444 corrupt_frac=1.0000 acc_corrupt=0.8444 loss_corrupt=0.6566 wrong_frac=0.4987 init_acc_corrupt=0.5059 acc_corrupt_t_0p0_0p2=0.2596 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.9547 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=0.9982 corrupt_frac_t_0p4_0p6=0.1990 acc_corrupt_t_0p6_0p8=0.9996 corrupt_frac_t_0p6_0p8=0.2017 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.1997 out_w_norm=13.0208 out_g_norm=0.5928 loss_all=0.4948 init_gold_top10=0.7147 init_gold_top100=0.7938 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.6193 init_acc_rollout_kept=0.5818 logit_acc_rollout_applied=0.8863 logit_acc_rollout_kept=0.8906
410
+ step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=21.9s lr=2.000000e-03 loss=0.6109 loss_recon=0.6109 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4924 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8498 corrupt_frac=1.0000 acc_corrupt=0.8498 loss_corrupt=0.6109 wrong_frac=0.4994 init_acc_corrupt=0.5062 acc_corrupt_t_0p0_0p2=0.2896 corrupt_frac_t_0p0_0p2=0.2030 acc_corrupt_t_0p2_0p4=0.9701 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.9990 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.2001 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=13.0742 out_g_norm=0.5800 loss_all=0.7399 init_gold_top10=0.6286 init_gold_top100=0.7624 rollout_applied_pos_frac=0.5000 init_acc_rollout_applied=0.4682 init_acc_rollout_kept=0.5192 logit_acc_rollout_applied=0.7932 logit_acc_rollout_kept=0.8269
411
+ step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=21.9s lr=2.000000e-03 loss=0.5545 loss_recon=0.5545 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4930 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8579 corrupt_frac=1.0000 acc_corrupt=0.8579 loss_corrupt=0.5545 wrong_frac=0.5037 init_acc_corrupt=0.5034 acc_corrupt_t_0p0_0p2=0.3201 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.9793 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=0.9993 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=0.9998 corrupt_frac_t_0p6_0p8=0.1965 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.1984 out_w_norm=13.1038 out_g_norm=0.6309 loss_all=0.5981 init_gold_top10=0.5947 init_gold_top100=0.7261 rollout_applied_pos_frac=0.4609 init_acc_rollout_applied=0.5526 init_acc_rollout_kept=0.4085 logit_acc_rollout_applied=0.8759 logit_acc_rollout_kept=0.8048
412
+ step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=22.0s lr=2.000000e-03 loss=0.4655 loss_recon=0.4655 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.5040 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5050 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8745 corrupt_frac=1.0000 acc_corrupt=0.8745 loss_corrupt=0.4655 wrong_frac=0.4960 init_acc_corrupt=0.5140 acc_corrupt_t_0p0_0p2=0.3661 corrupt_frac_t_0p0_0p2=0.1931 acc_corrupt_t_0p2_0p4=0.9856 corrupt_frac_t_0p2_0p4=0.2013 acc_corrupt_t_0p4_0p6=0.9995 corrupt_frac_t_0p4_0p6=0.2019 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1981 acc_corrupt_t_0p8_1p0=0.9999 corrupt_frac_t_0p8_1p0=0.2056 out_w_norm=13.1189 out_g_norm=0.5747 loss_all=0.3300 init_gold_top10=0.6904 init_gold_top100=0.7832 rollout_applied_pos_frac=0.4297 init_acc_rollout_applied=0.5847 init_acc_rollout_kept=0.5233 logit_acc_rollout_applied=0.9022 logit_acc_rollout_kept=0.9125
413
+ step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=22.0s lr=2.000000e-03 loss=0.4380 loss_recon=0.4380 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4999 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8777 corrupt_frac=1.0000 acc_corrupt=0.8777 loss_corrupt=0.4380 wrong_frac=0.4993 init_acc_corrupt=0.5112 acc_corrupt_t_0p0_0p2=0.4089 corrupt_frac_t_0p0_0p2=0.2034 acc_corrupt_t_0p2_0p4=0.9900 corrupt_frac_t_0p2_0p4=0.1967 acc_corrupt_t_0p4_0p6=0.9997 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1933 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2055 out_w_norm=13.1427 out_g_norm=0.5409 loss_all=0.6228 init_gold_top10=0.6355 init_gold_top100=0.7547 rollout_applied_pos_frac=0.5547 init_acc_rollout_applied=0.5375 init_acc_rollout_kept=0.4298 logit_acc_rollout_applied=0.8516 logit_acc_rollout_kept=0.8000
414
+ NCCL version 2.25.1+cuda12.8
415
+ resumed_from=runs/train8_ctx1024_t5tok_p50_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt start_step=2001
416
+ {
417
+ "device": "cuda:0",
418
+ "rank": 0,
419
+ "world_size": 4,
420
+ "samples": "owt_cached_chunks:8",
421
+ "vocab_size": 2423,
422
+ "tokenizer_vocab_size": 32100,
423
+ "save_dir": "runs/train8_ctx1024_t5tok_p50_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728",
424
+ "batch_size": 128,
425
+ "grad_accum": 1,
426
+ "effective_batch_size": 512,
427
+ "global_batch_size": 512,
428
+ "lr_schedule": "constant_warmup",
429
+ "optimizer": "muon",
430
+ "epochs": 0.0,
431
+ "steps_per_epoch": 1,
432
+ "total_steps": 3000,
433
+ "warmup_steps": 10,
434
+ "warmup_epochs": -1.0,
435
+ "min_lr": 0.0,
436
+ "weight_decay": 0.1,
437
+ "output_weight_decay": -1.0,
438
+ "adamw_param_groups": "nanogpt",
439
+ "adam_beta1": 0.9,
440
+ "adam_beta2": 0.95,
441
+ "adam_eps": 1e-08,
442
+ "muon_impl": "legacy",
443
+ "muon_momentum": 0.95,
444
+ "muon_ns_steps": 5,
445
+ "muon_update_scale": 1.0,
446
+ "muon_nesterov": false,
447
+ "muon_width_scale": false,
448
+ "muon_grouping": "legacy_dim_ge_2",
449
+ "muon_param_count": 2523776,
450
+ "muon_adam_param_count": 8192,
451
+ "muon_param_names": [
452
+ "vocab_embed.embedding",
453
+ "sigma_map.net.0.weight",
454
+ "sigma_map.net.2.weight",
455
+ "blocks.0.attn_qkv.weight",
456
+ "blocks.0.attn_out.weight",
457
+ "blocks.0.mlp.0.weight",
458
+ "blocks.0.mlp.2.weight",
459
+ "blocks.0.adaLN_modulation.weight",
460
+ "blocks.1.attn_qkv.weight",
461
+ "blocks.1.attn_out.weight",
462
+ "blocks.1.mlp.0.weight",
463
+ "blocks.1.mlp.2.weight",
464
+ "blocks.1.adaLN_modulation.weight",
465
+ "blocks.2.attn_qkv.weight",
466
+ "blocks.2.attn_out.weight",
467
+ "blocks.2.mlp.0.weight",
468
+ "blocks.2.mlp.2.weight",
469
+ "blocks.2.adaLN_modulation.weight",
470
+ "output_layer.linear.weight",
471
+ "output_layer.adaLN_modulation.weight"
472
+ ],
473
+ "muon_adam_param_names": [
474
+ "sigma_map.net.0.bias",
475
+ "sigma_map.net.2.bias",
476
+ "blocks.0.norm1.weight",
477
+ "blocks.0.norm2.weight",
478
+ "blocks.0.mlp.0.bias",
479
+ "blocks.0.mlp.2.bias",
480
+ "blocks.0.adaLN_modulation.bias",
481
+ "blocks.1.norm1.weight",
482
+ "blocks.1.norm2.weight",
483
+ "blocks.1.mlp.0.bias",
484
+ "blocks.1.mlp.2.bias",
485
+ "blocks.1.adaLN_modulation.bias",
486
+ "blocks.2.norm1.weight",
487
+ "blocks.2.norm2.weight",
488
+ "blocks.2.mlp.0.bias",
489
+ "blocks.2.mlp.2.bias",
490
+ "blocks.2.adaLN_modulation.bias",
491
+ "output_layer.norm_final.weight",
492
+ "output_layer.adaLN_modulation.bias"
493
+ ],
494
+ "muon_effective_nesterov": false,
495
+ "muon_effective_width_scale": false,
496
+ "muon_effective_weight_decay": 0.1,
497
+ "muon_adam_fallback_nesterov": false,
498
+ "muon_adam_fallback_weight_decay": 0.1,
499
+ "ema_decay": 0.9999,
500
+ "ema_start_step": 0,
501
+ "model_type": "ddit",
502
+ "ddit_mlp_type": "gelu",
503
+ "elf_num_time_tokens": 4,
504
+ "elf_num_model_mode_tokens": 0,
505
+ "qk_norm": true,
506
+ "output_bias": false,
507
+ "output_init_std": -1.0,
508
+ "norm_type": "rmsnorm",
509
+ "target_loss": "hard_ce",
510
+ "linear_soft_target_power": 1.0,
511
+ "linear_soft_target_min_conf": 0.0,
512
+ "linear_soft_target_max_conf": 1.0,
513
+ "t_sampling_mode": "uniform",
514
+ "t_sampling_power": 1.0,
515
+ "t_sampling_eps": 0.0001,
516
+ "t_sampling_logit_mean": -1.5,
517
+ "t_sampling_logit_std": 0.8,
518
+ "dual_t": true,
519
+ "corrupt_t_mode": "same",
520
+ "corrupt_min_t": 0.0,
521
+ "corrupt_max_t": 1.0,
522
+ "prefix_block_prob": 0.0,
523
+ "prefix_block_len": 128,
524
+ "mask_ratio_floor_schedule": "none",
525
+ "dirichlet_endpoint_mode": "categorical_dual_t",
526
+ "dirichlet_semantic_t_mode": "same",
527
+ "dirichlet_semantic_t_value": 0.0,
528
+ "dirichlet_semantic_t_curve": "linear",
529
+ "dirichlet_semantic_t_power": 1.0,
530
+ "endpoint_sequence_random_prob_alpha": 0.0,
531
+ "categorical_wrong_from_full_vocab": true,
532
+ "categorical_wrong_from_batch_valid_tokens": false,
533
+ "categorical_wrong_basin_token_ids": "",
534
+ "categorical_wrong_basin_prob": 0.0,
535
+ "categorical_wrong_unigram_prob": 0.0,
536
+ "categorical_wrong_uniform_prob": 0.0,
537
+ "categorical_wrong_prob_floor": 0.0,
538
+ "categorical_wrong_corpus_unigram_path": "",
539
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
540
+ "categorical_wrong_basin_shared_prob": 0.0,
541
+ "categorical_wrong_unigram_shared_prob": 0.0,
542
+ "mask_mixture_original_prob": 0.0,
543
+ "mask_mixture_lowk_prob": 0.0,
544
+ "mask_mixture_lowcorrupt_prob": 0.0,
545
+ "mask_mixture_block_prob": 0.0,
546
+ "mask_mixture_all_prob": 1.0,
547
+ "mask_mixture_lowk_clean_tokens": "0",
548
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
549
+ "mask_mixture_block_tokens": "64,128",
550
+ "simplex_bridge_sampler": "dirichlet",
551
+ "logistic_normal_sigma_min": 0.1,
552
+ "logistic_normal_sigma_max": 1.0,
553
+ "logistic_normal_tau_min": 1.0,
554
+ "logistic_normal_tau_max": 1.0,
555
+ "torch_compile": false,
556
+ "compile_mode": "max-autotune",
557
+ "state_format": "prob",
558
+ "meanflow_weight": 0.0,
559
+ "rollout_train_prob": 0.5,
560
+ "rollout_train_steps": 3,
561
+ "rollout_train_steps_min": 0,
562
+ "rollout_train_infer_steps": 1,
563
+ "rollout_train_time_mode": "sampled_path",
564
+ "rollout_train_s_dist": "uniform",
565
+ "rollout_train_s_min_frac": 0.0,
566
+ "rollout_train_s_max_frac": 0.25,
567
+ "rollout_train_s_beta_alpha": 2.0,
568
+ "rollout_train_s_beta_beta": 6.0,
569
+ "rollout_train_temp": 1.0,
570
+ "rollout_train_max_gamma": 1.0,
571
+ "rollout_train_corrupt_only": true,
572
+ "rollout_train_samplewise": true,
573
+ "rollout_train_compute_always": false,
574
+ "rollout_train_sync_t": true,
575
+ "bridge_noise_init": "logistic_normal",
576
+ "noise_sigma": -1.0,
577
+ "allow_tf32": true,
578
+ "activation_checkpointing": false,
579
+ "activation_checkpoint_interval": 1,
580
+ "activation_checkpoint_scope": "block",
581
+ "ddp_static_graph": false,
582
+ "ddp_gradient_as_bucket_view": true,
583
+ "blocking_data_transfer": false,
584
+ "dataloader_prefetch_factor": 4,
585
+ "full_train_stats": false,
586
+ "tokenized_hf": false,
587
+ "tokenized_pad_token": "pad",
588
+ "elf_conditional_hf": false,
589
+ "record_pad_truncate": false,
590
+ "record_add_eos": false,
591
+ "record_add_special_tokens": false,
592
+ "record_pad_token": "pad",
593
+ "record_shuffle_buffer": 10000,
594
+ "wrap": true,
595
+ "wrap_mode": "stream",
596
+ "wrap_record_buffer_size": 200,
597
+ "owt_cached_chunks": true,
598
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
599
+ "owt_chunk_cache_rebuild": false,
600
+ "owt_chunk_cache_write_batch": 4096,
601
+ "owt_exact_repeat_per_chunk": 64,
602
+ "online_chunk_shuffle": false,
603
+ "online_chunk_shuffle_buffer": 10000,
604
+ "openwebtext_split": "train_minus_100k",
605
+ "detokenizer": "auto",
606
+ "resolved_detokenizer": null,
607
+ "num_workers": 0,
608
+ "latest_every": 1000,
609
+ "resume_path": "runs/train8_ctx1024_t5tok_p50_rand0_3_unif0_0p25_outwdm1_t5tok_ctx1024_k03_20260518_022728/latest.pt"
610
+ }
611
+ step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=22.9s lr=2.000000e-03 loss=0.4227 loss_recon=0.4227 loss_meanflow=0.0000 mean_model_t=0.4971 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5052 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8784 corrupt_frac=1.0000 acc_corrupt=0.8784 loss_corrupt=0.4227 wrong_frac=0.5028 init_acc_corrupt=0.5099 acc_corrupt_t_0p0_0p2=0.4207 corrupt_frac_t_0p0_0p2=0.2071 acc_corrupt_t_0p2_0p4=0.9922 corrupt_frac_t_0p2_0p4=0.1988 acc_corrupt_t_0p4_0p6=0.9997 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1951 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=13.1370 out_g_norm=0.5656 loss_all=0.3018 init_gold_top10=0.6387 init_gold_top100=0.7422 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.5405 init_acc_rollout_kept=0.4382 logit_acc_rollout_applied=0.8902 logit_acc_rollout_kept=0.9257
612
+ step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=21.9s lr=2.000000e-03 loss=0.3744 loss_recon=0.3744 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5026 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8896 corrupt_frac=1.0000 acc_corrupt=0.8896 loss_corrupt=0.3744 wrong_frac=0.4984 init_acc_corrupt=0.5127 acc_corrupt_t_0p0_0p2=0.4549 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9946 corrupt_frac_t_0p2_0p4=0.1979 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.2007 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2015 out_w_norm=13.1374 out_g_norm=0.5428 loss_all=0.4181 init_gold_top10=0.6663 init_gold_top100=0.7683 rollout_applied_pos_frac=0.5391 init_acc_rollout_applied=0.5834 init_acc_rollout_kept=0.4670 logit_acc_rollout_applied=0.9081 logit_acc_rollout_kept=0.8371
613
+ step=2300 epoch=2300/3000 epoch_step=1/1 micro_steps=2300 elapsed=22.0s lr=2.000000e-03 loss=0.3433 loss_recon=0.3433 loss_meanflow=0.0000 mean_model_t=0.5015 mean_corrupt_t=0.5015 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5045 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8958 corrupt_frac=1.0000 acc_corrupt=0.8958 loss_corrupt=0.3433 wrong_frac=0.4985 init_acc_corrupt=0.5141 acc_corrupt_t_0p0_0p2=0.4692 corrupt_frac_t_0p0_0p2=0.1947 acc_corrupt_t_0p2_0p4=0.9959 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.2045 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=13.1241 out_g_norm=0.5142 loss_all=0.3396 init_gold_top10=0.6717 init_gold_top100=0.7648 rollout_applied_pos_frac=0.5391 init_acc_rollout_applied=0.5447 init_acc_rollout_kept=0.4746 logit_acc_rollout_applied=0.8425 logit_acc_rollout_kept=0.9545
614
+ step=2400 epoch=2400/3000 epoch_step=1/1 micro_steps=2400 elapsed=21.9s lr=2.000000e-03 loss=0.3246 loss_recon=0.3246 loss_meanflow=0.0000 mean_model_t=0.4986 mean_corrupt_t=0.4986 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4976 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8982 corrupt_frac=1.0000 acc_corrupt=0.8982 loss_corrupt=0.3246 wrong_frac=0.5016 init_acc_corrupt=0.5120 acc_corrupt_t_0p0_0p2=0.4957 corrupt_frac_t_0p0_0p2=0.2005 acc_corrupt_t_0p2_0p4=0.9966 corrupt_frac_t_0p2_0p4=0.2001 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1992 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1979 out_w_norm=13.1036 out_g_norm=0.4845 loss_all=0.3084 init_gold_top10=0.6982 init_gold_top100=0.7897 rollout_applied_pos_frac=0.5469 init_acc_rollout_applied=0.6229 init_acc_rollout_kept=0.4094 logit_acc_rollout_applied=0.9352 logit_acc_rollout_kept=0.8657
615
+ step=2500 epoch=2500/3000 epoch_step=1/1 micro_steps=2500 elapsed=22.0s lr=2.000000e-03 loss=0.3218 loss_recon=0.3218 loss_meanflow=0.0000 mean_model_t=0.4977 mean_corrupt_t=0.4977 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4981 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8974 corrupt_frac=1.0000 acc_corrupt=0.8974 loss_corrupt=0.3218 wrong_frac=0.5023 init_acc_corrupt=0.5107 acc_corrupt_t_0p0_0p2=0.5043 corrupt_frac_t_0p0_0p2=0.2059 acc_corrupt_t_0p2_0p4=0.9977 corrupt_frac_t_0p2_0p4=0.1985 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.1969 acc_corrupt_t_0p6_0p8=0.9999 corrupt_frac_t_0p6_0p8=0.1927 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2059 out_w_norm=13.0637 out_g_norm=0.4908 loss_all=0.2705 init_gold_top10=0.6342 init_gold_top100=0.7127 rollout_applied_pos_frac=0.5234 init_acc_rollout_applied=0.5217 init_acc_rollout_kept=0.4782 logit_acc_rollout_applied=0.8998 logit_acc_rollout_kept=0.9173
616
+ step=2600 epoch=2600/3000 epoch_step=1/1 micro_steps=2600 elapsed=22.0s lr=2.000000e-03 loss=0.2821 loss_recon=0.2821 loss_meanflow=0.0000 mean_model_t=0.5012 mean_corrupt_t=0.5012 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5030 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9090 corrupt_frac=1.0000 acc_corrupt=0.9090 loss_corrupt=0.2821 wrong_frac=0.4987 init_acc_corrupt=0.5152 acc_corrupt_t_0p0_0p2=0.5405 corrupt_frac_t_0p0_0p2=0.1972 acc_corrupt_t_0p2_0p4=0.9984 corrupt_frac_t_0p2_0p4=0.2024 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.1990 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2017 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1997 out_w_norm=13.0192 out_g_norm=0.4401 loss_all=0.2470 init_gold_top10=0.7280 init_gold_top100=0.7950 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.6351 init_acc_rollout_kept=0.5818 logit_acc_rollout_applied=0.9189 logit_acc_rollout_kept=0.9226
617
+ step=2700 epoch=2700/3000 epoch_step=1/1 micro_steps=2700 elapsed=21.8s lr=2.000000e-03 loss=0.2873 loss_recon=0.2873 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4924 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9066 corrupt_frac=1.0000 acc_corrupt=0.9066 loss_corrupt=0.2873 wrong_frac=0.4994 init_acc_corrupt=0.5126 acc_corrupt_t_0p0_0p2=0.5414 corrupt_frac_t_0p0_0p2=0.2030 acc_corrupt_t_0p2_0p4=0.9986 corrupt_frac_t_0p2_0p4=0.1915 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2023 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.2001 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2032 out_w_norm=12.9538 out_g_norm=0.4343 loss_all=0.3652 init_gold_top10=0.6653 init_gold_top100=0.7625 rollout_applied_pos_frac=0.5000 init_acc_rollout_applied=0.4843 init_acc_rollout_kept=0.5192 logit_acc_rollout_applied=0.8712 logit_acc_rollout_kept=0.8880
618
+ step=2800 epoch=2800/3000 epoch_step=1/1 micro_steps=2800 elapsed=21.9s lr=2.000000e-03 loss=0.2828 loss_recon=0.2828 loss_meanflow=0.0000 mean_model_t=0.4962 mean_corrupt_t=0.4962 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4930 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9060 corrupt_frac=1.0000 acc_corrupt=0.9060 loss_corrupt=0.2828 wrong_frac=0.5037 init_acc_corrupt=0.5085 acc_corrupt_t_0p0_0p2=0.5374 corrupt_frac_t_0p0_0p2=0.2026 acc_corrupt_t_0p2_0p4=0.9989 corrupt_frac_t_0p2_0p4=0.2020 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1965 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.1984 out_w_norm=12.8964 out_g_norm=0.4150 loss_all=0.3107 init_gold_top10=0.6161 init_gold_top100=0.7261 rollout_applied_pos_frac=0.4609 init_acc_rollout_applied=0.5516 init_acc_rollout_kept=0.4085 logit_acc_rollout_applied=0.9118 logit_acc_rollout_kept=0.8791
619
+ step=2900 epoch=2900/3000 epoch_step=1/1 micro_steps=2900 elapsed=22.0s lr=2.000000e-03 loss=0.2599 loss_recon=0.2599 loss_meanflow=0.0000 mean_model_t=0.5040 mean_corrupt_t=0.5040 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5050 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9131 corrupt_frac=1.0000 acc_corrupt=0.9131 loss_corrupt=0.2599 wrong_frac=0.4960 init_acc_corrupt=0.5190 acc_corrupt_t_0p0_0p2=0.5511 corrupt_frac_t_0p0_0p2=0.1931 acc_corrupt_t_0p2_0p4=0.9991 corrupt_frac_t_0p2_0p4=0.2013 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2019 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1981 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2056 out_w_norm=12.8359 out_g_norm=0.3958 loss_all=0.1289 init_gold_top10=0.7116 init_gold_top100=0.7832 rollout_applied_pos_frac=0.4297 init_acc_rollout_applied=0.6001 init_acc_rollout_kept=0.5233 logit_acc_rollout_applied=0.9320 logit_acc_rollout_kept=0.9771
620
+ step=3000 epoch=3000/3000 epoch_step=1/1 micro_steps=3000 elapsed=22.0s lr=2.000000e-03 loss=0.2674 loss_recon=0.2674 loss_meanflow=0.0000 mean_model_t=0.5008 mean_corrupt_t=0.5008 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4999 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.9105 corrupt_frac=1.0000 acc_corrupt=0.9105 loss_corrupt=0.2674 wrong_frac=0.4993 init_acc_corrupt=0.5155 acc_corrupt_t_0p0_0p2=0.5608 corrupt_frac_t_0p0_0p2=0.2034 acc_corrupt_t_0p2_0p4=0.9992 corrupt_frac_t_0p2_0p4=0.1967 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=0.1933 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.2055 out_w_norm=12.7742 out_g_norm=0.3695 loss_all=0.4272 init_gold_top10=0.6409 init_gold_top100=0.7547 rollout_applied_pos_frac=0.5547 init_acc_rollout_applied=0.5394 init_acc_rollout_kept=0.4298 logit_acc_rollout_applied=0.8686 logit_acc_rollout_kept=0.8456
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_ctx1024_t5tok_p50_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014436.log ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 2423,
8
+ "tokenizer_vocab_size": 32100,
9
+ "save_dir": "runs/train8_ctx1024_t5tok_p50_rand0_4_unif0_0p25_outwdm1_t5tok_ctx1024_randk_20260518_014436",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 2523776,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "uniform",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_prob_floor": 0.0,
124
+ "categorical_wrong_corpus_unigram_path": "",
125
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
126
+ "categorical_wrong_basin_shared_prob": 0.0,
127
+ "categorical_wrong_unigram_shared_prob": 0.0,
128
+ "mask_mixture_original_prob": 0.0,
129
+ "mask_mixture_lowk_prob": 0.0,
130
+ "mask_mixture_lowcorrupt_prob": 0.0,
131
+ "mask_mixture_block_prob": 0.0,
132
+ "mask_mixture_all_prob": 1.0,
133
+ "mask_mixture_lowk_clean_tokens": "0",
134
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
135
+ "mask_mixture_block_tokens": "64,128",
136
+ "simplex_bridge_sampler": "dirichlet",
137
+ "logistic_normal_sigma_min": 0.1,
138
+ "logistic_normal_sigma_max": 1.0,
139
+ "logistic_normal_tau_min": 1.0,
140
+ "logistic_normal_tau_max": 1.0,
141
+ "torch_compile": false,
142
+ "compile_mode": "max-autotune",
143
+ "state_format": "prob",
144
+ "meanflow_weight": 0.0,
145
+ "rollout_train_prob": 0.5,
146
+ "rollout_train_steps": 4,
147
+ "rollout_train_steps_min": 0,
148
+ "rollout_train_infer_steps": 1,
149
+ "rollout_train_time_mode": "sampled_path",
150
+ "rollout_train_s_dist": "uniform",
151
+ "rollout_train_s_min_frac": 0.0,
152
+ "rollout_train_s_max_frac": 0.25,
153
+ "rollout_train_s_beta_alpha": 2.0,
154
+ "rollout_train_s_beta_beta": 6.0,
155
+ "rollout_train_temp": 1.0,
156
+ "rollout_train_max_gamma": 1.0,
157
+ "rollout_train_corrupt_only": true,
158
+ "rollout_train_samplewise": true,
159
+ "rollout_train_compute_always": false,
160
+ "rollout_train_sync_t": true,
161
+ "bridge_noise_init": "logistic_normal",
162
+ "noise_sigma": -1.0,
163
+ "allow_tf32": true,
164
+ "activation_checkpointing": false,
165
+ "activation_checkpoint_interval": 1,
166
+ "activation_checkpoint_scope": "block",
167
+ "ddp_static_graph": false,
168
+ "ddp_gradient_as_bucket_view": true,
169
+ "blocking_data_transfer": false,
170
+ "dataloader_prefetch_factor": 4,
171
+ "full_train_stats": false,
172
+ "tokenized_hf": false,
173
+ "tokenized_pad_token": "pad",
174
+ "elf_conditional_hf": false,
175
+ "record_pad_truncate": false,
176
+ "record_add_eos": false,
177
+ "record_add_special_tokens": false,
178
+ "record_pad_token": "pad",
179
+ "record_shuffle_buffer": 10000,
180
+ "wrap": true,
181
+ "wrap_mode": "stream",
182
+ "wrap_record_buffer_size": 200,
183
+ "owt_cached_chunks": true,
184
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/t5_len1024_train8_compact_overfit",
185
+ "owt_chunk_cache_rebuild": false,
186
+ "owt_chunk_cache_write_batch": 4096,
187
+ "owt_exact_repeat_per_chunk": 64,
188
+ "online_chunk_shuffle": false,
189
+ "online_chunk_shuffle_buffer": 10000,
190
+ "openwebtext_split": "train_minus_100k",
191
+ "detokenizer": "auto",
192
+ "resolved_detokenizer": null,
193
+ "num_workers": 0,
194
+ "latest_every": 1000,
195
+ "resume_path": ""
196
+ }
197
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=26.4s lr=2.000000e-03 loss=7.3399 loss_recon=7.3399 loss_meanflow=0.0000 mean_model_t=0.5013 mean_corrupt_t=0.5013 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5002 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3325 corrupt_frac=1.0000 acc_corrupt=0.3325 loss_corrupt=7.3399 wrong_frac=0.4986 init_acc_corrupt=0.4674 acc_corrupt_t_0p0_0p2=0.0457 corrupt_frac_t_0p0_0p2=0.1952 acc_corrupt_t_0p2_0p4=0.1645 corrupt_frac_t_0p2_0p4=0.2063 acc_corrupt_t_0p4_0p6=0.3267 corrupt_frac_t_0p4_0p6=0.1973 acc_corrupt_t_0p6_0p8=0.4815 corrupt_frac_t_0p6_0p8=0.1976 acc_corrupt_t_0p8_1p0=0.6387 corrupt_frac_t_0p8_1p0=0.2036 out_w_norm=1.0906 out_g_norm=1.0044 loss_all=6.6981 init_gold_top10=0.5044 init_gold_top100=0.6199 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.4979 init_acc_rollout_kept=0.4387 logit_acc_rollout_applied=0.3335 logit_acc_rollout_kept=0.3000
198
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=25.5s lr=2.000000e-03 loss=5.8172 loss_recon=5.8172 loss_meanflow=0.0000 mean_model_t=0.4985 mean_corrupt_t=0.4985 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5030 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3271 corrupt_frac=1.0000 acc_corrupt=0.3271 loss_corrupt=5.8172 wrong_frac=0.5014 init_acc_corrupt=0.4646 acc_corrupt_t_0p0_0p2=0.0526 corrupt_frac_t_0p0_0p2=0.2037 acc_corrupt_t_0p2_0p4=0.1624 corrupt_frac_t_0p2_0p4=0.1982 acc_corrupt_t_0p4_0p6=0.3284 corrupt_frac_t_0p4_0p6=0.1956 acc_corrupt_t_0p6_0p8=0.4722 corrupt_frac_t_0p6_0p8=0.2056 acc_corrupt_t_0p8_1p0=0.6242 corrupt_frac_t_0p8_1p0=0.1969 out_w_norm=3.4895 out_g_norm=1.3273 loss_all=5.0587 init_gold_top10=0.5049 init_gold_top100=0.6439 rollout_applied_pos_frac=0.5234 init_acc_rollout_applied=0.5128 init_acc_rollout_kept=0.4286 logit_acc_rollout_applied=0.3788 logit_acc_rollout_kept=0.3220
199
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=25.7s lr=2.000000e-03 loss=4.7562 loss_recon=4.7562 loss_meanflow=0.0000 mean_model_t=0.4953 mean_corrupt_t=0.4953 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5102 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3619 corrupt_frac=1.0000 acc_corrupt=0.3619 loss_corrupt=4.7562 wrong_frac=0.5048 init_acc_corrupt=0.4615 acc_corrupt_t_0p0_0p2=0.0552 corrupt_frac_t_0p0_0p2=0.2036 acc_corrupt_t_0p2_0p4=0.1876 corrupt_frac_t_0p2_0p4=0.2030 acc_corrupt_t_0p4_0p6=0.3634 corrupt_frac_t_0p4_0p6=0.2005 acc_corrupt_t_0p6_0p8=0.5245 corrupt_frac_t_0p6_0p8=0.1995 acc_corrupt_t_0p8_1p0=0.6985 corrupt_frac_t_0p8_1p0=0.1934 out_w_norm=5.5710 out_g_norm=0.5522 loss_all=4.3223 init_gold_top10=0.5282 init_gold_top100=0.6622 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.5230 init_acc_rollout_kept=0.4573 logit_acc_rollout_applied=0.4249 logit_acc_rollout_kept=0.3736
200
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=25.6s lr=2.000000e-03 loss=4.1328 loss_recon=4.1328 loss_meanflow=0.0000 mean_model_t=0.4980 mean_corrupt_t=0.4980 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5031 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4209 corrupt_frac=1.0000 acc_corrupt=0.4209 loss_corrupt=4.1328 wrong_frac=0.5019 init_acc_corrupt=0.4652 acc_corrupt_t_0p0_0p2=0.0579 corrupt_frac_t_0p0_0p2=0.2061 acc_corrupt_t_0p2_0p4=0.2100 corrupt_frac_t_0p2_0p4=0.1960 acc_corrupt_t_0p4_0p6=0.4187 corrupt_frac_t_0p4_0p6=0.1973 acc_corrupt_t_0p6_0p8=0.6135 corrupt_frac_t_0p6_0p8=0.2020 acc_corrupt_t_0p8_1p0=0.8120 corrupt_frac_t_0p8_1p0=0.1985 out_w_norm=7.1017 out_g_norm=0.2796 loss_all=3.8815 init_gold_top10=0.5108 init_gold_top100=0.6647 rollout_applied_pos_frac=0.5391 init_acc_rollout_applied=0.4336 init_acc_rollout_kept=0.4870 logit_acc_rollout_applied=0.4331 logit_acc_rollout_kept=0.4897
201
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=54.9s lr=2.000000e-03 loss=3.5369 loss_recon=3.5369 loss_meanflow=0.0000 mean_model_t=0.4998 mean_corrupt_t=0.4998 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5031 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4845 corrupt_frac=1.0000 acc_corrupt=0.4845 loss_corrupt=3.5369 wrong_frac=0.5002 init_acc_corrupt=0.4679 acc_corrupt_t_0p0_0p2=0.0593 corrupt_frac_t_0p0_0p2=0.1998 acc_corrupt_t_0p2_0p4=0.2383 corrupt_frac_t_0p2_0p4=0.1971 acc_corrupt_t_0p4_0p6=0.5039 corrupt_frac_t_0p4_0p6=0.2008 acc_corrupt_t_0p6_0p8=0.7114 corrupt_frac_t_0p6_0p8=0.2002 acc_corrupt_t_0p8_1p0=0.9013 corrupt_frac_t_0p8_1p0=0.2020 out_w_norm=8.4318 out_g_norm=0.2337 loss_all=3.1695 init_gold_top10=0.5257 init_gold_top100=0.6722 rollout_applied_pos_frac=0.4531 init_acc_rollout_applied=0.4713 init_acc_rollout_kept=0.4937 logit_acc_rollout_applied=0.4919 logit_acc_rollout_kept=0.5170
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n1024_linear_soft_kl_onehot_20260517_train8_overfit.log ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 50257,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_n1024_linear_soft_kl_onehot_20260517_train8_overfit",
10
+ "batch_size": 1,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 4,
13
+ "global_batch_size": 4,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 2,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 20,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 169453056,
36
+ "muon_adam_param_count": 122368,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "blocks.3.attn_qkv.weight",
57
+ "blocks.3.attn_out.weight",
58
+ "blocks.3.mlp.0.weight",
59
+ "blocks.3.mlp.2.weight",
60
+ "blocks.3.adaLN_modulation.weight",
61
+ "blocks.4.attn_qkv.weight",
62
+ "blocks.4.attn_out.weight",
63
+ "blocks.4.mlp.0.weight",
64
+ "blocks.4.mlp.2.weight",
65
+ "blocks.4.adaLN_modulation.weight",
66
+ "blocks.5.attn_qkv.weight",
67
+ "blocks.5.attn_out.weight",
68
+ "blocks.5.mlp.0.weight",
69
+ "blocks.5.mlp.2.weight",
70
+ "blocks.5.adaLN_modulation.weight",
71
+ "blocks.6.attn_qkv.weight",
72
+ "blocks.6.attn_out.weight",
73
+ "blocks.6.mlp.0.weight",
74
+ "blocks.6.mlp.2.weight",
75
+ "blocks.6.adaLN_modulation.weight",
76
+ "blocks.7.attn_qkv.weight",
77
+ "blocks.7.attn_out.weight",
78
+ "blocks.7.mlp.0.weight",
79
+ "blocks.7.mlp.2.weight",
80
+ "blocks.7.adaLN_modulation.weight",
81
+ "blocks.8.attn_qkv.weight",
82
+ "blocks.8.attn_out.weight",
83
+ "blocks.8.mlp.0.weight",
84
+ "blocks.8.mlp.2.weight",
85
+ "blocks.8.adaLN_modulation.weight",
86
+ "blocks.9.attn_qkv.weight",
87
+ "blocks.9.attn_out.weight",
88
+ "blocks.9.mlp.0.weight",
89
+ "blocks.9.mlp.2.weight",
90
+ "blocks.9.adaLN_modulation.weight",
91
+ "blocks.10.attn_qkv.weight",
92
+ "blocks.10.attn_out.weight",
93
+ "blocks.10.mlp.0.weight",
94
+ "blocks.10.mlp.2.weight",
95
+ "blocks.10.adaLN_modulation.weight",
96
+ "blocks.11.attn_qkv.weight",
97
+ "blocks.11.attn_out.weight",
98
+ "blocks.11.mlp.0.weight",
99
+ "blocks.11.mlp.2.weight",
100
+ "blocks.11.adaLN_modulation.weight",
101
+ "output_layer.linear.weight",
102
+ "output_layer.adaLN_modulation.weight"
103
+ ],
104
+ "muon_adam_param_names": [
105
+ "sigma_map.net.0.bias",
106
+ "sigma_map.net.2.bias",
107
+ "blocks.0.norm1.weight",
108
+ "blocks.0.norm2.weight",
109
+ "blocks.0.mlp.0.bias",
110
+ "blocks.0.mlp.2.bias",
111
+ "blocks.0.adaLN_modulation.bias",
112
+ "blocks.1.norm1.weight",
113
+ "blocks.1.norm2.weight",
114
+ "blocks.1.mlp.0.bias",
115
+ "blocks.1.mlp.2.bias",
116
+ "blocks.1.adaLN_modulation.bias",
117
+ "blocks.2.norm1.weight",
118
+ "blocks.2.norm2.weight",
119
+ "blocks.2.mlp.0.bias",
120
+ "blocks.2.mlp.2.bias",
121
+ "blocks.2.adaLN_modulation.bias",
122
+ "blocks.3.norm1.weight",
123
+ "blocks.3.norm2.weight",
124
+ "blocks.3.mlp.0.bias",
125
+ "blocks.3.mlp.2.bias",
126
+ "blocks.3.adaLN_modulation.bias",
127
+ "blocks.4.norm1.weight",
128
+ "blocks.4.norm2.weight",
129
+ "blocks.4.mlp.0.bias",
130
+ "blocks.4.mlp.2.bias",
131
+ "blocks.4.adaLN_modulation.bias",
132
+ "blocks.5.norm1.weight",
133
+ "blocks.5.norm2.weight",
134
+ "blocks.5.mlp.0.bias",
135
+ "blocks.5.mlp.2.bias",
136
+ "blocks.5.adaLN_modulation.bias",
137
+ "blocks.6.norm1.weight",
138
+ "blocks.6.norm2.weight",
139
+ "blocks.6.mlp.0.bias",
140
+ "blocks.6.mlp.2.bias",
141
+ "blocks.6.adaLN_modulation.bias",
142
+ "blocks.7.norm1.weight",
143
+ "blocks.7.norm2.weight",
144
+ "blocks.7.mlp.0.bias",
145
+ "blocks.7.mlp.2.bias",
146
+ "blocks.7.adaLN_modulation.bias",
147
+ "blocks.8.norm1.weight",
148
+ "blocks.8.norm2.weight",
149
+ "blocks.8.mlp.0.bias",
150
+ "blocks.8.mlp.2.bias",
151
+ "blocks.8.adaLN_modulation.bias",
152
+ "blocks.9.norm1.weight",
153
+ "blocks.9.norm2.weight",
154
+ "blocks.9.mlp.0.bias",
155
+ "blocks.9.mlp.2.bias",
156
+ "blocks.9.adaLN_modulation.bias",
157
+ "blocks.10.norm1.weight",
158
+ "blocks.10.norm2.weight",
159
+ "blocks.10.mlp.0.bias",
160
+ "blocks.10.mlp.2.bias",
161
+ "blocks.10.adaLN_modulation.bias",
162
+ "blocks.11.norm1.weight",
163
+ "blocks.11.norm2.weight",
164
+ "blocks.11.mlp.0.bias",
165
+ "blocks.11.mlp.2.bias",
166
+ "blocks.11.adaLN_modulation.bias",
167
+ "output_layer.norm_final.weight",
168
+ "output_layer.adaLN_modulation.bias"
169
+ ],
170
+ "muon_effective_nesterov": false,
171
+ "muon_effective_width_scale": false,
172
+ "muon_effective_weight_decay": 0.1,
173
+ "muon_adam_fallback_nesterov": false,
174
+ "muon_adam_fallback_weight_decay": 0.1,
175
+ "ema_decay": 0.9999,
176
+ "ema_start_step": 0,
177
+ "model_type": "ddit",
178
+ "elf_num_time_tokens": 4,
179
+ "elf_num_model_mode_tokens": 0,
180
+ "qk_norm": true,
181
+ "output_bias": false,
182
+ "output_init_std": -1.0,
183
+ "norm_type": "rmsnorm",
184
+ "target_loss": "linear_soft_kl",
185
+ "linear_soft_target_power": 1.0,
186
+ "linear_soft_target_min_conf": 0.0,
187
+ "linear_soft_target_max_conf": 1.0,
188
+ "t_sampling_mode": "logit_normal",
189
+ "t_sampling_power": 1.0,
190
+ "t_sampling_eps": 0.0001,
191
+ "t_sampling_logit_mean": -1.5,
192
+ "t_sampling_logit_std": 0.8,
193
+ "dual_t": true,
194
+ "corrupt_t_mode": "same",
195
+ "corrupt_min_t": 0.0,
196
+ "corrupt_max_t": 1.0,
197
+ "prefix_block_prob": 0.0,
198
+ "prefix_block_len": 128,
199
+ "mask_ratio_floor_schedule": "none",
200
+ "dirichlet_endpoint_mode": "categorical_dual_t",
201
+ "dirichlet_semantic_t_mode": "same",
202
+ "dirichlet_semantic_t_value": 0.0,
203
+ "dirichlet_semantic_t_curve": "linear",
204
+ "dirichlet_semantic_t_power": 1.0,
205
+ "endpoint_sequence_random_prob_alpha": 0.0,
206
+ "categorical_wrong_from_full_vocab": true,
207
+ "categorical_wrong_from_batch_valid_tokens": false,
208
+ "categorical_wrong_basin_token_ids": "",
209
+ "categorical_wrong_basin_prob": 0.0,
210
+ "categorical_wrong_unigram_prob": 0.0,
211
+ "categorical_wrong_uniform_prob": 0.0,
212
+ "categorical_wrong_corpus_unigram_path": "",
213
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
214
+ "categorical_wrong_basin_shared_prob": 0.0,
215
+ "categorical_wrong_unigram_shared_prob": 0.0,
216
+ "mask_mixture_original_prob": 0.0,
217
+ "mask_mixture_lowk_prob": 1.0,
218
+ "mask_mixture_lowcorrupt_prob": 0.0,
219
+ "mask_mixture_block_prob": 0.0,
220
+ "mask_mixture_all_prob": 0.0,
221
+ "mask_mixture_lowk_clean_tokens": "64,128,256",
222
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
223
+ "mask_mixture_block_tokens": "64,128",
224
+ "simplex_bridge_sampler": "dirichlet",
225
+ "logistic_normal_sigma_min": 0.18,
226
+ "logistic_normal_sigma_max": 2.2,
227
+ "logistic_normal_tau_min": 0.65,
228
+ "logistic_normal_tau_max": 1.15,
229
+ "torch_compile": false,
230
+ "compile_mode": "max-autotune",
231
+ "state_format": "prob",
232
+ "meanflow_weight": 0.0,
233
+ "rollout_train_prob": 0.0,
234
+ "rollout_train_steps": 1,
235
+ "rollout_train_infer_steps": 64,
236
+ "rollout_train_temp": 1.45,
237
+ "rollout_train_max_gamma": 1.0,
238
+ "rollout_train_corrupt_only": true,
239
+ "rollout_train_samplewise": false,
240
+ "rollout_train_compute_always": false,
241
+ "bridge_noise_init": "logistic_normal",
242
+ "noise_sigma": -1.0,
243
+ "allow_tf32": true,
244
+ "activation_checkpointing": false,
245
+ "activation_checkpoint_interval": 1,
246
+ "activation_checkpoint_scope": "block",
247
+ "ddp_static_graph": false,
248
+ "ddp_gradient_as_bucket_view": true,
249
+ "blocking_data_transfer": false,
250
+ "dataloader_prefetch_factor": 4,
251
+ "full_train_stats": false,
252
+ "tokenized_hf": false,
253
+ "tokenized_pad_token": "pad",
254
+ "elf_conditional_hf": false,
255
+ "record_pad_truncate": false,
256
+ "record_add_eos": false,
257
+ "record_add_special_tokens": false,
258
+ "record_pad_token": "pad",
259
+ "record_shuffle_buffer": 10000,
260
+ "wrap": true,
261
+ "wrap_mode": "stream",
262
+ "wrap_record_buffer_size": 200,
263
+ "owt_cached_chunks": true,
264
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_overfit",
265
+ "owt_chunk_cache_rebuild": false,
266
+ "owt_chunk_cache_write_batch": 4096,
267
+ "owt_exact_repeat_per_chunk": 0,
268
+ "online_chunk_shuffle": false,
269
+ "online_chunk_shuffle_buffer": 10000,
270
+ "openwebtext_split": "train_minus_100k",
271
+ "detokenizer": "auto",
272
+ "resolved_detokenizer": null,
273
+ "num_workers": 0,
274
+ "latest_every": 50,
275
+ "resume_path": ""
276
+ }
277
+ step=25 epoch=13/500 epoch_step=1/2 micro_steps=25 elapsed=3.8s lr=2.000000e-03 loss=1.7993 loss_recon=1.7993 loss_meanflow=0.0000 mean_model_t=0.2121 mean_corrupt_t=0.2121 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2121 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2175 corrupt_frac=0.8375 acc_corrupt=0.1083 loss_corrupt=2.1199 wrong_frac=0.7882 init_acc_corrupt=0.1188 acc_corrupt_t_0p0_0p2=0.0577 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.2466 out_g_norm=0.2995 acc_corrupt_t_0p2_0p4=0.1084 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.3238 corrupt_frac_t_0p4_0p6=1.0000 loss_all=10.6312 init_gold_top10=0.2214 init_gold_top100=0.2969
278
+ step=50 epoch=25/500 epoch_step=2/2 micro_steps=50 elapsed=3.0s lr=2.000000e-03 loss=1.7308 loss_recon=1.7308 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2092 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2213 corrupt_frac=0.8350 acc_corrupt=0.1138 loss_corrupt=1.9693 wrong_frac=0.7934 init_acc_corrupt=0.1070 acc_corrupt_t_0p0_0p2=0.0648 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.1391 out_g_norm=0.3935 acc_corrupt_t_0p2_0p4=0.1387 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.3155 corrupt_frac_t_0p4_0p6=1.0000 loss_all=10.0953 init_gold_top10=0.4206 init_gold_top100=0.4206
279
+ step=75 epoch=38/500 epoch_step=1/2 micro_steps=75 elapsed=7.0s lr=2.000000e-03 loss=1.6763 loss_recon=1.6763 loss_meanflow=0.0000 mean_model_t=0.2101 mean_corrupt_t=0.2101 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2101 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2155 corrupt_frac=0.8500 acc_corrupt=0.1221 loss_corrupt=2.0509 wrong_frac=0.7888 init_acc_corrupt=0.1127 acc_corrupt_t_0p0_0p2=0.0664 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.8803 out_g_norm=0.4689 acc_corrupt_t_0p2_0p4=0.1689 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.2627 corrupt_frac_t_0p4_0p6=1.0000 loss_all=10.1304 init_gold_top10=0.2302 init_gold_top100=0.3198
280
+ step=100 epoch=50/500 epoch_step=2/2 micro_steps=100 elapsed=3.1s lr=2.000000e-03 loss=1.5429 loss_recon=1.5429 loss_meanflow=0.0000 mean_model_t=0.1943 mean_corrupt_t=0.1943 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1943 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1895 corrupt_frac=0.8500 acc_corrupt=0.1051 loss_corrupt=1.8908 wrong_frac=0.8033 init_acc_corrupt=0.1015 acc_corrupt_t_0p2_0p4=0.1633 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.5289 out_g_norm=0.5494 acc_corrupt_t_0p0_0p2=0.0633 corrupt_frac_t_0p0_0p2=1.0000 loss_all=9.8296 init_gold_top10=0.0612 init_gold_top100=0.1654
281
+ step=125 epoch=63/500 epoch_step=1/2 micro_steps=125 elapsed=7.3s lr=2.000000e-03 loss=1.5721 loss_recon=1.5721 loss_meanflow=0.0000 mean_model_t=0.2141 mean_corrupt_t=0.2141 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2141 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1815 corrupt_frac=0.8700 acc_corrupt=0.1145 loss_corrupt=1.8442 wrong_frac=0.7877 init_acc_corrupt=0.1171 acc_corrupt_t_0p0_0p2=0.0596 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=3.1369 out_g_norm=0.6446 acc_corrupt_t_0p4_0p6=0.2577 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.1357 corrupt_frac_t_0p2_0p4=1.0000 loss_all=8.5182 init_gold_top10=0.2305 init_gold_top100=0.3047
282
+ step=150 epoch=75/500 epoch_step=2/2 micro_steps=150 elapsed=3.2s lr=2.000000e-03 loss=1.3855 loss_recon=1.3855 loss_meanflow=0.0000 mean_model_t=0.2050 mean_corrupt_t=0.2050 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2050 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2116 corrupt_frac=0.8400 acc_corrupt=0.1265 loss_corrupt=1.6168 wrong_frac=0.7929 init_acc_corrupt=0.1216 acc_corrupt_t_0p0_0p2=0.0653 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=3.7735 out_g_norm=0.6714 acc_corrupt_t_0p4_0p6=0.3057 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.1802 corrupt_frac_t_0p2_0p4=1.0000 loss_all=7.4310 init_gold_top10=0.3073 init_gold_top100=0.3164
283
+ step=175 epoch=88/500 epoch_step=1/2 micro_steps=175 elapsed=7.1s lr=2.000000e-03 loss=1.4102 loss_recon=1.4102 loss_meanflow=0.0000 mean_model_t=0.2183 mean_corrupt_t=0.2183 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2183 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2136 corrupt_frac=0.8625 acc_corrupt=0.1376 loss_corrupt=1.7792 wrong_frac=0.7800 init_acc_corrupt=0.1370 acc_corrupt_t_0p2_0p4=0.1907 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=4.3787 out_g_norm=0.5865 acc_corrupt_t_0p0_0p2=0.0610 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.3194 corrupt_frac_t_0p4_0p6=1.0000 loss_all=8.7739 init_gold_top10=0.0424 init_gold_top100=0.1719
284
+ step=200 epoch=100/500 epoch_step=2/2 micro_steps=200 elapsed=3.2s lr=2.000000e-03 loss=1.1934 loss_recon=1.1934 loss_meanflow=0.0000 mean_model_t=0.1829 mean_corrupt_t=0.1829 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1829 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2005 corrupt_frac=0.8450 acc_corrupt=0.1132 loss_corrupt=1.5587 wrong_frac=0.8122 init_acc_corrupt=0.0871 acc_corrupt_t_0p2_0p4=0.1561 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=4.7917 out_g_norm=0.5517 acc_corrupt_t_0p0_0p2=0.0642 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.2998 corrupt_frac_t_0p4_0p6=1.0000 loss_all=8.7217 init_gold_top10=0.1198 init_gold_top100=0.2510
285
+ step=225 epoch=113/500 epoch_step=1/2 micro_steps=225 elapsed=6.4s lr=2.000000e-03 loss=1.3249 loss_recon=1.3249 loss_meanflow=0.0000 mean_model_t=0.2100 mean_corrupt_t=0.2100 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2100 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2210 corrupt_frac=0.8525 acc_corrupt=0.1355 loss_corrupt=1.6210 wrong_frac=0.7966 init_acc_corrupt=0.1112 acc_corrupt_t_0p2_0p4=0.1706 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=5.1207 out_g_norm=0.5293 acc_corrupt_t_0p0_0p2=0.0646 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.3460 corrupt_frac_t_0p4_0p6=1.0000 loss_all=7.9293 init_gold_top10=0.2292 init_gold_top100=0.3083
286
+ step=250 epoch=125/500 epoch_step=2/2 micro_steps=250 elapsed=3.2s lr=2.000000e-03 loss=0.9987 loss_recon=0.9987 loss_meanflow=0.0000 mean_model_t=0.1634 mean_corrupt_t=0.1634 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1634 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1912 corrupt_frac=0.8650 acc_corrupt=0.1063 loss_corrupt=1.3599 wrong_frac=0.8360 init_acc_corrupt=0.0634 acc_corrupt_t_0p2_0p4=0.1724 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=5.4342 out_g_norm=0.4328 acc_corrupt_t_0p0_0p2=0.0714 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.3451 corrupt_frac_t_0p4_0p6=1.0000 loss_all=8.4824 init_gold_top10=0.0542 init_gold_top100=0.1740
287
+ step=275 epoch=138/500 epoch_step=1/2 micro_steps=275 elapsed=7.4s lr=2.000000e-03 loss=1.2546 loss_recon=1.2546 loss_meanflow=0.0000 mean_model_t=0.2121 mean_corrupt_t=0.2121 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2121 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2141 corrupt_frac=0.8775 acc_corrupt=0.1391 loss_corrupt=1.5386 wrong_frac=0.7930 init_acc_corrupt=0.1115 acc_corrupt_t_0p4_0p6=0.3962 corrupt_frac_t_0p4_0p6=1.0000 out_w_norm=5.7373 out_g_norm=0.4318 acc_corrupt_t_0p0_0p2=0.0664 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p2_0p4=0.1931 corrupt_frac_t_0p2_0p4=1.0000 loss_all=7.5609 init_gold_top10=0.1663 init_gold_top100=0.2779
288
+ step=300 epoch=150/500 epoch_step=2/2 micro_steps=300 elapsed=3.2s lr=2.000000e-03 loss=1.1000 loss_recon=1.1000 loss_meanflow=0.0000 mean_model_t=0.1955 mean_corrupt_t=0.1955 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1955 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2408 corrupt_frac=0.8325 acc_corrupt=0.1357 loss_corrupt=1.3590 wrong_frac=0.8023 init_acc_corrupt=0.1065 acc_corrupt_t_0p2_0p4=0.2241 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=6.0310 out_g_norm=0.4127 acc_corrupt_t_0p0_0p2=0.0673 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.5078 corrupt_frac_t_0p4_0p6=1.0000 loss_all=6.7362 init_gold_top10=0.1211 init_gold_top100=0.2526
289
+ step=325 epoch=163/500 epoch_step=1/2 micro_steps=325 elapsed=7.2s lr=2.000000e-03 loss=0.9793 loss_recon=0.9793 loss_meanflow=0.0000 mean_model_t=0.1678 mean_corrupt_t=0.1678 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1678 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2080 corrupt_frac=0.8550 acc_corrupt=0.1132 loss_corrupt=1.2116 wrong_frac=0.8324 init_acc_corrupt=0.0730 acc_corrupt_t_0p0_0p2=0.0721 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=6.3253 out_g_norm=0.4276 acc_corrupt_t_0p2_0p4=0.1988 corrupt_frac_t_0p2_0p4=1.0000 loss_all=6.4874 init_gold_top10=0.1589 init_gold_top100=0.2708
290
+ step=350 epoch=175/500 epoch_step=2/2 micro_steps=350 elapsed=3.2s lr=2.000000e-03 loss=1.0407 loss_recon=1.0407 loss_meanflow=0.0000 mean_model_t=0.1906 mean_corrupt_t=0.1906 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1906 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2343 corrupt_frac=0.8425 acc_corrupt=0.1336 loss_corrupt=1.2750 wrong_frac=0.8084 init_acc_corrupt=0.1011 acc_corrupt_t_0p0_0p2=0.0646 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=6.6144 out_g_norm=0.3903 acc_corrupt_t_0p2_0p4=0.1983 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.3315 corrupt_frac_t_0p4_0p6=1.0000 loss_all=6.5952 init_gold_top10=0.2396 init_gold_top100=0.3047
291
+ step=375 epoch=188/500 epoch_step=1/2 micro_steps=375 elapsed=6.0s lr=2.000000e-03 loss=1.1322 loss_recon=1.1322 loss_meanflow=0.0000 mean_model_t=0.2456 mean_corrupt_t=0.2456 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2456 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2746 corrupt_frac=0.8675 acc_corrupt=0.1932 loss_corrupt=1.1824 wrong_frac=0.7527 init_acc_corrupt=0.1760 acc_corrupt_t_0p2_0p4=0.2197 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=6.8983 out_g_norm=0.3729 acc_corrupt_t_0p0_0p2=0.0641 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p6_0p8=0.5789 corrupt_frac_t_0p6_0p8=1.0000 acc_corrupt_t_0p4_0p6=0.4135 corrupt_frac_t_0p4_0p6=1.0000 loss_all=3.3921 init_gold_top10=0.7824 init_gold_top100=0.7846
292
+ step=400 epoch=200/500 epoch_step=2/2 micro_steps=400 elapsed=3.2s lr=2.000000e-03 loss=0.9991 loss_recon=0.9991 loss_meanflow=0.0000 mean_model_t=0.1998 mean_corrupt_t=0.1998 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1998 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2586 corrupt_frac=0.8425 acc_corrupt=0.1536 loss_corrupt=1.2177 wrong_frac=0.8029 init_acc_corrupt=0.1143 acc_corrupt_t_0p2_0p4=0.2608 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=7.1964 out_g_norm=0.3839 acc_corrupt_t_0p0_0p2=0.0717 corrupt_frac_t_0p0_0p2=1.0000 loss_all=6.4061 init_gold_top10=0.3240 init_gold_top100=0.3250
293
+ step=425 epoch=213/500 epoch_step=1/2 micro_steps=425 elapsed=6.6s lr=2.000000e-03 loss=1.0999 loss_recon=1.0999 loss_meanflow=0.0000 mean_model_t=0.2141 mean_corrupt_t=0.2141 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2141 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2632 corrupt_frac=0.8525 acc_corrupt=0.1637 loss_corrupt=1.4368 wrong_frac=0.7856 init_acc_corrupt=0.1222 acc_corrupt_t_0p0_0p2=0.0835 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=7.5028 out_g_norm=0.3774 acc_corrupt_t_0p2_0p4=0.2364 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.3469 corrupt_frac_t_0p4_0p6=1.0000 loss_all=7.9881 init_gold_top10=0.1000 init_gold_top100=0.2000
294
+ step=450 epoch=225/500 epoch_step=2/2 micro_steps=450 elapsed=3.2s lr=2.000000e-03 loss=1.0844 loss_recon=1.0844 loss_meanflow=0.0000 mean_model_t=0.2233 mean_corrupt_t=0.2233 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2233 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2737 corrupt_frac=0.8650 acc_corrupt=0.1816 loss_corrupt=1.3110 wrong_frac=0.7728 init_acc_corrupt=0.1468 acc_corrupt_t_0p2_0p4=0.2386 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=7.7913 out_g_norm=0.3997 acc_corrupt_t_0p0_0p2=0.0642 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.4325 corrupt_frac_t_0p4_0p6=1.0000 loss_all=6.3435 init_gold_top10=0.2645 init_gold_top100=0.3292
295
+ step=475 epoch=238/500 epoch_step=1/2 micro_steps=475 elapsed=7.1s lr=2.000000e-03 loss=0.9691 loss_recon=0.9691 loss_meanflow=0.0000 mean_model_t=0.1984 mean_corrupt_t=0.1984 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1984 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2405 corrupt_frac=0.8700 acc_corrupt=0.1471 loss_corrupt=1.2060 wrong_frac=0.8033 init_acc_corrupt=0.0984 acc_corrupt_t_0p2_0p4=0.2344 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=8.0605 out_g_norm=0.3808 acc_corrupt_t_0p0_0p2=0.0813 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.4854 corrupt_frac_t_0p4_0p6=1.0000 loss_all=6.1833 init_gold_top10=0.1551 init_gold_top100=0.2533
296
+ step=500 epoch=250/500 epoch_step=2/2 micro_steps=500 elapsed=3.2s lr=2.000000e-03 loss=0.9673 loss_recon=0.9673 loss_meanflow=0.0000 mean_model_t=0.2001 mean_corrupt_t=0.2001 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2001 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2664 corrupt_frac=0.8500 acc_corrupt=0.1540 loss_corrupt=1.2035 wrong_frac=0.8004 init_acc_corrupt=0.1043 acc_corrupt_t_0p0_0p2=0.0815 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=8.3192 out_g_norm=0.3685 acc_corrupt_t_0p2_0p4=0.2433 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.4016 corrupt_frac_t_0p4_0p6=1.0000 loss_all=6.1066 init_gold_top10=0.0885 init_gold_top100=0.2018
297
+ step=525 epoch=263/500 epoch_step=1/2 micro_steps=525 elapsed=6.0s lr=2.000000e-03 loss=0.9982 loss_recon=0.9982 loss_meanflow=0.0000 mean_model_t=0.2159 mean_corrupt_t=0.2159 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2159 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2740 corrupt_frac=0.8600 acc_corrupt=0.1689 loss_corrupt=1.2558 wrong_frac=0.7809 init_acc_corrupt=0.1125 acc_corrupt_t_0p2_0p4=0.1805 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=8.5659 out_g_norm=0.3468 acc_corrupt_t_0p0_0p2=0.0852 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.4840 corrupt_frac_t_0p4_0p6=1.0000 loss_all=6.4480 init_gold_top10=0.1574 init_gold_top100=0.2444
298
+ step=550 epoch=275/500 epoch_step=2/2 micro_steps=550 elapsed=3.2s lr=2.000000e-03 loss=0.9355 loss_recon=0.9355 loss_meanflow=0.0000 mean_model_t=0.2041 mean_corrupt_t=0.2041 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2041 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2661 corrupt_frac=0.8725 acc_corrupt=0.1664 loss_corrupt=1.2309 wrong_frac=0.7949 init_acc_corrupt=0.1005 acc_corrupt_t_0p2_0p4=0.2618 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=8.8255 out_g_norm=0.3415 acc_corrupt_t_0p0_0p2=0.0799 corrupt_frac_t_0p0_0p2=1.0000 loss_all=7.2001 init_gold_top10=0.0592 init_gold_top100=0.1685
299
+ step=575 epoch=288/500 epoch_step=1/2 micro_steps=575 elapsed=6.0s lr=2.000000e-03 loss=1.0003 loss_recon=1.0003 loss_meanflow=0.0000 mean_model_t=0.2442 mean_corrupt_t=0.2442 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2442 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3211 corrupt_frac=0.8500 acc_corrupt=0.2051 loss_corrupt=1.3013 wrong_frac=0.7576 init_acc_corrupt=0.1518 acc_corrupt_t_0p0_0p2=0.0923 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=9.0870 out_g_norm=0.3515 acc_corrupt_t_0p2_0p4=0.2705 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.5232 corrupt_frac_t_0p4_0p6=1.0000 loss_all=6.9261 init_gold_top10=0.0167 init_gold_top100=0.1306
300
+ step=600 epoch=300/500 epoch_step=2/2 micro_steps=600 elapsed=3.2s lr=2.000000e-03 loss=0.8200 loss_recon=0.8200 loss_meanflow=0.0000 mean_model_t=0.1951 mean_corrupt_t=0.1951 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1951 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2871 corrupt_frac=0.8375 acc_corrupt=0.1528 loss_corrupt=0.9724 wrong_frac=0.8035 init_acc_corrupt=0.0935 acc_corrupt_t_0p0_0p2=0.0905 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=9.3239 out_g_norm=0.4139 acc_corrupt_t_0p2_0p4=0.1989 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.5833 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p6_0p8=0.6484 corrupt_frac_t_0p6_0p8=1.0000 loss_all=4.5192 init_gold_top10=0.3650 init_gold_top100=0.3672
301
+ step=625 epoch=313/500 epoch_step=1/2 micro_steps=625 elapsed=7.0s lr=2.000000e-03 loss=0.9825 loss_recon=0.9825 loss_meanflow=0.0000 mean_model_t=0.2590 mean_corrupt_t=0.2590 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2590 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3329 corrupt_frac=0.8825 acc_corrupt=0.2458 loss_corrupt=1.0945 wrong_frac=0.7406 init_acc_corrupt=0.1867 acc_corrupt_t_0p2_0p4=0.2677 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=9.5319 out_g_norm=0.3527 acc_corrupt_t_0p0_0p2=0.0845 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.5176 corrupt_frac_t_0p4_0p6=1.0000 loss_all=3.7167 init_gold_top10=0.4185 init_gold_top100=0.4208
302
+ step=650 epoch=325/500 epoch_step=2/2 micro_steps=650 elapsed=3.2s lr=2.000000e-03 loss=0.9874 loss_recon=0.9874 loss_meanflow=0.0000 mean_model_t=0.2612 mean_corrupt_t=0.2612 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2612 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3400 corrupt_frac=0.8725 acc_corrupt=0.2448 loss_corrupt=1.0896 wrong_frac=0.7398 init_acc_corrupt=0.1886 acc_corrupt_t_0p2_0p4=0.3127 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=9.7304 out_g_norm=0.3655 acc_corrupt_t_0p0_0p2=0.0755 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.4844 corrupt_frac_t_0p4_0p6=1.0000 loss_all=3.5117 init_gold_top10=0.4688 init_gold_top100=0.4688
303
+ step=675 epoch=338/500 epoch_step=1/2 micro_steps=675 elapsed=6.0s lr=2.000000e-03 loss=0.8145 loss_recon=0.8145 loss_meanflow=0.0000 mean_model_t=0.1940 mean_corrupt_t=0.1940 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1940 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2916 corrupt_frac=0.8450 acc_corrupt=0.1620 loss_corrupt=0.9186 wrong_frac=0.8114 init_acc_corrupt=0.0945 acc_corrupt_t_0p0_0p2=0.0862 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=9.9168 out_g_norm=0.3564 acc_corrupt_t_0p2_0p4=0.2357 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.4627 corrupt_frac_t_0p4_0p6=1.0000 loss_all=3.4227 init_gold_top10=0.4431 init_gold_top100=0.4442
304
+ step=700 epoch=350/500 epoch_step=2/2 micro_steps=700 elapsed=3.2s lr=2.000000e-03 loss=0.7076 loss_recon=0.7076 loss_meanflow=0.0000 mean_model_t=0.1733 mean_corrupt_t=0.1733 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1733 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2837 corrupt_frac=0.8350 acc_corrupt=0.1428 loss_corrupt=0.8889 wrong_frac=0.8329 init_acc_corrupt=0.0765 acc_corrupt_t_0p0_0p2=0.0757 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=10.0685 out_g_norm=0.3569 acc_corrupt_t_0p2_0p4=0.2488 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.6315 corrupt_frac_t_0p4_0p6=1.0000 loss_all=4.7608 init_gold_top10=0.1953 init_gold_top100=0.2904
305
+ step=725 epoch=363/500 epoch_step=1/2 micro_steps=725 elapsed=6.0s lr=2.000000e-03 loss=0.8131 loss_recon=0.8131 loss_meanflow=0.0000 mean_model_t=0.2029 mean_corrupt_t=0.2029 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2029 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3042 corrupt_frac=0.8425 acc_corrupt=0.1742 loss_corrupt=1.1156 wrong_frac=0.8007 init_acc_corrupt=0.1016 acc_corrupt_t_0p2_0p4=0.2369 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=10.1892 out_g_norm=0.3261 acc_corrupt_t_0p4_0p6=0.5145 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p0_0p2=0.0871 corrupt_frac_t_0p0_0p2=1.0000 loss_all=7.1434 init_gold_top10=0.0658 init_gold_top100=0.1920
306
+ step=750 epoch=375/500 epoch_step=2/2 micro_steps=750 elapsed=3.2s lr=2.000000e-03 loss=0.7671 loss_recon=0.7671 loss_meanflow=0.0000 mean_model_t=0.1835 mean_corrupt_t=0.1835 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1835 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2925 corrupt_frac=0.8425 acc_corrupt=0.1609 loss_corrupt=0.9390 wrong_frac=0.8120 init_acc_corrupt=0.0854 acc_corrupt_t_0p2_0p4=0.2501 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=10.3122 out_g_norm=0.3505 acc_corrupt_t_0p0_0p2=0.0844 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.4344 corrupt_frac_t_0p4_0p6=1.0000 loss_all=4.6960 init_gold_top10=0.2768 init_gold_top100=0.3047
307
+ step=775 epoch=388/500 epoch_step=1/2 micro_steps=775 elapsed=6.8s lr=2.000000e-03 loss=0.7031 loss_recon=0.7031 loss_meanflow=0.0000 mean_model_t=0.1742 mean_corrupt_t=0.1742 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1742 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2743 corrupt_frac=0.8550 acc_corrupt=0.1519 loss_corrupt=0.9045 wrong_frac=0.8231 init_acc_corrupt=0.0854 acc_corrupt_t_0p0_0p2=0.0810 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=10.4271 out_g_norm=0.3425 acc_corrupt_t_0p4_0p6=0.4886 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.2815 corrupt_frac_t_0p2_0p4=1.0000 loss_all=5.2330 init_gold_top10=0.1354 init_gold_top100=0.2617
308
+ step=800 epoch=400/500 epoch_step=2/2 micro_steps=800 elapsed=3.2s lr=2.000000e-03 loss=0.6103 loss_recon=0.6103 loss_meanflow=0.0000 mean_model_t=0.1450 mean_corrupt_t=0.1450 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1450 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2500 corrupt_frac=0.8475 acc_corrupt=0.1150 loss_corrupt=0.7678 wrong_frac=0.8575 init_acc_corrupt=0.0461 acc_corrupt_t_0p0_0p2=0.0809 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=10.5259 out_g_norm=0.3246 acc_corrupt_t_0p2_0p4=0.2161 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.4479 corrupt_frac_t_0p4_0p6=1.0000 loss_all=4.2664 init_gold_top10=0.2057 init_gold_top100=0.2747
309
+ step=825 epoch=413/500 epoch_step=1/2 micro_steps=825 elapsed=6.0s lr=2.000000e-03 loss=0.7702 loss_recon=0.7702 loss_meanflow=0.0000 mean_model_t=0.2042 mean_corrupt_t=0.2042 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2042 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2791 corrupt_frac=0.8825 acc_corrupt=0.1831 loss_corrupt=1.0393 wrong_frac=0.7993 init_acc_corrupt=0.1002 acc_corrupt_t_0p0_0p2=0.0953 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=10.6312 out_g_norm=0.3265 acc_corrupt_t_0p2_0p4=0.2385 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.5915 corrupt_frac_t_0p4_0p6=1.0000 loss_all=6.5603 init_gold_top10=0.0844 init_gold_top100=0.1875
310
+ step=850 epoch=425/500 epoch_step=2/2 micro_steps=850 elapsed=3.2s lr=2.000000e-03 loss=0.8016 loss_recon=0.8016 loss_meanflow=0.0000 mean_model_t=0.2162 mean_corrupt_t=0.2162 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2162 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3146 corrupt_frac=0.8500 acc_corrupt=0.1936 loss_corrupt=1.0525 wrong_frac=0.7869 init_acc_corrupt=0.1148 acc_corrupt_t_0p2_0p4=0.2723 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=10.7207 out_g_norm=0.3004 acc_corrupt_t_0p0_0p2=0.0904 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.5135 corrupt_frac_t_0p4_0p6=1.0000 loss_all=6.0824 init_gold_top10=0.1281 init_gold_top100=0.2208
311
+ step=875 epoch=438/500 epoch_step=1/2 micro_steps=875 elapsed=6.0s lr=2.000000e-03 loss=0.5991 loss_recon=0.5991 loss_meanflow=0.0000 mean_model_t=0.1540 mean_corrupt_t=0.1540 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1540 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2811 corrupt_frac=0.8325 acc_corrupt=0.1364 loss_corrupt=0.9252 wrong_frac=0.8482 init_acc_corrupt=0.0609 acc_corrupt_t_0p2_0p4=0.2563 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=10.8209 out_g_norm=0.3307 acc_corrupt_t_0p0_0p2=0.0813 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.4596 corrupt_frac_t_0p4_0p6=1.0000 loss_all=7.4078 init_gold_top10=0.0145 init_gold_top100=0.1306
312
+ step=900 epoch=450/500 epoch_step=2/2 micro_steps=900 elapsed=3.2s lr=2.000000e-03 loss=0.7143 loss_recon=0.7143 loss_meanflow=0.0000 mean_model_t=0.1903 mean_corrupt_t=0.1903 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1903 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2958 corrupt_frac=0.8500 acc_corrupt=0.1716 loss_corrupt=0.9272 wrong_frac=0.8049 init_acc_corrupt=0.0853 acc_corrupt_t_0p0_0p2=0.0970 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=10.9276 out_g_norm=0.2847 acc_corrupt_t_0p2_0p4=0.2633 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.4594 corrupt_frac_t_0p4_0p6=1.0000 loss_all=5.3005 init_gold_top10=0.1261 init_gold_top100=0.2188
313
+ step=925 epoch=463/500 epoch_step=1/2 micro_steps=925 elapsed=6.8s lr=2.000000e-03 loss=0.7613 loss_recon=0.7613 loss_meanflow=0.0000 mean_model_t=0.2349 mean_corrupt_t=0.2349 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2349 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3118 corrupt_frac=0.8800 acc_corrupt=0.2180 loss_corrupt=0.9760 wrong_frac=0.7702 init_acc_corrupt=0.1401 acc_corrupt_t_0p0_0p2=0.1123 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=11.0396 out_g_norm=0.3330 acc_corrupt_t_0p4_0p6=0.5308 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.3340 corrupt_frac_t_0p2_0p4=1.0000 loss_all=5.3976 init_gold_top10=0.1281 init_gold_top100=0.2458
314
+ step=950 epoch=475/500 epoch_step=2/2 micro_steps=950 elapsed=3.2s lr=2.000000e-03 loss=0.7476 loss_recon=0.7476 loss_meanflow=0.0000 mean_model_t=0.2230 mean_corrupt_t=0.2230 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2230 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3692 corrupt_frac=0.8100 acc_corrupt=0.2212 loss_corrupt=0.9145 wrong_frac=0.7714 init_acc_corrupt=0.1330 acc_corrupt_t_0p0_0p2=0.0942 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=11.1446 out_g_norm=0.3047 acc_corrupt_t_0p2_0p4=0.2574 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.5126 corrupt_frac_t_0p4_0p6=1.0000 loss_all=4.2467 init_gold_top10=0.1966 init_gold_top100=0.2839
315
+ step=975 epoch=488/500 epoch_step=1/2 micro_steps=975 elapsed=6.0s lr=2.000000e-03 loss=0.7301 loss_recon=0.7301 loss_meanflow=0.0000 mean_model_t=0.2120 mean_corrupt_t=0.2120 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2120 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3079 corrupt_frac=0.8625 acc_corrupt=0.1976 loss_corrupt=0.8495 wrong_frac=0.7889 init_acc_corrupt=0.1092 acc_corrupt_t_0p0_0p2=0.1043 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=11.2519 out_g_norm=0.3183 acc_corrupt_t_0p2_0p4=0.2925 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.4447 corrupt_frac_t_0p4_0p6=1.0000 loss_all=3.5521 init_gold_top10=0.3099 init_gold_top100=0.3177
316
+ step=1000 epoch=500/500 epoch_step=2/2 micro_steps=1000 elapsed=3.2s lr=2.000000e-03 loss=0.6367 loss_recon=0.6367 loss_meanflow=0.0000 mean_model_t=0.1827 mean_corrupt_t=0.1827 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1827 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2979 corrupt_frac=0.8475 acc_corrupt=0.1717 loss_corrupt=0.7287 wrong_frac=0.8224 init_acc_corrupt=0.0841 acc_corrupt_t_0p0_0p2=0.0930 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=11.3703 out_g_norm=0.2946 acc_corrupt_t_0p2_0p4=0.2936 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.8340 init_gold_top10=0.3828 init_gold_top100=0.3828
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n128_compactv584_3l_bs512_hard_ce_onehot.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n256_compactv969_3l_bs512_hard_ce_allcorrupt.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n512_compactv1635_3l_bs512_hard_ce_onehot.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n64_compactv335_3l_hard_ce_allcorrupt.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n8_allcorrupt_linear_soft_kl_20260517_train8ctx8_allcorrupt.log ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 50257,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_n8_allcorrupt_linear_soft_kl_20260517_train8ctx8_allcorrupt",
10
+ "batch_size": 1,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 4,
13
+ "global_batch_size": 4,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 2,
18
+ "total_steps": 500,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 169453056,
36
+ "muon_adam_param_count": 122368,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "blocks.3.attn_qkv.weight",
57
+ "blocks.3.attn_out.weight",
58
+ "blocks.3.mlp.0.weight",
59
+ "blocks.3.mlp.2.weight",
60
+ "blocks.3.adaLN_modulation.weight",
61
+ "blocks.4.attn_qkv.weight",
62
+ "blocks.4.attn_out.weight",
63
+ "blocks.4.mlp.0.weight",
64
+ "blocks.4.mlp.2.weight",
65
+ "blocks.4.adaLN_modulation.weight",
66
+ "blocks.5.attn_qkv.weight",
67
+ "blocks.5.attn_out.weight",
68
+ "blocks.5.mlp.0.weight",
69
+ "blocks.5.mlp.2.weight",
70
+ "blocks.5.adaLN_modulation.weight",
71
+ "blocks.6.attn_qkv.weight",
72
+ "blocks.6.attn_out.weight",
73
+ "blocks.6.mlp.0.weight",
74
+ "blocks.6.mlp.2.weight",
75
+ "blocks.6.adaLN_modulation.weight",
76
+ "blocks.7.attn_qkv.weight",
77
+ "blocks.7.attn_out.weight",
78
+ "blocks.7.mlp.0.weight",
79
+ "blocks.7.mlp.2.weight",
80
+ "blocks.7.adaLN_modulation.weight",
81
+ "blocks.8.attn_qkv.weight",
82
+ "blocks.8.attn_out.weight",
83
+ "blocks.8.mlp.0.weight",
84
+ "blocks.8.mlp.2.weight",
85
+ "blocks.8.adaLN_modulation.weight",
86
+ "blocks.9.attn_qkv.weight",
87
+ "blocks.9.attn_out.weight",
88
+ "blocks.9.mlp.0.weight",
89
+ "blocks.9.mlp.2.weight",
90
+ "blocks.9.adaLN_modulation.weight",
91
+ "blocks.10.attn_qkv.weight",
92
+ "blocks.10.attn_out.weight",
93
+ "blocks.10.mlp.0.weight",
94
+ "blocks.10.mlp.2.weight",
95
+ "blocks.10.adaLN_modulation.weight",
96
+ "blocks.11.attn_qkv.weight",
97
+ "blocks.11.attn_out.weight",
98
+ "blocks.11.mlp.0.weight",
99
+ "blocks.11.mlp.2.weight",
100
+ "blocks.11.adaLN_modulation.weight",
101
+ "output_layer.linear.weight",
102
+ "output_layer.adaLN_modulation.weight"
103
+ ],
104
+ "muon_adam_param_names": [
105
+ "sigma_map.net.0.bias",
106
+ "sigma_map.net.2.bias",
107
+ "blocks.0.norm1.weight",
108
+ "blocks.0.norm2.weight",
109
+ "blocks.0.mlp.0.bias",
110
+ "blocks.0.mlp.2.bias",
111
+ "blocks.0.adaLN_modulation.bias",
112
+ "blocks.1.norm1.weight",
113
+ "blocks.1.norm2.weight",
114
+ "blocks.1.mlp.0.bias",
115
+ "blocks.1.mlp.2.bias",
116
+ "blocks.1.adaLN_modulation.bias",
117
+ "blocks.2.norm1.weight",
118
+ "blocks.2.norm2.weight",
119
+ "blocks.2.mlp.0.bias",
120
+ "blocks.2.mlp.2.bias",
121
+ "blocks.2.adaLN_modulation.bias",
122
+ "blocks.3.norm1.weight",
123
+ "blocks.3.norm2.weight",
124
+ "blocks.3.mlp.0.bias",
125
+ "blocks.3.mlp.2.bias",
126
+ "blocks.3.adaLN_modulation.bias",
127
+ "blocks.4.norm1.weight",
128
+ "blocks.4.norm2.weight",
129
+ "blocks.4.mlp.0.bias",
130
+ "blocks.4.mlp.2.bias",
131
+ "blocks.4.adaLN_modulation.bias",
132
+ "blocks.5.norm1.weight",
133
+ "blocks.5.norm2.weight",
134
+ "blocks.5.mlp.0.bias",
135
+ "blocks.5.mlp.2.bias",
136
+ "blocks.5.adaLN_modulation.bias",
137
+ "blocks.6.norm1.weight",
138
+ "blocks.6.norm2.weight",
139
+ "blocks.6.mlp.0.bias",
140
+ "blocks.6.mlp.2.bias",
141
+ "blocks.6.adaLN_modulation.bias",
142
+ "blocks.7.norm1.weight",
143
+ "blocks.7.norm2.weight",
144
+ "blocks.7.mlp.0.bias",
145
+ "blocks.7.mlp.2.bias",
146
+ "blocks.7.adaLN_modulation.bias",
147
+ "blocks.8.norm1.weight",
148
+ "blocks.8.norm2.weight",
149
+ "blocks.8.mlp.0.bias",
150
+ "blocks.8.mlp.2.bias",
151
+ "blocks.8.adaLN_modulation.bias",
152
+ "blocks.9.norm1.weight",
153
+ "blocks.9.norm2.weight",
154
+ "blocks.9.mlp.0.bias",
155
+ "blocks.9.mlp.2.bias",
156
+ "blocks.9.adaLN_modulation.bias",
157
+ "blocks.10.norm1.weight",
158
+ "blocks.10.norm2.weight",
159
+ "blocks.10.mlp.0.bias",
160
+ "blocks.10.mlp.2.bias",
161
+ "blocks.10.adaLN_modulation.bias",
162
+ "blocks.11.norm1.weight",
163
+ "blocks.11.norm2.weight",
164
+ "blocks.11.mlp.0.bias",
165
+ "blocks.11.mlp.2.bias",
166
+ "blocks.11.adaLN_modulation.bias",
167
+ "output_layer.norm_final.weight",
168
+ "output_layer.adaLN_modulation.bias"
169
+ ],
170
+ "muon_effective_nesterov": false,
171
+ "muon_effective_width_scale": false,
172
+ "muon_effective_weight_decay": 0.1,
173
+ "muon_adam_fallback_nesterov": false,
174
+ "muon_adam_fallback_weight_decay": 0.1,
175
+ "ema_decay": 0.9999,
176
+ "ema_start_step": 0,
177
+ "model_type": "ddit",
178
+ "elf_num_time_tokens": 4,
179
+ "elf_num_model_mode_tokens": 0,
180
+ "qk_norm": true,
181
+ "output_bias": false,
182
+ "output_init_std": -1.0,
183
+ "norm_type": "rmsnorm",
184
+ "target_loss": "linear_soft_kl",
185
+ "linear_soft_target_power": 1.0,
186
+ "linear_soft_target_min_conf": 0.0,
187
+ "linear_soft_target_max_conf": 1.0,
188
+ "t_sampling_mode": "logit_normal",
189
+ "t_sampling_power": 1.0,
190
+ "t_sampling_eps": 0.0001,
191
+ "t_sampling_logit_mean": -1.5,
192
+ "t_sampling_logit_std": 0.8,
193
+ "dual_t": true,
194
+ "corrupt_t_mode": "same",
195
+ "corrupt_min_t": 0.0,
196
+ "corrupt_max_t": 1.0,
197
+ "prefix_block_prob": 0.0,
198
+ "prefix_block_len": 128,
199
+ "mask_ratio_floor_schedule": "none",
200
+ "dirichlet_endpoint_mode": "categorical_dual_t",
201
+ "dirichlet_semantic_t_mode": "same",
202
+ "dirichlet_semantic_t_value": 0.0,
203
+ "dirichlet_semantic_t_curve": "linear",
204
+ "dirichlet_semantic_t_power": 1.0,
205
+ "endpoint_sequence_random_prob_alpha": 0.0,
206
+ "categorical_wrong_from_full_vocab": true,
207
+ "categorical_wrong_from_batch_valid_tokens": false,
208
+ "categorical_wrong_basin_token_ids": "",
209
+ "categorical_wrong_basin_prob": 0.0,
210
+ "categorical_wrong_unigram_prob": 0.0,
211
+ "categorical_wrong_uniform_prob": 0.0,
212
+ "categorical_wrong_corpus_unigram_path": "",
213
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
214
+ "categorical_wrong_basin_shared_prob": 0.0,
215
+ "categorical_wrong_unigram_shared_prob": 0.0,
216
+ "mask_mixture_original_prob": 0.0,
217
+ "mask_mixture_lowk_prob": 0.0,
218
+ "mask_mixture_lowcorrupt_prob": 0.0,
219
+ "mask_mixture_block_prob": 0.0,
220
+ "mask_mixture_all_prob": 1.0,
221
+ "mask_mixture_lowk_clean_tokens": "0",
222
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
223
+ "mask_mixture_block_tokens": "64,128",
224
+ "simplex_bridge_sampler": "dirichlet",
225
+ "logistic_normal_sigma_min": 0.18,
226
+ "logistic_normal_sigma_max": 2.2,
227
+ "logistic_normal_tau_min": 0.65,
228
+ "logistic_normal_tau_max": 1.15,
229
+ "torch_compile": false,
230
+ "compile_mode": "max-autotune",
231
+ "state_format": "prob",
232
+ "meanflow_weight": 0.0,
233
+ "rollout_train_prob": 0.0,
234
+ "rollout_train_steps": 1,
235
+ "rollout_train_infer_steps": 64,
236
+ "rollout_train_temp": 1.45,
237
+ "rollout_train_max_gamma": 1.0,
238
+ "rollout_train_corrupt_only": true,
239
+ "rollout_train_samplewise": false,
240
+ "rollout_train_compute_always": false,
241
+ "bridge_noise_init": "logistic_normal",
242
+ "noise_sigma": -1.0,
243
+ "allow_tf32": true,
244
+ "activation_checkpointing": false,
245
+ "activation_checkpoint_interval": 1,
246
+ "activation_checkpoint_scope": "block",
247
+ "ddp_static_graph": false,
248
+ "ddp_gradient_as_bucket_view": true,
249
+ "blocking_data_transfer": false,
250
+ "dataloader_prefetch_factor": 4,
251
+ "full_train_stats": false,
252
+ "tokenized_hf": false,
253
+ "tokenized_pad_token": "pad",
254
+ "elf_conditional_hf": false,
255
+ "record_pad_truncate": false,
256
+ "record_add_eos": false,
257
+ "record_add_special_tokens": false,
258
+ "record_pad_token": "pad",
259
+ "record_shuffle_buffer": 10000,
260
+ "wrap": true,
261
+ "wrap_mode": "stream",
262
+ "wrap_record_buffer_size": 200,
263
+ "owt_cached_chunks": true,
264
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len8_train8_overfit",
265
+ "owt_chunk_cache_rebuild": false,
266
+ "owt_chunk_cache_write_batch": 4096,
267
+ "owt_exact_repeat_per_chunk": 0,
268
+ "online_chunk_shuffle": false,
269
+ "online_chunk_shuffle_buffer": 10000,
270
+ "openwebtext_split": "train_minus_100k",
271
+ "detokenizer": "auto",
272
+ "resolved_detokenizer": null,
273
+ "num_workers": 0,
274
+ "latest_every": 10,
275
+ "resume_path": ""
276
+ }
277
+ step=10 epoch=5/250 epoch_step=2/2 micro_steps=10 elapsed=1.9s lr=2.000000e-03 loss=2.5338 loss_recon=2.5338 loss_meanflow=0.0000 mean_model_t=0.2827 mean_corrupt_t=0.2827 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2827 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1750 corrupt_frac=1.0000 acc_corrupt=0.1750 loss_corrupt=3.1158 wrong_frac=0.7500 init_acc_corrupt=0.1625 acc_corrupt_t_0p0_0p2=0.0938 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.0172 out_g_norm=1.3628 acc_corrupt_t_0p6_0p8=0.1250 corrupt_frac_t_0p6_0p8=1.0000 acc_corrupt_t_0p2_0p4=0.1875 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.5000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=10.7812 init_gold_top10=0.3750 init_gold_top100=0.3750
278
+ step=20 epoch=10/250 epoch_step=2/2 micro_steps=20 elapsed=7.0s lr=2.000000e-03 loss=2.0841 loss_recon=2.0841 loss_meanflow=0.0000 mean_model_t=0.2379 mean_corrupt_t=0.2379 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2379 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1750 corrupt_frac=1.0000 acc_corrupt=0.1750 loss_corrupt=3.0768 wrong_frac=0.7250 init_acc_corrupt=0.1625 acc_corrupt_t_0p0_0p2=0.1250 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.1058 out_g_norm=1.5830 acc_corrupt_t_0p2_0p4=0.1875 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p6_0p8=0.5000 corrupt_frac_t_0p6_0p8=1.0000 acc_corrupt_t_0p4_0p6=0.1250 corrupt_frac_t_0p4_0p6=1.0000 loss_all=10.7266 init_gold_top10=0.1250 init_gold_top100=0.1250
279
+ step=30 epoch=15/250 epoch_step=2/2 micro_steps=30 elapsed=6.7s lr=2.000000e-03 loss=2.1873 loss_recon=2.1873 loss_meanflow=0.0000 mean_model_t=0.2564 mean_corrupt_t=0.2564 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2564 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1625 corrupt_frac=1.0000 acc_corrupt=0.1625 loss_corrupt=3.0765 wrong_frac=0.7750 init_acc_corrupt=0.1750 acc_corrupt_t_0p2_0p4=0.1750 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=0.2196 out_g_norm=1.5462 acc_corrupt_t_0p4_0p6=0.1250 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p0_0p2=0.1667 corrupt_frac_t_0p0_0p2=1.0000 loss_all=10.6484 init_gold_top10=0.0000 init_gold_top100=0.1250
280
+ step=40 epoch=20/250 epoch_step=2/2 micro_steps=40 elapsed=6.1s lr=2.000000e-03 loss=1.5748 loss_recon=1.5748 loss_meanflow=0.0000 mean_model_t=0.1906 mean_corrupt_t=0.1906 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1906 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1500 corrupt_frac=1.0000 acc_corrupt=0.1500 loss_corrupt=2.5082 wrong_frac=0.8625 init_acc_corrupt=0.0875 acc_corrupt_t_0p2_0p4=0.1250 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=0.3274 out_g_norm=1.4375 acc_corrupt_t_0p4_0p6=0.2500 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p0_0p2=0.1458 corrupt_frac_t_0p0_0p2=1.0000 loss_all=10.5859 init_gold_top10=0.0000 init_gold_top100=0.1250
281
+ step=50 epoch=25/250 epoch_step=2/2 micro_steps=50 elapsed=5.6s lr=2.000000e-03 loss=1.2940 loss_recon=1.2940 loss_meanflow=0.0000 mean_model_t=0.1646 mean_corrupt_t=0.1646 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1646 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1625 corrupt_frac=1.0000 acc_corrupt=0.1625 loss_corrupt=2.2712 wrong_frac=0.8875 init_acc_corrupt=0.0250 acc_corrupt_t_0p0_0p2=0.1458 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.4224 out_g_norm=1.5049 acc_corrupt_t_0p2_0p4=0.1875 corrupt_frac_t_0p2_0p4=1.0000 loss_all=10.6875 init_gold_top10=0.0000 init_gold_top100=0.2500
282
+ step=60 epoch=30/250 epoch_step=2/2 micro_steps=60 elapsed=4.7s lr=2.000000e-03 loss=1.8988 loss_recon=1.8988 loss_meanflow=0.0000 mean_model_t=0.2362 mean_corrupt_t=0.2362 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2362 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1625 corrupt_frac=1.0000 acc_corrupt=0.1625 loss_corrupt=2.7098 wrong_frac=0.7250 init_acc_corrupt=0.0875 acc_corrupt_t_0p0_0p2=0.1250 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.5076 out_g_norm=1.8652 acc_corrupt_t_0p4_0p6=0.2500 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.1750 corrupt_frac_t_0p2_0p4=1.0000 loss_all=10.1016 init_gold_top10=0.1250 init_gold_top100=0.2500
283
+ step=70 epoch=35/250 epoch_step=2/2 micro_steps=70 elapsed=5.2s lr=2.000000e-03 loss=1.4906 loss_recon=1.4906 loss_meanflow=0.0000 mean_model_t=0.1956 mean_corrupt_t=0.1956 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1956 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1500 corrupt_frac=1.0000 acc_corrupt=0.1500 loss_corrupt=2.3653 wrong_frac=0.8375 init_acc_corrupt=0.0375 acc_corrupt_t_0p2_0p4=0.1667 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=0.5888 out_g_norm=1.8518 acc_corrupt_t_0p0_0p2=0.1250 corrupt_frac_t_0p0_0p2=1.0000 loss_all=10.3281 init_gold_top10=0.1250 init_gold_top100=0.1250
284
+ step=80 epoch=40/250 epoch_step=2/2 micro_steps=80 elapsed=4.5s lr=2.000000e-03 loss=1.8378 loss_recon=1.8378 loss_meanflow=0.0000 mean_model_t=0.2455 mean_corrupt_t=0.2455 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2455 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2875 corrupt_frac=1.0000 acc_corrupt=0.2875 loss_corrupt=2.5135 wrong_frac=0.7250 init_acc_corrupt=0.2000 acc_corrupt_t_0p4_0p6=0.5417 corrupt_frac_t_0p4_0p6=1.0000 out_w_norm=0.6751 out_g_norm=1.7835 acc_corrupt_t_0p0_0p2=0.1250 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p2_0p4=0.2500 corrupt_frac_t_0p2_0p4=1.0000 loss_all=9.0312 init_gold_top10=0.5000 init_gold_top100=0.5000
285
+ step=90 epoch=45/250 epoch_step=2/2 micro_steps=90 elapsed=4.1s lr=2.000000e-03 loss=1.6532 loss_recon=1.6532 loss_meanflow=0.0000 mean_model_t=0.2205 mean_corrupt_t=0.2205 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2205 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1500 corrupt_frac=1.0000 acc_corrupt=0.1500 loss_corrupt=2.2576 wrong_frac=0.7750 init_acc_corrupt=0.1375 acc_corrupt_t_0p0_0p2=0.1250 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.7642 out_g_norm=2.1013 acc_corrupt_t_0p2_0p4=0.1667 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.1875 corrupt_frac_t_0p4_0p6=1.0000 loss_all=9.3867 init_gold_top10=0.3750 init_gold_top100=0.3750
286
+ step=100 epoch=50/250 epoch_step=2/2 micro_steps=100 elapsed=5.3s lr=2.000000e-03 loss=1.2407 loss_recon=1.2407 loss_meanflow=0.0000 mean_model_t=0.1800 mean_corrupt_t=0.1800 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1800 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2125 corrupt_frac=1.0000 acc_corrupt=0.2125 loss_corrupt=2.1568 wrong_frac=0.8375 init_acc_corrupt=0.0875 acc_corrupt_t_0p0_0p2=0.1458 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.8532 out_g_norm=1.9523 acc_corrupt_t_0p2_0p4=0.3125 corrupt_frac_t_0p2_0p4=1.0000 loss_all=10.0547 init_gold_top10=0.1250 init_gold_top100=0.2500
287
+ step=110 epoch=55/250 epoch_step=2/2 micro_steps=110 elapsed=4.4s lr=2.000000e-03 loss=1.5690 loss_recon=1.5690 loss_meanflow=0.0000 mean_model_t=0.2276 mean_corrupt_t=0.2276 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2276 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2375 corrupt_frac=1.0000 acc_corrupt=0.2375 loss_corrupt=2.4548 wrong_frac=0.8375 init_acc_corrupt=0.1375 acc_corrupt_t_0p0_0p2=0.1458 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.9430 out_g_norm=2.0350 acc_corrupt_t_0p4_0p6=0.3750 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.3750 corrupt_frac_t_0p2_0p4=1.0000 loss_all=9.4180 init_gold_top10=0.0000 init_gold_top100=0.1250
288
+ step=120 epoch=60/250 epoch_step=2/2 micro_steps=120 elapsed=4.1s lr=2.000000e-03 loss=1.6581 loss_recon=1.6581 loss_meanflow=0.0000 mean_model_t=0.2553 mean_corrupt_t=0.2553 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2553 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3250 corrupt_frac=1.0000 acc_corrupt=0.3250 loss_corrupt=2.4954 wrong_frac=0.6750 init_acc_corrupt=0.2250 acc_corrupt_t_0p2_0p4=0.3929 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=1.0350 out_g_norm=2.3059 acc_corrupt_t_0p0_0p2=0.1667 corrupt_frac_t_0p0_0p2=1.0000 loss_all=9.1484 init_gold_top10=0.5000 init_gold_top100=0.5000
289
+ step=130 epoch=65/250 epoch_step=2/2 micro_steps=130 elapsed=4.6s lr=2.000000e-03 loss=1.4130 loss_recon=1.4130 loss_meanflow=0.0000 mean_model_t=0.2286 mean_corrupt_t=0.2286 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2286 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2375 corrupt_frac=1.0000 acc_corrupt=0.2375 loss_corrupt=2.1573 wrong_frac=0.7875 init_acc_corrupt=0.1375 acc_corrupt_t_0p0_0p2=0.1667 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.1300 out_g_norm=2.4490 acc_corrupt_t_0p2_0p4=0.2679 corrupt_frac_t_0p2_0p4=1.0000 loss_all=8.8047 init_gold_top10=0.1250 init_gold_top100=0.1250
290
+ step=140 epoch=70/250 epoch_step=2/2 micro_steps=140 elapsed=4.4s lr=2.000000e-03 loss=1.0509 loss_recon=1.0509 loss_meanflow=0.0000 mean_model_t=0.1682 mean_corrupt_t=0.1682 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1682 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1875 corrupt_frac=1.0000 acc_corrupt=0.1875 loss_corrupt=1.9215 wrong_frac=0.8375 init_acc_corrupt=0.0500 acc_corrupt_t_0p0_0p2=0.1250 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.2307 out_g_norm=2.0366 acc_corrupt_t_0p2_0p4=0.3333 corrupt_frac_t_0p2_0p4=1.0000 loss_all=9.1719 init_gold_top10=0.1250 init_gold_top100=0.1250
291
+ step=150 epoch=75/250 epoch_step=2/2 micro_steps=150 elapsed=4.1s lr=2.000000e-03 loss=1.3402 loss_recon=1.3402 loss_meanflow=0.0000 mean_model_t=0.2326 mean_corrupt_t=0.2326 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2326 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2625 corrupt_frac=1.0000 acc_corrupt=0.2625 loss_corrupt=1.9238 wrong_frac=0.7625 init_acc_corrupt=0.1375 acc_corrupt_t_0p0_0p2=0.1750 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.3304 out_g_norm=1.9295 acc_corrupt_t_0p2_0p4=0.3125 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.5000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=8.0742 init_gold_top10=0.2500 init_gold_top100=0.2500
292
+ step=160 epoch=80/250 epoch_step=2/2 micro_steps=160 elapsed=4.5s lr=2.000000e-03 loss=0.8337 loss_recon=0.8337 loss_meanflow=0.0000 mean_model_t=0.1882 mean_corrupt_t=0.1882 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1882 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3375 corrupt_frac=1.0000 acc_corrupt=0.3375 loss_corrupt=1.6679 wrong_frac=0.7500 init_acc_corrupt=0.1750 acc_corrupt_t_0p2_0p4=0.5000 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=1.4319 out_g_norm=1.8421 acc_corrupt_t_0p0_0p2=0.1750 corrupt_frac_t_0p0_0p2=1.0000 loss_all=8.7930 init_gold_top10=0.0000 init_gold_top100=0.0000
293
+ step=170 epoch=85/250 epoch_step=2/2 micro_steps=170 elapsed=5.3s lr=2.000000e-03 loss=0.9756 loss_recon=0.9756 loss_meanflow=0.0000 mean_model_t=0.2017 mean_corrupt_t=0.2017 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2017 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2250 corrupt_frac=1.0000 acc_corrupt=0.2250 loss_corrupt=1.3851 wrong_frac=0.7250 init_acc_corrupt=0.1000 acc_corrupt_t_0p2_0p4=0.3750 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=1.5338 out_g_norm=1.6552 acc_corrupt_t_0p0_0p2=0.0750 corrupt_frac_t_0p0_0p2=1.0000 loss_all=4.8604 init_gold_top10=0.5000 init_gold_top100=0.6250
294
+ step=180 epoch=90/250 epoch_step=2/2 micro_steps=180 elapsed=4.1s lr=2.000000e-03 loss=1.0499 loss_recon=1.0499 loss_meanflow=0.0000 mean_model_t=0.2081 mean_corrupt_t=0.2081 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2081 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2375 corrupt_frac=1.0000 acc_corrupt=0.2375 loss_corrupt=1.7474 wrong_frac=0.7875 init_acc_corrupt=0.1250 acc_corrupt_t_0p0_0p2=0.1667 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.6309 out_g_norm=1.7936 acc_corrupt_t_0p2_0p4=0.3438 corrupt_frac_t_0p2_0p4=1.0000 loss_all=8.0332 init_gold_top10=0.3750 init_gold_top100=0.5000
295
+ step=190 epoch=95/250 epoch_step=2/2 micro_steps=190 elapsed=4.5s lr=2.000000e-03 loss=0.9665 loss_recon=0.9665 loss_meanflow=0.0000 mean_model_t=0.1942 mean_corrupt_t=0.1942 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1942 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2250 corrupt_frac=1.0000 acc_corrupt=0.2250 loss_corrupt=1.7928 wrong_frac=0.8625 init_acc_corrupt=0.0625 acc_corrupt_t_0p0_0p2=0.1458 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.7201 out_g_norm=2.0560 acc_corrupt_t_0p2_0p4=0.3438 corrupt_frac_t_0p2_0p4=1.0000 loss_all=8.8516 init_gold_top10=0.1250 init_gold_top100=0.1250
296
+ step=200 epoch=100/250 epoch_step=2/2 micro_steps=200 elapsed=4.4s lr=2.000000e-03 loss=1.0310 loss_recon=1.0310 loss_meanflow=0.0000 mean_model_t=0.1970 mean_corrupt_t=0.1970 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1970 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1625 corrupt_frac=1.0000 acc_corrupt=0.1625 loss_corrupt=1.8039 wrong_frac=0.8250 init_acc_corrupt=0.0625 acc_corrupt_t_0p0_0p2=0.1250 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.8020 out_g_norm=2.0184 acc_corrupt_t_0p2_0p4=0.2500 corrupt_frac_t_0p2_0p4=1.0000 loss_all=7.9961 init_gold_top10=0.0000 init_gold_top100=0.3750
297
+ step=210 epoch=105/250 epoch_step=2/2 micro_steps=210 elapsed=4.1s lr=2.000000e-03 loss=0.8071 loss_recon=0.8071 loss_meanflow=0.0000 mean_model_t=0.1880 mean_corrupt_t=0.1880 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1880 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2500 corrupt_frac=1.0000 acc_corrupt=0.2500 loss_corrupt=1.2439 wrong_frac=0.8500 init_acc_corrupt=0.0875 acc_corrupt_t_0p2_0p4=0.3125 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=1.8702 out_g_norm=2.0898 acc_corrupt_t_0p0_0p2=0.2083 corrupt_frac_t_0p0_0p2=1.0000 loss_all=4.9170 init_gold_top10=0.2500 init_gold_top100=0.3750
298
+ step=220 epoch=110/250 epoch_step=2/2 micro_steps=220 elapsed=4.5s lr=2.000000e-03 loss=0.8528 loss_recon=0.8528 loss_meanflow=0.0000 mean_model_t=0.1718 mean_corrupt_t=0.1718 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1718 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2000 corrupt_frac=1.0000 acc_corrupt=0.2000 loss_corrupt=1.6071 wrong_frac=0.8500 init_acc_corrupt=0.0375 acc_corrupt_t_0p0_0p2=0.2083 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.9181 out_g_norm=1.7059 acc_corrupt_t_0p2_0p4=0.1875 corrupt_frac_t_0p2_0p4=1.0000 loss_all=8.7812 init_gold_top10=0.1250 init_gold_top100=0.1250
299
+ step=230 epoch=115/250 epoch_step=2/2 micro_steps=230 elapsed=4.4s lr=2.000000e-03 loss=0.8167 loss_recon=0.8167 loss_meanflow=0.0000 mean_model_t=0.3049 mean_corrupt_t=0.3049 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.3049 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4375 corrupt_frac=1.0000 acc_corrupt=0.4375 loss_corrupt=1.4772 wrong_frac=0.6500 init_acc_corrupt=0.2875 acc_corrupt_t_0p4_0p6=0.7917 corrupt_frac_t_0p4_0p6=1.0000 out_w_norm=1.9604 out_g_norm=2.2768 acc_corrupt_t_0p2_0p4=0.4583 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p0_0p2=0.1562 corrupt_frac_t_0p0_0p2=1.0000 loss_all=7.4824 init_gold_top10=0.2500 init_gold_top100=0.2500
300
+ step=240 epoch=120/250 epoch_step=2/2 micro_steps=240 elapsed=4.1s lr=2.000000e-03 loss=0.8203 loss_recon=0.8203 loss_meanflow=0.0000 mean_model_t=0.2693 mean_corrupt_t=0.2693 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2693 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3375 corrupt_frac=1.0000 acc_corrupt=0.3375 loss_corrupt=1.1419 wrong_frac=0.7125 init_acc_corrupt=0.2000 acc_corrupt_t_0p0_0p2=0.1875 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0042 out_g_norm=2.2456 acc_corrupt_t_0p4_0p6=0.6250 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.3438 corrupt_frac_t_0p2_0p4=1.0000 loss_all=4.4014 init_gold_top10=0.2500 init_gold_top100=0.2500
301
+ step=250 epoch=125/250 epoch_step=2/2 micro_steps=250 elapsed=4.5s lr=2.000000e-03 loss=0.5640 loss_recon=0.5640 loss_meanflow=0.0000 mean_model_t=0.1964 mean_corrupt_t=0.1964 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1964 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2875 corrupt_frac=1.0000 acc_corrupt=0.2875 loss_corrupt=0.8368 wrong_frac=0.7500 init_acc_corrupt=0.1250 acc_corrupt_t_0p0_0p2=0.1719 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0422 out_g_norm=1.8608 acc_corrupt_t_0p6_0p8=0.8750 corrupt_frac_t_0p6_0p8=1.0000 acc_corrupt_t_0p2_0p4=0.6250 corrupt_frac_t_0p2_0p4=1.0000 loss_all=3.4316 init_gold_top10=0.5000 init_gold_top100=0.5000
302
+ step=260 epoch=130/250 epoch_step=2/2 micro_steps=260 elapsed=4.4s lr=2.000000e-03 loss=0.6435 loss_recon=0.6435 loss_meanflow=0.0000 mean_model_t=0.1517 mean_corrupt_t=0.1517 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1517 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1500 corrupt_frac=1.0000 acc_corrupt=0.1500 loss_corrupt=1.1853 wrong_frac=0.8750 init_acc_corrupt=0.0250 acc_corrupt_t_0p0_0p2=0.1429 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0589 out_g_norm=1.9505 acc_corrupt_t_0p2_0p4=0.1667 corrupt_frac_t_0p2_0p4=1.0000 loss_all=5.7266 init_gold_top10=0.1250 init_gold_top100=0.5000
303
+ step=270 epoch=135/250 epoch_step=2/2 micro_steps=270 elapsed=4.0s lr=2.000000e-03 loss=0.6368 loss_recon=0.6368 loss_meanflow=0.0000 mean_model_t=0.1781 mean_corrupt_t=0.1781 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1781 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2500 corrupt_frac=1.0000 acc_corrupt=0.2500 loss_corrupt=1.0227 wrong_frac=0.7750 init_acc_corrupt=0.0875 acc_corrupt_t_0p0_0p2=0.1786 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.0719 out_g_norm=2.0863 acc_corrupt_t_0p2_0p4=0.4167 corrupt_frac_t_0p2_0p4=1.0000 loss_all=4.4688 init_gold_top10=0.3750 init_gold_top100=0.3750
304
+ step=280 epoch=140/250 epoch_step=2/2 micro_steps=280 elapsed=4.5s lr=2.000000e-03 loss=0.8962 loss_recon=0.8962 loss_meanflow=0.0000 mean_model_t=0.2199 mean_corrupt_t=0.2199 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2199 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1750 corrupt_frac=1.0000 acc_corrupt=0.1750 loss_corrupt=1.4513 wrong_frac=0.8000 init_acc_corrupt=0.0625 acc_corrupt_t_0p4_0p6=0.0000 corrupt_frac_t_0p4_0p6=1.0000 out_w_norm=2.0855 out_g_norm=2.2083 acc_corrupt_t_0p0_0p2=0.1000 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p2_0p4=0.3125 corrupt_frac_t_0p2_0p4=1.0000 loss_all=6.1973 init_gold_top10=0.1250 init_gold_top100=0.2500
305
+ step=290 epoch=145/250 epoch_step=2/2 micro_steps=290 elapsed=4.4s lr=2.000000e-03 loss=0.6349 loss_recon=0.6349 loss_meanflow=0.0000 mean_model_t=0.2100 mean_corrupt_t=0.2100 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2100 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3000 corrupt_frac=1.0000 acc_corrupt=0.3000 loss_corrupt=1.0752 wrong_frac=0.7625 init_acc_corrupt=0.1625 acc_corrupt_t_0p2_0p4=0.3750 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.1006 out_g_norm=2.0542 acc_corrupt_t_0p0_0p2=0.2250 corrupt_frac_t_0p0_0p2=1.0000 loss_all=5.5759 init_gold_top10=0.2500 init_gold_top100=0.2500
306
+ step=300 epoch=150/250 epoch_step=2/2 micro_steps=300 elapsed=4.1s lr=2.000000e-03 loss=0.6497 loss_recon=0.6497 loss_meanflow=0.0000 mean_model_t=0.2037 mean_corrupt_t=0.2037 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2037 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2625 corrupt_frac=1.0000 acc_corrupt=0.2625 loss_corrupt=1.3369 wrong_frac=0.7250 init_acc_corrupt=0.0875 acc_corrupt_t_0p0_0p2=0.1750 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1204 out_g_norm=2.0059 acc_corrupt_t_0p2_0p4=0.3500 corrupt_frac_t_0p2_0p4=1.0000 loss_all=7.2656 init_gold_top10=0.1250 init_gold_top100=0.5000
307
+ step=310 epoch=155/250 epoch_step=2/2 micro_steps=310 elapsed=4.5s lr=2.000000e-03 loss=0.5915 loss_recon=0.5915 loss_meanflow=0.0000 mean_model_t=0.1660 mean_corrupt_t=0.1660 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1660 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2125 corrupt_frac=1.0000 acc_corrupt=0.2125 loss_corrupt=1.2310 wrong_frac=0.8500 init_acc_corrupt=0.0875 acc_corrupt_t_0p0_0p2=0.1607 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1404 out_g_norm=2.0428 acc_corrupt_t_0p4_0p6=0.8750 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.0625 corrupt_frac_t_0p2_0p4=1.0000 loss_all=8.0625 init_gold_top10=0.0000 init_gold_top100=0.0000
308
+ step=320 epoch=160/250 epoch_step=2/2 micro_steps=320 elapsed=4.4s lr=2.000000e-03 loss=0.7207 loss_recon=0.7207 loss_meanflow=0.0000 mean_model_t=0.2203 mean_corrupt_t=0.2203 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2203 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2250 corrupt_frac=1.0000 acc_corrupt=0.2250 loss_corrupt=1.3024 wrong_frac=0.8250 init_acc_corrupt=0.1125 acc_corrupt_t_0p0_0p2=0.1458 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1608 out_g_norm=2.2383 acc_corrupt_t_0p4_0p6=0.2500 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.3750 corrupt_frac_t_0p2_0p4=1.0000 loss_all=6.6562 init_gold_top10=0.0000 init_gold_top100=0.3750
309
+ step=330 epoch=165/250 epoch_step=2/2 micro_steps=330 elapsed=4.0s lr=2.000000e-03 loss=0.6010 loss_recon=0.6010 loss_meanflow=0.0000 mean_model_t=0.2293 mean_corrupt_t=0.2293 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2293 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3000 corrupt_frac=1.0000 acc_corrupt=0.3000 loss_corrupt=0.7656 wrong_frac=0.6750 init_acc_corrupt=0.2125 acc_corrupt_t_0p0_0p2=0.0938 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1767 out_g_norm=2.3064 acc_corrupt_t_0p2_0p4=0.4375 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.0283 init_gold_top10=0.6250 init_gold_top100=0.6250
310
+ step=340 epoch=170/250 epoch_step=2/2 micro_steps=340 elapsed=4.4s lr=2.000000e-03 loss=0.5942 loss_recon=0.5942 loss_meanflow=0.0000 mean_model_t=0.2682 mean_corrupt_t=0.2682 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2682 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3375 corrupt_frac=1.0000 acc_corrupt=0.3375 loss_corrupt=1.0234 wrong_frac=0.6875 init_acc_corrupt=0.2375 acc_corrupt_t_0p0_0p2=0.1250 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1904 out_g_norm=2.4344 acc_corrupt_t_0p2_0p4=0.4167 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.5000 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=1.0000 loss_all=4.6797 init_gold_top10=0.1250 init_gold_top100=0.1250
311
+ step=350 epoch=175/250 epoch_step=2/2 micro_steps=350 elapsed=4.4s lr=2.000000e-03 loss=0.6995 loss_recon=0.6995 loss_meanflow=0.0000 mean_model_t=0.1929 mean_corrupt_t=0.1929 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1929 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1750 corrupt_frac=1.0000 acc_corrupt=0.1750 loss_corrupt=1.1853 wrong_frac=0.8375 init_acc_corrupt=0.0750 acc_corrupt_t_0p2_0p4=0.2500 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.1949 out_g_norm=2.2119 acc_corrupt_t_0p0_0p2=0.1000 corrupt_frac_t_0p0_0p2=1.0000 loss_all=5.4766 init_gold_top10=0.1250 init_gold_top100=0.2500
312
+ step=360 epoch=180/250 epoch_step=2/2 micro_steps=360 elapsed=4.1s lr=2.000000e-03 loss=0.6252 loss_recon=0.6252 loss_meanflow=0.0000 mean_model_t=0.1923 mean_corrupt_t=0.1923 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1923 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2000 corrupt_frac=1.0000 acc_corrupt=0.2000 loss_corrupt=0.9015 wrong_frac=0.7875 init_acc_corrupt=0.1000 acc_corrupt_t_0p0_0p2=0.1250 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1944 out_g_norm=1.9623 acc_corrupt_t_0p4_0p6=0.5000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=4.1462 init_gold_top10=0.2500 init_gold_top100=0.2500
313
+ step=370 epoch=185/250 epoch_step=2/2 micro_steps=370 elapsed=4.5s lr=2.000000e-03 loss=0.4709 loss_recon=0.4709 loss_meanflow=0.0000 mean_model_t=0.1757 mean_corrupt_t=0.1757 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1757 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2750 corrupt_frac=1.0000 acc_corrupt=0.2750 loss_corrupt=0.8083 wrong_frac=0.8250 init_acc_corrupt=0.0750 acc_corrupt_t_0p2_0p4=0.3750 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.1991 out_g_norm=2.0457 acc_corrupt_t_0p0_0p2=0.2500 corrupt_frac_t_0p0_0p2=1.0000 loss_all=3.7109 init_gold_top10=0.2500 init_gold_top100=0.5000
314
+ step=380 epoch=190/250 epoch_step=2/2 micro_steps=380 elapsed=4.4s lr=2.000000e-03 loss=0.4684 loss_recon=0.4684 loss_meanflow=0.0000 mean_model_t=0.1794 mean_corrupt_t=0.1794 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1794 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2375 corrupt_frac=1.0000 acc_corrupt=0.2375 loss_corrupt=0.7979 wrong_frac=0.8375 init_acc_corrupt=0.1000 acc_corrupt_t_0p0_0p2=0.1250 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.2059 out_g_norm=2.0713 acc_corrupt_t_0p4_0p6=0.5000 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.5000 corrupt_frac_t_0p2_0p4=1.0000 loss_all=4.0498 init_gold_top10=0.2500 init_gold_top100=0.2500
315
+ step=390 epoch=195/250 epoch_step=2/2 micro_steps=390 elapsed=4.1s lr=2.000000e-03 loss=0.6511 loss_recon=0.6511 loss_meanflow=0.0000 mean_model_t=0.1873 mean_corrupt_t=0.1873 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1873 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2500 corrupt_frac=1.0000 acc_corrupt=0.2500 loss_corrupt=1.1307 wrong_frac=0.8125 init_acc_corrupt=0.0750 acc_corrupt_t_0p2_0p4=0.3125 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.2128 out_g_norm=2.4076 acc_corrupt_t_0p0_0p2=0.2083 corrupt_frac_t_0p0_0p2=1.0000 loss_all=5.4785 init_gold_top10=0.1250 init_gold_top100=0.5000
316
+ step=400 epoch=200/250 epoch_step=2/2 micro_steps=400 elapsed=4.5s lr=2.000000e-03 loss=0.6558 loss_recon=0.6558 loss_meanflow=0.0000 mean_model_t=0.3288 mean_corrupt_t=0.3288 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.3288 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3125 corrupt_frac=1.0000 acc_corrupt=0.3125 loss_corrupt=0.7121 wrong_frac=0.7250 init_acc_corrupt=0.2250 acc_corrupt_t_0p0_0p2=0.0833 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.2082 out_g_norm=2.2056 acc_corrupt_t_0p2_0p4=0.2188 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p6_0p8=0.6250 corrupt_frac_t_0p6_0p8=1.0000 acc_corrupt_t_0p4_0p6=0.6875 corrupt_frac_t_0p4_0p6=1.0000 loss_all=1.0971 init_gold_top10=0.7500 init_gold_top100=0.7500
317
+ step=410 epoch=205/250 epoch_step=2/2 micro_steps=410 elapsed=5.9s lr=2.000000e-03 loss=0.6098 loss_recon=0.6098 loss_meanflow=0.0000 mean_model_t=0.2749 mean_corrupt_t=0.2749 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2749 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4000 corrupt_frac=1.0000 acc_corrupt=0.4000 loss_corrupt=0.8961 wrong_frac=0.6750 init_acc_corrupt=0.2375 acc_corrupt_t_0p0_0p2=0.1667 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.2129 out_g_norm=2.1538 acc_corrupt_t_0p2_0p4=0.4500 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.6250 corrupt_frac_t_0p4_0p6=1.0000 loss_all=3.2842 init_gold_top10=0.3750 init_gold_top100=0.5000
318
+ step=420 epoch=210/250 epoch_step=2/2 micro_steps=420 elapsed=5.5s lr=2.000000e-03 loss=0.6001 loss_recon=0.6001 loss_meanflow=0.0000 mean_model_t=0.2123 mean_corrupt_t=0.2123 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2123 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2625 corrupt_frac=1.0000 acc_corrupt=0.2625 loss_corrupt=1.0973 wrong_frac=0.7750 init_acc_corrupt=0.1000 acc_corrupt_t_0p2_0p4=0.3125 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.2078 out_g_norm=2.0003 acc_corrupt_t_0p0_0p2=0.1875 corrupt_frac_t_0p0_0p2=1.0000 loss_all=5.4766 init_gold_top10=0.1250 init_gold_top100=0.3750
319
+ step=430 epoch=215/250 epoch_step=2/2 micro_steps=430 elapsed=6.0s lr=2.000000e-03 loss=0.5659 loss_recon=0.5659 loss_meanflow=0.0000 mean_model_t=0.1772 mean_corrupt_t=0.1772 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1772 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2750 corrupt_frac=1.0000 acc_corrupt=0.2750 loss_corrupt=0.9336 wrong_frac=0.8250 init_acc_corrupt=0.0875 acc_corrupt_t_0p0_0p2=0.2250 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1960 out_g_norm=1.8503 acc_corrupt_t_0p2_0p4=0.3250 corrupt_frac_t_0p2_0p4=1.0000 loss_all=4.2871 init_gold_top10=0.1250 init_gold_top100=0.3750
320
+ step=440 epoch=220/250 epoch_step=2/2 micro_steps=440 elapsed=5.4s lr=2.000000e-03 loss=0.5390 loss_recon=0.5390 loss_meanflow=0.0000 mean_model_t=0.2333 mean_corrupt_t=0.2333 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2333 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3625 corrupt_frac=1.0000 acc_corrupt=0.3625 loss_corrupt=0.9075 wrong_frac=0.6875 init_acc_corrupt=0.1875 acc_corrupt_t_0p0_0p2=0.1562 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1950 out_g_norm=1.9730 acc_corrupt_t_0p2_0p4=0.5000 corrupt_frac_t_0p2_0p4=1.0000 loss_all=4.2441 init_gold_top10=0.3750 init_gold_top100=0.6250
321
+ step=450 epoch=225/250 epoch_step=2/2 micro_steps=450 elapsed=4.3s lr=2.000000e-03 loss=0.5868 loss_recon=0.5868 loss_meanflow=0.0000 mean_model_t=0.1578 mean_corrupt_t=0.1578 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1578 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1500 corrupt_frac=1.0000 acc_corrupt=0.1500 loss_corrupt=1.2287 wrong_frac=0.8375 init_acc_corrupt=0.0500 acc_corrupt_t_0p0_0p2=0.1250 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1938 out_g_norm=2.1070 acc_corrupt_t_0p2_0p4=0.2083 corrupt_frac_t_0p2_0p4=1.0000 loss_all=6.6562 init_gold_top10=0.0000 init_gold_top100=0.0000
322
+ step=460 epoch=230/250 epoch_step=2/2 micro_steps=460 elapsed=5.0s lr=2.000000e-03 loss=0.4614 loss_recon=0.4614 loss_meanflow=0.0000 mean_model_t=0.1884 mean_corrupt_t=0.1884 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1884 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1875 corrupt_frac=1.0000 acc_corrupt=0.1875 loss_corrupt=0.9782 wrong_frac=0.8250 init_acc_corrupt=0.1000 acc_corrupt_t_0p0_0p2=0.0179 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1963 out_g_norm=2.1456 acc_corrupt_t_0p4_0p6=0.5625 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.6250 corrupt_frac_t_0p2_0p4=1.0000 loss_all=5.5859 init_gold_top10=0.0000 init_gold_top100=0.1250
323
+ step=470 epoch=235/250 epoch_step=2/2 micro_steps=470 elapsed=5.8s lr=2.000000e-03 loss=0.5090 loss_recon=0.5090 loss_meanflow=0.0000 mean_model_t=0.1817 mean_corrupt_t=0.1817 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1817 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2375 corrupt_frac=1.0000 acc_corrupt=0.2375 loss_corrupt=1.1135 wrong_frac=0.8625 init_acc_corrupt=0.0750 acc_corrupt_t_0p0_0p2=0.1786 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.2051 out_g_norm=2.3825 acc_corrupt_t_0p2_0p4=0.3750 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.3750 corrupt_frac_t_0p4_0p6=1.0000 loss_all=6.2793 init_gold_top10=0.0000 init_gold_top100=0.0000
324
+ step=480 epoch=240/250 epoch_step=2/2 micro_steps=480 elapsed=5.6s lr=2.000000e-03 loss=0.5577 loss_recon=0.5577 loss_meanflow=0.0000 mean_model_t=0.2314 mean_corrupt_t=0.2314 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2314 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3125 corrupt_frac=1.0000 acc_corrupt=0.3125 loss_corrupt=0.8093 wrong_frac=0.6750 init_acc_corrupt=0.1625 acc_corrupt_t_0p0_0p2=0.1000 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.2094 out_g_norm=2.1309 acc_corrupt_t_0p2_0p4=0.4688 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.7500 corrupt_frac_t_0p4_0p6=1.0000 loss_all=2.8984 init_gold_top10=0.3750 init_gold_top100=0.5000
325
+ step=490 epoch=245/250 epoch_step=2/2 micro_steps=490 elapsed=6.4s lr=2.000000e-03 loss=0.5394 loss_recon=0.5394 loss_meanflow=0.0000 mean_model_t=0.2122 mean_corrupt_t=0.2122 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.2122 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2750 corrupt_frac=1.0000 acc_corrupt=0.2750 loss_corrupt=0.8618 wrong_frac=0.7250 init_acc_corrupt=0.1375 acc_corrupt_t_0p0_0p2=0.1875 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.2091 out_g_norm=2.0059 acc_corrupt_t_0p2_0p4=0.2250 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.8750 corrupt_frac_t_0p4_0p6=1.0000 loss_all=3.6338 init_gold_top10=0.2500 init_gold_top100=0.2500
326
+ step=500 epoch=250/250 epoch_step=2/2 micro_steps=500 elapsed=4.4s lr=2.000000e-03 loss=0.4201 loss_recon=0.4201 loss_meanflow=0.0000 mean_model_t=0.1335 mean_corrupt_t=0.1335 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.1335 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1875 corrupt_frac=1.0000 acc_corrupt=0.1875 loss_corrupt=0.7494 wrong_frac=0.8875 init_acc_corrupt=0.0500 acc_corrupt_t_0p2_0p4=0.1875 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.2072 out_g_norm=1.6554 acc_corrupt_t_0p0_0p2=0.1875 corrupt_frac_t_0p0_0p2=1.0000 loss_all=3.9180 init_gold_top10=0.2500 init_gold_top100=0.2500
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n8_compactv47_3l_linear_soft_kl_allcorrupt.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_n8_hard_ce_bridge_20260517_train8ctx8_overfit.log ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 50257,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_n8_hard_ce_bridge_20260517_train8ctx8_overfit",
10
+ "batch_size": 1,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 4,
13
+ "global_batch_size": 4,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 2,
18
+ "total_steps": 500,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 169453056,
36
+ "muon_adam_param_count": 122368,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "blocks.3.attn_qkv.weight",
57
+ "blocks.3.attn_out.weight",
58
+ "blocks.3.mlp.0.weight",
59
+ "blocks.3.mlp.2.weight",
60
+ "blocks.3.adaLN_modulation.weight",
61
+ "blocks.4.attn_qkv.weight",
62
+ "blocks.4.attn_out.weight",
63
+ "blocks.4.mlp.0.weight",
64
+ "blocks.4.mlp.2.weight",
65
+ "blocks.4.adaLN_modulation.weight",
66
+ "blocks.5.attn_qkv.weight",
67
+ "blocks.5.attn_out.weight",
68
+ "blocks.5.mlp.0.weight",
69
+ "blocks.5.mlp.2.weight",
70
+ "blocks.5.adaLN_modulation.weight",
71
+ "blocks.6.attn_qkv.weight",
72
+ "blocks.6.attn_out.weight",
73
+ "blocks.6.mlp.0.weight",
74
+ "blocks.6.mlp.2.weight",
75
+ "blocks.6.adaLN_modulation.weight",
76
+ "blocks.7.attn_qkv.weight",
77
+ "blocks.7.attn_out.weight",
78
+ "blocks.7.mlp.0.weight",
79
+ "blocks.7.mlp.2.weight",
80
+ "blocks.7.adaLN_modulation.weight",
81
+ "blocks.8.attn_qkv.weight",
82
+ "blocks.8.attn_out.weight",
83
+ "blocks.8.mlp.0.weight",
84
+ "blocks.8.mlp.2.weight",
85
+ "blocks.8.adaLN_modulation.weight",
86
+ "blocks.9.attn_qkv.weight",
87
+ "blocks.9.attn_out.weight",
88
+ "blocks.9.mlp.0.weight",
89
+ "blocks.9.mlp.2.weight",
90
+ "blocks.9.adaLN_modulation.weight",
91
+ "blocks.10.attn_qkv.weight",
92
+ "blocks.10.attn_out.weight",
93
+ "blocks.10.mlp.0.weight",
94
+ "blocks.10.mlp.2.weight",
95
+ "blocks.10.adaLN_modulation.weight",
96
+ "blocks.11.attn_qkv.weight",
97
+ "blocks.11.attn_out.weight",
98
+ "blocks.11.mlp.0.weight",
99
+ "blocks.11.mlp.2.weight",
100
+ "blocks.11.adaLN_modulation.weight",
101
+ "output_layer.linear.weight",
102
+ "output_layer.adaLN_modulation.weight"
103
+ ],
104
+ "muon_adam_param_names": [
105
+ "sigma_map.net.0.bias",
106
+ "sigma_map.net.2.bias",
107
+ "blocks.0.norm1.weight",
108
+ "blocks.0.norm2.weight",
109
+ "blocks.0.mlp.0.bias",
110
+ "blocks.0.mlp.2.bias",
111
+ "blocks.0.adaLN_modulation.bias",
112
+ "blocks.1.norm1.weight",
113
+ "blocks.1.norm2.weight",
114
+ "blocks.1.mlp.0.bias",
115
+ "blocks.1.mlp.2.bias",
116
+ "blocks.1.adaLN_modulation.bias",
117
+ "blocks.2.norm1.weight",
118
+ "blocks.2.norm2.weight",
119
+ "blocks.2.mlp.0.bias",
120
+ "blocks.2.mlp.2.bias",
121
+ "blocks.2.adaLN_modulation.bias",
122
+ "blocks.3.norm1.weight",
123
+ "blocks.3.norm2.weight",
124
+ "blocks.3.mlp.0.bias",
125
+ "blocks.3.mlp.2.bias",
126
+ "blocks.3.adaLN_modulation.bias",
127
+ "blocks.4.norm1.weight",
128
+ "blocks.4.norm2.weight",
129
+ "blocks.4.mlp.0.bias",
130
+ "blocks.4.mlp.2.bias",
131
+ "blocks.4.adaLN_modulation.bias",
132
+ "blocks.5.norm1.weight",
133
+ "blocks.5.norm2.weight",
134
+ "blocks.5.mlp.0.bias",
135
+ "blocks.5.mlp.2.bias",
136
+ "blocks.5.adaLN_modulation.bias",
137
+ "blocks.6.norm1.weight",
138
+ "blocks.6.norm2.weight",
139
+ "blocks.6.mlp.0.bias",
140
+ "blocks.6.mlp.2.bias",
141
+ "blocks.6.adaLN_modulation.bias",
142
+ "blocks.7.norm1.weight",
143
+ "blocks.7.norm2.weight",
144
+ "blocks.7.mlp.0.bias",
145
+ "blocks.7.mlp.2.bias",
146
+ "blocks.7.adaLN_modulation.bias",
147
+ "blocks.8.norm1.weight",
148
+ "blocks.8.norm2.weight",
149
+ "blocks.8.mlp.0.bias",
150
+ "blocks.8.mlp.2.bias",
151
+ "blocks.8.adaLN_modulation.bias",
152
+ "blocks.9.norm1.weight",
153
+ "blocks.9.norm2.weight",
154
+ "blocks.9.mlp.0.bias",
155
+ "blocks.9.mlp.2.bias",
156
+ "blocks.9.adaLN_modulation.bias",
157
+ "blocks.10.norm1.weight",
158
+ "blocks.10.norm2.weight",
159
+ "blocks.10.mlp.0.bias",
160
+ "blocks.10.mlp.2.bias",
161
+ "blocks.10.adaLN_modulation.bias",
162
+ "blocks.11.norm1.weight",
163
+ "blocks.11.norm2.weight",
164
+ "blocks.11.mlp.0.bias",
165
+ "blocks.11.mlp.2.bias",
166
+ "blocks.11.adaLN_modulation.bias",
167
+ "output_layer.norm_final.weight",
168
+ "output_layer.adaLN_modulation.bias"
169
+ ],
170
+ "muon_effective_nesterov": false,
171
+ "muon_effective_width_scale": false,
172
+ "muon_effective_weight_decay": 0.1,
173
+ "muon_adam_fallback_nesterov": false,
174
+ "muon_adam_fallback_weight_decay": 0.1,
175
+ "ema_decay": 0.9999,
176
+ "ema_start_step": 0,
177
+ "model_type": "ddit",
178
+ "elf_num_time_tokens": 4,
179
+ "elf_num_model_mode_tokens": 0,
180
+ "qk_norm": true,
181
+ "output_bias": false,
182
+ "output_init_std": -1.0,
183
+ "norm_type": "rmsnorm",
184
+ "target_loss": "hard_ce",
185
+ "linear_soft_target_power": 1.0,
186
+ "linear_soft_target_min_conf": 0.0,
187
+ "linear_soft_target_max_conf": 1.0,
188
+ "t_sampling_mode": "logit_normal",
189
+ "t_sampling_power": 1.0,
190
+ "t_sampling_eps": 0.0001,
191
+ "t_sampling_logit_mean": -1.5,
192
+ "t_sampling_logit_std": 0.8,
193
+ "dual_t": true,
194
+ "corrupt_t_mode": "same",
195
+ "corrupt_min_t": 0.0,
196
+ "corrupt_max_t": 1.0,
197
+ "prefix_block_prob": 0.0,
198
+ "prefix_block_len": 128,
199
+ "mask_ratio_floor_schedule": "none",
200
+ "dirichlet_endpoint_mode": "categorical_dual_t",
201
+ "dirichlet_semantic_t_mode": "same",
202
+ "dirichlet_semantic_t_value": 0.0,
203
+ "dirichlet_semantic_t_curve": "linear",
204
+ "dirichlet_semantic_t_power": 1.0,
205
+ "endpoint_sequence_random_prob_alpha": 0.0,
206
+ "categorical_wrong_from_full_vocab": true,
207
+ "categorical_wrong_from_batch_valid_tokens": false,
208
+ "categorical_wrong_basin_token_ids": "",
209
+ "categorical_wrong_basin_prob": 0.0,
210
+ "categorical_wrong_unigram_prob": 0.0,
211
+ "categorical_wrong_uniform_prob": 0.0,
212
+ "categorical_wrong_corpus_unigram_path": "",
213
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
214
+ "categorical_wrong_basin_shared_prob": 0.0,
215
+ "categorical_wrong_unigram_shared_prob": 0.0,
216
+ "mask_mixture_original_prob": 0.0,
217
+ "mask_mixture_lowk_prob": 1.0,
218
+ "mask_mixture_lowcorrupt_prob": 0.0,
219
+ "mask_mixture_block_prob": 0.0,
220
+ "mask_mixture_all_prob": 0.0,
221
+ "mask_mixture_lowk_clean_tokens": "1,2,4",
222
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
223
+ "mask_mixture_block_tokens": "64,128",
224
+ "simplex_bridge_sampler": "dirichlet",
225
+ "logistic_normal_sigma_min": 0.18,
226
+ "logistic_normal_sigma_max": 2.2,
227
+ "logistic_normal_tau_min": 0.65,
228
+ "logistic_normal_tau_max": 1.15,
229
+ "torch_compile": false,
230
+ "compile_mode": "max-autotune",
231
+ "state_format": "prob",
232
+ "meanflow_weight": 0.0,
233
+ "rollout_train_prob": 0.0,
234
+ "rollout_train_steps": 1,
235
+ "rollout_train_infer_steps": 64,
236
+ "rollout_train_temp": 1.45,
237
+ "rollout_train_max_gamma": 1.0,
238
+ "rollout_train_corrupt_only": true,
239
+ "rollout_train_samplewise": false,
240
+ "rollout_train_compute_always": false,
241
+ "bridge_noise_init": "logistic_normal",
242
+ "noise_sigma": -1.0,
243
+ "allow_tf32": true,
244
+ "activation_checkpointing": false,
245
+ "activation_checkpoint_interval": 1,
246
+ "activation_checkpoint_scope": "block",
247
+ "ddp_static_graph": false,
248
+ "ddp_gradient_as_bucket_view": true,
249
+ "blocking_data_transfer": false,
250
+ "dataloader_prefetch_factor": 4,
251
+ "full_train_stats": false,
252
+ "tokenized_hf": false,
253
+ "tokenized_pad_token": "pad",
254
+ "elf_conditional_hf": false,
255
+ "record_pad_truncate": false,
256
+ "record_add_eos": false,
257
+ "record_add_special_tokens": false,
258
+ "record_pad_token": "pad",
259
+ "record_shuffle_buffer": 10000,
260
+ "wrap": true,
261
+ "wrap_mode": "stream",
262
+ "wrap_record_buffer_size": 200,
263
+ "owt_cached_chunks": true,
264
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len8_train8_overfit",
265
+ "owt_chunk_cache_rebuild": false,
266
+ "owt_chunk_cache_write_batch": 4096,
267
+ "owt_exact_repeat_per_chunk": 0,
268
+ "online_chunk_shuffle": false,
269
+ "online_chunk_shuffle_buffer": 10000,
270
+ "openwebtext_split": "train_minus_100k",
271
+ "detokenizer": "auto",
272
+ "resolved_detokenizer": null,
273
+ "num_workers": 0,
274
+ "latest_every": 10,
275
+ "resume_path": ""
276
+ }
277
+ step=10 epoch=5/250 epoch_step=2/2 micro_steps=10 elapsed=1.8s lr=2.000000e-03 loss=10.8125 loss_recon=10.8125 loss_meanflow=0.0000 mean_model_t=0.1856 mean_corrupt_t=0.1856 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1125 corrupt_frac=0.7875 acc_corrupt=0.0952 loss_corrupt=10.8125 wrong_frac=0.8095 init_acc_corrupt=0.0476 acc_corrupt_t_0p0_0p2=0.0962 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.0163 out_g_norm=5.9134 acc_corrupt_t_0p2_0p4=0.0000 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.2500 corrupt_frac_t_0p4_0p6=1.0000 loss_all=10.8125 init_gold_top10=0.0000 init_gold_top100=0.0000
278
+ step=20 epoch=10/250 epoch_step=2/2 micro_steps=20 elapsed=5.4s lr=2.000000e-03 loss=10.7460 loss_recon=10.7460 loss_meanflow=0.0000 mean_model_t=0.2173 mean_corrupt_t=0.2173 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1375 corrupt_frac=0.7750 acc_corrupt=0.1290 loss_corrupt=10.7460 wrong_frac=0.8226 init_acc_corrupt=0.0968 acc_corrupt_t_0p2_0p4=0.0000 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=0.1009 out_g_norm=6.5812 acc_corrupt_t_0p0_0p2=0.1667 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.1538 corrupt_frac_t_0p4_0p6=1.0000 loss_all=10.7109 init_gold_top10=0.4286 init_gold_top100=0.4286
279
+ step=30 epoch=15/250 epoch_step=2/2 micro_steps=30 elapsed=5.2s lr=2.000000e-03 loss=10.6771 loss_recon=10.6771 loss_meanflow=0.0000 mean_model_t=0.1648 mean_corrupt_t=0.1648 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1500 corrupt_frac=0.6750 acc_corrupt=0.0926 loss_corrupt=10.6771 wrong_frac=0.8148 init_acc_corrupt=0.0185 acc_corrupt_t_0p0_0p2=0.0909 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.2149 out_g_norm=7.4784 acc_corrupt_t_0p2_0p4=0.1000 corrupt_frac_t_0p2_0p4=1.0000 loss_all=10.6094 init_gold_top10=0.7500 init_gold_top100=0.7500
280
+ step=40 epoch=20/250 epoch_step=2/2 micro_steps=40 elapsed=4.6s lr=2.000000e-03 loss=10.5590 loss_recon=10.5590 loss_meanflow=0.0000 mean_model_t=0.1595 mean_corrupt_t=0.1595 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1375 corrupt_frac=0.6625 acc_corrupt=0.0943 loss_corrupt=10.5590 wrong_frac=0.8679 init_acc_corrupt=0.0377 acc_corrupt_t_0p0_0p2=0.1111 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.3232 out_g_norm=7.8194 acc_corrupt_t_0p2_0p4=0.0588 corrupt_frac_t_0p2_0p4=1.0000 loss_all=10.5078 init_gold_top10=0.0000 init_gold_top100=0.2857
281
+ step=50 epoch=25/250 epoch_step=2/2 micro_steps=50 elapsed=5.6s lr=2.000000e-03 loss=10.4746 loss_recon=10.4746 loss_meanflow=0.0000 mean_model_t=0.1662 mean_corrupt_t=0.1662 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1375 corrupt_frac=0.7375 acc_corrupt=0.1017 loss_corrupt=10.4746 wrong_frac=0.7458 init_acc_corrupt=0.0678 acc_corrupt_t_0p0_0p2=0.1111 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.4216 out_g_norm=8.2502 acc_corrupt_t_0p2_0p4=0.0870 corrupt_frac_t_0p2_0p4=1.0000 loss_all=10.4141 init_gold_top10=0.4286 init_gold_top100=0.5714
282
+ step=60 epoch=30/250 epoch_step=2/2 micro_steps=60 elapsed=4.1s lr=2.000000e-03 loss=10.2844 loss_recon=10.2844 loss_meanflow=0.0000 mean_model_t=0.1705 mean_corrupt_t=0.1705 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1375 corrupt_frac=0.7500 acc_corrupt=0.1167 loss_corrupt=10.2844 wrong_frac=0.8167 init_acc_corrupt=0.1000 acc_corrupt_t_0p0_0p2=0.1429 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.5234 out_g_norm=8.4039 acc_corrupt_t_0p2_0p4=0.0800 corrupt_frac_t_0p2_0p4=1.0000 loss_all=10.0469 init_gold_top10=0.0000 init_gold_top100=0.1429
283
+ step=70 epoch=35/250 epoch_step=2/2 micro_steps=70 elapsed=5.5s lr=2.000000e-03 loss=10.1806 loss_recon=10.1806 loss_meanflow=0.0000 mean_model_t=0.2329 mean_corrupt_t=0.2329 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1500 corrupt_frac=0.6750 acc_corrupt=0.1296 loss_corrupt=10.1806 wrong_frac=0.7407 init_acc_corrupt=0.2037 acc_corrupt_t_0p0_0p2=0.1923 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.6238 out_g_norm=8.6946 acc_corrupt_t_0p4_0p6=0.0000 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.1111 corrupt_frac_t_0p2_0p4=1.0000 loss_all=10.0781 init_gold_top10=0.5714 init_gold_top100=0.5714
284
+ step=80 epoch=40/250 epoch_step=2/2 micro_steps=80 elapsed=4.5s lr=2.000000e-03 loss=9.7620 loss_recon=9.7620 loss_meanflow=0.0000 mean_model_t=0.2648 mean_corrupt_t=0.2648 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1750 corrupt_frac=0.6500 acc_corrupt=0.1731 loss_corrupt=9.7620 wrong_frac=0.7692 init_acc_corrupt=0.1538 acc_corrupt_t_0p2_0p4=0.1379 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=0.7095 out_g_norm=9.4154 acc_corrupt_t_0p0_0p2=0.2500 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.1818 corrupt_frac_t_0p4_0p6=1.0000 loss_all=9.8906 init_gold_top10=0.0000 init_gold_top100=0.0000
285
+ step=90 epoch=45/250 epoch_step=2/2 micro_steps=90 elapsed=4.4s lr=2.000000e-03 loss=9.8625 loss_recon=9.8625 loss_meanflow=0.0000 mean_model_t=0.1982 mean_corrupt_t=0.1982 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1750 corrupt_frac=0.7500 acc_corrupt=0.1333 loss_corrupt=9.8625 wrong_frac=0.7667 init_acc_corrupt=0.1000 acc_corrupt_t_0p0_0p2=0.0750 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=0.7873 out_g_norm=9.6620 acc_corrupt_t_0p2_0p4=0.2143 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.3333 corrupt_frac_t_0p4_0p6=1.0000 loss_all=9.4219 init_gold_top10=0.2857 init_gold_top100=0.4286
286
+ step=100 epoch=50/250 epoch_step=2/2 micro_steps=100 elapsed=5.5s lr=2.000000e-03 loss=9.5792 loss_recon=9.5792 loss_meanflow=0.0000 mean_model_t=0.1833 mean_corrupt_t=0.1833 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2125 corrupt_frac=0.7250 acc_corrupt=0.1552 loss_corrupt=9.5792 wrong_frac=0.8103 init_acc_corrupt=0.0690 acc_corrupt_t_0p2_0p4=0.1290 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=0.8603 out_g_norm=9.7993 acc_corrupt_t_0p0_0p2=0.1852 corrupt_frac_t_0p0_0p2=1.0000 loss_all=9.4141 init_gold_top10=0.2500 init_gold_top100=0.2500
287
+ step=110 epoch=55/250 epoch_step=2/2 micro_steps=110 elapsed=4.5s lr=2.000000e-03 loss=9.5256 loss_recon=9.5256 loss_meanflow=0.0000 mean_model_t=0.1922 mean_corrupt_t=0.1922 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1625 corrupt_frac=0.6875 acc_corrupt=0.1455 loss_corrupt=9.5256 wrong_frac=0.8364 init_acc_corrupt=0.1091 acc_corrupt_t_0p4_0p6=0.0000 corrupt_frac_t_0p4_0p6=1.0000 out_w_norm=0.9352 out_g_norm=10.1661 acc_corrupt_t_0p2_0p4=0.2222 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p0_0p2=0.1290 corrupt_frac_t_0p0_0p2=1.0000 loss_all=9.0781 init_gold_top10=0.0000 init_gold_top100=0.0000
288
+ step=120 epoch=60/250 epoch_step=2/2 micro_steps=120 elapsed=4.1s lr=2.000000e-03 loss=9.2604 loss_recon=9.2604 loss_meanflow=0.0000 mean_model_t=0.1497 mean_corrupt_t=0.1497 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1625 corrupt_frac=0.7500 acc_corrupt=0.1833 loss_corrupt=9.2604 wrong_frac=0.7500 init_acc_corrupt=0.1167 acc_corrupt_t_0p0_0p2=0.1628 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.0120 out_g_norm=10.6578 acc_corrupt_t_0p2_0p4=0.2353 corrupt_frac_t_0p2_0p4=1.0000 loss_all=8.6797 init_gold_top10=0.1667 init_gold_top100=0.3333
289
+ step=130 epoch=65/250 epoch_step=2/2 micro_steps=130 elapsed=5.2s lr=2.000000e-03 loss=9.1183 loss_recon=9.1183 loss_meanflow=0.0000 mean_model_t=0.2275 mean_corrupt_t=0.2275 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1625 corrupt_frac=0.6375 acc_corrupt=0.1176 loss_corrupt=9.1183 wrong_frac=0.7451 init_acc_corrupt=0.1176 acc_corrupt_t_0p0_0p2=0.1111 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.0883 out_g_norm=10.5466 acc_corrupt_t_0p2_0p4=0.1304 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.1000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=9.5859 init_gold_top10=0.2500 init_gold_top100=0.2500
290
+ step=140 epoch=70/250 epoch_step=2/2 micro_steps=140 elapsed=4.5s lr=2.000000e-03 loss=8.4357 loss_recon=8.4357 loss_meanflow=0.0000 mean_model_t=0.2169 mean_corrupt_t=0.2169 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2000 corrupt_frac=0.6500 acc_corrupt=0.2115 loss_corrupt=8.4357 wrong_frac=0.8077 init_acc_corrupt=0.0769 acc_corrupt_t_0p2_0p4=0.2857 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=1.1685 out_g_norm=11.3089 acc_corrupt_t_0p0_0p2=0.1875 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.1667 corrupt_frac_t_0p4_0p6=1.0000 loss_all=8.0703 init_gold_top10=0.3333 init_gold_top100=0.3333
291
+ step=150 epoch=75/250 epoch_step=2/2 micro_steps=150 elapsed=4.1s lr=2.000000e-03 loss=7.9952 loss_recon=7.9952 loss_meanflow=0.0000 mean_model_t=0.2117 mean_corrupt_t=0.2117 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3625 corrupt_frac=0.7250 acc_corrupt=0.3276 loss_corrupt=7.9952 wrong_frac=0.6724 init_acc_corrupt=0.1552 acc_corrupt_t_0p0_0p2=0.1905 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.2502 out_g_norm=11.1589 acc_corrupt_t_0p2_0p4=0.4054 corrupt_frac_t_0p2_0p4=1.0000 loss_all=8.3125 init_gold_top10=0.0000 init_gold_top100=0.2500
292
+ step=160 epoch=80/250 epoch_step=2/2 micro_steps=160 elapsed=4.6s lr=2.000000e-03 loss=7.8244 loss_recon=7.8244 loss_meanflow=0.0000 mean_model_t=0.1973 mean_corrupt_t=0.1973 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2250 corrupt_frac=0.6875 acc_corrupt=0.1818 loss_corrupt=7.8244 wrong_frac=0.8727 init_acc_corrupt=0.0545 acc_corrupt_t_0p0_0p2=0.1538 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.3346 out_g_norm=11.5378 acc_corrupt_t_0p2_0p4=0.2069 corrupt_frac_t_0p2_0p4=1.0000 loss_all=6.5117 init_gold_top10=0.0000 init_gold_top100=0.0000
293
+ step=170 epoch=85/250 epoch_step=2/2 micro_steps=170 elapsed=4.4s lr=2.000000e-03 loss=8.0737 loss_recon=8.0737 loss_meanflow=0.0000 mean_model_t=0.1858 mean_corrupt_t=0.1858 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2375 corrupt_frac=0.7000 acc_corrupt=0.1429 loss_corrupt=8.0737 wrong_frac=0.8929 init_acc_corrupt=0.0179 acc_corrupt_t_0p0_0p2=0.1429 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.4206 out_g_norm=11.3048 acc_corrupt_t_0p2_0p4=0.1429 corrupt_frac_t_0p2_0p4=1.0000 loss_all=7.0273 init_gold_top10=0.1667 init_gold_top100=0.5000
294
+ step=180 epoch=90/250 epoch_step=2/2 micro_steps=180 elapsed=4.0s lr=2.000000e-03 loss=7.6565 loss_recon=7.6565 loss_meanflow=0.0000 mean_model_t=0.1801 mean_corrupt_t=0.1801 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3000 corrupt_frac=0.7000 acc_corrupt=0.2321 loss_corrupt=7.6565 wrong_frac=0.8214 init_acc_corrupt=0.0893 acc_corrupt_t_0p0_0p2=0.1579 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.5044 out_g_norm=11.5652 acc_corrupt_t_0p2_0p4=0.2857 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.7500 corrupt_frac_t_0p4_0p6=1.0000 loss_all=4.9751 init_gold_top10=0.5000 init_gold_top100=0.5000
295
+ step=190 epoch=95/250 epoch_step=2/2 micro_steps=190 elapsed=4.5s lr=2.000000e-03 loss=7.0855 loss_recon=7.0855 loss_meanflow=0.0000 mean_model_t=0.2116 mean_corrupt_t=0.2116 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2875 corrupt_frac=0.7000 acc_corrupt=0.2321 loss_corrupt=7.0855 wrong_frac=0.7679 init_acc_corrupt=0.1250 acc_corrupt_t_0p0_0p2=0.0870 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.5845 out_g_norm=11.7599 acc_corrupt_t_0p2_0p4=0.3103 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.5000 corrupt_frac_t_0p4_0p6=1.0000 loss_all=8.2539 init_gold_top10=0.0000 init_gold_top100=0.3333
296
+ step=200 epoch=100/250 epoch_step=2/2 micro_steps=200 elapsed=4.4s lr=2.000000e-03 loss=7.0000 loss_recon=7.0000 loss_meanflow=0.0000 mean_model_t=0.1888 mean_corrupt_t=0.1888 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2500 corrupt_frac=0.6500 acc_corrupt=0.2115 loss_corrupt=7.0000 wrong_frac=0.7692 init_acc_corrupt=0.0769 acc_corrupt_t_0p2_0p4=0.2353 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=1.6635 out_g_norm=11.4887 acc_corrupt_t_0p0_0p2=0.1667 corrupt_frac_t_0p0_0p2=1.0000 loss_all=4.9365 init_gold_top10=0.0000 init_gold_top100=0.2500
297
+ step=210 epoch=105/250 epoch_step=2/2 micro_steps=210 elapsed=4.1s lr=2.000000e-03 loss=6.3846 loss_recon=6.3846 loss_meanflow=0.0000 mean_model_t=0.2222 mean_corrupt_t=0.2222 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4125 corrupt_frac=0.7250 acc_corrupt=0.3276 loss_corrupt=6.3846 wrong_frac=0.7759 init_acc_corrupt=0.0862 acc_corrupt_t_0p2_0p4=0.4500 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=1.7426 out_g_norm=11.8660 acc_corrupt_t_0p0_0p2=0.0556 corrupt_frac_t_0p0_0p2=1.0000 loss_all=2.2339 init_gold_top10=0.7500 init_gold_top100=0.7500
298
+ step=220 epoch=110/250 epoch_step=2/2 micro_steps=220 elapsed=4.5s lr=2.000000e-03 loss=6.1355 loss_recon=6.1355 loss_meanflow=0.0000 mean_model_t=0.1720 mean_corrupt_t=0.1720 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3125 corrupt_frac=0.7000 acc_corrupt=0.2500 loss_corrupt=6.1355 wrong_frac=0.8214 init_acc_corrupt=0.0893 acc_corrupt_t_0p0_0p2=0.1714 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.8209 out_g_norm=11.3624 acc_corrupt_t_0p2_0p4=0.3810 corrupt_frac_t_0p2_0p4=1.0000 loss_all=7.0828 init_gold_top10=0.1667 init_gold_top100=0.1667
299
+ step=230 epoch=115/250 epoch_step=2/2 micro_steps=230 elapsed=4.4s lr=2.000000e-03 loss=4.8994 loss_recon=4.8994 loss_meanflow=0.0000 mean_model_t=0.3280 mean_corrupt_t=0.3280 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5250 corrupt_frac=0.6000 acc_corrupt=0.3542 loss_corrupt=4.8994 wrong_frac=0.6250 init_acc_corrupt=0.2500 acc_corrupt_t_0p0_0p2=0.2000 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.8942 out_g_norm=11.3461 acc_corrupt_t_0p6_0p8=0.7500 corrupt_frac_t_0p6_0p8=1.0000 acc_corrupt_t_0p4_0p6=0.5000 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.2667 corrupt_frac_t_0p2_0p4=1.0000 loss_all=1.5292 init_gold_top10=0.5000 init_gold_top100=0.5000
300
+ step=240 epoch=120/250 epoch_step=2/2 micro_steps=240 elapsed=4.1s lr=2.000000e-03 loss=5.5022 loss_recon=5.5022 loss_meanflow=0.0000 mean_model_t=0.2149 mean_corrupt_t=0.2149 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3625 corrupt_frac=0.7000 acc_corrupt=0.2857 loss_corrupt=5.5022 wrong_frac=0.8393 init_acc_corrupt=0.0893 acc_corrupt_t_0p0_0p2=0.2051 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=1.9660 out_g_norm=11.3102 acc_corrupt_t_0p6_0p8=0.3333 corrupt_frac_t_0p6_0p8=1.0000 acc_corrupt_t_0p2_0p4=0.5455 corrupt_frac_t_0p2_0p4=1.0000 loss_all=3.3633 init_gold_top10=0.2857 init_gold_top100=0.2857
301
+ step=250 epoch=125/250 epoch_step=2/2 micro_steps=250 elapsed=4.5s lr=2.000000e-03 loss=5.3018 loss_recon=5.3018 loss_meanflow=0.0000 mean_model_t=0.1982 mean_corrupt_t=0.1982 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2625 corrupt_frac=0.7625 acc_corrupt=0.2459 loss_corrupt=5.3018 wrong_frac=0.8197 init_acc_corrupt=0.0984 acc_corrupt_t_0p2_0p4=0.4286 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.0377 out_g_norm=11.2366 acc_corrupt_t_0p0_0p2=0.1707 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p4_0p6=0.3333 corrupt_frac_t_0p4_0p6=1.0000 loss_all=6.6230 init_gold_top10=0.0000 init_gold_top100=0.0000
302
+ step=260 epoch=130/250 epoch_step=2/2 micro_steps=260 elapsed=4.4s lr=2.000000e-03 loss=6.4312 loss_recon=6.4312 loss_meanflow=0.0000 mean_model_t=0.1619 mean_corrupt_t=0.1619 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1750 corrupt_frac=0.7250 acc_corrupt=0.1552 loss_corrupt=6.4312 wrong_frac=0.8621 init_acc_corrupt=0.0345 acc_corrupt_t_0p0_0p2=0.1463 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1092 out_g_norm=11.2571 acc_corrupt_t_0p2_0p4=0.1765 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.7271 init_gold_top10=0.0000 init_gold_top100=0.5000
303
+ step=270 epoch=135/250 epoch_step=2/2 micro_steps=270 elapsed=4.1s lr=2.000000e-03 loss=5.7238 loss_recon=5.7238 loss_meanflow=0.0000 mean_model_t=0.1655 mean_corrupt_t=0.1655 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3125 corrupt_frac=0.7875 acc_corrupt=0.2381 loss_corrupt=5.7238 wrong_frac=0.8889 init_acc_corrupt=0.0794 acc_corrupt_t_0p0_0p2=0.1957 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.1768 out_g_norm=11.7862 acc_corrupt_t_0p2_0p4=0.3529 corrupt_frac_t_0p2_0p4=1.0000 loss_all=1.9777 init_gold_top10=0.0000 init_gold_top100=0.0000
304
+ step=280 epoch=140/250 epoch_step=2/2 micro_steps=280 elapsed=4.5s lr=2.000000e-03 loss=4.8150 loss_recon=4.8150 loss_meanflow=0.0000 mean_model_t=0.1937 mean_corrupt_t=0.1937 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3250 corrupt_frac=0.6750 acc_corrupt=0.1852 loss_corrupt=4.8150 wrong_frac=0.8333 init_acc_corrupt=0.0556 acc_corrupt_t_0p0_0p2=0.1429 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.2385 out_g_norm=12.7686 acc_corrupt_t_0p4_0p6=0.2500 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.3750 corrupt_frac_t_0p2_0p4=1.0000 loss_all=5.2206 init_gold_top10=0.0000 init_gold_top100=0.1667
305
+ step=290 epoch=145/250 epoch_step=2/2 micro_steps=290 elapsed=4.4s lr=2.000000e-03 loss=5.0242 loss_recon=5.0242 loss_meanflow=0.0000 mean_model_t=0.1749 mean_corrupt_t=0.1749 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2750 corrupt_frac=0.7500 acc_corrupt=0.2333 loss_corrupt=5.0242 wrong_frac=0.9167 init_acc_corrupt=0.0667 acc_corrupt_t_0p0_0p2=0.1622 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.2959 out_g_norm=12.2660 acc_corrupt_t_0p2_0p4=0.3478 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.4589 init_gold_top10=0.3333 init_gold_top100=0.3333
306
+ step=300 epoch=150/250 epoch_step=2/2 micro_steps=300 elapsed=4.0s lr=2.000000e-03 loss=5.8570 loss_recon=5.8570 loss_meanflow=0.0000 mean_model_t=0.1787 mean_corrupt_t=0.1787 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2750 corrupt_frac=0.6375 acc_corrupt=0.1176 loss_corrupt=5.8570 wrong_frac=0.8627 init_acc_corrupt=0.0392 acc_corrupt_t_0p2_0p4=0.2000 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.3494 out_g_norm=12.8436 acc_corrupt_t_0p0_0p2=0.0385 corrupt_frac_t_0p0_0p2=1.0000 loss_all=2.8794 init_gold_top10=0.0000 init_gold_top100=0.5000
307
+ step=310 epoch=155/250 epoch_step=2/2 micro_steps=310 elapsed=4.5s lr=2.000000e-03 loss=5.1161 loss_recon=5.1161 loss_meanflow=0.0000 mean_model_t=0.2153 mean_corrupt_t=0.2153 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2625 corrupt_frac=0.7500 acc_corrupt=0.1833 loss_corrupt=5.1161 wrong_frac=0.8333 init_acc_corrupt=0.1333 acc_corrupt_t_0p0_0p2=0.0690 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.4004 out_g_norm=12.3832 acc_corrupt_t_0p4_0p6=0.6667 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p2_0p4=0.2000 corrupt_frac_t_0p2_0p4=1.0000 loss_all=2.6932 init_gold_top10=0.2857 init_gold_top100=0.2857
308
+ step=320 epoch=160/250 epoch_step=2/2 micro_steps=320 elapsed=4.4s lr=2.000000e-03 loss=4.1836 loss_recon=4.1836 loss_meanflow=0.0000 mean_model_t=0.1894 mean_corrupt_t=0.1894 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4000 corrupt_frac=0.6875 acc_corrupt=0.2909 loss_corrupt=4.1836 wrong_frac=0.8545 init_acc_corrupt=0.0364 acc_corrupt_t_0p2_0p4=0.3500 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.4487 out_g_norm=12.3994 acc_corrupt_t_0p0_0p2=0.2571 corrupt_frac_t_0p0_0p2=1.0000 loss_all=2.0757 init_gold_top10=0.2500 init_gold_top100=0.5000
309
+ step=330 epoch=165/250 epoch_step=2/2 micro_steps=330 elapsed=4.1s lr=2.000000e-03 loss=5.3740 loss_recon=5.3740 loss_meanflow=0.0000 mean_model_t=0.1705 mean_corrupt_t=0.1705 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3250 corrupt_frac=0.7625 acc_corrupt=0.2131 loss_corrupt=5.3740 wrong_frac=0.8852 init_acc_corrupt=0.0492 acc_corrupt_t_0p2_0p4=0.2759 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.4920 out_g_norm=12.7280 acc_corrupt_t_0p0_0p2=0.1563 corrupt_frac_t_0p0_0p2=1.0000 loss_all=1.2651 init_gold_top10=0.2500 init_gold_top100=0.2500
310
+ step=340 epoch=170/250 epoch_step=2/2 micro_steps=340 elapsed=4.4s lr=2.000000e-03 loss=4.2422 loss_recon=4.2422 loss_meanflow=0.0000 mean_model_t=0.2121 mean_corrupt_t=0.2121 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3750 corrupt_frac=0.6250 acc_corrupt=0.2800 loss_corrupt=4.2422 wrong_frac=0.7600 init_acc_corrupt=0.1400 acc_corrupt_t_0p0_0p2=0.1765 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.5284 out_g_norm=12.9125 acc_corrupt_t_0p2_0p4=0.5000 corrupt_frac_t_0p2_0p4=1.0000 loss_all=4.8362 init_gold_top10=0.0000 init_gold_top100=0.0000
311
+ step=350 epoch=175/250 epoch_step=2/2 micro_steps=350 elapsed=4.3s lr=2.000000e-03 loss=4.3841 loss_recon=4.3841 loss_meanflow=0.0000 mean_model_t=0.1489 mean_corrupt_t=0.1489 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3250 corrupt_frac=0.6750 acc_corrupt=0.2407 loss_corrupt=4.3841 wrong_frac=0.7963 init_acc_corrupt=0.0926 acc_corrupt_t_0p4_0p6=0.6667 corrupt_frac_t_0p4_0p6=1.0000 out_w_norm=2.5599 out_g_norm=12.9114 acc_corrupt_t_0p2_0p4=1.0000 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p0_0p2=0.1136 corrupt_frac_t_0p0_0p2=1.0000 loss_all=5.6359 init_gold_top10=0.0000 init_gold_top100=0.0000
312
+ step=360 epoch=180/250 epoch_step=2/2 micro_steps=360 elapsed=3.9s lr=2.000000e-03 loss=3.7976 loss_recon=3.7976 loss_meanflow=0.0000 mean_model_t=0.1694 mean_corrupt_t=0.1694 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2750 corrupt_frac=0.7125 acc_corrupt=0.2281 loss_corrupt=3.7976 wrong_frac=0.8421 init_acc_corrupt=0.0877 acc_corrupt_t_0p2_0p4=0.3333 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.5899 out_g_norm=13.4754 acc_corrupt_t_0p0_0p2=0.1905 corrupt_frac_t_0p0_0p2=1.0000 loss_all=5.9990 init_gold_top10=0.0000 init_gold_top100=0.2500
313
+ step=370 epoch=185/250 epoch_step=2/2 micro_steps=370 elapsed=4.8s lr=2.000000e-03 loss=4.1272 loss_recon=4.1272 loss_meanflow=0.0000 mean_model_t=0.1564 mean_corrupt_t=0.1564 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2625 corrupt_frac=0.7000 acc_corrupt=0.0714 loss_corrupt=4.1272 wrong_frac=0.9286 init_acc_corrupt=0.0000 acc_corrupt_t_0p2_0p4=0.1250 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.6152 out_g_norm=13.5264 acc_corrupt_t_0p0_0p2=0.0625 corrupt_frac_t_0p0_0p2=1.0000 loss_all=4.7281 init_gold_top10=0.0000 init_gold_top100=0.3333
314
+ step=380 epoch=190/250 epoch_step=2/2 micro_steps=380 elapsed=4.3s lr=2.000000e-03 loss=2.7788 loss_recon=2.7788 loss_meanflow=0.0000 mean_model_t=0.3014 mean_corrupt_t=0.3014 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4875 corrupt_frac=0.7000 acc_corrupt=0.3929 loss_corrupt=2.7788 wrong_frac=0.6786 init_acc_corrupt=0.2500 acc_corrupt_t_0p2_0p4=0.3721 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.6379 out_g_norm=13.2091 acc_corrupt_t_0p6_0p8=1.0000 corrupt_frac_t_0p6_0p8=1.0000 acc_corrupt_t_0p0_0p2=0.0000 corrupt_frac_t_0p0_0p2=1.0000 loss_all=1.6825 init_gold_top10=0.3333 init_gold_top100=0.3333
315
+ step=390 epoch=195/250 epoch_step=2/2 micro_steps=390 elapsed=4.0s lr=2.000000e-03 loss=3.3236 loss_recon=3.3236 loss_meanflow=0.0000 mean_model_t=0.2232 mean_corrupt_t=0.2232 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3250 corrupt_frac=0.7625 acc_corrupt=0.3115 loss_corrupt=3.3236 wrong_frac=0.6885 init_acc_corrupt=0.1475 acc_corrupt_t_0p4_0p6=0.5833 corrupt_frac_t_0p4_0p6=1.0000 out_w_norm=2.6586 out_g_norm=12.9198 acc_corrupt_t_0p2_0p4=0.3077 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p0_0p2=0.1739 corrupt_frac_t_0p0_0p2=1.0000 loss_all=3.6035 init_gold_top10=0.1429 init_gold_top100=0.5714
316
+ step=400 epoch=200/250 epoch_step=2/2 micro_steps=400 elapsed=4.4s lr=2.000000e-03 loss=2.8541 loss_recon=2.8541 loss_meanflow=0.0000 mean_model_t=0.2852 mean_corrupt_t=0.2852 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4375 corrupt_frac=0.7125 acc_corrupt=0.3509 loss_corrupt=2.8541 wrong_frac=0.7193 init_acc_corrupt=0.2105 acc_corrupt_t_0p0_0p2=0.1429 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.6788 out_g_norm=13.6963 acc_corrupt_t_0p2_0p4=0.3333 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.8000 corrupt_frac_t_0p4_0p6=1.0000 acc_corrupt_t_0p6_0p8=0.5714 corrupt_frac_t_0p6_0p8=1.0000 loss_all=1.8984 init_gold_top10=0.0000 init_gold_top100=0.2500
317
+ step=410 epoch=205/250 epoch_step=2/2 micro_steps=410 elapsed=4.3s lr=2.000000e-03 loss=2.8174 loss_recon=2.8174 loss_meanflow=0.0000 mean_model_t=0.2321 mean_corrupt_t=0.2321 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5000 corrupt_frac=0.7125 acc_corrupt=0.3509 loss_corrupt=2.8174 wrong_frac=0.6842 init_acc_corrupt=0.1579 acc_corrupt_t_0p2_0p4=0.3864 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.6980 out_g_norm=14.6062 acc_corrupt_t_0p0_0p2=0.2308 corrupt_frac_t_0p0_0p2=1.0000 loss_all=6.5630 init_gold_top10=0.1429 init_gold_top100=0.4286
318
+ step=420 epoch=210/250 epoch_step=2/2 micro_steps=420 elapsed=4.0s lr=2.000000e-03 loss=3.4428 loss_recon=3.4428 loss_meanflow=0.0000 mean_model_t=0.2283 mean_corrupt_t=0.2283 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3625 corrupt_frac=0.7375 acc_corrupt=0.2712 loss_corrupt=3.4428 wrong_frac=0.7627 init_acc_corrupt=0.1017 acc_corrupt_t_0p0_0p2=0.1613 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.7121 out_g_norm=14.0254 acc_corrupt_t_0p2_0p4=0.2857 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.7143 corrupt_frac_t_0p4_0p6=1.0000 loss_all=3.6406 init_gold_top10=0.0000 init_gold_top100=0.3333
319
+ step=430 epoch=215/250 epoch_step=2/2 micro_steps=430 elapsed=4.4s lr=2.000000e-03 loss=3.1890 loss_recon=3.1890 loss_meanflow=0.0000 mean_model_t=0.1566 mean_corrupt_t=0.1566 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2500 corrupt_frac=0.7375 acc_corrupt=0.1695 loss_corrupt=3.1890 wrong_frac=0.8305 init_acc_corrupt=0.0169 acc_corrupt_t_0p0_0p2=0.1277 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.7216 out_g_norm=14.2216 acc_corrupt_t_0p2_0p4=0.3333 corrupt_frac_t_0p2_0p4=1.0000 loss_all=1.7258 init_gold_top10=0.0000 init_gold_top100=0.2500
320
+ step=440 epoch=220/250 epoch_step=2/2 micro_steps=440 elapsed=4.3s lr=2.000000e-03 loss=2.5924 loss_recon=2.5924 loss_meanflow=0.0000 mean_model_t=0.2137 mean_corrupt_t=0.2137 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4625 corrupt_frac=0.6500 acc_corrupt=0.3654 loss_corrupt=2.5924 wrong_frac=0.8077 init_acc_corrupt=0.1346 acc_corrupt_t_0p0_0p2=0.2308 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.7297 out_g_norm=14.3430 acc_corrupt_t_0p2_0p4=0.4500 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.6667 corrupt_frac_t_0p4_0p6=1.0000 loss_all=1.3561 init_gold_top10=0.2500 init_gold_top100=0.5000
321
+ step=450 epoch=225/250 epoch_step=2/2 micro_steps=450 elapsed=4.0s lr=2.000000e-03 loss=3.3114 loss_recon=3.3114 loss_meanflow=0.0000 mean_model_t=0.1809 mean_corrupt_t=0.1809 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3375 corrupt_frac=0.8125 acc_corrupt=0.2923 loss_corrupt=3.3114 wrong_frac=0.7692 init_acc_corrupt=0.1385 acc_corrupt_t_0p0_0p2=0.1316 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.7403 out_g_norm=14.0755 acc_corrupt_t_0p2_0p4=0.4000 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.8571 corrupt_frac_t_0p4_0p6=1.0000 loss_all=6.0078 init_gold_top10=0.0000 init_gold_top100=0.0000
322
+ step=460 epoch=230/250 epoch_step=2/2 micro_steps=460 elapsed=4.4s lr=2.000000e-03 loss=2.9129 loss_recon=2.9129 loss_meanflow=0.0000 mean_model_t=0.2244 mean_corrupt_t=0.2244 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3375 corrupt_frac=0.7125 acc_corrupt=0.2281 loss_corrupt=2.9129 wrong_frac=0.8070 init_acc_corrupt=0.1404 acc_corrupt_t_0p2_0p4=0.3571 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.7499 out_g_norm=13.6439 acc_corrupt_t_0p0_0p2=0.1034 corrupt_frac_t_0p0_0p2=1.0000 loss_all=3.9619 init_gold_top10=0.1429 init_gold_top100=0.1429
323
+ step=470 epoch=235/250 epoch_step=2/2 micro_steps=470 elapsed=4.5s lr=2.000000e-03 loss=2.8460 loss_recon=2.8460 loss_meanflow=0.0000 mean_model_t=0.1958 mean_corrupt_t=0.1958 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3375 corrupt_frac=0.7875 acc_corrupt=0.2857 loss_corrupt=2.8460 wrong_frac=0.7619 init_acc_corrupt=0.1429 acc_corrupt_t_0p0_0p2=0.2093 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.7602 out_g_norm=12.3965 acc_corrupt_t_0p2_0p4=0.3571 corrupt_frac_t_0p2_0p4=1.0000 acc_corrupt_t_0p4_0p6=0.6667 corrupt_frac_t_0p4_0p6=1.0000 loss_all=0.9632 init_gold_top10=0.5000 init_gold_top100=0.5000
324
+ step=480 epoch=240/250 epoch_step=2/2 micro_steps=480 elapsed=4.1s lr=2.000000e-03 loss=2.1795 loss_recon=2.1795 loss_meanflow=0.0000 mean_model_t=0.2166 mean_corrupt_t=0.2166 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4375 corrupt_frac=0.7500 acc_corrupt=0.3833 loss_corrupt=2.1795 wrong_frac=0.7833 init_acc_corrupt=0.1833 acc_corrupt_t_0p2_0p4=0.5278 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.7764 out_g_norm=12.1876 acc_corrupt_t_0p0_0p2=0.1667 corrupt_frac_t_0p0_0p2=1.0000 loss_all=1.1442 init_gold_top10=0.2500 init_gold_top100=0.5000
325
+ step=490 epoch=245/250 epoch_step=2/2 micro_steps=490 elapsed=5.0s lr=2.000000e-03 loss=2.5202 loss_recon=2.5202 loss_meanflow=0.0000 mean_model_t=0.1816 mean_corrupt_t=0.1816 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3875 corrupt_frac=0.7250 acc_corrupt=0.3103 loss_corrupt=2.5202 wrong_frac=0.8621 init_acc_corrupt=0.0862 acc_corrupt_t_0p0_0p2=0.2308 corrupt_frac_t_0p0_0p2=1.0000 out_w_norm=2.7920 out_g_norm=13.9710 acc_corrupt_t_0p2_0p4=0.3750 corrupt_frac_t_0p2_0p4=1.0000 loss_all=1.8340 init_gold_top10=0.1429 init_gold_top100=0.1429
326
+ step=500 epoch=250/250 epoch_step=2/2 micro_steps=500 elapsed=4.4s lr=2.000000e-03 loss=2.2207 loss_recon=2.2207 loss_meanflow=0.0000 mean_model_t=0.2459 mean_corrupt_t=0.2459 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4125 corrupt_frac=0.7375 acc_corrupt=0.3390 loss_corrupt=2.2207 wrong_frac=0.7797 init_acc_corrupt=0.1695 acc_corrupt_t_0p2_0p4=0.2727 corrupt_frac_t_0p2_0p4=1.0000 out_w_norm=2.7991 out_g_norm=13.3679 acc_corrupt_t_0p0_0p2=0.3000 corrupt_frac_t_0p0_0p2=1.0000 acc_corrupt_t_0p6_0p8=0.7143 corrupt_frac_t_0p6_0p8=1.0000 loss_all=1.2793 init_gold_top10=0.2857 init_gold_top100=0.2857
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_noisegeo_len256_allcorrupt_fullvocab_dirC1_1024_20260517_163805.log ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_noisegeo_len256_allcorrupt_seqrand1_dirC1_1024_20260517_163805.log ADDED
@@ -0,0 +1,989 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 969,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_seqrand1_dirC1_1024_20260517_163805",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 1965440,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 1.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_corpus_unigram_path": "",
124
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
125
+ "categorical_wrong_basin_shared_prob": 0.0,
126
+ "categorical_wrong_unigram_shared_prob": 0.0,
127
+ "mask_mixture_original_prob": 0.0,
128
+ "mask_mixture_lowk_prob": 0.0,
129
+ "mask_mixture_lowcorrupt_prob": 0.0,
130
+ "mask_mixture_block_prob": 0.0,
131
+ "mask_mixture_all_prob": 1.0,
132
+ "mask_mixture_lowk_clean_tokens": "0",
133
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
134
+ "mask_mixture_block_tokens": "64,128",
135
+ "simplex_bridge_sampler": "dirichlet",
136
+ "logistic_normal_sigma_min": 0.1,
137
+ "logistic_normal_sigma_max": 1.0,
138
+ "logistic_normal_tau_min": 1.0,
139
+ "logistic_normal_tau_max": 1.0,
140
+ "torch_compile": false,
141
+ "compile_mode": "max-autotune",
142
+ "state_format": "prob",
143
+ "meanflow_weight": 0.0,
144
+ "rollout_train_prob": 0.0,
145
+ "rollout_train_steps": 1,
146
+ "rollout_train_infer_steps": 64,
147
+ "rollout_train_temp": 1.45,
148
+ "rollout_train_max_gamma": 1.0,
149
+ "rollout_train_corrupt_only": true,
150
+ "rollout_train_samplewise": false,
151
+ "rollout_train_compute_always": false,
152
+ "bridge_noise_init": "logistic_normal",
153
+ "noise_sigma": -1.0,
154
+ "allow_tf32": true,
155
+ "activation_checkpointing": false,
156
+ "activation_checkpoint_interval": 1,
157
+ "activation_checkpoint_scope": "block",
158
+ "ddp_static_graph": false,
159
+ "ddp_gradient_as_bucket_view": true,
160
+ "blocking_data_transfer": false,
161
+ "dataloader_prefetch_factor": 4,
162
+ "full_train_stats": false,
163
+ "tokenized_hf": false,
164
+ "tokenized_pad_token": "pad",
165
+ "elf_conditional_hf": false,
166
+ "record_pad_truncate": false,
167
+ "record_add_eos": false,
168
+ "record_add_special_tokens": false,
169
+ "record_pad_token": "pad",
170
+ "record_shuffle_buffer": 10000,
171
+ "wrap": true,
172
+ "wrap_mode": "stream",
173
+ "wrap_record_buffer_size": 200,
174
+ "owt_cached_chunks": true,
175
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
176
+ "owt_chunk_cache_rebuild": false,
177
+ "owt_chunk_cache_write_batch": 4096,
178
+ "owt_exact_repeat_per_chunk": 64,
179
+ "online_chunk_shuffle": false,
180
+ "online_chunk_shuffle_buffer": 10000,
181
+ "openwebtext_split": "train_minus_100k",
182
+ "detokenizer": "auto",
183
+ "resolved_detokenizer": null,
184
+ "num_workers": 0,
185
+ "latest_every": 1000,
186
+ "resume_path": ""
187
+ }
188
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=4.7s lr=2.000000e-03 loss=6.7667 loss_recon=6.7667 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0492 corrupt_frac=1.0000 acc_corrupt=0.0492 loss_corrupt=6.7667 wrong_frac=0.9425 init_acc_corrupt=0.0414 acc_corrupt_t_0p0_0p2=0.0352 corrupt_frac_t_0p0_0p2=0.5559 acc_corrupt_t_0p2_0p4=0.0536 corrupt_frac_t_0p2_0p4=0.3589 acc_corrupt_t_0p4_0p6=0.1117 corrupt_frac_t_0p4_0p6=0.0773 acc_corrupt_t_0p6_0p8=0.2184 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=0.7053 out_g_norm=1.2287 loss_all=6.6102 init_gold_top10=0.0613 init_gold_top100=0.2903
189
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=3.9s lr=2.000000e-03 loss=6.3651 loss_recon=6.3651 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0567 corrupt_frac=1.0000 acc_corrupt=0.0567 loss_corrupt=6.3651 wrong_frac=0.9405 init_acc_corrupt=0.0435 acc_corrupt_t_0p0_0p2=0.0415 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.0619 corrupt_frac_t_0p2_0p4=0.3621 acc_corrupt_t_0p4_0p6=0.1215 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.2278 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=1.6246 out_g_norm=1.7241 acc_corrupt_t_0p8_1p0=0.3841 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.1362 init_gold_top10=0.0640 init_gold_top100=0.3024
190
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=3.9s lr=2.000000e-03 loss=5.9542 loss_recon=5.9542 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0624 corrupt_frac=1.0000 acc_corrupt=0.0624 loss_corrupt=5.9542 wrong_frac=0.9431 init_acc_corrupt=0.0408 acc_corrupt_t_0p0_0p2=0.0447 corrupt_frac_t_0p0_0p2=0.5641 acc_corrupt_t_0p2_0p4=0.0701 corrupt_frac_t_0p2_0p4=0.3542 acc_corrupt_t_0p4_0p6=0.1385 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.2581 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=2.6115 out_g_norm=0.7201 acc_corrupt_t_0p8_1p0=0.4961 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.7587 init_gold_top10=0.0671 init_gold_top100=0.3087
191
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=3.9s lr=2.000000e-03 loss=5.4663 loss_recon=5.4663 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0756 corrupt_frac=1.0000 acc_corrupt=0.0756 loss_corrupt=5.4663 wrong_frac=0.9416 init_acc_corrupt=0.0425 acc_corrupt_t_0p0_0p2=0.0525 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.0842 corrupt_frac_t_0p2_0p4=0.3620 acc_corrupt_t_0p4_0p6=0.1813 corrupt_frac_t_0p4_0p6=0.0719 out_w_norm=3.8283 out_g_norm=0.5630 acc_corrupt_t_0p6_0p8=0.2909 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.2832 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.1062 init_gold_top10=0.0580 init_gold_top100=0.3017
192
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=3.9s lr=2.000000e-03 loss=4.5989 loss_recon=4.5989 loss_meanflow=0.0000 mean_model_t=0.2071 mean_corrupt_t=0.2071 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0918 corrupt_frac=1.0000 acc_corrupt=0.0918 loss_corrupt=4.5989 wrong_frac=0.9428 init_acc_corrupt=0.0415 acc_corrupt_t_0p0_0p2=0.0605 corrupt_frac_t_0p0_0p2=0.5632 acc_corrupt_t_0p2_0p4=0.1060 corrupt_frac_t_0p2_0p4=0.3546 acc_corrupt_t_0p4_0p6=0.2227 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.4495 corrupt_frac_t_0p6_0p8=0.0118 acc_corrupt_t_0p8_1p0=0.6602 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=5.0397 out_g_norm=0.6465 loss_all=4.0205 init_gold_top10=0.0772 init_gold_top100=0.3108
193
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=3.8s lr=2.000000e-03 loss=3.5699 loss_recon=3.5699 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1335 corrupt_frac=1.0000 acc_corrupt=0.1335 loss_corrupt=3.5699 wrong_frac=0.9417 init_acc_corrupt=0.0417 acc_corrupt_t_0p0_0p2=0.0824 corrupt_frac_t_0p0_0p2=0.5617 acc_corrupt_t_0p2_0p4=0.1662 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.3225 corrupt_frac_t_0p4_0p6=0.0729 acc_corrupt_t_0p6_0p8=0.5217 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=6.1912 out_g_norm=0.7056 acc_corrupt_t_0p8_1p0=0.8555 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.0804 init_gold_top10=0.0662 init_gold_top100=0.3149
194
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=3.8s lr=2.000000e-03 loss=2.6595 loss_recon=2.6595 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1997 corrupt_frac=1.0000 acc_corrupt=0.1997 loss_corrupt=2.6595 wrong_frac=0.9418 init_acc_corrupt=0.0418 acc_corrupt_t_0p0_0p2=0.1268 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.2588 corrupt_frac_t_0p2_0p4=0.3587 acc_corrupt_t_0p4_0p6=0.4077 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.5679 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=7.1650 out_g_norm=1.0429 acc_corrupt_t_0p8_1p0=0.9141 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.3141 init_gold_top10=0.0560 init_gold_top100=0.2892
195
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=3.9s lr=2.000000e-03 loss=2.0909 loss_recon=2.0909 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2497 corrupt_frac=1.0000 acc_corrupt=0.2497 loss_corrupt=2.0909 wrong_frac=0.9409 init_acc_corrupt=0.0433 acc_corrupt_t_0p0_0p2=0.1565 corrupt_frac_t_0p0_0p2=0.5548 acc_corrupt_t_0p2_0p4=0.3269 corrupt_frac_t_0p2_0p4=0.3585 acc_corrupt_t_0p4_0p6=0.5176 corrupt_frac_t_0p4_0p6=0.0790 out_w_norm=7.8313 out_g_norm=1.3524 acc_corrupt_t_0p6_0p8=0.6160 corrupt_frac_t_0p6_0p8=0.0130 acc_corrupt_t_0p8_1p0=0.9609 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.9063 init_gold_top10=0.0728 init_gold_top100=0.2933
196
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=3.9s lr=2.000000e-03 loss=1.9275 loss_recon=1.9275 loss_meanflow=0.0000 mean_model_t=0.2107 mean_corrupt_t=0.2107 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2675 corrupt_frac=1.0000 acc_corrupt=0.2675 loss_corrupt=1.9275 wrong_frac=0.9393 init_acc_corrupt=0.0447 acc_corrupt_t_0p0_0p2=0.1625 corrupt_frac_t_0p0_0p2=0.5521 acc_corrupt_t_0p2_0p4=0.3606 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.5215 corrupt_frac_t_0p4_0p6=0.0788 acc_corrupt_t_0p6_0p8=0.7558 corrupt_frac_t_0p6_0p8=0.0133 out_w_norm=8.1274 out_g_norm=1.3974 acc_corrupt_t_0p8_1p0=0.6549 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.0105 init_gold_top10=0.0489 init_gold_top100=0.2866
197
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=3.9s lr=2.000000e-03 loss=1.8480 loss_recon=1.8480 loss_meanflow=0.0000 mean_model_t=0.2112 mean_corrupt_t=0.2112 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2761 corrupt_frac=1.0000 acc_corrupt=0.2761 loss_corrupt=1.8480 wrong_frac=0.9392 init_acc_corrupt=0.0447 acc_corrupt_t_0p0_0p2=0.1696 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3669 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.5556 corrupt_frac_t_0p4_0p6=0.0810 out_w_norm=8.2662 out_g_norm=1.1505 acc_corrupt_t_0p6_0p8=0.6375 corrupt_frac_t_0p6_0p8=0.0137 acc_corrupt_t_0p8_1p0=0.4414 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7700 init_gold_top10=0.0757 init_gold_top100=0.3064
198
+ NCCL version 2.25.1+cuda12.8
199
+ resumed_from=runs/train8_noisegeo_len256_allcorrupt_seqrand1_dirC1_1024_20260517_163805/latest.pt start_step=1001
200
+ {
201
+ "device": "cuda:0",
202
+ "rank": 0,
203
+ "world_size": 4,
204
+ "samples": "owt_cached_chunks:8",
205
+ "vocab_size": 969,
206
+ "tokenizer_vocab_size": 50257,
207
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_seqrand1_dirC1_1024_20260517_163805",
208
+ "batch_size": 128,
209
+ "grad_accum": 1,
210
+ "effective_batch_size": 512,
211
+ "global_batch_size": 512,
212
+ "lr_schedule": "constant_warmup",
213
+ "optimizer": "muon",
214
+ "epochs": 0.0,
215
+ "steps_per_epoch": 1,
216
+ "total_steps": 2000,
217
+ "warmup_steps": 10,
218
+ "warmup_epochs": -1.0,
219
+ "min_lr": 0.0,
220
+ "weight_decay": 0.1,
221
+ "output_weight_decay": -1.0,
222
+ "adamw_param_groups": "nanogpt",
223
+ "adam_beta1": 0.9,
224
+ "adam_beta2": 0.95,
225
+ "adam_eps": 1e-08,
226
+ "muon_impl": "legacy",
227
+ "muon_momentum": 0.95,
228
+ "muon_ns_steps": 5,
229
+ "muon_update_scale": 1.0,
230
+ "muon_nesterov": false,
231
+ "muon_width_scale": false,
232
+ "muon_grouping": "legacy_dim_ge_2",
233
+ "muon_param_count": 1965440,
234
+ "muon_adam_param_count": 8192,
235
+ "muon_param_names": [
236
+ "vocab_embed.embedding",
237
+ "sigma_map.net.0.weight",
238
+ "sigma_map.net.2.weight",
239
+ "blocks.0.attn_qkv.weight",
240
+ "blocks.0.attn_out.weight",
241
+ "blocks.0.mlp.0.weight",
242
+ "blocks.0.mlp.2.weight",
243
+ "blocks.0.adaLN_modulation.weight",
244
+ "blocks.1.attn_qkv.weight",
245
+ "blocks.1.attn_out.weight",
246
+ "blocks.1.mlp.0.weight",
247
+ "blocks.1.mlp.2.weight",
248
+ "blocks.1.adaLN_modulation.weight",
249
+ "blocks.2.attn_qkv.weight",
250
+ "blocks.2.attn_out.weight",
251
+ "blocks.2.mlp.0.weight",
252
+ "blocks.2.mlp.2.weight",
253
+ "blocks.2.adaLN_modulation.weight",
254
+ "output_layer.linear.weight",
255
+ "output_layer.adaLN_modulation.weight"
256
+ ],
257
+ "muon_adam_param_names": [
258
+ "sigma_map.net.0.bias",
259
+ "sigma_map.net.2.bias",
260
+ "blocks.0.norm1.weight",
261
+ "blocks.0.norm2.weight",
262
+ "blocks.0.mlp.0.bias",
263
+ "blocks.0.mlp.2.bias",
264
+ "blocks.0.adaLN_modulation.bias",
265
+ "blocks.1.norm1.weight",
266
+ "blocks.1.norm2.weight",
267
+ "blocks.1.mlp.0.bias",
268
+ "blocks.1.mlp.2.bias",
269
+ "blocks.1.adaLN_modulation.bias",
270
+ "blocks.2.norm1.weight",
271
+ "blocks.2.norm2.weight",
272
+ "blocks.2.mlp.0.bias",
273
+ "blocks.2.mlp.2.bias",
274
+ "blocks.2.adaLN_modulation.bias",
275
+ "output_layer.norm_final.weight",
276
+ "output_layer.adaLN_modulation.bias"
277
+ ],
278
+ "muon_effective_nesterov": false,
279
+ "muon_effective_width_scale": false,
280
+ "muon_effective_weight_decay": 0.1,
281
+ "muon_adam_fallback_nesterov": false,
282
+ "muon_adam_fallback_weight_decay": 0.1,
283
+ "ema_decay": 0.9999,
284
+ "ema_start_step": 0,
285
+ "model_type": "ddit",
286
+ "ddit_mlp_type": "gelu",
287
+ "elf_num_time_tokens": 4,
288
+ "elf_num_model_mode_tokens": 0,
289
+ "qk_norm": true,
290
+ "output_bias": false,
291
+ "output_init_std": -1.0,
292
+ "norm_type": "rmsnorm",
293
+ "target_loss": "hard_ce",
294
+ "linear_soft_target_power": 1.0,
295
+ "linear_soft_target_min_conf": 0.0,
296
+ "linear_soft_target_max_conf": 1.0,
297
+ "t_sampling_mode": "logit_normal",
298
+ "t_sampling_power": 1.0,
299
+ "t_sampling_eps": 0.0001,
300
+ "t_sampling_logit_mean": -1.5,
301
+ "t_sampling_logit_std": 0.8,
302
+ "dual_t": true,
303
+ "corrupt_t_mode": "same",
304
+ "corrupt_min_t": 0.0,
305
+ "corrupt_max_t": 1.0,
306
+ "prefix_block_prob": 0.0,
307
+ "prefix_block_len": 128,
308
+ "mask_ratio_floor_schedule": "none",
309
+ "dirichlet_endpoint_mode": "categorical_dual_t",
310
+ "dirichlet_semantic_t_mode": "same",
311
+ "dirichlet_semantic_t_value": 0.0,
312
+ "dirichlet_semantic_t_curve": "linear",
313
+ "dirichlet_semantic_t_power": 1.0,
314
+ "endpoint_sequence_random_prob_alpha": 1.0,
315
+ "categorical_wrong_from_full_vocab": true,
316
+ "categorical_wrong_from_batch_valid_tokens": false,
317
+ "categorical_wrong_basin_token_ids": "",
318
+ "categorical_wrong_basin_prob": 0.0,
319
+ "categorical_wrong_unigram_prob": 0.0,
320
+ "categorical_wrong_uniform_prob": 0.0,
321
+ "categorical_wrong_corpus_unigram_path": "",
322
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
323
+ "categorical_wrong_basin_shared_prob": 0.0,
324
+ "categorical_wrong_unigram_shared_prob": 0.0,
325
+ "mask_mixture_original_prob": 0.0,
326
+ "mask_mixture_lowk_prob": 0.0,
327
+ "mask_mixture_lowcorrupt_prob": 0.0,
328
+ "mask_mixture_block_prob": 0.0,
329
+ "mask_mixture_all_prob": 1.0,
330
+ "mask_mixture_lowk_clean_tokens": "0",
331
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
332
+ "mask_mixture_block_tokens": "64,128",
333
+ "simplex_bridge_sampler": "dirichlet",
334
+ "logistic_normal_sigma_min": 0.1,
335
+ "logistic_normal_sigma_max": 1.0,
336
+ "logistic_normal_tau_min": 1.0,
337
+ "logistic_normal_tau_max": 1.0,
338
+ "torch_compile": false,
339
+ "compile_mode": "max-autotune",
340
+ "state_format": "prob",
341
+ "meanflow_weight": 0.0,
342
+ "rollout_train_prob": 0.0,
343
+ "rollout_train_steps": 1,
344
+ "rollout_train_infer_steps": 64,
345
+ "rollout_train_temp": 1.45,
346
+ "rollout_train_max_gamma": 1.0,
347
+ "rollout_train_corrupt_only": true,
348
+ "rollout_train_samplewise": false,
349
+ "rollout_train_compute_always": false,
350
+ "bridge_noise_init": "logistic_normal",
351
+ "noise_sigma": -1.0,
352
+ "allow_tf32": true,
353
+ "activation_checkpointing": false,
354
+ "activation_checkpoint_interval": 1,
355
+ "activation_checkpoint_scope": "block",
356
+ "ddp_static_graph": false,
357
+ "ddp_gradient_as_bucket_view": true,
358
+ "blocking_data_transfer": false,
359
+ "dataloader_prefetch_factor": 4,
360
+ "full_train_stats": false,
361
+ "tokenized_hf": false,
362
+ "tokenized_pad_token": "pad",
363
+ "elf_conditional_hf": false,
364
+ "record_pad_truncate": false,
365
+ "record_add_eos": false,
366
+ "record_add_special_tokens": false,
367
+ "record_pad_token": "pad",
368
+ "record_shuffle_buffer": 10000,
369
+ "wrap": true,
370
+ "wrap_mode": "stream",
371
+ "wrap_record_buffer_size": 200,
372
+ "owt_cached_chunks": true,
373
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
374
+ "owt_chunk_cache_rebuild": false,
375
+ "owt_chunk_cache_write_batch": 4096,
376
+ "owt_exact_repeat_per_chunk": 64,
377
+ "online_chunk_shuffle": false,
378
+ "online_chunk_shuffle_buffer": 10000,
379
+ "openwebtext_split": "train_minus_100k",
380
+ "detokenizer": "auto",
381
+ "resolved_detokenizer": null,
382
+ "num_workers": 0,
383
+ "latest_every": 1000,
384
+ "resume_path": "runs/train8_noisegeo_len256_allcorrupt_seqrand1_dirC1_1024_20260517_163805/latest.pt"
385
+ }
386
+ step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=4.4s lr=2.000000e-03 loss=1.8279 loss_recon=1.8279 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2734 corrupt_frac=1.0000 acc_corrupt=0.2734 loss_corrupt=1.8279 wrong_frac=0.9425 init_acc_corrupt=0.0414 acc_corrupt_t_0p0_0p2=0.1732 corrupt_frac_t_0p0_0p2=0.5559 acc_corrupt_t_0p2_0p4=0.3623 corrupt_frac_t_0p2_0p4=0.3589 acc_corrupt_t_0p4_0p6=0.5357 corrupt_frac_t_0p4_0p6=0.0773 acc_corrupt_t_0p6_0p8=0.7117 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=8.3859 out_g_norm=1.0143 loss_all=1.8020 init_gold_top10=0.0613 init_gold_top100=0.2903
387
+ step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=3.7s lr=2.000000e-03 loss=1.7941 loss_recon=1.7941 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2825 corrupt_frac=1.0000 acc_corrupt=0.2825 loss_corrupt=1.7941 wrong_frac=0.9405 init_acc_corrupt=0.0435 acc_corrupt_t_0p0_0p2=0.1786 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3733 corrupt_frac_t_0p2_0p4=0.3621 acc_corrupt_t_0p4_0p6=0.5432 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.7633 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=8.5003 out_g_norm=0.8753 acc_corrupt_t_0p8_1p0=0.9935 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7603 init_gold_top10=0.0640 init_gold_top100=0.3024
388
+ step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=3.7s lr=2.000000e-03 loss=1.7990 loss_recon=1.7990 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2789 corrupt_frac=1.0000 acc_corrupt=0.2789 loss_corrupt=1.7990 wrong_frac=0.9431 init_acc_corrupt=0.0408 acc_corrupt_t_0p0_0p2=0.1806 corrupt_frac_t_0p0_0p2=0.5641 acc_corrupt_t_0p2_0p4=0.3699 corrupt_frac_t_0p2_0p4=0.3542 acc_corrupt_t_0p4_0p6=0.5414 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.7539 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=8.5977 out_g_norm=0.8285 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.8094 init_gold_top10=0.0671 init_gold_top100=0.3087
389
+ step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=3.7s lr=2.000000e-03 loss=1.7856 loss_recon=1.7856 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2820 corrupt_frac=1.0000 acc_corrupt=0.2820 loss_corrupt=1.7856 wrong_frac=0.9416 init_acc_corrupt=0.0425 acc_corrupt_t_0p0_0p2=0.1793 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.3725 corrupt_frac_t_0p2_0p4=0.3620 acc_corrupt_t_0p4_0p6=0.5668 corrupt_frac_t_0p4_0p6=0.0719 out_w_norm=8.6908 out_g_norm=0.7441 acc_corrupt_t_0p6_0p8=0.6845 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.6055 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7903 init_gold_top10=0.0580 init_gold_top100=0.3017
390
+ step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=3.7s lr=2.000000e-03 loss=1.7828 loss_recon=1.7828 loss_meanflow=0.0000 mean_model_t=0.2071 mean_corrupt_t=0.2071 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2815 corrupt_frac=1.0000 acc_corrupt=0.2815 loss_corrupt=1.7828 wrong_frac=0.9428 init_acc_corrupt=0.0415 acc_corrupt_t_0p0_0p2=0.1831 corrupt_frac_t_0p0_0p2=0.5632 acc_corrupt_t_0p2_0p4=0.3734 corrupt_frac_t_0p2_0p4=0.3546 acc_corrupt_t_0p4_0p6=0.5355 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.7821 corrupt_frac_t_0p6_0p8=0.0118 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=8.7883 out_g_norm=0.6961 loss_all=1.7305 init_gold_top10=0.0772 init_gold_top100=0.3108
391
+ step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=3.7s lr=2.000000e-03 loss=1.7678 loss_recon=1.7678 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2869 corrupt_frac=1.0000 acc_corrupt=0.2869 loss_corrupt=1.7678 wrong_frac=0.9417 init_acc_corrupt=0.0417 acc_corrupt_t_0p0_0p2=0.1868 corrupt_frac_t_0p0_0p2=0.5617 acc_corrupt_t_0p2_0p4=0.3837 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.5346 corrupt_frac_t_0p4_0p6=0.0729 acc_corrupt_t_0p6_0p8=0.7138 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=8.8863 out_g_norm=0.6478 acc_corrupt_t_0p8_1p0=0.8867 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7834 init_gold_top10=0.0662 init_gold_top100=0.3149
392
+ step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=3.7s lr=2.000000e-03 loss=1.7618 loss_recon=1.7618 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2888 corrupt_frac=1.0000 acc_corrupt=0.2888 loss_corrupt=1.7618 wrong_frac=0.9418 init_acc_corrupt=0.0418 acc_corrupt_t_0p0_0p2=0.1874 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.3848 corrupt_frac_t_0p2_0p4=0.3587 acc_corrupt_t_0p4_0p6=0.5286 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.6649 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=8.9699 out_g_norm=0.6192 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.8032 init_gold_top10=0.0560 init_gold_top100=0.2892
393
+ step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=3.7s lr=2.000000e-03 loss=1.7584 loss_recon=1.7584 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2899 corrupt_frac=1.0000 acc_corrupt=0.2899 loss_corrupt=1.7584 wrong_frac=0.9409 init_acc_corrupt=0.0433 acc_corrupt_t_0p0_0p2=0.1882 corrupt_frac_t_0p0_0p2=0.5548 acc_corrupt_t_0p2_0p4=0.3789 corrupt_frac_t_0p2_0p4=0.3585 acc_corrupt_t_0p4_0p6=0.5646 corrupt_frac_t_0p4_0p6=0.0790 out_w_norm=9.0504 out_g_norm=0.6281 acc_corrupt_t_0p6_0p8=0.6550 corrupt_frac_t_0p6_0p8=0.0130 acc_corrupt_t_0p8_1p0=0.9414 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6919 init_gold_top10=0.0728 init_gold_top100=0.2933
394
+ step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=3.7s lr=2.000000e-03 loss=1.7424 loss_recon=1.7424 loss_meanflow=0.0000 mean_model_t=0.2107 mean_corrupt_t=0.2107 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2959 corrupt_frac=1.0000 acc_corrupt=0.2959 loss_corrupt=1.7424 wrong_frac=0.9393 init_acc_corrupt=0.0447 acc_corrupt_t_0p0_0p2=0.1908 corrupt_frac_t_0p0_0p2=0.5521 acc_corrupt_t_0p2_0p4=0.3919 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.5379 corrupt_frac_t_0p4_0p6=0.0788 acc_corrupt_t_0p6_0p8=0.7762 corrupt_frac_t_0p6_0p8=0.0133 out_w_norm=9.1147 out_g_norm=0.6102 acc_corrupt_t_0p8_1p0=0.6771 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.8825 init_gold_top10=0.0489 init_gold_top100=0.2866
395
+ step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=3.7s lr=2.000000e-03 loss=1.7359 loss_recon=1.7359 loss_meanflow=0.0000 mean_model_t=0.2112 mean_corrupt_t=0.2112 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2968 corrupt_frac=1.0000 acc_corrupt=0.2968 loss_corrupt=1.7359 wrong_frac=0.9392 init_acc_corrupt=0.0447 acc_corrupt_t_0p0_0p2=0.1921 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3879 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.5651 corrupt_frac_t_0p4_0p6=0.0810 out_w_norm=9.2008 out_g_norm=0.5321 acc_corrupt_t_0p6_0p8=0.6451 corrupt_frac_t_0p6_0p8=0.0137 acc_corrupt_t_0p8_1p0=0.4305 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7197 init_gold_top10=0.0757 init_gold_top100=0.3064
396
+ NCCL version 2.25.1+cuda12.8
397
+ resumed_from=runs/train8_noisegeo_len256_allcorrupt_seqrand1_dirC1_1024_20260517_163805/latest.pt start_step=2001
398
+ {
399
+ "device": "cuda:0",
400
+ "rank": 0,
401
+ "world_size": 4,
402
+ "samples": "owt_cached_chunks:8",
403
+ "vocab_size": 969,
404
+ "tokenizer_vocab_size": 50257,
405
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_seqrand1_dirC1_1024_20260517_163805",
406
+ "batch_size": 128,
407
+ "grad_accum": 1,
408
+ "effective_batch_size": 512,
409
+ "global_batch_size": 512,
410
+ "lr_schedule": "constant_warmup",
411
+ "optimizer": "muon",
412
+ "epochs": 0.0,
413
+ "steps_per_epoch": 1,
414
+ "total_steps": 3000,
415
+ "warmup_steps": 10,
416
+ "warmup_epochs": -1.0,
417
+ "min_lr": 0.0,
418
+ "weight_decay": 0.1,
419
+ "output_weight_decay": -1.0,
420
+ "adamw_param_groups": "nanogpt",
421
+ "adam_beta1": 0.9,
422
+ "adam_beta2": 0.95,
423
+ "adam_eps": 1e-08,
424
+ "muon_impl": "legacy",
425
+ "muon_momentum": 0.95,
426
+ "muon_ns_steps": 5,
427
+ "muon_update_scale": 1.0,
428
+ "muon_nesterov": false,
429
+ "muon_width_scale": false,
430
+ "muon_grouping": "legacy_dim_ge_2",
431
+ "muon_param_count": 1965440,
432
+ "muon_adam_param_count": 8192,
433
+ "muon_param_names": [
434
+ "vocab_embed.embedding",
435
+ "sigma_map.net.0.weight",
436
+ "sigma_map.net.2.weight",
437
+ "blocks.0.attn_qkv.weight",
438
+ "blocks.0.attn_out.weight",
439
+ "blocks.0.mlp.0.weight",
440
+ "blocks.0.mlp.2.weight",
441
+ "blocks.0.adaLN_modulation.weight",
442
+ "blocks.1.attn_qkv.weight",
443
+ "blocks.1.attn_out.weight",
444
+ "blocks.1.mlp.0.weight",
445
+ "blocks.1.mlp.2.weight",
446
+ "blocks.1.adaLN_modulation.weight",
447
+ "blocks.2.attn_qkv.weight",
448
+ "blocks.2.attn_out.weight",
449
+ "blocks.2.mlp.0.weight",
450
+ "blocks.2.mlp.2.weight",
451
+ "blocks.2.adaLN_modulation.weight",
452
+ "output_layer.linear.weight",
453
+ "output_layer.adaLN_modulation.weight"
454
+ ],
455
+ "muon_adam_param_names": [
456
+ "sigma_map.net.0.bias",
457
+ "sigma_map.net.2.bias",
458
+ "blocks.0.norm1.weight",
459
+ "blocks.0.norm2.weight",
460
+ "blocks.0.mlp.0.bias",
461
+ "blocks.0.mlp.2.bias",
462
+ "blocks.0.adaLN_modulation.bias",
463
+ "blocks.1.norm1.weight",
464
+ "blocks.1.norm2.weight",
465
+ "blocks.1.mlp.0.bias",
466
+ "blocks.1.mlp.2.bias",
467
+ "blocks.1.adaLN_modulation.bias",
468
+ "blocks.2.norm1.weight",
469
+ "blocks.2.norm2.weight",
470
+ "blocks.2.mlp.0.bias",
471
+ "blocks.2.mlp.2.bias",
472
+ "blocks.2.adaLN_modulation.bias",
473
+ "output_layer.norm_final.weight",
474
+ "output_layer.adaLN_modulation.bias"
475
+ ],
476
+ "muon_effective_nesterov": false,
477
+ "muon_effective_width_scale": false,
478
+ "muon_effective_weight_decay": 0.1,
479
+ "muon_adam_fallback_nesterov": false,
480
+ "muon_adam_fallback_weight_decay": 0.1,
481
+ "ema_decay": 0.9999,
482
+ "ema_start_step": 0,
483
+ "model_type": "ddit",
484
+ "ddit_mlp_type": "gelu",
485
+ "elf_num_time_tokens": 4,
486
+ "elf_num_model_mode_tokens": 0,
487
+ "qk_norm": true,
488
+ "output_bias": false,
489
+ "output_init_std": -1.0,
490
+ "norm_type": "rmsnorm",
491
+ "target_loss": "hard_ce",
492
+ "linear_soft_target_power": 1.0,
493
+ "linear_soft_target_min_conf": 0.0,
494
+ "linear_soft_target_max_conf": 1.0,
495
+ "t_sampling_mode": "logit_normal",
496
+ "t_sampling_power": 1.0,
497
+ "t_sampling_eps": 0.0001,
498
+ "t_sampling_logit_mean": -1.5,
499
+ "t_sampling_logit_std": 0.8,
500
+ "dual_t": true,
501
+ "corrupt_t_mode": "same",
502
+ "corrupt_min_t": 0.0,
503
+ "corrupt_max_t": 1.0,
504
+ "prefix_block_prob": 0.0,
505
+ "prefix_block_len": 128,
506
+ "mask_ratio_floor_schedule": "none",
507
+ "dirichlet_endpoint_mode": "categorical_dual_t",
508
+ "dirichlet_semantic_t_mode": "same",
509
+ "dirichlet_semantic_t_value": 0.0,
510
+ "dirichlet_semantic_t_curve": "linear",
511
+ "dirichlet_semantic_t_power": 1.0,
512
+ "endpoint_sequence_random_prob_alpha": 1.0,
513
+ "categorical_wrong_from_full_vocab": true,
514
+ "categorical_wrong_from_batch_valid_tokens": false,
515
+ "categorical_wrong_basin_token_ids": "",
516
+ "categorical_wrong_basin_prob": 0.0,
517
+ "categorical_wrong_unigram_prob": 0.0,
518
+ "categorical_wrong_uniform_prob": 0.0,
519
+ "categorical_wrong_corpus_unigram_path": "",
520
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
521
+ "categorical_wrong_basin_shared_prob": 0.0,
522
+ "categorical_wrong_unigram_shared_prob": 0.0,
523
+ "mask_mixture_original_prob": 0.0,
524
+ "mask_mixture_lowk_prob": 0.0,
525
+ "mask_mixture_lowcorrupt_prob": 0.0,
526
+ "mask_mixture_block_prob": 0.0,
527
+ "mask_mixture_all_prob": 1.0,
528
+ "mask_mixture_lowk_clean_tokens": "0",
529
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
530
+ "mask_mixture_block_tokens": "64,128",
531
+ "simplex_bridge_sampler": "dirichlet",
532
+ "logistic_normal_sigma_min": 0.1,
533
+ "logistic_normal_sigma_max": 1.0,
534
+ "logistic_normal_tau_min": 1.0,
535
+ "logistic_normal_tau_max": 1.0,
536
+ "torch_compile": false,
537
+ "compile_mode": "max-autotune",
538
+ "state_format": "prob",
539
+ "meanflow_weight": 0.0,
540
+ "rollout_train_prob": 0.0,
541
+ "rollout_train_steps": 1,
542
+ "rollout_train_infer_steps": 64,
543
+ "rollout_train_temp": 1.45,
544
+ "rollout_train_max_gamma": 1.0,
545
+ "rollout_train_corrupt_only": true,
546
+ "rollout_train_samplewise": false,
547
+ "rollout_train_compute_always": false,
548
+ "bridge_noise_init": "logistic_normal",
549
+ "noise_sigma": -1.0,
550
+ "allow_tf32": true,
551
+ "activation_checkpointing": false,
552
+ "activation_checkpoint_interval": 1,
553
+ "activation_checkpoint_scope": "block",
554
+ "ddp_static_graph": false,
555
+ "ddp_gradient_as_bucket_view": true,
556
+ "blocking_data_transfer": false,
557
+ "dataloader_prefetch_factor": 4,
558
+ "full_train_stats": false,
559
+ "tokenized_hf": false,
560
+ "tokenized_pad_token": "pad",
561
+ "elf_conditional_hf": false,
562
+ "record_pad_truncate": false,
563
+ "record_add_eos": false,
564
+ "record_add_special_tokens": false,
565
+ "record_pad_token": "pad",
566
+ "record_shuffle_buffer": 10000,
567
+ "wrap": true,
568
+ "wrap_mode": "stream",
569
+ "wrap_record_buffer_size": 200,
570
+ "owt_cached_chunks": true,
571
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
572
+ "owt_chunk_cache_rebuild": false,
573
+ "owt_chunk_cache_write_batch": 4096,
574
+ "owt_exact_repeat_per_chunk": 64,
575
+ "online_chunk_shuffle": false,
576
+ "online_chunk_shuffle_buffer": 10000,
577
+ "openwebtext_split": "train_minus_100k",
578
+ "detokenizer": "auto",
579
+ "resolved_detokenizer": null,
580
+ "num_workers": 0,
581
+ "latest_every": 1000,
582
+ "resume_path": "runs/train8_noisegeo_len256_allcorrupt_seqrand1_dirC1_1024_20260517_163805/latest.pt"
583
+ }
584
+ step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=4.6s lr=2.000000e-03 loss=1.7538 loss_recon=1.7538 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2902 corrupt_frac=1.0000 acc_corrupt=0.2902 loss_corrupt=1.7538 wrong_frac=0.9425 init_acc_corrupt=0.0414 acc_corrupt_t_0p0_0p2=0.1937 corrupt_frac_t_0p0_0p2=0.5559 acc_corrupt_t_0p2_0p4=0.3767 corrupt_frac_t_0p2_0p4=0.3589 acc_corrupt_t_0p4_0p6=0.5383 corrupt_frac_t_0p4_0p6=0.0773 acc_corrupt_t_0p6_0p8=0.7216 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=9.2653 out_g_norm=0.5394 loss_all=1.7307 init_gold_top10=0.0613 init_gold_top100=0.2903
585
+ step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=3.8s lr=2.000000e-03 loss=1.7349 loss_recon=1.7349 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2970 corrupt_frac=1.0000 acc_corrupt=0.2970 loss_corrupt=1.7349 wrong_frac=0.9405 init_acc_corrupt=0.0435 acc_corrupt_t_0p0_0p2=0.1987 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3825 corrupt_frac_t_0p2_0p4=0.3621 acc_corrupt_t_0p4_0p6=0.5441 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.7718 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=9.3442 out_g_norm=0.5542 acc_corrupt_t_0p8_1p0=0.9948 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7286 init_gold_top10=0.0640 init_gold_top100=0.3024
586
+ step=2300 epoch=2300/3000 epoch_step=1/1 micro_steps=2300 elapsed=3.8s lr=2.000000e-03 loss=1.7487 loss_recon=1.7487 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2926 corrupt_frac=1.0000 acc_corrupt=0.2926 loss_corrupt=1.7487 wrong_frac=0.9431 init_acc_corrupt=0.0408 acc_corrupt_t_0p0_0p2=0.1970 corrupt_frac_t_0p0_0p2=0.5641 acc_corrupt_t_0p2_0p4=0.3814 corrupt_frac_t_0p2_0p4=0.3542 acc_corrupt_t_0p4_0p6=0.5462 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.7510 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=9.4060 out_g_norm=0.5413 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7581 init_gold_top10=0.0671 init_gold_top100=0.3087
587
+ step=2400 epoch=2400/3000 epoch_step=1/1 micro_steps=2400 elapsed=3.8s lr=2.000000e-03 loss=1.7391 loss_recon=1.7391 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2961 corrupt_frac=1.0000 acc_corrupt=0.2961 loss_corrupt=1.7391 wrong_frac=0.9416 init_acc_corrupt=0.0425 acc_corrupt_t_0p0_0p2=0.1990 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.3808 corrupt_frac_t_0p2_0p4=0.3620 acc_corrupt_t_0p4_0p6=0.5690 corrupt_frac_t_0p4_0p6=0.0719 out_w_norm=9.4742 out_g_norm=0.5023 acc_corrupt_t_0p6_0p8=0.6785 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.5977 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7613 init_gold_top10=0.0580 init_gold_top100=0.3017
588
+ step=2500 epoch=2500/3000 epoch_step=1/1 micro_steps=2500 elapsed=3.8s lr=2.000000e-03 loss=1.7416 loss_recon=1.7416 loss_meanflow=0.0000 mean_model_t=0.2071 mean_corrupt_t=0.2071 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2950 corrupt_frac=1.0000 acc_corrupt=0.2950 loss_corrupt=1.7416 wrong_frac=0.9428 init_acc_corrupt=0.0415 acc_corrupt_t_0p0_0p2=0.1996 corrupt_frac_t_0p0_0p2=0.5632 acc_corrupt_t_0p2_0p4=0.3850 corrupt_frac_t_0p2_0p4=0.3546 acc_corrupt_t_0p4_0p6=0.5367 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.7857 corrupt_frac_t_0p6_0p8=0.0118 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=9.5301 out_g_norm=0.4867 loss_all=1.7209 init_gold_top10=0.0772 init_gold_top100=0.3108
589
+ step=2600 epoch=2600/3000 epoch_step=1/1 micro_steps=2600 elapsed=3.8s lr=2.000000e-03 loss=1.7363 loss_recon=1.7363 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2991 corrupt_frac=1.0000 acc_corrupt=0.2991 loss_corrupt=1.7363 wrong_frac=0.9417 init_acc_corrupt=0.0417 acc_corrupt_t_0p0_0p2=0.2031 corrupt_frac_t_0p0_0p2=0.5617 acc_corrupt_t_0p2_0p4=0.3909 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.5407 corrupt_frac_t_0p4_0p6=0.0729 acc_corrupt_t_0p6_0p8=0.7247 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=9.5793 out_g_norm=0.4670 acc_corrupt_t_0p8_1p0=0.9922 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7050 init_gold_top10=0.0662 init_gold_top100=0.3149
590
+ step=2700 epoch=2700/3000 epoch_step=1/1 micro_steps=2700 elapsed=3.9s lr=2.000000e-03 loss=1.7290 loss_recon=1.7290 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3004 corrupt_frac=1.0000 acc_corrupt=0.3004 loss_corrupt=1.7290 wrong_frac=0.9418 init_acc_corrupt=0.0418 acc_corrupt_t_0p0_0p2=0.2042 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.3911 corrupt_frac_t_0p2_0p4=0.3587 acc_corrupt_t_0p4_0p6=0.5295 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.6610 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=9.6285 out_g_norm=0.5110 acc_corrupt_t_0p8_1p0=0.9805 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.8043 init_gold_top10=0.0560 init_gold_top100=0.2892
591
+ step=2800 epoch=2800/3000 epoch_step=1/1 micro_steps=2800 elapsed=3.9s lr=2.000000e-03 loss=1.7279 loss_recon=1.7279 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3008 corrupt_frac=1.0000 acc_corrupt=0.3008 loss_corrupt=1.7279 wrong_frac=0.9409 init_acc_corrupt=0.0433 acc_corrupt_t_0p0_0p2=0.2024 corrupt_frac_t_0p0_0p2=0.5548 acc_corrupt_t_0p2_0p4=0.3876 corrupt_frac_t_0p2_0p4=0.3585 acc_corrupt_t_0p4_0p6=0.5629 corrupt_frac_t_0p4_0p6=0.0790 out_w_norm=9.6815 out_g_norm=0.4944 acc_corrupt_t_0p6_0p8=0.6531 corrupt_frac_t_0p6_0p8=0.0130 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6817 init_gold_top10=0.0728 init_gold_top100=0.2933
592
+ step=2900 epoch=2900/3000 epoch_step=1/1 micro_steps=2900 elapsed=3.8s lr=2.000000e-03 loss=1.7175 loss_recon=1.7175 loss_meanflow=0.0000 mean_model_t=0.2107 mean_corrupt_t=0.2107 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3055 corrupt_frac=1.0000 acc_corrupt=0.3055 loss_corrupt=1.7175 wrong_frac=0.9393 init_acc_corrupt=0.0447 acc_corrupt_t_0p0_0p2=0.2046 corrupt_frac_t_0p0_0p2=0.5521 acc_corrupt_t_0p2_0p4=0.3960 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.5443 corrupt_frac_t_0p4_0p6=0.0788 acc_corrupt_t_0p6_0p8=0.7752 corrupt_frac_t_0p6_0p8=0.0133 out_w_norm=9.7218 out_g_norm=0.4560 acc_corrupt_t_0p8_1p0=0.6745 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.8487 init_gold_top10=0.0489 init_gold_top100=0.2866
593
+ step=3000 epoch=3000/3000 epoch_step=1/1 micro_steps=3000 elapsed=3.8s lr=2.000000e-03 loss=1.7120 loss_recon=1.7120 loss_meanflow=0.0000 mean_model_t=0.2112 mean_corrupt_t=0.2112 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3080 corrupt_frac=1.0000 acc_corrupt=0.3080 loss_corrupt=1.7120 wrong_frac=0.9392 init_acc_corrupt=0.0447 acc_corrupt_t_0p0_0p2=0.2085 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3944 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.5629 corrupt_frac_t_0p4_0p6=0.0810 out_w_norm=9.7560 out_g_norm=0.4364 acc_corrupt_t_0p6_0p8=0.6472 corrupt_frac_t_0p6_0p8=0.0137 acc_corrupt_t_0p8_1p0=0.3563 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6956 init_gold_top10=0.0757 init_gold_top100=0.3064
594
+ NCCL version 2.25.1+cuda12.8
595
+ resumed_from=runs/train8_noisegeo_len256_allcorrupt_seqrand1_dirC1_1024_20260517_163805/latest.pt start_step=3001
596
+ {
597
+ "device": "cuda:0",
598
+ "rank": 0,
599
+ "world_size": 4,
600
+ "samples": "owt_cached_chunks:8",
601
+ "vocab_size": 969,
602
+ "tokenizer_vocab_size": 50257,
603
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_seqrand1_dirC1_1024_20260517_163805",
604
+ "batch_size": 128,
605
+ "grad_accum": 1,
606
+ "effective_batch_size": 512,
607
+ "global_batch_size": 512,
608
+ "lr_schedule": "constant_warmup",
609
+ "optimizer": "muon",
610
+ "epochs": 0.0,
611
+ "steps_per_epoch": 1,
612
+ "total_steps": 4000,
613
+ "warmup_steps": 10,
614
+ "warmup_epochs": -1.0,
615
+ "min_lr": 0.0,
616
+ "weight_decay": 0.1,
617
+ "output_weight_decay": -1.0,
618
+ "adamw_param_groups": "nanogpt",
619
+ "adam_beta1": 0.9,
620
+ "adam_beta2": 0.95,
621
+ "adam_eps": 1e-08,
622
+ "muon_impl": "legacy",
623
+ "muon_momentum": 0.95,
624
+ "muon_ns_steps": 5,
625
+ "muon_update_scale": 1.0,
626
+ "muon_nesterov": false,
627
+ "muon_width_scale": false,
628
+ "muon_grouping": "legacy_dim_ge_2",
629
+ "muon_param_count": 1965440,
630
+ "muon_adam_param_count": 8192,
631
+ "muon_param_names": [
632
+ "vocab_embed.embedding",
633
+ "sigma_map.net.0.weight",
634
+ "sigma_map.net.2.weight",
635
+ "blocks.0.attn_qkv.weight",
636
+ "blocks.0.attn_out.weight",
637
+ "blocks.0.mlp.0.weight",
638
+ "blocks.0.mlp.2.weight",
639
+ "blocks.0.adaLN_modulation.weight",
640
+ "blocks.1.attn_qkv.weight",
641
+ "blocks.1.attn_out.weight",
642
+ "blocks.1.mlp.0.weight",
643
+ "blocks.1.mlp.2.weight",
644
+ "blocks.1.adaLN_modulation.weight",
645
+ "blocks.2.attn_qkv.weight",
646
+ "blocks.2.attn_out.weight",
647
+ "blocks.2.mlp.0.weight",
648
+ "blocks.2.mlp.2.weight",
649
+ "blocks.2.adaLN_modulation.weight",
650
+ "output_layer.linear.weight",
651
+ "output_layer.adaLN_modulation.weight"
652
+ ],
653
+ "muon_adam_param_names": [
654
+ "sigma_map.net.0.bias",
655
+ "sigma_map.net.2.bias",
656
+ "blocks.0.norm1.weight",
657
+ "blocks.0.norm2.weight",
658
+ "blocks.0.mlp.0.bias",
659
+ "blocks.0.mlp.2.bias",
660
+ "blocks.0.adaLN_modulation.bias",
661
+ "blocks.1.norm1.weight",
662
+ "blocks.1.norm2.weight",
663
+ "blocks.1.mlp.0.bias",
664
+ "blocks.1.mlp.2.bias",
665
+ "blocks.1.adaLN_modulation.bias",
666
+ "blocks.2.norm1.weight",
667
+ "blocks.2.norm2.weight",
668
+ "blocks.2.mlp.0.bias",
669
+ "blocks.2.mlp.2.bias",
670
+ "blocks.2.adaLN_modulation.bias",
671
+ "output_layer.norm_final.weight",
672
+ "output_layer.adaLN_modulation.bias"
673
+ ],
674
+ "muon_effective_nesterov": false,
675
+ "muon_effective_width_scale": false,
676
+ "muon_effective_weight_decay": 0.1,
677
+ "muon_adam_fallback_nesterov": false,
678
+ "muon_adam_fallback_weight_decay": 0.1,
679
+ "ema_decay": 0.9999,
680
+ "ema_start_step": 0,
681
+ "model_type": "ddit",
682
+ "ddit_mlp_type": "gelu",
683
+ "elf_num_time_tokens": 4,
684
+ "elf_num_model_mode_tokens": 0,
685
+ "qk_norm": true,
686
+ "output_bias": false,
687
+ "output_init_std": -1.0,
688
+ "norm_type": "rmsnorm",
689
+ "target_loss": "hard_ce",
690
+ "linear_soft_target_power": 1.0,
691
+ "linear_soft_target_min_conf": 0.0,
692
+ "linear_soft_target_max_conf": 1.0,
693
+ "t_sampling_mode": "logit_normal",
694
+ "t_sampling_power": 1.0,
695
+ "t_sampling_eps": 0.0001,
696
+ "t_sampling_logit_mean": -1.5,
697
+ "t_sampling_logit_std": 0.8,
698
+ "dual_t": true,
699
+ "corrupt_t_mode": "same",
700
+ "corrupt_min_t": 0.0,
701
+ "corrupt_max_t": 1.0,
702
+ "prefix_block_prob": 0.0,
703
+ "prefix_block_len": 128,
704
+ "mask_ratio_floor_schedule": "none",
705
+ "dirichlet_endpoint_mode": "categorical_dual_t",
706
+ "dirichlet_semantic_t_mode": "same",
707
+ "dirichlet_semantic_t_value": 0.0,
708
+ "dirichlet_semantic_t_curve": "linear",
709
+ "dirichlet_semantic_t_power": 1.0,
710
+ "endpoint_sequence_random_prob_alpha": 1.0,
711
+ "categorical_wrong_from_full_vocab": true,
712
+ "categorical_wrong_from_batch_valid_tokens": false,
713
+ "categorical_wrong_basin_token_ids": "",
714
+ "categorical_wrong_basin_prob": 0.0,
715
+ "categorical_wrong_unigram_prob": 0.0,
716
+ "categorical_wrong_uniform_prob": 0.0,
717
+ "categorical_wrong_corpus_unigram_path": "",
718
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
719
+ "categorical_wrong_basin_shared_prob": 0.0,
720
+ "categorical_wrong_unigram_shared_prob": 0.0,
721
+ "mask_mixture_original_prob": 0.0,
722
+ "mask_mixture_lowk_prob": 0.0,
723
+ "mask_mixture_lowcorrupt_prob": 0.0,
724
+ "mask_mixture_block_prob": 0.0,
725
+ "mask_mixture_all_prob": 1.0,
726
+ "mask_mixture_lowk_clean_tokens": "0",
727
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
728
+ "mask_mixture_block_tokens": "64,128",
729
+ "simplex_bridge_sampler": "dirichlet",
730
+ "logistic_normal_sigma_min": 0.1,
731
+ "logistic_normal_sigma_max": 1.0,
732
+ "logistic_normal_tau_min": 1.0,
733
+ "logistic_normal_tau_max": 1.0,
734
+ "torch_compile": false,
735
+ "compile_mode": "max-autotune",
736
+ "state_format": "prob",
737
+ "meanflow_weight": 0.0,
738
+ "rollout_train_prob": 0.0,
739
+ "rollout_train_steps": 1,
740
+ "rollout_train_infer_steps": 64,
741
+ "rollout_train_temp": 1.45,
742
+ "rollout_train_max_gamma": 1.0,
743
+ "rollout_train_corrupt_only": true,
744
+ "rollout_train_samplewise": false,
745
+ "rollout_train_compute_always": false,
746
+ "bridge_noise_init": "logistic_normal",
747
+ "noise_sigma": -1.0,
748
+ "allow_tf32": true,
749
+ "activation_checkpointing": false,
750
+ "activation_checkpoint_interval": 1,
751
+ "activation_checkpoint_scope": "block",
752
+ "ddp_static_graph": false,
753
+ "ddp_gradient_as_bucket_view": true,
754
+ "blocking_data_transfer": false,
755
+ "dataloader_prefetch_factor": 4,
756
+ "full_train_stats": false,
757
+ "tokenized_hf": false,
758
+ "tokenized_pad_token": "pad",
759
+ "elf_conditional_hf": false,
760
+ "record_pad_truncate": false,
761
+ "record_add_eos": false,
762
+ "record_add_special_tokens": false,
763
+ "record_pad_token": "pad",
764
+ "record_shuffle_buffer": 10000,
765
+ "wrap": true,
766
+ "wrap_mode": "stream",
767
+ "wrap_record_buffer_size": 200,
768
+ "owt_cached_chunks": true,
769
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
770
+ "owt_chunk_cache_rebuild": false,
771
+ "owt_chunk_cache_write_batch": 4096,
772
+ "owt_exact_repeat_per_chunk": 64,
773
+ "online_chunk_shuffle": false,
774
+ "online_chunk_shuffle_buffer": 10000,
775
+ "openwebtext_split": "train_minus_100k",
776
+ "detokenizer": "auto",
777
+ "resolved_detokenizer": null,
778
+ "num_workers": 0,
779
+ "latest_every": 1000,
780
+ "resume_path": "runs/train8_noisegeo_len256_allcorrupt_seqrand1_dirC1_1024_20260517_163805/latest.pt"
781
+ }
782
+ step=3100 epoch=3100/4000 epoch_step=1/1 micro_steps=3100 elapsed=4.4s lr=2.000000e-03 loss=1.7269 loss_recon=1.7269 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2998 corrupt_frac=1.0000 acc_corrupt=0.2998 loss_corrupt=1.7269 wrong_frac=0.9425 init_acc_corrupt=0.0414 acc_corrupt_t_0p0_0p2=0.2078 corrupt_frac_t_0p0_0p2=0.5559 acc_corrupt_t_0p2_0p4=0.3817 corrupt_frac_t_0p2_0p4=0.3589 acc_corrupt_t_0p4_0p6=0.5389 corrupt_frac_t_0p4_0p6=0.0773 acc_corrupt_t_0p6_0p8=0.7192 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=9.8082 out_g_norm=0.4470 loss_all=1.7473 init_gold_top10=0.0613 init_gold_top100=0.2903
783
+ step=3200 epoch=3200/4000 epoch_step=1/1 micro_steps=3200 elapsed=3.7s lr=2.000000e-03 loss=1.7153 loss_recon=1.7153 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3054 corrupt_frac=1.0000 acc_corrupt=0.3054 loss_corrupt=1.7153 wrong_frac=0.9405 init_acc_corrupt=0.0435 acc_corrupt_t_0p0_0p2=0.2104 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3872 corrupt_frac_t_0p2_0p4=0.3621 acc_corrupt_t_0p4_0p6=0.5475 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.7701 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=9.8564 out_g_norm=0.4356 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7345 init_gold_top10=0.0640 init_gold_top100=0.3024
784
+ step=3300 epoch=3300/4000 epoch_step=1/1 micro_steps=3300 elapsed=3.7s lr=2.000000e-03 loss=1.7272 loss_recon=1.7272 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2991 corrupt_frac=1.0000 acc_corrupt=0.2991 loss_corrupt=1.7272 wrong_frac=0.9431 init_acc_corrupt=0.0408 acc_corrupt_t_0p0_0p2=0.2081 corrupt_frac_t_0p0_0p2=0.5641 acc_corrupt_t_0p2_0p4=0.3824 corrupt_frac_t_0p2_0p4=0.3542 acc_corrupt_t_0p4_0p6=0.5457 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.7533 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=9.8844 out_g_norm=0.3994 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7267 init_gold_top10=0.0671 init_gold_top100=0.3087
785
+ step=3400 epoch=3400/4000 epoch_step=1/1 micro_steps=3400 elapsed=3.7s lr=2.000000e-03 loss=1.7213 loss_recon=1.7213 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3043 corrupt_frac=1.0000 acc_corrupt=0.3043 loss_corrupt=1.7213 wrong_frac=0.9416 init_acc_corrupt=0.0425 acc_corrupt_t_0p0_0p2=0.2107 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.3854 corrupt_frac_t_0p2_0p4=0.3620 acc_corrupt_t_0p4_0p6=0.5701 corrupt_frac_t_0p4_0p6=0.0719 out_w_norm=9.9123 out_g_norm=0.4174 acc_corrupt_t_0p6_0p8=0.6820 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.6074 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7223 init_gold_top10=0.0580 init_gold_top100=0.3017
786
+ step=3500 epoch=3500/4000 epoch_step=1/1 micro_steps=3500 elapsed=3.7s lr=2.000000e-03 loss=1.7239 loss_recon=1.7239 loss_meanflow=0.0000 mean_model_t=0.2071 mean_corrupt_t=0.2071 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3012 corrupt_frac=1.0000 acc_corrupt=0.3012 loss_corrupt=1.7239 wrong_frac=0.9428 init_acc_corrupt=0.0415 acc_corrupt_t_0p0_0p2=0.2099 corrupt_frac_t_0p0_0p2=0.5632 acc_corrupt_t_0p2_0p4=0.3867 corrupt_frac_t_0p2_0p4=0.3546 acc_corrupt_t_0p4_0p6=0.5344 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.7840 corrupt_frac_t_0p6_0p8=0.0118 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=9.9555 out_g_norm=0.4271 loss_all=1.6940 init_gold_top10=0.0772 init_gold_top100=0.3108
787
+ step=3600 epoch=3600/4000 epoch_step=1/1 micro_steps=3600 elapsed=3.7s lr=2.000000e-03 loss=1.7216 loss_recon=1.7216 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3033 corrupt_frac=1.0000 acc_corrupt=0.3033 loss_corrupt=1.7216 wrong_frac=0.9417 init_acc_corrupt=0.0417 acc_corrupt_t_0p0_0p2=0.2094 corrupt_frac_t_0p0_0p2=0.5617 acc_corrupt_t_0p2_0p4=0.3930 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.5391 corrupt_frac_t_0p4_0p6=0.0729 acc_corrupt_t_0p6_0p8=0.7227 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=9.9711 out_g_norm=0.3950 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6872 init_gold_top10=0.0662 init_gold_top100=0.3149
788
+ step=3700 epoch=3700/4000 epoch_step=1/1 micro_steps=3700 elapsed=3.7s lr=2.000000e-03 loss=1.7123 loss_recon=1.7123 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3073 corrupt_frac=1.0000 acc_corrupt=0.3073 loss_corrupt=1.7123 wrong_frac=0.9418 init_acc_corrupt=0.0418 acc_corrupt_t_0p0_0p2=0.2152 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.3936 corrupt_frac_t_0p2_0p4=0.3587 acc_corrupt_t_0p4_0p6=0.5266 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.6730 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=9.9983 out_g_norm=0.3913 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.8060 init_gold_top10=0.0560 init_gold_top100=0.2892
789
+ step=3800 epoch=3800/4000 epoch_step=1/1 micro_steps=3800 elapsed=3.7s lr=2.000000e-03 loss=1.7113 loss_recon=1.7113 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3067 corrupt_frac=1.0000 acc_corrupt=0.3067 loss_corrupt=1.7113 wrong_frac=0.9409 init_acc_corrupt=0.0433 acc_corrupt_t_0p0_0p2=0.2125 corrupt_frac_t_0p0_0p2=0.5548 acc_corrupt_t_0p2_0p4=0.3890 corrupt_frac_t_0p2_0p4=0.3585 acc_corrupt_t_0p4_0p6=0.5615 corrupt_frac_t_0p4_0p6=0.0790 out_w_norm=10.0339 out_g_norm=0.3812 acc_corrupt_t_0p6_0p8=0.6482 corrupt_frac_t_0p6_0p8=0.0130 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6719 init_gold_top10=0.0728 init_gold_top100=0.2933
790
+ step=3900 epoch=3900/4000 epoch_step=1/1 micro_steps=3900 elapsed=3.7s lr=2.000000e-03 loss=1.7036 loss_recon=1.7036 loss_meanflow=0.0000 mean_model_t=0.2107 mean_corrupt_t=0.2107 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3104 corrupt_frac=1.0000 acc_corrupt=0.3104 loss_corrupt=1.7036 wrong_frac=0.9393 init_acc_corrupt=0.0447 acc_corrupt_t_0p0_0p2=0.2134 corrupt_frac_t_0p0_0p2=0.5521 acc_corrupt_t_0p2_0p4=0.3975 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.5390 corrupt_frac_t_0p4_0p6=0.0788 acc_corrupt_t_0p6_0p8=0.7733 corrupt_frac_t_0p6_0p8=0.0133 out_w_norm=10.0619 out_g_norm=0.3564 acc_corrupt_t_0p8_1p0=0.6719 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.8375 init_gold_top10=0.0489 init_gold_top100=0.2866
791
+ step=4000 epoch=4000/4000 epoch_step=1/1 micro_steps=4000 elapsed=3.7s lr=2.000000e-03 loss=1.6998 loss_recon=1.6998 loss_meanflow=0.0000 mean_model_t=0.2112 mean_corrupt_t=0.2112 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3113 corrupt_frac=1.0000 acc_corrupt=0.3113 loss_corrupt=1.6998 wrong_frac=0.9392 init_acc_corrupt=0.0447 acc_corrupt_t_0p0_0p2=0.2145 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3947 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.5619 corrupt_frac_t_0p4_0p6=0.0810 out_w_norm=10.0805 out_g_norm=0.3895 acc_corrupt_t_0p6_0p8=0.6389 corrupt_frac_t_0p6_0p8=0.0137 acc_corrupt_t_0p8_1p0=0.4688 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6850 init_gold_top10=0.0757 init_gold_top100=0.3064
792
+ NCCL version 2.25.1+cuda12.8
793
+ resumed_from=runs/train8_noisegeo_len256_allcorrupt_seqrand1_dirC1_1024_20260517_163805/latest.pt start_step=4001
794
+ {
795
+ "device": "cuda:0",
796
+ "rank": 0,
797
+ "world_size": 4,
798
+ "samples": "owt_cached_chunks:8",
799
+ "vocab_size": 969,
800
+ "tokenizer_vocab_size": 50257,
801
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_seqrand1_dirC1_1024_20260517_163805",
802
+ "batch_size": 128,
803
+ "grad_accum": 1,
804
+ "effective_batch_size": 512,
805
+ "global_batch_size": 512,
806
+ "lr_schedule": "constant_warmup",
807
+ "optimizer": "muon",
808
+ "epochs": 0.0,
809
+ "steps_per_epoch": 1,
810
+ "total_steps": 5000,
811
+ "warmup_steps": 10,
812
+ "warmup_epochs": -1.0,
813
+ "min_lr": 0.0,
814
+ "weight_decay": 0.1,
815
+ "output_weight_decay": -1.0,
816
+ "adamw_param_groups": "nanogpt",
817
+ "adam_beta1": 0.9,
818
+ "adam_beta2": 0.95,
819
+ "adam_eps": 1e-08,
820
+ "muon_impl": "legacy",
821
+ "muon_momentum": 0.95,
822
+ "muon_ns_steps": 5,
823
+ "muon_update_scale": 1.0,
824
+ "muon_nesterov": false,
825
+ "muon_width_scale": false,
826
+ "muon_grouping": "legacy_dim_ge_2",
827
+ "muon_param_count": 1965440,
828
+ "muon_adam_param_count": 8192,
829
+ "muon_param_names": [
830
+ "vocab_embed.embedding",
831
+ "sigma_map.net.0.weight",
832
+ "sigma_map.net.2.weight",
833
+ "blocks.0.attn_qkv.weight",
834
+ "blocks.0.attn_out.weight",
835
+ "blocks.0.mlp.0.weight",
836
+ "blocks.0.mlp.2.weight",
837
+ "blocks.0.adaLN_modulation.weight",
838
+ "blocks.1.attn_qkv.weight",
839
+ "blocks.1.attn_out.weight",
840
+ "blocks.1.mlp.0.weight",
841
+ "blocks.1.mlp.2.weight",
842
+ "blocks.1.adaLN_modulation.weight",
843
+ "blocks.2.attn_qkv.weight",
844
+ "blocks.2.attn_out.weight",
845
+ "blocks.2.mlp.0.weight",
846
+ "blocks.2.mlp.2.weight",
847
+ "blocks.2.adaLN_modulation.weight",
848
+ "output_layer.linear.weight",
849
+ "output_layer.adaLN_modulation.weight"
850
+ ],
851
+ "muon_adam_param_names": [
852
+ "sigma_map.net.0.bias",
853
+ "sigma_map.net.2.bias",
854
+ "blocks.0.norm1.weight",
855
+ "blocks.0.norm2.weight",
856
+ "blocks.0.mlp.0.bias",
857
+ "blocks.0.mlp.2.bias",
858
+ "blocks.0.adaLN_modulation.bias",
859
+ "blocks.1.norm1.weight",
860
+ "blocks.1.norm2.weight",
861
+ "blocks.1.mlp.0.bias",
862
+ "blocks.1.mlp.2.bias",
863
+ "blocks.1.adaLN_modulation.bias",
864
+ "blocks.2.norm1.weight",
865
+ "blocks.2.norm2.weight",
866
+ "blocks.2.mlp.0.bias",
867
+ "blocks.2.mlp.2.bias",
868
+ "blocks.2.adaLN_modulation.bias",
869
+ "output_layer.norm_final.weight",
870
+ "output_layer.adaLN_modulation.bias"
871
+ ],
872
+ "muon_effective_nesterov": false,
873
+ "muon_effective_width_scale": false,
874
+ "muon_effective_weight_decay": 0.1,
875
+ "muon_adam_fallback_nesterov": false,
876
+ "muon_adam_fallback_weight_decay": 0.1,
877
+ "ema_decay": 0.9999,
878
+ "ema_start_step": 0,
879
+ "model_type": "ddit",
880
+ "ddit_mlp_type": "gelu",
881
+ "elf_num_time_tokens": 4,
882
+ "elf_num_model_mode_tokens": 0,
883
+ "qk_norm": true,
884
+ "output_bias": false,
885
+ "output_init_std": -1.0,
886
+ "norm_type": "rmsnorm",
887
+ "target_loss": "hard_ce",
888
+ "linear_soft_target_power": 1.0,
889
+ "linear_soft_target_min_conf": 0.0,
890
+ "linear_soft_target_max_conf": 1.0,
891
+ "t_sampling_mode": "logit_normal",
892
+ "t_sampling_power": 1.0,
893
+ "t_sampling_eps": 0.0001,
894
+ "t_sampling_logit_mean": -1.5,
895
+ "t_sampling_logit_std": 0.8,
896
+ "dual_t": true,
897
+ "corrupt_t_mode": "same",
898
+ "corrupt_min_t": 0.0,
899
+ "corrupt_max_t": 1.0,
900
+ "prefix_block_prob": 0.0,
901
+ "prefix_block_len": 128,
902
+ "mask_ratio_floor_schedule": "none",
903
+ "dirichlet_endpoint_mode": "categorical_dual_t",
904
+ "dirichlet_semantic_t_mode": "same",
905
+ "dirichlet_semantic_t_value": 0.0,
906
+ "dirichlet_semantic_t_curve": "linear",
907
+ "dirichlet_semantic_t_power": 1.0,
908
+ "endpoint_sequence_random_prob_alpha": 1.0,
909
+ "categorical_wrong_from_full_vocab": true,
910
+ "categorical_wrong_from_batch_valid_tokens": false,
911
+ "categorical_wrong_basin_token_ids": "",
912
+ "categorical_wrong_basin_prob": 0.0,
913
+ "categorical_wrong_unigram_prob": 0.0,
914
+ "categorical_wrong_uniform_prob": 0.0,
915
+ "categorical_wrong_corpus_unigram_path": "",
916
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
917
+ "categorical_wrong_basin_shared_prob": 0.0,
918
+ "categorical_wrong_unigram_shared_prob": 0.0,
919
+ "mask_mixture_original_prob": 0.0,
920
+ "mask_mixture_lowk_prob": 0.0,
921
+ "mask_mixture_lowcorrupt_prob": 0.0,
922
+ "mask_mixture_block_prob": 0.0,
923
+ "mask_mixture_all_prob": 1.0,
924
+ "mask_mixture_lowk_clean_tokens": "0",
925
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
926
+ "mask_mixture_block_tokens": "64,128",
927
+ "simplex_bridge_sampler": "dirichlet",
928
+ "logistic_normal_sigma_min": 0.1,
929
+ "logistic_normal_sigma_max": 1.0,
930
+ "logistic_normal_tau_min": 1.0,
931
+ "logistic_normal_tau_max": 1.0,
932
+ "torch_compile": false,
933
+ "compile_mode": "max-autotune",
934
+ "state_format": "prob",
935
+ "meanflow_weight": 0.0,
936
+ "rollout_train_prob": 0.0,
937
+ "rollout_train_steps": 1,
938
+ "rollout_train_infer_steps": 64,
939
+ "rollout_train_temp": 1.45,
940
+ "rollout_train_max_gamma": 1.0,
941
+ "rollout_train_corrupt_only": true,
942
+ "rollout_train_samplewise": false,
943
+ "rollout_train_compute_always": false,
944
+ "bridge_noise_init": "logistic_normal",
945
+ "noise_sigma": -1.0,
946
+ "allow_tf32": true,
947
+ "activation_checkpointing": false,
948
+ "activation_checkpoint_interval": 1,
949
+ "activation_checkpoint_scope": "block",
950
+ "ddp_static_graph": false,
951
+ "ddp_gradient_as_bucket_view": true,
952
+ "blocking_data_transfer": false,
953
+ "dataloader_prefetch_factor": 4,
954
+ "full_train_stats": false,
955
+ "tokenized_hf": false,
956
+ "tokenized_pad_token": "pad",
957
+ "elf_conditional_hf": false,
958
+ "record_pad_truncate": false,
959
+ "record_add_eos": false,
960
+ "record_add_special_tokens": false,
961
+ "record_pad_token": "pad",
962
+ "record_shuffle_buffer": 10000,
963
+ "wrap": true,
964
+ "wrap_mode": "stream",
965
+ "wrap_record_buffer_size": 200,
966
+ "owt_cached_chunks": true,
967
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
968
+ "owt_chunk_cache_rebuild": false,
969
+ "owt_chunk_cache_write_batch": 4096,
970
+ "owt_exact_repeat_per_chunk": 64,
971
+ "online_chunk_shuffle": false,
972
+ "online_chunk_shuffle_buffer": 10000,
973
+ "openwebtext_split": "train_minus_100k",
974
+ "detokenizer": "auto",
975
+ "resolved_detokenizer": null,
976
+ "num_workers": 0,
977
+ "latest_every": 1000,
978
+ "resume_path": "runs/train8_noisegeo_len256_allcorrupt_seqrand1_dirC1_1024_20260517_163805/latest.pt"
979
+ }
980
+ step=4100 epoch=4100/5000 epoch_step=1/1 micro_steps=4100 elapsed=4.4s lr=2.000000e-03 loss=1.7169 loss_recon=1.7169 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3049 corrupt_frac=1.0000 acc_corrupt=0.3049 loss_corrupt=1.7169 wrong_frac=0.9425 init_acc_corrupt=0.0414 acc_corrupt_t_0p0_0p2=0.2156 corrupt_frac_t_0p0_0p2=0.5559 acc_corrupt_t_0p2_0p4=0.3834 corrupt_frac_t_0p2_0p4=0.3589 acc_corrupt_t_0p4_0p6=0.5398 corrupt_frac_t_0p4_0p6=0.0773 acc_corrupt_t_0p6_0p8=0.7183 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=10.0966 out_g_norm=0.3552 loss_all=1.7122 init_gold_top10=0.0613 init_gold_top100=0.2903
981
+ step=4200 epoch=4200/5000 epoch_step=1/1 micro_steps=4200 elapsed=3.7s lr=2.000000e-03 loss=1.7008 loss_recon=1.7008 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3102 corrupt_frac=1.0000 acc_corrupt=0.3102 loss_corrupt=1.7008 wrong_frac=0.9405 init_acc_corrupt=0.0435 acc_corrupt_t_0p0_0p2=0.2187 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3880 corrupt_frac_t_0p2_0p4=0.3621 acc_corrupt_t_0p4_0p6=0.5461 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.7739 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=10.1254 out_g_norm=0.3475 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7443 init_gold_top10=0.0640 init_gold_top100=0.3024
982
+ step=4300 epoch=4300/5000 epoch_step=1/1 micro_steps=4300 elapsed=3.7s lr=2.000000e-03 loss=1.7153 loss_recon=1.7153 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3064 corrupt_frac=1.0000 acc_corrupt=0.3064 loss_corrupt=1.7153 wrong_frac=0.9431 init_acc_corrupt=0.0408 acc_corrupt_t_0p0_0p2=0.2192 corrupt_frac_t_0p0_0p2=0.5641 acc_corrupt_t_0p2_0p4=0.3848 corrupt_frac_t_0p2_0p4=0.3542 acc_corrupt_t_0p4_0p6=0.5471 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.7563 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=10.1437 out_g_norm=0.3620 acc_corrupt_t_0p8_1p0=0.9987 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7420 init_gold_top10=0.0671 init_gold_top100=0.3087
983
+ step=4400 epoch=4400/5000 epoch_step=1/1 micro_steps=4400 elapsed=3.7s lr=2.000000e-03 loss=1.7084 loss_recon=1.7084 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3078 corrupt_frac=1.0000 acc_corrupt=0.3078 loss_corrupt=1.7084 wrong_frac=0.9416 init_acc_corrupt=0.0425 acc_corrupt_t_0p0_0p2=0.2170 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.3852 corrupt_frac_t_0p2_0p4=0.3620 acc_corrupt_t_0p4_0p6=0.5719 corrupt_frac_t_0p4_0p6=0.0719 out_w_norm=10.1604 out_g_norm=0.3407 acc_corrupt_t_0p6_0p8=0.6741 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.6113 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7124 init_gold_top10=0.0580 init_gold_top100=0.3017
984
+ step=4500 epoch=4500/5000 epoch_step=1/1 micro_steps=4500 elapsed=3.7s lr=2.000000e-03 loss=1.7111 loss_recon=1.7111 loss_meanflow=0.0000 mean_model_t=0.2071 mean_corrupt_t=0.2071 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3062 corrupt_frac=1.0000 acc_corrupt=0.3062 loss_corrupt=1.7111 wrong_frac=0.9428 init_acc_corrupt=0.0415 acc_corrupt_t_0p0_0p2=0.2177 corrupt_frac_t_0p0_0p2=0.5632 acc_corrupt_t_0p2_0p4=0.3882 corrupt_frac_t_0p2_0p4=0.3546 acc_corrupt_t_0p4_0p6=0.5355 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.7873 corrupt_frac_t_0p6_0p8=0.0118 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=10.1916 out_g_norm=0.3348 loss_all=1.6872 init_gold_top10=0.0772 init_gold_top100=0.3108
985
+ step=4600 epoch=4600/5000 epoch_step=1/1 micro_steps=4600 elapsed=3.8s lr=2.000000e-03 loss=1.7063 loss_recon=1.7063 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3092 corrupt_frac=1.0000 acc_corrupt=0.3092 loss_corrupt=1.7063 wrong_frac=0.9417 init_acc_corrupt=0.0417 acc_corrupt_t_0p0_0p2=0.2186 corrupt_frac_t_0p0_0p2=0.5617 acc_corrupt_t_0p2_0p4=0.3950 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.5404 corrupt_frac_t_0p4_0p6=0.0729 acc_corrupt_t_0p6_0p8=0.7182 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=10.2035 out_g_norm=0.3463 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7043 init_gold_top10=0.0662 init_gold_top100=0.3149
986
+ step=4700 epoch=4700/5000 epoch_step=1/1 micro_steps=4700 elapsed=3.7s lr=2.000000e-03 loss=1.7007 loss_recon=1.7007 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3121 corrupt_frac=1.0000 acc_corrupt=0.3121 loss_corrupt=1.7007 wrong_frac=0.9418 init_acc_corrupt=0.0418 acc_corrupt_t_0p0_0p2=0.2243 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.3936 corrupt_frac_t_0p2_0p4=0.3587 acc_corrupt_t_0p4_0p6=0.5240 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.6673 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=10.2152 out_g_norm=0.3226 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7804 init_gold_top10=0.0560 init_gold_top100=0.2892
987
+ step=4800 epoch=4800/5000 epoch_step=1/1 micro_steps=4800 elapsed=3.7s lr=2.000000e-03 loss=1.7002 loss_recon=1.7002 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3114 corrupt_frac=1.0000 acc_corrupt=0.3114 loss_corrupt=1.7002 wrong_frac=0.9409 init_acc_corrupt=0.0433 acc_corrupt_t_0p0_0p2=0.2185 corrupt_frac_t_0p0_0p2=0.5548 acc_corrupt_t_0p2_0p4=0.3921 corrupt_frac_t_0p2_0p4=0.3585 acc_corrupt_t_0p4_0p6=0.5639 corrupt_frac_t_0p4_0p6=0.0790 out_w_norm=10.2286 out_g_norm=0.3147 acc_corrupt_t_0p6_0p8=0.6488 corrupt_frac_t_0p6_0p8=0.0130 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6743 init_gold_top10=0.0728 init_gold_top100=0.2933
988
+ step=4900 epoch=4900/5000 epoch_step=1/1 micro_steps=4900 elapsed=3.7s lr=2.000000e-03 loss=1.6960 loss_recon=1.6960 loss_meanflow=0.0000 mean_model_t=0.2107 mean_corrupt_t=0.2107 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3150 corrupt_frac=1.0000 acc_corrupt=0.3150 loss_corrupt=1.6960 wrong_frac=0.9393 init_acc_corrupt=0.0447 acc_corrupt_t_0p0_0p2=0.2212 corrupt_frac_t_0p0_0p2=0.5521 acc_corrupt_t_0p2_0p4=0.3976 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.5409 corrupt_frac_t_0p4_0p6=0.0788 acc_corrupt_t_0p6_0p8=0.7767 corrupt_frac_t_0p6_0p8=0.0133 out_w_norm=10.2407 out_g_norm=0.3305 acc_corrupt_t_0p8_1p0=0.6732 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.8041 init_gold_top10=0.0489 init_gold_top100=0.2866
989
+ step=5000 epoch=5000/5000 epoch_step=1/1 micro_steps=5000 elapsed=3.7s lr=2.000000e-03 loss=1.6921 loss_recon=1.6921 loss_meanflow=0.0000 mean_model_t=0.2112 mean_corrupt_t=0.2112 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3156 corrupt_frac=1.0000 acc_corrupt=0.3156 loss_corrupt=1.6921 wrong_frac=0.9392 init_acc_corrupt=0.0447 acc_corrupt_t_0p0_0p2=0.2214 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3960 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.5632 corrupt_frac_t_0p4_0p6=0.0810 out_w_norm=10.2337 out_g_norm=0.3309 acc_corrupt_t_0p6_0p8=0.6320 corrupt_frac_t_0p6_0p8=0.0137 acc_corrupt_t_0p8_1p0=0.4813 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6939 init_gold_top10=0.0757 init_gold_top100=0.3064
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_noisegeo_len256_allcorrupt_seqrand1_highC64_4096_20260517_163805.log ADDED
@@ -0,0 +1,791 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 969,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_seqrand1_highC64_4096_20260517_163805",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 1965440,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 1.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_corpus_unigram_path": "",
124
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
125
+ "categorical_wrong_basin_shared_prob": 0.0,
126
+ "categorical_wrong_unigram_shared_prob": 0.0,
127
+ "mask_mixture_original_prob": 0.0,
128
+ "mask_mixture_lowk_prob": 0.0,
129
+ "mask_mixture_lowcorrupt_prob": 0.0,
130
+ "mask_mixture_block_prob": 0.0,
131
+ "mask_mixture_all_prob": 1.0,
132
+ "mask_mixture_lowk_clean_tokens": "0",
133
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
134
+ "mask_mixture_block_tokens": "64,128",
135
+ "simplex_bridge_sampler": "dirichlet",
136
+ "logistic_normal_sigma_min": 0.1,
137
+ "logistic_normal_sigma_max": 1.0,
138
+ "logistic_normal_tau_min": 1.0,
139
+ "logistic_normal_tau_max": 1.0,
140
+ "torch_compile": false,
141
+ "compile_mode": "max-autotune",
142
+ "state_format": "prob",
143
+ "meanflow_weight": 0.0,
144
+ "rollout_train_prob": 0.0,
145
+ "rollout_train_steps": 1,
146
+ "rollout_train_infer_steps": 64,
147
+ "rollout_train_temp": 1.45,
148
+ "rollout_train_max_gamma": 1.0,
149
+ "rollout_train_corrupt_only": true,
150
+ "rollout_train_samplewise": false,
151
+ "rollout_train_compute_always": false,
152
+ "bridge_noise_init": "logistic_normal",
153
+ "noise_sigma": -1.0,
154
+ "allow_tf32": true,
155
+ "activation_checkpointing": false,
156
+ "activation_checkpoint_interval": 1,
157
+ "activation_checkpoint_scope": "block",
158
+ "ddp_static_graph": false,
159
+ "ddp_gradient_as_bucket_view": true,
160
+ "blocking_data_transfer": false,
161
+ "dataloader_prefetch_factor": 4,
162
+ "full_train_stats": false,
163
+ "tokenized_hf": false,
164
+ "tokenized_pad_token": "pad",
165
+ "elf_conditional_hf": false,
166
+ "record_pad_truncate": false,
167
+ "record_add_eos": false,
168
+ "record_add_special_tokens": false,
169
+ "record_pad_token": "pad",
170
+ "record_shuffle_buffer": 10000,
171
+ "wrap": true,
172
+ "wrap_mode": "stream",
173
+ "wrap_record_buffer_size": 200,
174
+ "owt_cached_chunks": true,
175
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
176
+ "owt_chunk_cache_rebuild": false,
177
+ "owt_chunk_cache_write_batch": 4096,
178
+ "owt_exact_repeat_per_chunk": 64,
179
+ "online_chunk_shuffle": false,
180
+ "online_chunk_shuffle_buffer": 10000,
181
+ "openwebtext_split": "train_minus_100k",
182
+ "detokenizer": "auto",
183
+ "resolved_detokenizer": null,
184
+ "num_workers": 0,
185
+ "latest_every": 1000,
186
+ "resume_path": ""
187
+ }
188
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=4.6s lr=2.000000e-03 loss=6.7544 loss_recon=6.7544 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0620 corrupt_frac=1.0000 acc_corrupt=0.0620 loss_corrupt=6.7544 wrong_frac=0.9425 init_acc_corrupt=0.0583 acc_corrupt_t_0p0_0p2=0.0416 corrupt_frac_t_0p0_0p2=0.5559 acc_corrupt_t_0p2_0p4=0.0683 corrupt_frac_t_0p2_0p4=0.3589 acc_corrupt_t_0p4_0p6=0.1540 corrupt_frac_t_0p4_0p6=0.0773 acc_corrupt_t_0p6_0p8=0.3057 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=0.8069 out_g_norm=1.2569 loss_all=6.5858 init_gold_top10=0.0631 init_gold_top100=0.1527
189
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=3.9s lr=2.000000e-03 loss=6.3225 loss_recon=6.3225 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0659 corrupt_frac=1.0000 acc_corrupt=0.0659 loss_corrupt=6.3225 wrong_frac=0.9405 init_acc_corrupt=0.0603 acc_corrupt_t_0p0_0p2=0.0457 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.0743 corrupt_frac_t_0p2_0p4=0.3621 acc_corrupt_t_0p4_0p6=0.1454 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.2908 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=2.1823 out_g_norm=1.7025 acc_corrupt_t_0p8_1p0=0.4883 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.0814 init_gold_top10=0.0650 init_gold_top100=0.1524
190
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=3.9s lr=2.000000e-03 loss=5.8654 loss_recon=5.8654 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0727 corrupt_frac=1.0000 acc_corrupt=0.0727 loss_corrupt=5.8654 wrong_frac=0.9431 init_acc_corrupt=0.0578 acc_corrupt_t_0p0_0p2=0.0506 corrupt_frac_t_0p0_0p2=0.5641 acc_corrupt_t_0p2_0p4=0.0832 corrupt_frac_t_0p2_0p4=0.3542 acc_corrupt_t_0p4_0p6=0.1635 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.3176 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=3.4522 out_g_norm=0.7577 acc_corrupt_t_0p8_1p0=0.5703 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.6379 init_gold_top10=0.0683 init_gold_top100=0.1548
191
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=3.9s lr=2.000000e-03 loss=5.2947 loss_recon=5.2947 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0885 corrupt_frac=1.0000 acc_corrupt=0.0885 loss_corrupt=5.2947 wrong_frac=0.9416 init_acc_corrupt=0.0592 acc_corrupt_t_0p0_0p2=0.0600 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.1021 corrupt_frac_t_0p2_0p4=0.3620 acc_corrupt_t_0p4_0p6=0.2053 corrupt_frac_t_0p4_0p6=0.0719 out_w_norm=4.8795 out_g_norm=0.5442 acc_corrupt_t_0p6_0p8=0.3520 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.2930 corrupt_frac_t_0p8_1p0=0.0078 loss_all=4.9185 init_gold_top10=0.0601 init_gold_top100=0.1488
192
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=3.9s lr=2.000000e-03 loss=4.3642 loss_recon=4.3642 loss_meanflow=0.0000 mean_model_t=0.2071 mean_corrupt_t=0.2071 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1157 corrupt_frac=1.0000 acc_corrupt=0.1157 loss_corrupt=4.3642 wrong_frac=0.9428 init_acc_corrupt=0.0580 acc_corrupt_t_0p0_0p2=0.0745 corrupt_frac_t_0p0_0p2=0.5632 acc_corrupt_t_0p2_0p4=0.1395 corrupt_frac_t_0p2_0p4=0.3546 acc_corrupt_t_0p4_0p6=0.2707 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.5257 corrupt_frac_t_0p6_0p8=0.0118 acc_corrupt_t_0p8_1p0=0.8398 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=6.1600 out_g_norm=0.6298 loss_all=3.8021 init_gold_top10=0.0782 init_gold_top100=0.1620
193
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=3.9s lr=2.000000e-03 loss=3.2898 loss_recon=3.2898 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1683 corrupt_frac=1.0000 acc_corrupt=0.1683 loss_corrupt=3.2898 wrong_frac=0.9417 init_acc_corrupt=0.0591 acc_corrupt_t_0p0_0p2=0.1149 corrupt_frac_t_0p0_0p2=0.5617 acc_corrupt_t_0p2_0p4=0.2089 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.3396 corrupt_frac_t_0p4_0p6=0.0729 acc_corrupt_t_0p6_0p8=0.5331 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=7.2965 out_g_norm=0.7567 acc_corrupt_t_0p8_1p0=0.9102 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.6961 init_gold_top10=0.0689 init_gold_top100=0.1557
194
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=3.9s lr=2.000000e-03 loss=2.3635 loss_recon=2.3635 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2489 corrupt_frac=1.0000 acc_corrupt=0.2489 loss_corrupt=2.3635 wrong_frac=0.9418 init_acc_corrupt=0.0590 acc_corrupt_t_0p0_0p2=0.1774 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.3100 corrupt_frac_t_0p2_0p4=0.3587 acc_corrupt_t_0p4_0p6=0.4399 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.5913 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=8.1634 out_g_norm=1.1224 acc_corrupt_t_0p8_1p0=0.9453 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.1892 init_gold_top10=0.0582 init_gold_top100=0.1457
195
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=3.9s lr=2.000000e-03 loss=1.9215 loss_recon=1.9215 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2909 corrupt_frac=1.0000 acc_corrupt=0.2909 loss_corrupt=1.9215 wrong_frac=0.9409 init_acc_corrupt=0.0600 acc_corrupt_t_0p0_0p2=0.2065 corrupt_frac_t_0p0_0p2=0.5548 acc_corrupt_t_0p2_0p4=0.3602 corrupt_frac_t_0p2_0p4=0.3585 acc_corrupt_t_0p4_0p6=0.5357 corrupt_frac_t_0p4_0p6=0.0790 out_w_norm=8.6846 out_g_norm=1.2636 acc_corrupt_t_0p6_0p8=0.6311 corrupt_frac_t_0p6_0p8=0.0130 acc_corrupt_t_0p8_1p0=0.9766 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.8396 init_gold_top10=0.0737 init_gold_top100=0.1602
196
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=3.9s lr=2.000000e-03 loss=1.7962 loss_recon=1.7962 loss_meanflow=0.0000 mean_model_t=0.2107 mean_corrupt_t=0.2107 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3050 corrupt_frac=1.0000 acc_corrupt=0.3050 loss_corrupt=1.7962 wrong_frac=0.9393 init_acc_corrupt=0.0616 acc_corrupt_t_0p0_0p2=0.2131 corrupt_frac_t_0p0_0p2=0.5521 acc_corrupt_t_0p2_0p4=0.3849 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.5296 corrupt_frac_t_0p4_0p6=0.0788 acc_corrupt_t_0p6_0p8=0.7714 corrupt_frac_t_0p6_0p8=0.0133 out_w_norm=8.8871 out_g_norm=1.2738 acc_corrupt_t_0p8_1p0=0.6940 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.9315 init_gold_top10=0.0495 init_gold_top100=0.1383
197
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=3.9s lr=2.000000e-03 loss=1.7356 loss_recon=1.7356 loss_meanflow=0.0000 mean_model_t=0.2112 mean_corrupt_t=0.2112 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3118 corrupt_frac=1.0000 acc_corrupt=0.3118 loss_corrupt=1.7356 wrong_frac=0.9392 init_acc_corrupt=0.0616 acc_corrupt_t_0p0_0p2=0.2200 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3879 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.5600 corrupt_frac_t_0p4_0p6=0.0810 out_w_norm=8.9701 out_g_norm=1.0438 acc_corrupt_t_0p6_0p8=0.6375 corrupt_frac_t_0p6_0p8=0.0137 acc_corrupt_t_0p8_1p0=0.5023 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7485 init_gold_top10=0.0775 init_gold_top100=0.1654
198
+ NCCL version 2.25.1+cuda12.8
199
+ resumed_from=runs/train8_noisegeo_len256_allcorrupt_seqrand1_highC64_4096_20260517_163805/latest.pt start_step=1001
200
+ {
201
+ "device": "cuda:0",
202
+ "rank": 0,
203
+ "world_size": 4,
204
+ "samples": "owt_cached_chunks:8",
205
+ "vocab_size": 969,
206
+ "tokenizer_vocab_size": 50257,
207
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_seqrand1_highC64_4096_20260517_163805",
208
+ "batch_size": 128,
209
+ "grad_accum": 1,
210
+ "effective_batch_size": 512,
211
+ "global_batch_size": 512,
212
+ "lr_schedule": "constant_warmup",
213
+ "optimizer": "muon",
214
+ "epochs": 0.0,
215
+ "steps_per_epoch": 1,
216
+ "total_steps": 2000,
217
+ "warmup_steps": 10,
218
+ "warmup_epochs": -1.0,
219
+ "min_lr": 0.0,
220
+ "weight_decay": 0.1,
221
+ "output_weight_decay": -1.0,
222
+ "adamw_param_groups": "nanogpt",
223
+ "adam_beta1": 0.9,
224
+ "adam_beta2": 0.95,
225
+ "adam_eps": 1e-08,
226
+ "muon_impl": "legacy",
227
+ "muon_momentum": 0.95,
228
+ "muon_ns_steps": 5,
229
+ "muon_update_scale": 1.0,
230
+ "muon_nesterov": false,
231
+ "muon_width_scale": false,
232
+ "muon_grouping": "legacy_dim_ge_2",
233
+ "muon_param_count": 1965440,
234
+ "muon_adam_param_count": 8192,
235
+ "muon_param_names": [
236
+ "vocab_embed.embedding",
237
+ "sigma_map.net.0.weight",
238
+ "sigma_map.net.2.weight",
239
+ "blocks.0.attn_qkv.weight",
240
+ "blocks.0.attn_out.weight",
241
+ "blocks.0.mlp.0.weight",
242
+ "blocks.0.mlp.2.weight",
243
+ "blocks.0.adaLN_modulation.weight",
244
+ "blocks.1.attn_qkv.weight",
245
+ "blocks.1.attn_out.weight",
246
+ "blocks.1.mlp.0.weight",
247
+ "blocks.1.mlp.2.weight",
248
+ "blocks.1.adaLN_modulation.weight",
249
+ "blocks.2.attn_qkv.weight",
250
+ "blocks.2.attn_out.weight",
251
+ "blocks.2.mlp.0.weight",
252
+ "blocks.2.mlp.2.weight",
253
+ "blocks.2.adaLN_modulation.weight",
254
+ "output_layer.linear.weight",
255
+ "output_layer.adaLN_modulation.weight"
256
+ ],
257
+ "muon_adam_param_names": [
258
+ "sigma_map.net.0.bias",
259
+ "sigma_map.net.2.bias",
260
+ "blocks.0.norm1.weight",
261
+ "blocks.0.norm2.weight",
262
+ "blocks.0.mlp.0.bias",
263
+ "blocks.0.mlp.2.bias",
264
+ "blocks.0.adaLN_modulation.bias",
265
+ "blocks.1.norm1.weight",
266
+ "blocks.1.norm2.weight",
267
+ "blocks.1.mlp.0.bias",
268
+ "blocks.1.mlp.2.bias",
269
+ "blocks.1.adaLN_modulation.bias",
270
+ "blocks.2.norm1.weight",
271
+ "blocks.2.norm2.weight",
272
+ "blocks.2.mlp.0.bias",
273
+ "blocks.2.mlp.2.bias",
274
+ "blocks.2.adaLN_modulation.bias",
275
+ "output_layer.norm_final.weight",
276
+ "output_layer.adaLN_modulation.bias"
277
+ ],
278
+ "muon_effective_nesterov": false,
279
+ "muon_effective_width_scale": false,
280
+ "muon_effective_weight_decay": 0.1,
281
+ "muon_adam_fallback_nesterov": false,
282
+ "muon_adam_fallback_weight_decay": 0.1,
283
+ "ema_decay": 0.9999,
284
+ "ema_start_step": 0,
285
+ "model_type": "ddit",
286
+ "ddit_mlp_type": "gelu",
287
+ "elf_num_time_tokens": 4,
288
+ "elf_num_model_mode_tokens": 0,
289
+ "qk_norm": true,
290
+ "output_bias": false,
291
+ "output_init_std": -1.0,
292
+ "norm_type": "rmsnorm",
293
+ "target_loss": "hard_ce",
294
+ "linear_soft_target_power": 1.0,
295
+ "linear_soft_target_min_conf": 0.0,
296
+ "linear_soft_target_max_conf": 1.0,
297
+ "t_sampling_mode": "logit_normal",
298
+ "t_sampling_power": 1.0,
299
+ "t_sampling_eps": 0.0001,
300
+ "t_sampling_logit_mean": -1.5,
301
+ "t_sampling_logit_std": 0.8,
302
+ "dual_t": true,
303
+ "corrupt_t_mode": "same",
304
+ "corrupt_min_t": 0.0,
305
+ "corrupt_max_t": 1.0,
306
+ "prefix_block_prob": 0.0,
307
+ "prefix_block_len": 128,
308
+ "mask_ratio_floor_schedule": "none",
309
+ "dirichlet_endpoint_mode": "categorical_dual_t",
310
+ "dirichlet_semantic_t_mode": "same",
311
+ "dirichlet_semantic_t_value": 0.0,
312
+ "dirichlet_semantic_t_curve": "linear",
313
+ "dirichlet_semantic_t_power": 1.0,
314
+ "endpoint_sequence_random_prob_alpha": 1.0,
315
+ "categorical_wrong_from_full_vocab": true,
316
+ "categorical_wrong_from_batch_valid_tokens": false,
317
+ "categorical_wrong_basin_token_ids": "",
318
+ "categorical_wrong_basin_prob": 0.0,
319
+ "categorical_wrong_unigram_prob": 0.0,
320
+ "categorical_wrong_uniform_prob": 0.0,
321
+ "categorical_wrong_corpus_unigram_path": "",
322
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
323
+ "categorical_wrong_basin_shared_prob": 0.0,
324
+ "categorical_wrong_unigram_shared_prob": 0.0,
325
+ "mask_mixture_original_prob": 0.0,
326
+ "mask_mixture_lowk_prob": 0.0,
327
+ "mask_mixture_lowcorrupt_prob": 0.0,
328
+ "mask_mixture_block_prob": 0.0,
329
+ "mask_mixture_all_prob": 1.0,
330
+ "mask_mixture_lowk_clean_tokens": "0",
331
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
332
+ "mask_mixture_block_tokens": "64,128",
333
+ "simplex_bridge_sampler": "dirichlet",
334
+ "logistic_normal_sigma_min": 0.1,
335
+ "logistic_normal_sigma_max": 1.0,
336
+ "logistic_normal_tau_min": 1.0,
337
+ "logistic_normal_tau_max": 1.0,
338
+ "torch_compile": false,
339
+ "compile_mode": "max-autotune",
340
+ "state_format": "prob",
341
+ "meanflow_weight": 0.0,
342
+ "rollout_train_prob": 0.0,
343
+ "rollout_train_steps": 1,
344
+ "rollout_train_infer_steps": 64,
345
+ "rollout_train_temp": 1.45,
346
+ "rollout_train_max_gamma": 1.0,
347
+ "rollout_train_corrupt_only": true,
348
+ "rollout_train_samplewise": false,
349
+ "rollout_train_compute_always": false,
350
+ "bridge_noise_init": "logistic_normal",
351
+ "noise_sigma": -1.0,
352
+ "allow_tf32": true,
353
+ "activation_checkpointing": false,
354
+ "activation_checkpoint_interval": 1,
355
+ "activation_checkpoint_scope": "block",
356
+ "ddp_static_graph": false,
357
+ "ddp_gradient_as_bucket_view": true,
358
+ "blocking_data_transfer": false,
359
+ "dataloader_prefetch_factor": 4,
360
+ "full_train_stats": false,
361
+ "tokenized_hf": false,
362
+ "tokenized_pad_token": "pad",
363
+ "elf_conditional_hf": false,
364
+ "record_pad_truncate": false,
365
+ "record_add_eos": false,
366
+ "record_add_special_tokens": false,
367
+ "record_pad_token": "pad",
368
+ "record_shuffle_buffer": 10000,
369
+ "wrap": true,
370
+ "wrap_mode": "stream",
371
+ "wrap_record_buffer_size": 200,
372
+ "owt_cached_chunks": true,
373
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
374
+ "owt_chunk_cache_rebuild": false,
375
+ "owt_chunk_cache_write_batch": 4096,
376
+ "owt_exact_repeat_per_chunk": 64,
377
+ "online_chunk_shuffle": false,
378
+ "online_chunk_shuffle_buffer": 10000,
379
+ "openwebtext_split": "train_minus_100k",
380
+ "detokenizer": "auto",
381
+ "resolved_detokenizer": null,
382
+ "num_workers": 0,
383
+ "latest_every": 1000,
384
+ "resume_path": "runs/train8_noisegeo_len256_allcorrupt_seqrand1_highC64_4096_20260517_163805/latest.pt"
385
+ }
386
+ step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=4.5s lr=2.000000e-03 loss=1.7211 loss_recon=1.7211 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3085 corrupt_frac=1.0000 acc_corrupt=0.3085 loss_corrupt=1.7211 wrong_frac=0.9425 init_acc_corrupt=0.0583 acc_corrupt_t_0p0_0p2=0.2262 corrupt_frac_t_0p0_0p2=0.5559 acc_corrupt_t_0p2_0p4=0.3782 corrupt_frac_t_0p2_0p4=0.3589 acc_corrupt_t_0p4_0p6=0.5347 corrupt_frac_t_0p4_0p6=0.0773 acc_corrupt_t_0p6_0p8=0.7156 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=9.0381 out_g_norm=0.9623 loss_all=1.7155 init_gold_top10=0.0631 init_gold_top100=0.1527
387
+ step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=3.7s lr=2.000000e-03 loss=1.7046 loss_recon=1.7046 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3124 corrupt_frac=1.0000 acc_corrupt=0.3124 loss_corrupt=1.7046 wrong_frac=0.9405 init_acc_corrupt=0.0603 acc_corrupt_t_0p0_0p2=0.2274 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3814 corrupt_frac_t_0p2_0p4=0.3621 acc_corrupt_t_0p4_0p6=0.5445 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.7642 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=9.1050 out_g_norm=0.9067 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7225 init_gold_top10=0.0650 init_gold_top100=0.1524
388
+ step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=3.7s lr=2.000000e-03 loss=1.7004 loss_recon=1.7004 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3108 corrupt_frac=1.0000 acc_corrupt=0.3108 loss_corrupt=1.7004 wrong_frac=0.9431 init_acc_corrupt=0.0578 acc_corrupt_t_0p0_0p2=0.2303 corrupt_frac_t_0p0_0p2=0.5641 acc_corrupt_t_0p2_0p4=0.3801 corrupt_frac_t_0p2_0p4=0.3542 acc_corrupt_t_0p4_0p6=0.5449 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.7552 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=9.1632 out_g_norm=0.7826 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7046 init_gold_top10=0.0683 init_gold_top100=0.1548
389
+ step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=3.7s lr=2.000000e-03 loss=1.6861 loss_recon=1.6861 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3134 corrupt_frac=1.0000 acc_corrupt=0.3134 loss_corrupt=1.6861 wrong_frac=0.9416 init_acc_corrupt=0.0592 acc_corrupt_t_0p0_0p2=0.2303 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.3808 corrupt_frac_t_0p2_0p4=0.3620 acc_corrupt_t_0p4_0p6=0.5690 corrupt_frac_t_0p4_0p6=0.0719 out_w_norm=9.2246 out_g_norm=0.7817 acc_corrupt_t_0p6_0p8=0.6754 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.6562 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7625 init_gold_top10=0.0601 init_gold_top100=0.1488
390
+ step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=3.7s lr=2.000000e-03 loss=1.6856 loss_recon=1.6856 loss_meanflow=0.0000 mean_model_t=0.2071 mean_corrupt_t=0.2071 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3134 corrupt_frac=1.0000 acc_corrupt=0.3134 loss_corrupt=1.6856 wrong_frac=0.9428 init_acc_corrupt=0.0580 acc_corrupt_t_0p0_0p2=0.2339 corrupt_frac_t_0p0_0p2=0.5632 acc_corrupt_t_0p2_0p4=0.3829 corrupt_frac_t_0p2_0p4=0.3546 acc_corrupt_t_0p4_0p6=0.5341 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.7848 corrupt_frac_t_0p6_0p8=0.0118 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=9.2788 out_g_norm=0.7222 loss_all=1.6723 init_gold_top10=0.0782 init_gold_top100=0.1620
391
+ step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=3.7s lr=2.000000e-03 loss=1.6759 loss_recon=1.6759 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3176 corrupt_frac=1.0000 acc_corrupt=0.3176 loss_corrupt=1.6759 wrong_frac=0.9417 init_acc_corrupt=0.0591 acc_corrupt_t_0p0_0p2=0.2371 corrupt_frac_t_0p0_0p2=0.5617 acc_corrupt_t_0p2_0p4=0.3900 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.5381 corrupt_frac_t_0p4_0p6=0.0729 acc_corrupt_t_0p6_0p8=0.7196 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=9.3244 out_g_norm=0.6543 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6061 init_gold_top10=0.0689 init_gold_top100=0.1557
392
+ step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=3.7s lr=2.000000e-03 loss=1.6719 loss_recon=1.6719 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3168 corrupt_frac=1.0000 acc_corrupt=0.3168 loss_corrupt=1.6719 wrong_frac=0.9418 init_acc_corrupt=0.0590 acc_corrupt_t_0p0_0p2=0.2351 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.3896 corrupt_frac_t_0p2_0p4=0.3587 acc_corrupt_t_0p4_0p6=0.5263 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.6647 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=9.3665 out_g_norm=0.6528 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7500 init_gold_top10=0.0582 init_gold_top100=0.1457
393
+ step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=3.7s lr=2.000000e-03 loss=1.6724 loss_recon=1.6724 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3168 corrupt_frac=1.0000 acc_corrupt=0.3168 loss_corrupt=1.6724 wrong_frac=0.9409 init_acc_corrupt=0.0600 acc_corrupt_t_0p0_0p2=0.2340 corrupt_frac_t_0p0_0p2=0.5548 acc_corrupt_t_0p2_0p4=0.3837 corrupt_frac_t_0p2_0p4=0.3585 acc_corrupt_t_0p4_0p6=0.5623 corrupt_frac_t_0p4_0p6=0.0790 out_w_norm=9.4222 out_g_norm=0.5894 acc_corrupt_t_0p6_0p8=0.6481 corrupt_frac_t_0p6_0p8=0.0130 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6828 init_gold_top10=0.0737 init_gold_top100=0.1602
394
+ step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=3.7s lr=2.000000e-03 loss=1.6636 loss_recon=1.6636 loss_meanflow=0.0000 mean_model_t=0.2107 mean_corrupt_t=0.2107 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3220 corrupt_frac=1.0000 acc_corrupt=0.3220 loss_corrupt=1.6636 wrong_frac=0.9393 init_acc_corrupt=0.0616 acc_corrupt_t_0p0_0p2=0.2349 corrupt_frac_t_0p0_0p2=0.5521 acc_corrupt_t_0p2_0p4=0.3964 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.5390 corrupt_frac_t_0p4_0p6=0.0788 acc_corrupt_t_0p6_0p8=0.7773 corrupt_frac_t_0p6_0p8=0.0133 out_w_norm=9.4541 out_g_norm=0.5804 acc_corrupt_t_0p8_1p0=0.6927 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.8339 init_gold_top10=0.0495 init_gold_top100=0.1383
395
+ step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=3.7s lr=2.000000e-03 loss=1.6564 loss_recon=1.6564 loss_meanflow=0.0000 mean_model_t=0.2112 mean_corrupt_t=0.2112 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3238 corrupt_frac=1.0000 acc_corrupt=0.3238 loss_corrupt=1.6564 wrong_frac=0.9392 init_acc_corrupt=0.0616 acc_corrupt_t_0p0_0p2=0.2372 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3949 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.5611 corrupt_frac_t_0p4_0p6=0.0810 out_w_norm=9.4934 out_g_norm=0.5113 acc_corrupt_t_0p6_0p8=0.6405 corrupt_frac_t_0p6_0p8=0.0137 acc_corrupt_t_0p8_1p0=0.4539 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6741 init_gold_top10=0.0775 init_gold_top100=0.1654
396
+ NCCL version 2.25.1+cuda12.8
397
+ resumed_from=runs/train8_noisegeo_len256_allcorrupt_seqrand1_highC64_4096_20260517_163805/latest.pt start_step=2001
398
+ {
399
+ "device": "cuda:0",
400
+ "rank": 0,
401
+ "world_size": 4,
402
+ "samples": "owt_cached_chunks:8",
403
+ "vocab_size": 969,
404
+ "tokenizer_vocab_size": 50257,
405
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_seqrand1_highC64_4096_20260517_163805",
406
+ "batch_size": 128,
407
+ "grad_accum": 1,
408
+ "effective_batch_size": 512,
409
+ "global_batch_size": 512,
410
+ "lr_schedule": "constant_warmup",
411
+ "optimizer": "muon",
412
+ "epochs": 0.0,
413
+ "steps_per_epoch": 1,
414
+ "total_steps": 3000,
415
+ "warmup_steps": 10,
416
+ "warmup_epochs": -1.0,
417
+ "min_lr": 0.0,
418
+ "weight_decay": 0.1,
419
+ "output_weight_decay": -1.0,
420
+ "adamw_param_groups": "nanogpt",
421
+ "adam_beta1": 0.9,
422
+ "adam_beta2": 0.95,
423
+ "adam_eps": 1e-08,
424
+ "muon_impl": "legacy",
425
+ "muon_momentum": 0.95,
426
+ "muon_ns_steps": 5,
427
+ "muon_update_scale": 1.0,
428
+ "muon_nesterov": false,
429
+ "muon_width_scale": false,
430
+ "muon_grouping": "legacy_dim_ge_2",
431
+ "muon_param_count": 1965440,
432
+ "muon_adam_param_count": 8192,
433
+ "muon_param_names": [
434
+ "vocab_embed.embedding",
435
+ "sigma_map.net.0.weight",
436
+ "sigma_map.net.2.weight",
437
+ "blocks.0.attn_qkv.weight",
438
+ "blocks.0.attn_out.weight",
439
+ "blocks.0.mlp.0.weight",
440
+ "blocks.0.mlp.2.weight",
441
+ "blocks.0.adaLN_modulation.weight",
442
+ "blocks.1.attn_qkv.weight",
443
+ "blocks.1.attn_out.weight",
444
+ "blocks.1.mlp.0.weight",
445
+ "blocks.1.mlp.2.weight",
446
+ "blocks.1.adaLN_modulation.weight",
447
+ "blocks.2.attn_qkv.weight",
448
+ "blocks.2.attn_out.weight",
449
+ "blocks.2.mlp.0.weight",
450
+ "blocks.2.mlp.2.weight",
451
+ "blocks.2.adaLN_modulation.weight",
452
+ "output_layer.linear.weight",
453
+ "output_layer.adaLN_modulation.weight"
454
+ ],
455
+ "muon_adam_param_names": [
456
+ "sigma_map.net.0.bias",
457
+ "sigma_map.net.2.bias",
458
+ "blocks.0.norm1.weight",
459
+ "blocks.0.norm2.weight",
460
+ "blocks.0.mlp.0.bias",
461
+ "blocks.0.mlp.2.bias",
462
+ "blocks.0.adaLN_modulation.bias",
463
+ "blocks.1.norm1.weight",
464
+ "blocks.1.norm2.weight",
465
+ "blocks.1.mlp.0.bias",
466
+ "blocks.1.mlp.2.bias",
467
+ "blocks.1.adaLN_modulation.bias",
468
+ "blocks.2.norm1.weight",
469
+ "blocks.2.norm2.weight",
470
+ "blocks.2.mlp.0.bias",
471
+ "blocks.2.mlp.2.bias",
472
+ "blocks.2.adaLN_modulation.bias",
473
+ "output_layer.norm_final.weight",
474
+ "output_layer.adaLN_modulation.bias"
475
+ ],
476
+ "muon_effective_nesterov": false,
477
+ "muon_effective_width_scale": false,
478
+ "muon_effective_weight_decay": 0.1,
479
+ "muon_adam_fallback_nesterov": false,
480
+ "muon_adam_fallback_weight_decay": 0.1,
481
+ "ema_decay": 0.9999,
482
+ "ema_start_step": 0,
483
+ "model_type": "ddit",
484
+ "ddit_mlp_type": "gelu",
485
+ "elf_num_time_tokens": 4,
486
+ "elf_num_model_mode_tokens": 0,
487
+ "qk_norm": true,
488
+ "output_bias": false,
489
+ "output_init_std": -1.0,
490
+ "norm_type": "rmsnorm",
491
+ "target_loss": "hard_ce",
492
+ "linear_soft_target_power": 1.0,
493
+ "linear_soft_target_min_conf": 0.0,
494
+ "linear_soft_target_max_conf": 1.0,
495
+ "t_sampling_mode": "logit_normal",
496
+ "t_sampling_power": 1.0,
497
+ "t_sampling_eps": 0.0001,
498
+ "t_sampling_logit_mean": -1.5,
499
+ "t_sampling_logit_std": 0.8,
500
+ "dual_t": true,
501
+ "corrupt_t_mode": "same",
502
+ "corrupt_min_t": 0.0,
503
+ "corrupt_max_t": 1.0,
504
+ "prefix_block_prob": 0.0,
505
+ "prefix_block_len": 128,
506
+ "mask_ratio_floor_schedule": "none",
507
+ "dirichlet_endpoint_mode": "categorical_dual_t",
508
+ "dirichlet_semantic_t_mode": "same",
509
+ "dirichlet_semantic_t_value": 0.0,
510
+ "dirichlet_semantic_t_curve": "linear",
511
+ "dirichlet_semantic_t_power": 1.0,
512
+ "endpoint_sequence_random_prob_alpha": 1.0,
513
+ "categorical_wrong_from_full_vocab": true,
514
+ "categorical_wrong_from_batch_valid_tokens": false,
515
+ "categorical_wrong_basin_token_ids": "",
516
+ "categorical_wrong_basin_prob": 0.0,
517
+ "categorical_wrong_unigram_prob": 0.0,
518
+ "categorical_wrong_uniform_prob": 0.0,
519
+ "categorical_wrong_corpus_unigram_path": "",
520
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
521
+ "categorical_wrong_basin_shared_prob": 0.0,
522
+ "categorical_wrong_unigram_shared_prob": 0.0,
523
+ "mask_mixture_original_prob": 0.0,
524
+ "mask_mixture_lowk_prob": 0.0,
525
+ "mask_mixture_lowcorrupt_prob": 0.0,
526
+ "mask_mixture_block_prob": 0.0,
527
+ "mask_mixture_all_prob": 1.0,
528
+ "mask_mixture_lowk_clean_tokens": "0",
529
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
530
+ "mask_mixture_block_tokens": "64,128",
531
+ "simplex_bridge_sampler": "dirichlet",
532
+ "logistic_normal_sigma_min": 0.1,
533
+ "logistic_normal_sigma_max": 1.0,
534
+ "logistic_normal_tau_min": 1.0,
535
+ "logistic_normal_tau_max": 1.0,
536
+ "torch_compile": false,
537
+ "compile_mode": "max-autotune",
538
+ "state_format": "prob",
539
+ "meanflow_weight": 0.0,
540
+ "rollout_train_prob": 0.0,
541
+ "rollout_train_steps": 1,
542
+ "rollout_train_infer_steps": 64,
543
+ "rollout_train_temp": 1.45,
544
+ "rollout_train_max_gamma": 1.0,
545
+ "rollout_train_corrupt_only": true,
546
+ "rollout_train_samplewise": false,
547
+ "rollout_train_compute_always": false,
548
+ "bridge_noise_init": "logistic_normal",
549
+ "noise_sigma": -1.0,
550
+ "allow_tf32": true,
551
+ "activation_checkpointing": false,
552
+ "activation_checkpoint_interval": 1,
553
+ "activation_checkpoint_scope": "block",
554
+ "ddp_static_graph": false,
555
+ "ddp_gradient_as_bucket_view": true,
556
+ "blocking_data_transfer": false,
557
+ "dataloader_prefetch_factor": 4,
558
+ "full_train_stats": false,
559
+ "tokenized_hf": false,
560
+ "tokenized_pad_token": "pad",
561
+ "elf_conditional_hf": false,
562
+ "record_pad_truncate": false,
563
+ "record_add_eos": false,
564
+ "record_add_special_tokens": false,
565
+ "record_pad_token": "pad",
566
+ "record_shuffle_buffer": 10000,
567
+ "wrap": true,
568
+ "wrap_mode": "stream",
569
+ "wrap_record_buffer_size": 200,
570
+ "owt_cached_chunks": true,
571
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
572
+ "owt_chunk_cache_rebuild": false,
573
+ "owt_chunk_cache_write_batch": 4096,
574
+ "owt_exact_repeat_per_chunk": 64,
575
+ "online_chunk_shuffle": false,
576
+ "online_chunk_shuffle_buffer": 10000,
577
+ "openwebtext_split": "train_minus_100k",
578
+ "detokenizer": "auto",
579
+ "resolved_detokenizer": null,
580
+ "num_workers": 0,
581
+ "latest_every": 1000,
582
+ "resume_path": "runs/train8_noisegeo_len256_allcorrupt_seqrand1_highC64_4096_20260517_163805/latest.pt"
583
+ }
584
+ step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=4.4s lr=2.000000e-03 loss=1.6726 loss_recon=1.6726 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3178 corrupt_frac=1.0000 acc_corrupt=0.3178 loss_corrupt=1.6726 wrong_frac=0.9425 init_acc_corrupt=0.0583 acc_corrupt_t_0p0_0p2=0.2400 corrupt_frac_t_0p0_0p2=0.5559 acc_corrupt_t_0p2_0p4=0.3821 corrupt_frac_t_0p2_0p4=0.3589 acc_corrupt_t_0p4_0p6=0.5375 corrupt_frac_t_0p4_0p6=0.0773 acc_corrupt_t_0p6_0p8=0.7159 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=9.5134 out_g_norm=0.4885 loss_all=1.6749 init_gold_top10=0.0631 init_gold_top100=0.1527
585
+ step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=3.7s lr=2.000000e-03 loss=1.6619 loss_recon=1.6619 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3208 corrupt_frac=1.0000 acc_corrupt=0.3208 loss_corrupt=1.6619 wrong_frac=0.9405 init_acc_corrupt=0.0603 acc_corrupt_t_0p0_0p2=0.2395 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3854 corrupt_frac_t_0p2_0p4=0.3621 acc_corrupt_t_0p4_0p6=0.5474 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.7683 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=9.5314 out_g_norm=0.4568 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6885 init_gold_top10=0.0650 init_gold_top100=0.1524
586
+ step=2300 epoch=2300/3000 epoch_step=1/1 micro_steps=2300 elapsed=3.7s lr=2.000000e-03 loss=1.6714 loss_recon=1.6714 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3161 corrupt_frac=1.0000 acc_corrupt=0.3161 loss_corrupt=1.6714 wrong_frac=0.9431 init_acc_corrupt=0.0578 acc_corrupt_t_0p0_0p2=0.2387 corrupt_frac_t_0p0_0p2=0.5641 acc_corrupt_t_0p2_0p4=0.3815 corrupt_frac_t_0p2_0p4=0.3542 acc_corrupt_t_0p4_0p6=0.5451 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.7514 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=9.5574 out_g_norm=0.4506 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6910 init_gold_top10=0.0683 init_gold_top100=0.1548
587
+ step=2400 epoch=2400/3000 epoch_step=1/1 micro_steps=2400 elapsed=3.7s lr=2.000000e-03 loss=1.6590 loss_recon=1.6590 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3220 corrupt_frac=1.0000 acc_corrupt=0.3220 loss_corrupt=1.6590 wrong_frac=0.9416 init_acc_corrupt=0.0592 acc_corrupt_t_0p0_0p2=0.2425 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.3856 corrupt_frac_t_0p2_0p4=0.3620 acc_corrupt_t_0p4_0p6=0.5687 corrupt_frac_t_0p4_0p6=0.0719 out_w_norm=9.5817 out_g_norm=0.4210 acc_corrupt_t_0p6_0p8=0.6772 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.5488 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7235 init_gold_top10=0.0601 init_gold_top100=0.1488
588
+ step=2500 epoch=2500/3000 epoch_step=1/1 micro_steps=2500 elapsed=3.7s lr=2.000000e-03 loss=1.6649 loss_recon=1.6649 loss_meanflow=0.0000 mean_model_t=0.2071 mean_corrupt_t=0.2071 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3167 corrupt_frac=1.0000 acc_corrupt=0.3167 loss_corrupt=1.6649 wrong_frac=0.9428 init_acc_corrupt=0.0580 acc_corrupt_t_0p0_0p2=0.2389 corrupt_frac_t_0p0_0p2=0.5632 acc_corrupt_t_0p2_0p4=0.3837 corrupt_frac_t_0p2_0p4=0.3546 acc_corrupt_t_0p4_0p6=0.5370 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.7832 corrupt_frac_t_0p6_0p8=0.0118 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=9.6088 out_g_norm=0.4081 loss_all=1.6595 init_gold_top10=0.0782 init_gold_top100=0.1620
589
+ step=2600 epoch=2600/3000 epoch_step=1/1 micro_steps=2600 elapsed=3.7s lr=2.000000e-03 loss=1.6595 loss_recon=1.6595 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3224 corrupt_frac=1.0000 acc_corrupt=0.3224 loss_corrupt=1.6595 wrong_frac=0.9417 init_acc_corrupt=0.0591 acc_corrupt_t_0p0_0p2=0.2448 corrupt_frac_t_0p0_0p2=0.5617 acc_corrupt_t_0p2_0p4=0.3911 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.5374 corrupt_frac_t_0p4_0p6=0.0729 acc_corrupt_t_0p6_0p8=0.7199 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=9.6205 out_g_norm=0.4072 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6203 init_gold_top10=0.0689 init_gold_top100=0.1557
590
+ step=2700 epoch=2700/3000 epoch_step=1/1 micro_steps=2700 elapsed=3.7s lr=2.000000e-03 loss=1.6593 loss_recon=1.6593 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3204 corrupt_frac=1.0000 acc_corrupt=0.3204 loss_corrupt=1.6593 wrong_frac=0.9418 init_acc_corrupt=0.0590 acc_corrupt_t_0p0_0p2=0.2406 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.3903 corrupt_frac_t_0p2_0p4=0.3587 acc_corrupt_t_0p4_0p6=0.5292 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.6661 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=9.6248 out_g_norm=0.3656 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7378 init_gold_top10=0.0582 init_gold_top100=0.1457
591
+ step=2800 epoch=2800/3000 epoch_step=1/1 micro_steps=2800 elapsed=3.7s lr=2.000000e-03 loss=1.6591 loss_recon=1.6591 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3207 corrupt_frac=1.0000 acc_corrupt=0.3207 loss_corrupt=1.6591 wrong_frac=0.9409 init_acc_corrupt=0.0600 acc_corrupt_t_0p0_0p2=0.2394 corrupt_frac_t_0p0_0p2=0.5548 acc_corrupt_t_0p2_0p4=0.3857 corrupt_frac_t_0p2_0p4=0.3585 acc_corrupt_t_0p4_0p6=0.5637 corrupt_frac_t_0p4_0p6=0.0790 out_w_norm=9.6487 out_g_norm=0.3641 acc_corrupt_t_0p6_0p8=0.6496 corrupt_frac_t_0p6_0p8=0.0130 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6713 init_gold_top10=0.0737 init_gold_top100=0.1602
592
+ step=2900 epoch=2900/3000 epoch_step=1/1 micro_steps=2900 elapsed=3.7s lr=2.000000e-03 loss=1.6486 loss_recon=1.6486 loss_meanflow=0.0000 mean_model_t=0.2107 mean_corrupt_t=0.2107 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3252 corrupt_frac=1.0000 acc_corrupt=0.3252 loss_corrupt=1.6486 wrong_frac=0.9393 init_acc_corrupt=0.0616 acc_corrupt_t_0p0_0p2=0.2400 corrupt_frac_t_0p0_0p2=0.5521 acc_corrupt_t_0p2_0p4=0.3971 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.5415 corrupt_frac_t_0p4_0p6=0.0788 acc_corrupt_t_0p6_0p8=0.7717 corrupt_frac_t_0p6_0p8=0.0133 out_w_norm=9.6603 out_g_norm=0.3605 acc_corrupt_t_0p8_1p0=0.6758 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.8130 init_gold_top10=0.0495 init_gold_top100=0.1383
593
+ step=3000 epoch=3000/3000 epoch_step=1/1 micro_steps=3000 elapsed=3.7s lr=2.000000e-03 loss=1.6439 loss_recon=1.6439 loss_meanflow=0.0000 mean_model_t=0.2112 mean_corrupt_t=0.2112 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3272 corrupt_frac=1.0000 acc_corrupt=0.3272 loss_corrupt=1.6439 wrong_frac=0.9392 init_acc_corrupt=0.0616 acc_corrupt_t_0p0_0p2=0.2428 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3948 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.5639 corrupt_frac_t_0p4_0p6=0.0810 out_w_norm=9.6701 out_g_norm=0.3616 acc_corrupt_t_0p6_0p8=0.6396 corrupt_frac_t_0p6_0p8=0.0137 acc_corrupt_t_0p8_1p0=0.4609 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6859 init_gold_top10=0.0775 init_gold_top100=0.1654
594
+ NCCL version 2.25.1+cuda12.8
595
+ resumed_from=runs/train8_noisegeo_len256_allcorrupt_seqrand1_highC64_4096_20260517_163805/latest.pt start_step=3001
596
+ {
597
+ "device": "cuda:0",
598
+ "rank": 0,
599
+ "world_size": 4,
600
+ "samples": "owt_cached_chunks:8",
601
+ "vocab_size": 969,
602
+ "tokenizer_vocab_size": 50257,
603
+ "save_dir": "runs/train8_noisegeo_len256_allcorrupt_seqrand1_highC64_4096_20260517_163805",
604
+ "batch_size": 128,
605
+ "grad_accum": 1,
606
+ "effective_batch_size": 512,
607
+ "global_batch_size": 512,
608
+ "lr_schedule": "constant_warmup",
609
+ "optimizer": "muon",
610
+ "epochs": 0.0,
611
+ "steps_per_epoch": 1,
612
+ "total_steps": 4000,
613
+ "warmup_steps": 10,
614
+ "warmup_epochs": -1.0,
615
+ "min_lr": 0.0,
616
+ "weight_decay": 0.1,
617
+ "output_weight_decay": -1.0,
618
+ "adamw_param_groups": "nanogpt",
619
+ "adam_beta1": 0.9,
620
+ "adam_beta2": 0.95,
621
+ "adam_eps": 1e-08,
622
+ "muon_impl": "legacy",
623
+ "muon_momentum": 0.95,
624
+ "muon_ns_steps": 5,
625
+ "muon_update_scale": 1.0,
626
+ "muon_nesterov": false,
627
+ "muon_width_scale": false,
628
+ "muon_grouping": "legacy_dim_ge_2",
629
+ "muon_param_count": 1965440,
630
+ "muon_adam_param_count": 8192,
631
+ "muon_param_names": [
632
+ "vocab_embed.embedding",
633
+ "sigma_map.net.0.weight",
634
+ "sigma_map.net.2.weight",
635
+ "blocks.0.attn_qkv.weight",
636
+ "blocks.0.attn_out.weight",
637
+ "blocks.0.mlp.0.weight",
638
+ "blocks.0.mlp.2.weight",
639
+ "blocks.0.adaLN_modulation.weight",
640
+ "blocks.1.attn_qkv.weight",
641
+ "blocks.1.attn_out.weight",
642
+ "blocks.1.mlp.0.weight",
643
+ "blocks.1.mlp.2.weight",
644
+ "blocks.1.adaLN_modulation.weight",
645
+ "blocks.2.attn_qkv.weight",
646
+ "blocks.2.attn_out.weight",
647
+ "blocks.2.mlp.0.weight",
648
+ "blocks.2.mlp.2.weight",
649
+ "blocks.2.adaLN_modulation.weight",
650
+ "output_layer.linear.weight",
651
+ "output_layer.adaLN_modulation.weight"
652
+ ],
653
+ "muon_adam_param_names": [
654
+ "sigma_map.net.0.bias",
655
+ "sigma_map.net.2.bias",
656
+ "blocks.0.norm1.weight",
657
+ "blocks.0.norm2.weight",
658
+ "blocks.0.mlp.0.bias",
659
+ "blocks.0.mlp.2.bias",
660
+ "blocks.0.adaLN_modulation.bias",
661
+ "blocks.1.norm1.weight",
662
+ "blocks.1.norm2.weight",
663
+ "blocks.1.mlp.0.bias",
664
+ "blocks.1.mlp.2.bias",
665
+ "blocks.1.adaLN_modulation.bias",
666
+ "blocks.2.norm1.weight",
667
+ "blocks.2.norm2.weight",
668
+ "blocks.2.mlp.0.bias",
669
+ "blocks.2.mlp.2.bias",
670
+ "blocks.2.adaLN_modulation.bias",
671
+ "output_layer.norm_final.weight",
672
+ "output_layer.adaLN_modulation.bias"
673
+ ],
674
+ "muon_effective_nesterov": false,
675
+ "muon_effective_width_scale": false,
676
+ "muon_effective_weight_decay": 0.1,
677
+ "muon_adam_fallback_nesterov": false,
678
+ "muon_adam_fallback_weight_decay": 0.1,
679
+ "ema_decay": 0.9999,
680
+ "ema_start_step": 0,
681
+ "model_type": "ddit",
682
+ "ddit_mlp_type": "gelu",
683
+ "elf_num_time_tokens": 4,
684
+ "elf_num_model_mode_tokens": 0,
685
+ "qk_norm": true,
686
+ "output_bias": false,
687
+ "output_init_std": -1.0,
688
+ "norm_type": "rmsnorm",
689
+ "target_loss": "hard_ce",
690
+ "linear_soft_target_power": 1.0,
691
+ "linear_soft_target_min_conf": 0.0,
692
+ "linear_soft_target_max_conf": 1.0,
693
+ "t_sampling_mode": "logit_normal",
694
+ "t_sampling_power": 1.0,
695
+ "t_sampling_eps": 0.0001,
696
+ "t_sampling_logit_mean": -1.5,
697
+ "t_sampling_logit_std": 0.8,
698
+ "dual_t": true,
699
+ "corrupt_t_mode": "same",
700
+ "corrupt_min_t": 0.0,
701
+ "corrupt_max_t": 1.0,
702
+ "prefix_block_prob": 0.0,
703
+ "prefix_block_len": 128,
704
+ "mask_ratio_floor_schedule": "none",
705
+ "dirichlet_endpoint_mode": "categorical_dual_t",
706
+ "dirichlet_semantic_t_mode": "same",
707
+ "dirichlet_semantic_t_value": 0.0,
708
+ "dirichlet_semantic_t_curve": "linear",
709
+ "dirichlet_semantic_t_power": 1.0,
710
+ "endpoint_sequence_random_prob_alpha": 1.0,
711
+ "categorical_wrong_from_full_vocab": true,
712
+ "categorical_wrong_from_batch_valid_tokens": false,
713
+ "categorical_wrong_basin_token_ids": "",
714
+ "categorical_wrong_basin_prob": 0.0,
715
+ "categorical_wrong_unigram_prob": 0.0,
716
+ "categorical_wrong_uniform_prob": 0.0,
717
+ "categorical_wrong_corpus_unigram_path": "",
718
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
719
+ "categorical_wrong_basin_shared_prob": 0.0,
720
+ "categorical_wrong_unigram_shared_prob": 0.0,
721
+ "mask_mixture_original_prob": 0.0,
722
+ "mask_mixture_lowk_prob": 0.0,
723
+ "mask_mixture_lowcorrupt_prob": 0.0,
724
+ "mask_mixture_block_prob": 0.0,
725
+ "mask_mixture_all_prob": 1.0,
726
+ "mask_mixture_lowk_clean_tokens": "0",
727
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
728
+ "mask_mixture_block_tokens": "64,128",
729
+ "simplex_bridge_sampler": "dirichlet",
730
+ "logistic_normal_sigma_min": 0.1,
731
+ "logistic_normal_sigma_max": 1.0,
732
+ "logistic_normal_tau_min": 1.0,
733
+ "logistic_normal_tau_max": 1.0,
734
+ "torch_compile": false,
735
+ "compile_mode": "max-autotune",
736
+ "state_format": "prob",
737
+ "meanflow_weight": 0.0,
738
+ "rollout_train_prob": 0.0,
739
+ "rollout_train_steps": 1,
740
+ "rollout_train_infer_steps": 64,
741
+ "rollout_train_temp": 1.45,
742
+ "rollout_train_max_gamma": 1.0,
743
+ "rollout_train_corrupt_only": true,
744
+ "rollout_train_samplewise": false,
745
+ "rollout_train_compute_always": false,
746
+ "bridge_noise_init": "logistic_normal",
747
+ "noise_sigma": -1.0,
748
+ "allow_tf32": true,
749
+ "activation_checkpointing": false,
750
+ "activation_checkpoint_interval": 1,
751
+ "activation_checkpoint_scope": "block",
752
+ "ddp_static_graph": false,
753
+ "ddp_gradient_as_bucket_view": true,
754
+ "blocking_data_transfer": false,
755
+ "dataloader_prefetch_factor": 4,
756
+ "full_train_stats": false,
757
+ "tokenized_hf": false,
758
+ "tokenized_pad_token": "pad",
759
+ "elf_conditional_hf": false,
760
+ "record_pad_truncate": false,
761
+ "record_add_eos": false,
762
+ "record_add_special_tokens": false,
763
+ "record_pad_token": "pad",
764
+ "record_shuffle_buffer": 10000,
765
+ "wrap": true,
766
+ "wrap_mode": "stream",
767
+ "wrap_record_buffer_size": 200,
768
+ "owt_cached_chunks": true,
769
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
770
+ "owt_chunk_cache_rebuild": false,
771
+ "owt_chunk_cache_write_batch": 4096,
772
+ "owt_exact_repeat_per_chunk": 64,
773
+ "online_chunk_shuffle": false,
774
+ "online_chunk_shuffle_buffer": 10000,
775
+ "openwebtext_split": "train_minus_100k",
776
+ "detokenizer": "auto",
777
+ "resolved_detokenizer": null,
778
+ "num_workers": 0,
779
+ "latest_every": 1000,
780
+ "resume_path": "runs/train8_noisegeo_len256_allcorrupt_seqrand1_highC64_4096_20260517_163805/latest.pt"
781
+ }
782
+ step=3100 epoch=3100/4000 epoch_step=1/1 micro_steps=3100 elapsed=4.4s lr=2.000000e-03 loss=1.6606 loss_recon=1.6606 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3197 corrupt_frac=1.0000 acc_corrupt=0.3197 loss_corrupt=1.6606 wrong_frac=0.9425 init_acc_corrupt=0.0583 acc_corrupt_t_0p0_0p2=0.2435 corrupt_frac_t_0p0_0p2=0.5559 acc_corrupt_t_0p2_0p4=0.3817 corrupt_frac_t_0p2_0p4=0.3589 acc_corrupt_t_0p4_0p6=0.5393 corrupt_frac_t_0p4_0p6=0.0773 acc_corrupt_t_0p6_0p8=0.7156 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=9.6867 out_g_norm=0.3523 loss_all=1.6715 init_gold_top10=0.0631 init_gold_top100=0.1527
783
+ step=3200 epoch=3200/4000 epoch_step=1/1 micro_steps=3200 elapsed=3.8s lr=2.000000e-03 loss=1.6522 loss_recon=1.6522 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3230 corrupt_frac=1.0000 acc_corrupt=0.3230 loss_corrupt=1.6522 wrong_frac=0.9405 init_acc_corrupt=0.0603 acc_corrupt_t_0p0_0p2=0.2425 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3865 corrupt_frac_t_0p2_0p4=0.3621 acc_corrupt_t_0p4_0p6=0.5494 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.7744 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=9.6869 out_g_norm=0.3688 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7025 init_gold_top10=0.0650 init_gold_top100=0.1524
784
+ step=3300 epoch=3300/4000 epoch_step=1/1 micro_steps=3300 elapsed=3.7s lr=2.000000e-03 loss=1.6672 loss_recon=1.6672 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3197 corrupt_frac=1.0000 acc_corrupt=0.3197 loss_corrupt=1.6672 wrong_frac=0.9431 init_acc_corrupt=0.0578 acc_corrupt_t_0p0_0p2=0.2449 corrupt_frac_t_0p0_0p2=0.5641 acc_corrupt_t_0p2_0p4=0.3818 corrupt_frac_t_0p2_0p4=0.3542 acc_corrupt_t_0p4_0p6=0.5467 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.7447 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=9.6901 out_g_norm=0.3679 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6815 init_gold_top10=0.0683 init_gold_top100=0.1548
785
+ step=3400 epoch=3400/4000 epoch_step=1/1 micro_steps=3400 elapsed=3.7s lr=2.000000e-03 loss=1.6520 loss_recon=1.6520 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3248 corrupt_frac=1.0000 acc_corrupt=0.3248 loss_corrupt=1.6520 wrong_frac=0.9416 init_acc_corrupt=0.0592 acc_corrupt_t_0p0_0p2=0.2473 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.3858 corrupt_frac_t_0p2_0p4=0.3620 acc_corrupt_t_0p4_0p6=0.5700 corrupt_frac_t_0p4_0p6=0.0719 out_w_norm=9.7106 out_g_norm=0.3385 acc_corrupt_t_0p6_0p8=0.6763 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.5566 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7008 init_gold_top10=0.0601 init_gold_top100=0.1488
786
+ step=3500 epoch=3500/4000 epoch_step=1/1 micro_steps=3500 elapsed=3.8s lr=2.000000e-03 loss=1.6581 loss_recon=1.6581 loss_meanflow=0.0000 mean_model_t=0.2071 mean_corrupt_t=0.2071 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3199 corrupt_frac=1.0000 acc_corrupt=0.3199 loss_corrupt=1.6581 wrong_frac=0.9428 init_acc_corrupt=0.0580 acc_corrupt_t_0p0_0p2=0.2430 corrupt_frac_t_0p0_0p2=0.5632 acc_corrupt_t_0p2_0p4=0.3864 corrupt_frac_t_0p2_0p4=0.3546 acc_corrupt_t_0p4_0p6=0.5350 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.7869 corrupt_frac_t_0p6_0p8=0.0118 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=9.7001 out_g_norm=0.3208 loss_all=1.6670 init_gold_top10=0.0782 init_gold_top100=0.1620
787
+ step=3600 epoch=3600/4000 epoch_step=1/1 micro_steps=3600 elapsed=3.7s lr=2.000000e-03 loss=1.6474 loss_recon=1.6474 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3261 corrupt_frac=1.0000 acc_corrupt=0.3261 loss_corrupt=1.6474 wrong_frac=0.9417 init_acc_corrupt=0.0591 acc_corrupt_t_0p0_0p2=0.2498 corrupt_frac_t_0p0_0p2=0.5617 acc_corrupt_t_0p2_0p4=0.3934 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.5381 corrupt_frac_t_0p4_0p6=0.0729 acc_corrupt_t_0p6_0p8=0.7225 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=9.7103 out_g_norm=0.3223 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.5876 init_gold_top10=0.0689 init_gold_top100=0.1557
788
+ step=3700 epoch=3700/4000 epoch_step=1/1 micro_steps=3700 elapsed=3.7s lr=2.000000e-03 loss=1.6534 loss_recon=1.6534 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3231 corrupt_frac=1.0000 acc_corrupt=0.3231 loss_corrupt=1.6534 wrong_frac=0.9418 init_acc_corrupt=0.0590 acc_corrupt_t_0p0_0p2=0.2448 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.3913 corrupt_frac_t_0p2_0p4=0.3587 acc_corrupt_t_0p4_0p6=0.5295 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.6687 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=9.7209 out_g_norm=0.3219 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.7383 init_gold_top10=0.0582 init_gold_top100=0.1457
789
+ step=3800 epoch=3800/4000 epoch_step=1/1 micro_steps=3800 elapsed=3.7s lr=2.000000e-03 loss=1.6621 loss_recon=1.6621 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3219 corrupt_frac=1.0000 acc_corrupt=0.3219 loss_corrupt=1.6621 wrong_frac=0.9409 init_acc_corrupt=0.0600 acc_corrupt_t_0p0_0p2=0.2412 corrupt_frac_t_0p0_0p2=0.5548 acc_corrupt_t_0p2_0p4=0.3868 corrupt_frac_t_0p2_0p4=0.3585 acc_corrupt_t_0p4_0p6=0.5621 corrupt_frac_t_0p4_0p6=0.0790 out_w_norm=9.7221 out_g_norm=0.3630 acc_corrupt_t_0p6_0p8=0.6486 corrupt_frac_t_0p6_0p8=0.0130 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6608 init_gold_top10=0.0737 init_gold_top100=0.1602
790
+ step=3900 epoch=3900/4000 epoch_step=1/1 micro_steps=3900 elapsed=3.7s lr=2.000000e-03 loss=1.6427 loss_recon=1.6427 loss_meanflow=0.0000 mean_model_t=0.2107 mean_corrupt_t=0.2107 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3277 corrupt_frac=1.0000 acc_corrupt=0.3277 loss_corrupt=1.6427 wrong_frac=0.9393 init_acc_corrupt=0.0616 acc_corrupt_t_0p0_0p2=0.2444 corrupt_frac_t_0p0_0p2=0.5521 acc_corrupt_t_0p2_0p4=0.3978 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.5402 corrupt_frac_t_0p4_0p6=0.0788 acc_corrupt_t_0p6_0p8=0.7732 corrupt_frac_t_0p6_0p8=0.0133 out_w_norm=9.7222 out_g_norm=0.3189 acc_corrupt_t_0p8_1p0=0.6745 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.8146 init_gold_top10=0.0495 init_gold_top100=0.1383
791
+ step=4000 epoch=4000/4000 epoch_step=1/1 micro_steps=4000 elapsed=3.7s lr=2.000000e-03 loss=1.6363 loss_recon=1.6363 loss_meanflow=0.0000 mean_model_t=0.2112 mean_corrupt_t=0.2112 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3306 corrupt_frac=1.0000 acc_corrupt=0.3306 loss_corrupt=1.6363 wrong_frac=0.9392 init_acc_corrupt=0.0616 acc_corrupt_t_0p0_0p2=0.2488 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3953 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.5641 corrupt_frac_t_0p4_0p6=0.0810 out_w_norm=9.7269 out_g_norm=0.3189 acc_corrupt_t_0p6_0p8=0.6353 corrupt_frac_t_0p6_0p8=0.0137 acc_corrupt_t_0p8_1p0=0.4656 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.6461 init_gold_top10=0.0775 init_gold_top100=0.1654
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_len256_rollin_p10_s4_i32_20260517_171654.log ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 969,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_rollin_len256_rollin_p10_s4_i32_20260517_171654",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 1965440,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_corpus_unigram_path": "",
124
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
125
+ "categorical_wrong_basin_shared_prob": 0.0,
126
+ "categorical_wrong_unigram_shared_prob": 0.0,
127
+ "mask_mixture_original_prob": 0.0,
128
+ "mask_mixture_lowk_prob": 0.0,
129
+ "mask_mixture_lowcorrupt_prob": 0.0,
130
+ "mask_mixture_block_prob": 0.0,
131
+ "mask_mixture_all_prob": 1.0,
132
+ "mask_mixture_lowk_clean_tokens": "0",
133
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
134
+ "mask_mixture_block_tokens": "64,128",
135
+ "simplex_bridge_sampler": "dirichlet",
136
+ "logistic_normal_sigma_min": 0.1,
137
+ "logistic_normal_sigma_max": 1.0,
138
+ "logistic_normal_tau_min": 1.0,
139
+ "logistic_normal_tau_max": 1.0,
140
+ "torch_compile": false,
141
+ "compile_mode": "max-autotune",
142
+ "state_format": "prob",
143
+ "meanflow_weight": 0.0,
144
+ "rollout_train_prob": 0.1,
145
+ "rollout_train_steps": 4,
146
+ "rollout_train_infer_steps": 32,
147
+ "rollout_train_temp": 1.45,
148
+ "rollout_train_max_gamma": 1.0,
149
+ "rollout_train_corrupt_only": true,
150
+ "rollout_train_samplewise": true,
151
+ "rollout_train_compute_always": false,
152
+ "bridge_noise_init": "logistic_normal",
153
+ "noise_sigma": -1.0,
154
+ "allow_tf32": true,
155
+ "activation_checkpointing": false,
156
+ "activation_checkpoint_interval": 1,
157
+ "activation_checkpoint_scope": "block",
158
+ "ddp_static_graph": false,
159
+ "ddp_gradient_as_bucket_view": true,
160
+ "blocking_data_transfer": false,
161
+ "dataloader_prefetch_factor": 4,
162
+ "full_train_stats": false,
163
+ "tokenized_hf": false,
164
+ "tokenized_pad_token": "pad",
165
+ "elf_conditional_hf": false,
166
+ "record_pad_truncate": false,
167
+ "record_add_eos": false,
168
+ "record_add_special_tokens": false,
169
+ "record_pad_token": "pad",
170
+ "record_shuffle_buffer": 10000,
171
+ "wrap": true,
172
+ "wrap_mode": "stream",
173
+ "wrap_record_buffer_size": 200,
174
+ "owt_cached_chunks": true,
175
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
176
+ "owt_chunk_cache_rebuild": false,
177
+ "owt_chunk_cache_write_batch": 4096,
178
+ "owt_exact_repeat_per_chunk": 64,
179
+ "online_chunk_shuffle": false,
180
+ "online_chunk_shuffle_buffer": 10000,
181
+ "openwebtext_split": "train_minus_100k",
182
+ "detokenizer": "auto",
183
+ "resolved_detokenizer": null,
184
+ "num_workers": 0,
185
+ "latest_every": 1000,
186
+ "resume_path": ""
187
+ }
188
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=8.0s lr=2.000000e-03 loss=6.7067 loss_recon=6.7067 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.1023 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0999 corrupt_frac=1.0000 acc_corrupt=0.0999 loss_corrupt=6.7067 wrong_frac=0.7915 init_acc_corrupt=0.1159 acc_corrupt_t_0p0_0p2=0.0484 corrupt_frac_t_0p0_0p2=0.5559 acc_corrupt_t_0p2_0p4=0.1329 corrupt_frac_t_0p2_0p4=0.3589 acc_corrupt_t_0p4_0p6=0.2848 corrupt_frac_t_0p4_0p6=0.0773 acc_corrupt_t_0p6_0p8=0.4184 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=1.1030 out_g_norm=1.0003 loss_all=6.4466 init_gold_top10=0.2086 init_gold_top100=0.4366 rollout_applied_pos_frac=0.0703 init_acc_rollout_applied=0.0890 init_acc_rollout_kept=0.1190 logit_acc_rollout_applied=0.0885 logit_acc_rollout_kept=0.1055
189
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=7.6s lr=2.000000e-03 loss=6.0936 loss_recon=6.0936 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0976 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1109 corrupt_frac=1.0000 acc_corrupt=0.1109 loss_corrupt=6.0936 wrong_frac=0.7892 init_acc_corrupt=0.1183 acc_corrupt_t_0p0_0p2=0.0550 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.1492 corrupt_frac_t_0p2_0p4=0.3621 acc_corrupt_t_0p4_0p6=0.2934 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.4311 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=3.3240 out_g_norm=1.4046 acc_corrupt_t_0p8_1p0=0.5273 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.8256 init_gold_top10=0.1947 init_gold_top100=0.4323 rollout_applied_pos_frac=0.0547 init_acc_rollout_applied=0.1574 init_acc_rollout_kept=0.1063 logit_acc_rollout_applied=0.1367 logit_acc_rollout_kept=0.1072
190
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=7.4s lr=2.000000e-03 loss=5.5672 loss_recon=5.5672 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.1010 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1203 corrupt_frac=1.0000 acc_corrupt=0.1203 loss_corrupt=5.5672 wrong_frac=0.7935 init_acc_corrupt=0.1137 acc_corrupt_t_0p0_0p2=0.0588 corrupt_frac_t_0p0_0p2=0.5641 acc_corrupt_t_0p2_0p4=0.1676 corrupt_frac_t_0p2_0p4=0.3542 acc_corrupt_t_0p4_0p6=0.3230 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.4930 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=5.2156 out_g_norm=0.7145 acc_corrupt_t_0p8_1p0=0.7435 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.2934 init_gold_top10=0.1996 init_gold_top100=0.4534 rollout_applied_pos_frac=0.0938 init_acc_rollout_applied=0.1927 init_acc_rollout_kept=0.1139 logit_acc_rollout_applied=0.1868 logit_acc_rollout_kept=0.1274
191
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=7.3s lr=2.000000e-03 loss=5.0073 loss_recon=5.0073 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.1039 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1462 corrupt_frac=1.0000 acc_corrupt=0.1462 loss_corrupt=5.0073 wrong_frac=0.7917 init_acc_corrupt=0.1159 acc_corrupt_t_0p0_0p2=0.0638 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.2008 corrupt_frac_t_0p2_0p4=0.3620 acc_corrupt_t_0p4_0p6=0.4408 corrupt_frac_t_0p4_0p6=0.0719 out_w_norm=6.9005 out_g_norm=0.4039 acc_corrupt_t_0p6_0p8=0.6561 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.7832 corrupt_frac_t_0p8_1p0=0.0078 loss_all=4.7856 init_gold_top10=0.1883 init_gold_top100=0.4416 rollout_applied_pos_frac=0.0625 init_acc_rollout_applied=0.1206 init_acc_rollout_kept=0.0976 logit_acc_rollout_applied=0.1631 logit_acc_rollout_kept=0.1482
192
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=7.3s lr=2.000000e-03 loss=4.2795 loss_recon=4.2795 loss_meanflow=0.0000 mean_model_t=0.2071 mean_corrupt_t=0.2071 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0961 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1812 corrupt_frac=1.0000 acc_corrupt=0.1812 loss_corrupt=4.2795 wrong_frac=0.7928 init_acc_corrupt=0.1148 acc_corrupt_t_0p0_0p2=0.0737 corrupt_frac_t_0p0_0p2=0.5632 acc_corrupt_t_0p2_0p4=0.2690 corrupt_frac_t_0p2_0p4=0.3546 acc_corrupt_t_0p4_0p6=0.5237 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.6891 corrupt_frac_t_0p6_0p8=0.0118 acc_corrupt_t_0p8_1p0=0.8477 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=8.4374 out_g_norm=0.4504 loss_all=3.8073 init_gold_top10=0.2090 init_gold_top100=0.4723 rollout_applied_pos_frac=0.0859 init_acc_rollout_applied=0.1662 init_acc_rollout_kept=0.1186 logit_acc_rollout_applied=0.2614 logit_acc_rollout_kept=0.2018
193
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=7.2s lr=2.000000e-03 loss=3.4476 loss_recon=3.4476 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0961 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2349 corrupt_frac=1.0000 acc_corrupt=0.2349 loss_corrupt=3.4476 wrong_frac=0.7927 init_acc_corrupt=0.1151 acc_corrupt_t_0p0_0p2=0.0936 corrupt_frac_t_0p0_0p2=0.5617 acc_corrupt_t_0p2_0p4=0.3648 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.6270 corrupt_frac_t_0p4_0p6=0.0729 acc_corrupt_t_0p6_0p8=0.7651 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=9.5907 out_g_norm=0.4997 acc_corrupt_t_0p8_1p0=0.8867 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.2236 init_gold_top10=0.2012 init_gold_top100=0.4802 rollout_applied_pos_frac=0.1094 init_acc_rollout_applied=0.1035 init_acc_rollout_kept=0.1020 logit_acc_rollout_applied=0.2469 logit_acc_rollout_kept=0.2494
194
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=7.4s lr=2.000000e-03 loss=2.6251 loss_recon=2.6251 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0994 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3345 corrupt_frac=1.0000 acc_corrupt=0.3345 loss_corrupt=2.6251 wrong_frac=0.7908 init_acc_corrupt=0.1189 acc_corrupt_t_0p0_0p2=0.1392 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.5257 corrupt_frac_t_0p2_0p4=0.3587 acc_corrupt_t_0p4_0p6=0.7872 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.8684 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=10.2590 out_g_norm=0.6124 acc_corrupt_t_0p8_1p0=0.9297 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.1368 init_gold_top10=0.2584 init_gold_top100=0.4848 rollout_applied_pos_frac=0.1016 init_acc_rollout_applied=0.1878 init_acc_rollout_kept=0.1222 logit_acc_rollout_applied=0.5724 logit_acc_rollout_kept=0.4001
195
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=7.5s lr=2.000000e-03 loss=1.9378 loss_recon=1.9378 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0984 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4643 corrupt_frac=1.0000 acc_corrupt=0.4643 loss_corrupt=1.9378 wrong_frac=0.7896 init_acc_corrupt=0.1202 acc_corrupt_t_0p0_0p2=0.2202 corrupt_frac_t_0p0_0p2=0.5548 acc_corrupt_t_0p2_0p4=0.7274 corrupt_frac_t_0p2_0p4=0.3585 acc_corrupt_t_0p4_0p6=0.9359 corrupt_frac_t_0p4_0p6=0.0790 out_w_norm=10.6517 out_g_norm=0.7751 acc_corrupt_t_0p6_0p8=0.9654 corrupt_frac_t_0p6_0p8=0.0130 acc_corrupt_t_0p8_1p0=0.9492 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.5568 init_gold_top10=0.2722 init_gold_top100=0.4842 rollout_applied_pos_frac=0.0859 init_acc_rollout_applied=0.2085 init_acc_rollout_kept=0.1354 logit_acc_rollout_applied=0.7095 logit_acc_rollout_kept=0.5537
196
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=7.5s lr=2.000000e-03 loss=1.3812 loss_recon=1.3812 loss_meanflow=0.0000 mean_model_t=0.2107 mean_corrupt_t=0.2107 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.1043 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5957 corrupt_frac=1.0000 acc_corrupt=0.5957 loss_corrupt=1.3812 wrong_frac=0.7895 init_acc_corrupt=0.1228 acc_corrupt_t_0p0_0p2=0.3446 corrupt_frac_t_0p0_0p2=0.5521 acc_corrupt_t_0p2_0p4=0.8849 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.9888 corrupt_frac_t_0p4_0p6=0.0788 acc_corrupt_t_0p6_0p8=0.9914 corrupt_frac_t_0p6_0p8=0.0133 out_w_norm=10.9706 out_g_norm=0.8759 acc_corrupt_t_0p8_1p0=0.9818 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.2201 init_gold_top10=0.2856 init_gold_top100=0.5112 rollout_applied_pos_frac=0.1484 init_acc_rollout_applied=0.1575 init_acc_rollout_kept=0.1108 logit_acc_rollout_applied=0.6813 logit_acc_rollout_kept=0.6290
197
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=7.4s lr=2.000000e-03 loss=1.0309 loss_recon=1.0309 loss_meanflow=0.0000 mean_model_t=0.2112 mean_corrupt_t=0.2112 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.1017 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6773 corrupt_frac=1.0000 acc_corrupt=0.6773 loss_corrupt=1.0309 wrong_frac=0.7887 init_acc_corrupt=0.1245 acc_corrupt_t_0p0_0p2=0.4410 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.9604 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.9979 corrupt_frac_t_0p4_0p6=0.0810 out_w_norm=11.2172 out_g_norm=0.9799 acc_corrupt_t_0p6_0p8=0.9955 corrupt_frac_t_0p6_0p8=0.0137 acc_corrupt_t_0p8_1p0=0.9688 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.8734 init_gold_top10=0.2964 init_gold_top100=0.5034 rollout_applied_pos_frac=0.1328 init_acc_rollout_applied=0.1790 init_acc_rollout_kept=0.1269 logit_acc_rollout_applied=0.7532 logit_acc_rollout_kept=0.7115
198
+ NCCL version 2.25.1+cuda12.8
199
+ resumed_from=runs/train8_rollin_len256_rollin_p10_s4_i32_20260517_171654/latest.pt start_step=1001
200
+ {
201
+ "device": "cuda:0",
202
+ "rank": 0,
203
+ "world_size": 4,
204
+ "samples": "owt_cached_chunks:8",
205
+ "vocab_size": 969,
206
+ "tokenizer_vocab_size": 50257,
207
+ "save_dir": "runs/train8_rollin_len256_rollin_p10_s4_i32_20260517_171654",
208
+ "batch_size": 128,
209
+ "grad_accum": 1,
210
+ "effective_batch_size": 512,
211
+ "global_batch_size": 512,
212
+ "lr_schedule": "constant_warmup",
213
+ "optimizer": "muon",
214
+ "epochs": 0.0,
215
+ "steps_per_epoch": 1,
216
+ "total_steps": 2000,
217
+ "warmup_steps": 10,
218
+ "warmup_epochs": -1.0,
219
+ "min_lr": 0.0,
220
+ "weight_decay": 0.1,
221
+ "output_weight_decay": -1.0,
222
+ "adamw_param_groups": "nanogpt",
223
+ "adam_beta1": 0.9,
224
+ "adam_beta2": 0.95,
225
+ "adam_eps": 1e-08,
226
+ "muon_impl": "legacy",
227
+ "muon_momentum": 0.95,
228
+ "muon_ns_steps": 5,
229
+ "muon_update_scale": 1.0,
230
+ "muon_nesterov": false,
231
+ "muon_width_scale": false,
232
+ "muon_grouping": "legacy_dim_ge_2",
233
+ "muon_param_count": 1965440,
234
+ "muon_adam_param_count": 8192,
235
+ "muon_param_names": [
236
+ "vocab_embed.embedding",
237
+ "sigma_map.net.0.weight",
238
+ "sigma_map.net.2.weight",
239
+ "blocks.0.attn_qkv.weight",
240
+ "blocks.0.attn_out.weight",
241
+ "blocks.0.mlp.0.weight",
242
+ "blocks.0.mlp.2.weight",
243
+ "blocks.0.adaLN_modulation.weight",
244
+ "blocks.1.attn_qkv.weight",
245
+ "blocks.1.attn_out.weight",
246
+ "blocks.1.mlp.0.weight",
247
+ "blocks.1.mlp.2.weight",
248
+ "blocks.1.adaLN_modulation.weight",
249
+ "blocks.2.attn_qkv.weight",
250
+ "blocks.2.attn_out.weight",
251
+ "blocks.2.mlp.0.weight",
252
+ "blocks.2.mlp.2.weight",
253
+ "blocks.2.adaLN_modulation.weight",
254
+ "output_layer.linear.weight",
255
+ "output_layer.adaLN_modulation.weight"
256
+ ],
257
+ "muon_adam_param_names": [
258
+ "sigma_map.net.0.bias",
259
+ "sigma_map.net.2.bias",
260
+ "blocks.0.norm1.weight",
261
+ "blocks.0.norm2.weight",
262
+ "blocks.0.mlp.0.bias",
263
+ "blocks.0.mlp.2.bias",
264
+ "blocks.0.adaLN_modulation.bias",
265
+ "blocks.1.norm1.weight",
266
+ "blocks.1.norm2.weight",
267
+ "blocks.1.mlp.0.bias",
268
+ "blocks.1.mlp.2.bias",
269
+ "blocks.1.adaLN_modulation.bias",
270
+ "blocks.2.norm1.weight",
271
+ "blocks.2.norm2.weight",
272
+ "blocks.2.mlp.0.bias",
273
+ "blocks.2.mlp.2.bias",
274
+ "blocks.2.adaLN_modulation.bias",
275
+ "output_layer.norm_final.weight",
276
+ "output_layer.adaLN_modulation.bias"
277
+ ],
278
+ "muon_effective_nesterov": false,
279
+ "muon_effective_width_scale": false,
280
+ "muon_effective_weight_decay": 0.1,
281
+ "muon_adam_fallback_nesterov": false,
282
+ "muon_adam_fallback_weight_decay": 0.1,
283
+ "ema_decay": 0.9999,
284
+ "ema_start_step": 0,
285
+ "model_type": "ddit",
286
+ "ddit_mlp_type": "gelu",
287
+ "elf_num_time_tokens": 4,
288
+ "elf_num_model_mode_tokens": 0,
289
+ "qk_norm": true,
290
+ "output_bias": false,
291
+ "output_init_std": -1.0,
292
+ "norm_type": "rmsnorm",
293
+ "target_loss": "hard_ce",
294
+ "linear_soft_target_power": 1.0,
295
+ "linear_soft_target_min_conf": 0.0,
296
+ "linear_soft_target_max_conf": 1.0,
297
+ "t_sampling_mode": "logit_normal",
298
+ "t_sampling_power": 1.0,
299
+ "t_sampling_eps": 0.0001,
300
+ "t_sampling_logit_mean": -1.5,
301
+ "t_sampling_logit_std": 0.8,
302
+ "dual_t": true,
303
+ "corrupt_t_mode": "same",
304
+ "corrupt_min_t": 0.0,
305
+ "corrupt_max_t": 1.0,
306
+ "prefix_block_prob": 0.0,
307
+ "prefix_block_len": 128,
308
+ "mask_ratio_floor_schedule": "none",
309
+ "dirichlet_endpoint_mode": "categorical_dual_t",
310
+ "dirichlet_semantic_t_mode": "same",
311
+ "dirichlet_semantic_t_value": 0.0,
312
+ "dirichlet_semantic_t_curve": "linear",
313
+ "dirichlet_semantic_t_power": 1.0,
314
+ "endpoint_sequence_random_prob_alpha": 0.0,
315
+ "categorical_wrong_from_full_vocab": true,
316
+ "categorical_wrong_from_batch_valid_tokens": false,
317
+ "categorical_wrong_basin_token_ids": "",
318
+ "categorical_wrong_basin_prob": 0.0,
319
+ "categorical_wrong_unigram_prob": 0.0,
320
+ "categorical_wrong_uniform_prob": 0.0,
321
+ "categorical_wrong_prob_floor": 0.0,
322
+ "categorical_wrong_corpus_unigram_path": "",
323
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
324
+ "categorical_wrong_basin_shared_prob": 0.0,
325
+ "categorical_wrong_unigram_shared_prob": 0.0,
326
+ "mask_mixture_original_prob": 0.0,
327
+ "mask_mixture_lowk_prob": 0.0,
328
+ "mask_mixture_lowcorrupt_prob": 0.0,
329
+ "mask_mixture_block_prob": 0.0,
330
+ "mask_mixture_all_prob": 1.0,
331
+ "mask_mixture_lowk_clean_tokens": "0",
332
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
333
+ "mask_mixture_block_tokens": "64,128",
334
+ "simplex_bridge_sampler": "dirichlet",
335
+ "logistic_normal_sigma_min": 0.1,
336
+ "logistic_normal_sigma_max": 1.0,
337
+ "logistic_normal_tau_min": 1.0,
338
+ "logistic_normal_tau_max": 1.0,
339
+ "torch_compile": false,
340
+ "compile_mode": "max-autotune",
341
+ "state_format": "prob",
342
+ "meanflow_weight": 0.0,
343
+ "rollout_train_prob": 0.1,
344
+ "rollout_train_steps": 4,
345
+ "rollout_train_infer_steps": 32,
346
+ "rollout_train_temp": 1.45,
347
+ "rollout_train_max_gamma": 1.0,
348
+ "rollout_train_corrupt_only": true,
349
+ "rollout_train_samplewise": true,
350
+ "rollout_train_compute_always": false,
351
+ "bridge_noise_init": "logistic_normal",
352
+ "noise_sigma": -1.0,
353
+ "allow_tf32": true,
354
+ "activation_checkpointing": false,
355
+ "activation_checkpoint_interval": 1,
356
+ "activation_checkpoint_scope": "block",
357
+ "ddp_static_graph": false,
358
+ "ddp_gradient_as_bucket_view": true,
359
+ "blocking_data_transfer": false,
360
+ "dataloader_prefetch_factor": 4,
361
+ "full_train_stats": false,
362
+ "tokenized_hf": false,
363
+ "tokenized_pad_token": "pad",
364
+ "elf_conditional_hf": false,
365
+ "record_pad_truncate": false,
366
+ "record_add_eos": false,
367
+ "record_add_special_tokens": false,
368
+ "record_pad_token": "pad",
369
+ "record_shuffle_buffer": 10000,
370
+ "wrap": true,
371
+ "wrap_mode": "stream",
372
+ "wrap_record_buffer_size": 200,
373
+ "owt_cached_chunks": true,
374
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
375
+ "owt_chunk_cache_rebuild": false,
376
+ "owt_chunk_cache_write_batch": 4096,
377
+ "owt_exact_repeat_per_chunk": 64,
378
+ "online_chunk_shuffle": false,
379
+ "online_chunk_shuffle_buffer": 10000,
380
+ "openwebtext_split": "train_minus_100k",
381
+ "detokenizer": "auto",
382
+ "resolved_detokenizer": null,
383
+ "num_workers": 0,
384
+ "latest_every": 1000,
385
+ "resume_path": "runs/train8_rollin_len256_rollin_p10_s4_i32_20260517_171654/latest.pt"
386
+ }
387
+ step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=8.0s lr=2.000000e-03 loss=0.8593 loss_recon=0.8593 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.1023 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7168 corrupt_frac=1.0000 acc_corrupt=0.7168 loss_corrupt=0.8593 wrong_frac=0.7915 init_acc_corrupt=0.1214 acc_corrupt_t_0p0_0p2=0.5003 corrupt_frac_t_0p0_0p2=0.5559 acc_corrupt_t_0p2_0p4=0.9853 corrupt_frac_t_0p2_0p4=0.3589 acc_corrupt_t_0p4_0p6=0.9991 corrupt_frac_t_0p4_0p6=0.0773 acc_corrupt_t_0p6_0p8=0.9966 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=11.3101 out_g_norm=0.9111 loss_all=0.7741 init_gold_top10=0.2546 init_gold_top100=0.4695 rollout_applied_pos_frac=0.0703 init_acc_rollout_applied=0.1280 init_acc_rollout_kept=0.1190 logit_acc_rollout_applied=0.6658 logit_acc_rollout_kept=0.7622
388
+ step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=7.4s lr=2.000000e-03 loss=0.7221 loss_recon=0.7221 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0976 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7550 corrupt_frac=1.0000 acc_corrupt=0.7550 loss_corrupt=0.7221 wrong_frac=0.7892 init_acc_corrupt=0.1240 acc_corrupt_t_0p0_0p2=0.5595 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.9945 corrupt_frac_t_0p2_0p4=0.3621 acc_corrupt_t_0p4_0p6=0.9995 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.9982 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=11.2923 out_g_norm=0.8062 acc_corrupt_t_0p8_1p0=0.9831 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.7178 init_gold_top10=0.2330 init_gold_top100=0.4535 rollout_applied_pos_frac=0.0547 init_acc_rollout_applied=0.2093 init_acc_rollout_kept=0.1063 logit_acc_rollout_applied=0.7785 logit_acc_rollout_kept=0.7502
389
+ step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=7.3s lr=2.000000e-03 loss=0.6831 loss_recon=0.6831 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.1010 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7644 corrupt_frac=1.0000 acc_corrupt=0.7644 loss_corrupt=0.6831 wrong_frac=0.7935 init_acc_corrupt=0.1195 acc_corrupt_t_0p0_0p2=0.5845 corrupt_frac_t_0p0_0p2=0.5641 acc_corrupt_t_0p2_0p4=0.9966 corrupt_frac_t_0p2_0p4=0.3542 acc_corrupt_t_0p4_0p6=0.9997 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.9981 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=11.2507 out_g_norm=0.7002 acc_corrupt_t_0p8_1p0=0.9961 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.6935 init_gold_top10=0.2564 init_gold_top100=0.4802 rollout_applied_pos_frac=0.0938 init_acc_rollout_applied=0.2438 init_acc_rollout_kept=0.1139 logit_acc_rollout_applied=0.8184 logit_acc_rollout_kept=0.7558
390
+ step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=7.4s lr=2.000000e-03 loss=0.6292 loss_recon=0.6292 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.1039 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7813 corrupt_frac=1.0000 acc_corrupt=0.7813 loss_corrupt=0.6292 wrong_frac=0.7917 init_acc_corrupt=0.1219 acc_corrupt_t_0p0_0p2=0.6085 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.9977 corrupt_frac_t_0p2_0p4=0.3620 acc_corrupt_t_0p4_0p6=0.9998 corrupt_frac_t_0p4_0p6=0.0719 out_w_norm=11.2046 out_g_norm=0.6321 acc_corrupt_t_0p6_0p8=0.9985 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.9941 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.6461 init_gold_top10=0.2258 init_gold_top100=0.4554 rollout_applied_pos_frac=0.0625 init_acc_rollout_applied=0.1826 init_acc_rollout_kept=0.0976 logit_acc_rollout_applied=0.6479 logit_acc_rollout_kept=0.7707
391
+ step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=7.3s lr=2.000000e-03 loss=0.5946 loss_recon=0.5946 loss_meanflow=0.0000 mean_model_t=0.2071 mean_corrupt_t=0.2071 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0961 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7954 corrupt_frac=1.0000 acc_corrupt=0.7954 loss_corrupt=0.5946 wrong_frac=0.7928 init_acc_corrupt=0.1202 acc_corrupt_t_0p0_0p2=0.6374 corrupt_frac_t_0p0_0p2=0.5632 acc_corrupt_t_0p2_0p4=0.9990 corrupt_frac_t_0p2_0p4=0.3546 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.9994 corrupt_frac_t_0p6_0p8=0.0118 acc_corrupt_t_0p8_1p0=1.0000 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=11.1643 out_g_norm=0.6000 loss_all=0.6375 init_gold_top10=0.2672 init_gold_top100=0.4767 rollout_applied_pos_frac=0.0859 init_acc_rollout_applied=0.2234 init_acc_rollout_kept=0.1186 logit_acc_rollout_applied=0.8629 logit_acc_rollout_kept=0.7678
392
+ step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=7.3s lr=2.000000e-03 loss=0.5549 loss_recon=0.5549 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0961 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8064 corrupt_frac=1.0000 acc_corrupt=0.8064 loss_corrupt=0.5549 wrong_frac=0.7927 init_acc_corrupt=0.1201 acc_corrupt_t_0p0_0p2=0.6558 corrupt_frac_t_0p0_0p2=0.5617 acc_corrupt_t_0p2_0p4=0.9994 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0729 acc_corrupt_t_0p6_0p8=0.9990 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=11.1336 out_g_norm=0.5135 acc_corrupt_t_0p8_1p0=0.9609 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.7128 init_gold_top10=0.2632 init_gold_top100=0.4819 rollout_applied_pos_frac=0.1094 init_acc_rollout_applied=0.1278 init_acc_rollout_kept=0.1020 logit_acc_rollout_applied=0.7486 logit_acc_rollout_kept=0.7451
393
+ step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=7.3s lr=2.000000e-03 loss=0.5210 loss_recon=0.5210 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0994 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8198 corrupt_frac=1.0000 acc_corrupt=0.8198 loss_corrupt=0.5210 wrong_frac=0.7908 init_acc_corrupt=0.1239 acc_corrupt_t_0p0_0p2=0.6754 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.9997 corrupt_frac_t_0p2_0p4=0.3587 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.9994 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=11.1085 out_g_norm=0.4887 acc_corrupt_t_0p8_1p0=0.9883 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5158 init_gold_top10=0.2778 init_gold_top100=0.4848 rollout_applied_pos_frac=0.1016 init_acc_rollout_applied=0.2485 init_acc_rollout_kept=0.1222 logit_acc_rollout_applied=0.9240 logit_acc_rollout_kept=0.8131
394
+ step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=7.3s lr=2.000000e-03 loss=0.5056 loss_recon=0.5056 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0984 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8261 corrupt_frac=1.0000 acc_corrupt=0.8261 loss_corrupt=0.5056 wrong_frac=0.7896 init_acc_corrupt=0.1242 acc_corrupt_t_0p0_0p2=0.6867 corrupt_frac_t_0p0_0p2=0.5548 acc_corrupt_t_0p2_0p4=0.9998 corrupt_frac_t_0p2_0p4=0.3585 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0790 out_w_norm=11.0887 out_g_norm=0.4736 acc_corrupt_t_0p6_0p8=0.9996 corrupt_frac_t_0p6_0p8=0.0130 acc_corrupt_t_0p8_1p0=0.9922 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.4750 init_gold_top10=0.2862 init_gold_top100=0.4842 rollout_applied_pos_frac=0.0859 init_acc_rollout_applied=0.2493 init_acc_rollout_kept=0.1354 logit_acc_rollout_applied=0.9670 logit_acc_rollout_kept=0.8349
395
+ step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=7.3s lr=2.000000e-03 loss=0.4743 loss_recon=0.4743 loss_meanflow=0.0000 mean_model_t=0.2107 mean_corrupt_t=0.2107 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.1043 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8369 corrupt_frac=1.0000 acc_corrupt=0.8369 loss_corrupt=0.4743 wrong_frac=0.7895 init_acc_corrupt=0.1257 acc_corrupt_t_0p0_0p2=0.7047 corrupt_frac_t_0p0_0p2=0.5521 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.9999 corrupt_frac_t_0p4_0p6=0.0788 acc_corrupt_t_0p6_0p8=0.9990 corrupt_frac_t_0p6_0p8=0.0133 out_w_norm=11.0800 out_g_norm=0.4315 acc_corrupt_t_0p8_1p0=0.9922 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5701 init_gold_top10=0.3041 init_gold_top100=0.5112 rollout_applied_pos_frac=0.1484 init_acc_rollout_applied=0.1780 init_acc_rollout_kept=0.1108 logit_acc_rollout_applied=0.7950 logit_acc_rollout_kept=0.8050
396
+ step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=7.3s lr=2.000000e-03 loss=0.4508 loss_recon=0.4508 loss_meanflow=0.0000 mean_model_t=0.2112 mean_corrupt_t=0.2112 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.1017 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8442 corrupt_frac=1.0000 acc_corrupt=0.8442 loss_corrupt=0.4508 wrong_frac=0.7887 init_acc_corrupt=0.1261 acc_corrupt_t_0p0_0p2=0.7176 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.9999 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=1.0000 corrupt_frac_t_0p4_0p6=0.0810 out_w_norm=11.0718 out_g_norm=0.3715 acc_corrupt_t_0p6_0p8=0.9996 corrupt_frac_t_0p6_0p8=0.0137 acc_corrupt_t_0p8_1p0=0.9969 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5180 init_gold_top10=0.3100 init_gold_top100=0.5034 rollout_applied_pos_frac=0.1328 init_acc_rollout_applied=0.1921 init_acc_rollout_kept=0.1269 logit_acc_rollout_applied=0.9000 logit_acc_rollout_kept=0.8119
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_len256_rollin_p25_s8_i64_20260517_171654.log ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 969,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_rollin_len256_rollin_p25_s8_i64_20260517_171654",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 1000,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 1965440,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_prob_floor": 0.0,
124
+ "categorical_wrong_corpus_unigram_path": "",
125
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
126
+ "categorical_wrong_basin_shared_prob": 0.0,
127
+ "categorical_wrong_unigram_shared_prob": 0.0,
128
+ "mask_mixture_original_prob": 0.0,
129
+ "mask_mixture_lowk_prob": 0.0,
130
+ "mask_mixture_lowcorrupt_prob": 0.0,
131
+ "mask_mixture_block_prob": 0.0,
132
+ "mask_mixture_all_prob": 1.0,
133
+ "mask_mixture_lowk_clean_tokens": "0",
134
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
135
+ "mask_mixture_block_tokens": "64,128",
136
+ "simplex_bridge_sampler": "dirichlet",
137
+ "logistic_normal_sigma_min": 0.1,
138
+ "logistic_normal_sigma_max": 1.0,
139
+ "logistic_normal_tau_min": 1.0,
140
+ "logistic_normal_tau_max": 1.0,
141
+ "torch_compile": false,
142
+ "compile_mode": "max-autotune",
143
+ "state_format": "prob",
144
+ "meanflow_weight": 0.0,
145
+ "rollout_train_prob": 0.25,
146
+ "rollout_train_steps": 8,
147
+ "rollout_train_infer_steps": 64,
148
+ "rollout_train_temp": 1.45,
149
+ "rollout_train_max_gamma": 1.0,
150
+ "rollout_train_corrupt_only": true,
151
+ "rollout_train_samplewise": true,
152
+ "rollout_train_compute_always": false,
153
+ "bridge_noise_init": "logistic_normal",
154
+ "noise_sigma": -1.0,
155
+ "allow_tf32": true,
156
+ "activation_checkpointing": false,
157
+ "activation_checkpoint_interval": 1,
158
+ "activation_checkpoint_scope": "block",
159
+ "ddp_static_graph": false,
160
+ "ddp_gradient_as_bucket_view": true,
161
+ "blocking_data_transfer": false,
162
+ "dataloader_prefetch_factor": 4,
163
+ "full_train_stats": false,
164
+ "tokenized_hf": false,
165
+ "tokenized_pad_token": "pad",
166
+ "elf_conditional_hf": false,
167
+ "record_pad_truncate": false,
168
+ "record_add_eos": false,
169
+ "record_add_special_tokens": false,
170
+ "record_pad_token": "pad",
171
+ "record_shuffle_buffer": 10000,
172
+ "wrap": true,
173
+ "wrap_mode": "stream",
174
+ "wrap_record_buffer_size": 200,
175
+ "owt_cached_chunks": true,
176
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
177
+ "owt_chunk_cache_rebuild": false,
178
+ "owt_chunk_cache_write_batch": 4096,
179
+ "owt_exact_repeat_per_chunk": 64,
180
+ "online_chunk_shuffle": false,
181
+ "online_chunk_shuffle_buffer": 10000,
182
+ "openwebtext_split": "train_minus_100k",
183
+ "detokenizer": "auto",
184
+ "resolved_detokenizer": null,
185
+ "num_workers": 0,
186
+ "latest_every": 1000,
187
+ "resume_path": ""
188
+ }
189
+ step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=11.2s lr=2.000000e-03 loss=6.7067 loss_recon=6.7067 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.2530 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0998 corrupt_frac=1.0000 acc_corrupt=0.0998 loss_corrupt=6.7067 wrong_frac=0.7915 init_acc_corrupt=0.1159 acc_corrupt_t_0p0_0p2=0.0484 corrupt_frac_t_0p0_0p2=0.5559 acc_corrupt_t_0p2_0p4=0.1328 corrupt_frac_t_0p2_0p4=0.3589 acc_corrupt_t_0p4_0p6=0.2837 corrupt_frac_t_0p4_0p6=0.0773 acc_corrupt_t_0p6_0p8=0.4131 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=1.1017 out_g_norm=1.0028 loss_all=6.4476 init_gold_top10=0.2087 init_gold_top100=0.4586 rollout_applied_pos_frac=0.2344 init_acc_rollout_applied=0.0948 init_acc_rollout_kept=0.1237 logit_acc_rollout_applied=0.1040 logit_acc_rollout_kept=0.1036
190
+ step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=10.5s lr=2.000000e-03 loss=6.0948 loss_recon=6.0948 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.2494 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1107 corrupt_frac=1.0000 acc_corrupt=0.1107 loss_corrupt=6.0948 wrong_frac=0.7892 init_acc_corrupt=0.1185 acc_corrupt_t_0p0_0p2=0.0551 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.1486 corrupt_frac_t_0p2_0p4=0.3621 acc_corrupt_t_0p4_0p6=0.2933 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.4290 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=3.3188 out_g_norm=1.4052 acc_corrupt_t_0p8_1p0=0.5286 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.8252 init_gold_top10=0.1984 init_gold_top100=0.4796 rollout_applied_pos_frac=0.3047 init_acc_rollout_applied=0.0981 init_acc_rollout_kept=0.1146 logit_acc_rollout_applied=0.1062 logit_acc_rollout_kept=0.1115
191
+ step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=10.5s lr=2.000000e-03 loss=5.5702 loss_recon=5.5702 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.2520 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1201 corrupt_frac=1.0000 acc_corrupt=0.1201 loss_corrupt=5.5702 wrong_frac=0.7935 init_acc_corrupt=0.1140 acc_corrupt_t_0p0_0p2=0.0586 corrupt_frac_t_0p0_0p2=0.5641 acc_corrupt_t_0p2_0p4=0.1674 corrupt_frac_t_0p2_0p4=0.3542 acc_corrupt_t_0p4_0p6=0.3229 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.4805 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=5.2001 out_g_norm=0.7194 acc_corrupt_t_0p8_1p0=0.7109 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.3022 init_gold_top10=0.2009 init_gold_top100=0.4819 rollout_applied_pos_frac=0.2109 init_acc_rollout_applied=0.1409 init_acc_rollout_kept=0.1165 logit_acc_rollout_applied=0.1431 logit_acc_rollout_kept=0.1295
192
+ step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=10.5s lr=2.000000e-03 loss=5.0199 loss_recon=5.0199 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.2573 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1453 corrupt_frac=1.0000 acc_corrupt=0.1453 loss_corrupt=5.0199 wrong_frac=0.7917 init_acc_corrupt=0.1164 acc_corrupt_t_0p0_0p2=0.0634 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.1995 corrupt_frac_t_0p2_0p4=0.3620 acc_corrupt_t_0p4_0p6=0.4374 corrupt_frac_t_0p4_0p6=0.0719 out_w_norm=6.8835 out_g_norm=0.4217 acc_corrupt_t_0p6_0p8=0.6500 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.7773 corrupt_frac_t_0p8_1p0=0.0078 loss_all=4.7900 init_gold_top10=0.1949 init_gold_top100=0.5059 rollout_applied_pos_frac=0.2422 init_acc_rollout_applied=0.1085 init_acc_rollout_kept=0.0967 logit_acc_rollout_applied=0.1551 logit_acc_rollout_kept=0.1495
193
+ step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=10.5s lr=2.000000e-03 loss=4.2894 loss_recon=4.2894 loss_meanflow=0.0000 mean_model_t=0.2071 mean_corrupt_t=0.2071 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.2505 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1810 corrupt_frac=1.0000 acc_corrupt=0.1810 loss_corrupt=4.2894 wrong_frac=0.7928 init_acc_corrupt=0.1154 acc_corrupt_t_0p0_0p2=0.0737 corrupt_frac_t_0p0_0p2=0.5632 acc_corrupt_t_0p2_0p4=0.2685 corrupt_frac_t_0p2_0p4=0.3546 acc_corrupt_t_0p4_0p6=0.5234 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.6916 corrupt_frac_t_0p6_0p8=0.0118 acc_corrupt_t_0p8_1p0=0.8477 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=8.4098 out_g_norm=0.4460 loss_all=3.8381 init_gold_top10=0.2167 init_gold_top100=0.5613 rollout_applied_pos_frac=0.2656 init_acc_rollout_applied=0.1174 init_acc_rollout_kept=0.1265 logit_acc_rollout_applied=0.2086 logit_acc_rollout_kept=0.2095
194
+ step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=10.5s lr=2.000000e-03 loss=3.4621 loss_recon=3.4621 loss_meanflow=0.0000 mean_model_t=0.2074 mean_corrupt_t=0.2074 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.2479 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2362 corrupt_frac=1.0000 acc_corrupt=0.2362 loss_corrupt=3.4621 wrong_frac=0.7927 init_acc_corrupt=0.1163 acc_corrupt_t_0p0_0p2=0.0926 corrupt_frac_t_0p0_0p2=0.5617 acc_corrupt_t_0p2_0p4=0.3686 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.6327 corrupt_frac_t_0p4_0p6=0.0729 acc_corrupt_t_0p6_0p8=0.7703 corrupt_frac_t_0p6_0p8=0.0124 out_w_norm=9.5687 out_g_norm=0.4953 acc_corrupt_t_0p8_1p0=0.9062 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.2426 init_gold_top10=0.2374 init_gold_top100=0.5941 rollout_applied_pos_frac=0.3203 init_acc_rollout_applied=0.1114 init_acc_rollout_kept=0.1015 logit_acc_rollout_applied=0.2544 logit_acc_rollout_kept=0.2525
195
+ step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=10.5s lr=2.000000e-03 loss=2.6374 loss_recon=2.6374 loss_meanflow=0.0000 mean_model_t=0.2094 mean_corrupt_t=0.2094 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.2563 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3429 corrupt_frac=1.0000 acc_corrupt=0.3429 loss_corrupt=2.6374 wrong_frac=0.7908 init_acc_corrupt=0.1210 acc_corrupt_t_0p0_0p2=0.1373 corrupt_frac_t_0p0_0p2=0.5550 acc_corrupt_t_0p2_0p4=0.5469 corrupt_frac_t_0p2_0p4=0.3587 acc_corrupt_t_0p4_0p6=0.8090 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.8866 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=10.2457 out_g_norm=0.6162 acc_corrupt_t_0p8_1p0=0.9414 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.1735 init_gold_top10=0.2986 init_gold_top100=0.5359 rollout_applied_pos_frac=0.1953 init_acc_rollout_applied=0.1772 init_acc_rollout_kept=0.1197 logit_acc_rollout_applied=0.5550 logit_acc_rollout_kept=0.3905
196
+ step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=10.5s lr=2.000000e-03 loss=1.9491 loss_recon=1.9491 loss_meanflow=0.0000 mean_model_t=0.2099 mean_corrupt_t=0.2099 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.2495 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4768 corrupt_frac=1.0000 acc_corrupt=0.4768 loss_corrupt=1.9491 wrong_frac=0.7896 init_acc_corrupt=0.1239 acc_corrupt_t_0p0_0p2=0.2186 corrupt_frac_t_0p0_0p2=0.5548 acc_corrupt_t_0p2_0p4=0.7614 corrupt_frac_t_0p2_0p4=0.3585 acc_corrupt_t_0p4_0p6=0.9506 corrupt_frac_t_0p4_0p6=0.0790 out_w_norm=10.6456 out_g_norm=0.7643 acc_corrupt_t_0p6_0p8=0.9743 corrupt_frac_t_0p6_0p8=0.0130 acc_corrupt_t_0p8_1p0=0.9648 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.5478 init_gold_top10=0.3396 init_gold_top100=0.5577 rollout_applied_pos_frac=0.2188 init_acc_rollout_applied=0.1821 init_acc_rollout_kept=0.1344 logit_acc_rollout_applied=0.6747 logit_acc_rollout_kept=0.5520
197
+ step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=10.5s lr=2.000000e-03 loss=1.3891 loss_recon=1.3891 loss_meanflow=0.0000 mean_model_t=0.2107 mean_corrupt_t=0.2107 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.2528 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6025 corrupt_frac=1.0000 acc_corrupt=0.6025 loss_corrupt=1.3891 wrong_frac=0.7895 init_acc_corrupt=0.1286 acc_corrupt_t_0p0_0p2=0.3456 corrupt_frac_t_0p0_0p2=0.5521 acc_corrupt_t_0p2_0p4=0.9013 corrupt_frac_t_0p2_0p4=0.3599 acc_corrupt_t_0p4_0p6=0.9920 corrupt_frac_t_0p4_0p6=0.0788 acc_corrupt_t_0p6_0p8=0.9927 corrupt_frac_t_0p6_0p8=0.0133 out_w_norm=10.9519 out_g_norm=0.8195 acc_corrupt_t_0p8_1p0=0.9948 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.1679 init_gold_top10=0.3795 init_gold_top100=0.5918 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.1549 init_acc_rollout_kept=0.1097 logit_acc_rollout_applied=0.7378 logit_acc_rollout_kept=0.6284
198
+ step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=10.5s lr=2.000000e-03 loss=1.0243 loss_recon=1.0243 loss_meanflow=0.0000 mean_model_t=0.2112 mean_corrupt_t=0.2112 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.2503 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6821 corrupt_frac=1.0000 acc_corrupt=0.6821 loss_corrupt=1.0243 wrong_frac=0.7887 init_acc_corrupt=0.1319 acc_corrupt_t_0p0_0p2=0.4452 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.9674 corrupt_frac_t_0p2_0p4=0.3570 acc_corrupt_t_0p4_0p6=0.9980 corrupt_frac_t_0p4_0p6=0.0810 out_w_norm=11.1775 out_g_norm=0.8899 acc_corrupt_t_0p6_0p8=0.9961 corrupt_frac_t_0p6_0p8=0.0137 acc_corrupt_t_0p8_1p0=0.9867 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.7826 init_gold_top10=0.3986 init_gold_top100=0.5936 rollout_applied_pos_frac=0.2891 init_acc_rollout_applied=0.2080 init_acc_rollout_kept=0.1181 logit_acc_rollout_applied=0.7562 logit_acc_rollout_kept=0.7304
LTA_openwebtext_dualt/logs/softendpoint_mn_pilot_4gpu/train8_rollin_synct_len256_synct_p50_s4_i32_20260517_1800synct.log ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "owt_cached_chunks:8",
7
+ "vocab_size": 969,
8
+ "tokenizer_vocab_size": 50257,
9
+ "save_dir": "runs/train8_rollin_synct_len256_synct_p50_s4_i32_20260517_1800synct",
10
+ "batch_size": 128,
11
+ "grad_accum": 1,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 1,
18
+ "total_steps": 500,
19
+ "warmup_steps": 10,
20
+ "warmup_epochs": -1.0,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.1,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "legacy",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": false,
33
+ "muon_width_scale": false,
34
+ "muon_grouping": "legacy_dim_ge_2",
35
+ "muon_param_count": 1965440,
36
+ "muon_adam_param_count": 8192,
37
+ "muon_param_names": [
38
+ "vocab_embed.embedding",
39
+ "sigma_map.net.0.weight",
40
+ "sigma_map.net.2.weight",
41
+ "blocks.0.attn_qkv.weight",
42
+ "blocks.0.attn_out.weight",
43
+ "blocks.0.mlp.0.weight",
44
+ "blocks.0.mlp.2.weight",
45
+ "blocks.0.adaLN_modulation.weight",
46
+ "blocks.1.attn_qkv.weight",
47
+ "blocks.1.attn_out.weight",
48
+ "blocks.1.mlp.0.weight",
49
+ "blocks.1.mlp.2.weight",
50
+ "blocks.1.adaLN_modulation.weight",
51
+ "blocks.2.attn_qkv.weight",
52
+ "blocks.2.attn_out.weight",
53
+ "blocks.2.mlp.0.weight",
54
+ "blocks.2.mlp.2.weight",
55
+ "blocks.2.adaLN_modulation.weight",
56
+ "output_layer.linear.weight",
57
+ "output_layer.adaLN_modulation.weight"
58
+ ],
59
+ "muon_adam_param_names": [
60
+ "sigma_map.net.0.bias",
61
+ "sigma_map.net.2.bias",
62
+ "blocks.0.norm1.weight",
63
+ "blocks.0.norm2.weight",
64
+ "blocks.0.mlp.0.bias",
65
+ "blocks.0.mlp.2.bias",
66
+ "blocks.0.adaLN_modulation.bias",
67
+ "blocks.1.norm1.weight",
68
+ "blocks.1.norm2.weight",
69
+ "blocks.1.mlp.0.bias",
70
+ "blocks.1.mlp.2.bias",
71
+ "blocks.1.adaLN_modulation.bias",
72
+ "blocks.2.norm1.weight",
73
+ "blocks.2.norm2.weight",
74
+ "blocks.2.mlp.0.bias",
75
+ "blocks.2.mlp.2.bias",
76
+ "blocks.2.adaLN_modulation.bias",
77
+ "output_layer.norm_final.weight",
78
+ "output_layer.adaLN_modulation.bias"
79
+ ],
80
+ "muon_effective_nesterov": false,
81
+ "muon_effective_width_scale": false,
82
+ "muon_effective_weight_decay": 0.1,
83
+ "muon_adam_fallback_nesterov": false,
84
+ "muon_adam_fallback_weight_decay": 0.1,
85
+ "ema_decay": 0.9999,
86
+ "ema_start_step": 0,
87
+ "model_type": "ddit",
88
+ "ddit_mlp_type": "gelu",
89
+ "elf_num_time_tokens": 4,
90
+ "elf_num_model_mode_tokens": 0,
91
+ "qk_norm": true,
92
+ "output_bias": false,
93
+ "output_init_std": -1.0,
94
+ "norm_type": "rmsnorm",
95
+ "target_loss": "hard_ce",
96
+ "linear_soft_target_power": 1.0,
97
+ "linear_soft_target_min_conf": 0.0,
98
+ "linear_soft_target_max_conf": 1.0,
99
+ "t_sampling_mode": "logit_normal",
100
+ "t_sampling_power": 1.0,
101
+ "t_sampling_eps": 0.0001,
102
+ "t_sampling_logit_mean": -1.5,
103
+ "t_sampling_logit_std": 0.8,
104
+ "dual_t": true,
105
+ "corrupt_t_mode": "same",
106
+ "corrupt_min_t": 0.0,
107
+ "corrupt_max_t": 1.0,
108
+ "prefix_block_prob": 0.0,
109
+ "prefix_block_len": 128,
110
+ "mask_ratio_floor_schedule": "none",
111
+ "dirichlet_endpoint_mode": "categorical_dual_t",
112
+ "dirichlet_semantic_t_mode": "same",
113
+ "dirichlet_semantic_t_value": 0.0,
114
+ "dirichlet_semantic_t_curve": "linear",
115
+ "dirichlet_semantic_t_power": 1.0,
116
+ "endpoint_sequence_random_prob_alpha": 0.0,
117
+ "categorical_wrong_from_full_vocab": true,
118
+ "categorical_wrong_from_batch_valid_tokens": false,
119
+ "categorical_wrong_basin_token_ids": "",
120
+ "categorical_wrong_basin_prob": 0.0,
121
+ "categorical_wrong_unigram_prob": 0.0,
122
+ "categorical_wrong_uniform_prob": 0.0,
123
+ "categorical_wrong_prob_floor": 0.0,
124
+ "categorical_wrong_corpus_unigram_path": "",
125
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
126
+ "categorical_wrong_basin_shared_prob": 0.0,
127
+ "categorical_wrong_unigram_shared_prob": 0.0,
128
+ "mask_mixture_original_prob": 0.0,
129
+ "mask_mixture_lowk_prob": 0.0,
130
+ "mask_mixture_lowcorrupt_prob": 0.0,
131
+ "mask_mixture_block_prob": 0.0,
132
+ "mask_mixture_all_prob": 1.0,
133
+ "mask_mixture_lowk_clean_tokens": "0",
134
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
135
+ "mask_mixture_block_tokens": "64,128",
136
+ "simplex_bridge_sampler": "dirichlet",
137
+ "logistic_normal_sigma_min": 0.1,
138
+ "logistic_normal_sigma_max": 1.0,
139
+ "logistic_normal_tau_min": 1.0,
140
+ "logistic_normal_tau_max": 1.0,
141
+ "torch_compile": false,
142
+ "compile_mode": "max-autotune",
143
+ "state_format": "prob",
144
+ "meanflow_weight": 0.0,
145
+ "rollout_train_prob": 0.5,
146
+ "rollout_train_steps": 4,
147
+ "rollout_train_infer_steps": 32,
148
+ "rollout_train_temp": 1.45,
149
+ "rollout_train_max_gamma": 1.0,
150
+ "rollout_train_corrupt_only": true,
151
+ "rollout_train_samplewise": true,
152
+ "rollout_train_compute_always": false,
153
+ "rollout_train_sync_t": true,
154
+ "bridge_noise_init": "logistic_normal",
155
+ "noise_sigma": -1.0,
156
+ "allow_tf32": true,
157
+ "activation_checkpointing": false,
158
+ "activation_checkpoint_interval": 1,
159
+ "activation_checkpoint_scope": "block",
160
+ "ddp_static_graph": false,
161
+ "ddp_gradient_as_bucket_view": true,
162
+ "blocking_data_transfer": false,
163
+ "dataloader_prefetch_factor": 4,
164
+ "full_train_stats": false,
165
+ "tokenized_hf": false,
166
+ "tokenized_pad_token": "pad",
167
+ "elf_conditional_hf": false,
168
+ "record_pad_truncate": false,
169
+ "record_add_eos": false,
170
+ "record_add_special_tokens": false,
171
+ "record_pad_token": "pad",
172
+ "record_shuffle_buffer": 10000,
173
+ "wrap": true,
174
+ "wrap_mode": "stream",
175
+ "wrap_record_buffer_size": 200,
176
+ "owt_cached_chunks": true,
177
+ "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len256_train8_compact_overfit",
178
+ "owt_chunk_cache_rebuild": false,
179
+ "owt_chunk_cache_write_batch": 4096,
180
+ "owt_exact_repeat_per_chunk": 64,
181
+ "online_chunk_shuffle": false,
182
+ "online_chunk_shuffle_buffer": 10000,
183
+ "openwebtext_split": "train_minus_100k",
184
+ "detokenizer": "auto",
185
+ "resolved_detokenizer": null,
186
+ "num_workers": 0,
187
+ "latest_every": 500,
188
+ "resume_path": ""
189
+ }
190
+ step=100 epoch=100/500 epoch_step=1/1 micro_steps=100 elapsed=8.0s lr=2.000000e-03 loss=6.7066 loss_recon=6.7066 loss_meanflow=0.0000 mean_model_t=0.2083 mean_corrupt_t=0.2083 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5128 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0995 corrupt_frac=1.0000 acc_corrupt=0.0995 loss_corrupt=6.7066 wrong_frac=0.7915 init_acc_corrupt=0.1159 acc_corrupt_t_0p0_0p2=0.0485 corrupt_frac_t_0p0_0p2=0.5559 acc_corrupt_t_0p2_0p4=0.1326 corrupt_frac_t_0p2_0p4=0.3589 acc_corrupt_t_0p4_0p6=0.2811 corrupt_frac_t_0p4_0p6=0.0773 acc_corrupt_t_0p6_0p8=0.4046 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=1.0998 out_g_norm=1.0064 loss_all=6.4488 init_gold_top10=0.2091 init_gold_top100=0.4888 rollout_applied_pos_frac=0.4844 init_acc_rollout_applied=0.1132 init_acc_rollout_kept=0.1206 logit_acc_rollout_applied=0.1075 logit_acc_rollout_kept=0.0998
191
+ step=200 epoch=200/500 epoch_step=1/1 micro_steps=200 elapsed=7.2s lr=2.000000e-03 loss=6.0975 loss_recon=6.0975 loss_meanflow=0.0000 mean_model_t=0.2108 mean_corrupt_t=0.2108 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4954 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1100 corrupt_frac=1.0000 acc_corrupt=0.1100 loss_corrupt=6.0975 wrong_frac=0.7892 init_acc_corrupt=0.1186 acc_corrupt_t_0p0_0p2=0.0549 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.1475 corrupt_frac_t_0p2_0p4=0.3621 acc_corrupt_t_0p4_0p6=0.2919 corrupt_frac_t_0p4_0p6=0.0781 acc_corrupt_t_0p6_0p8=0.4261 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=3.3114 out_g_norm=1.4050 acc_corrupt_t_0p8_1p0=0.4753 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.8306 init_gold_top10=0.2014 init_gold_top100=0.5203 rollout_applied_pos_frac=0.5234 init_acc_rollout_applied=0.0961 init_acc_rollout_kept=0.1255 logit_acc_rollout_applied=0.1030 logit_acc_rollout_kept=0.1140
192
+ step=300 epoch=300/500 epoch_step=1/1 micro_steps=300 elapsed=7.2s lr=2.000000e-03 loss=5.5630 loss_recon=5.5630 loss_meanflow=0.0000 mean_model_t=0.2067 mean_corrupt_t=0.2067 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5003 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1197 corrupt_frac=1.0000 acc_corrupt=0.1197 loss_corrupt=5.5630 wrong_frac=0.7935 init_acc_corrupt=0.1144 acc_corrupt_t_0p0_0p2=0.0585 corrupt_frac_t_0p0_0p2=0.5641 acc_corrupt_t_0p2_0p4=0.1669 corrupt_frac_t_0p2_0p4=0.3542 acc_corrupt_t_0p4_0p6=0.3223 corrupt_frac_t_0p4_0p6=0.0734 acc_corrupt_t_0p6_0p8=0.4699 corrupt_frac_t_0p6_0p8=0.0121 out_w_norm=5.1555 out_g_norm=0.7117 acc_corrupt_t_0p8_1p0=0.6198 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.3002 init_gold_top10=0.2069 init_gold_top100=0.5427 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.1296 init_acc_rollout_kept=0.1153 logit_acc_rollout_applied=0.1342 logit_acc_rollout_kept=0.1291
193
+ step=400 epoch=400/500 epoch_step=1/1 micro_steps=400 elapsed=7.2s lr=2.000000e-03 loss=4.9812 loss_recon=4.9812 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5077 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1463 corrupt_frac=1.0000 acc_corrupt=0.1463 loss_corrupt=4.9812 wrong_frac=0.7917 init_acc_corrupt=0.1168 acc_corrupt_t_0p0_0p2=0.0641 corrupt_frac_t_0p0_0p2=0.5564 acc_corrupt_t_0p2_0p4=0.2014 corrupt_frac_t_0p2_0p4=0.3620 acc_corrupt_t_0p4_0p6=0.4385 corrupt_frac_t_0p4_0p6=0.0719 out_w_norm=6.8356 out_g_norm=0.4138 acc_corrupt_t_0p6_0p8=0.6431 corrupt_frac_t_0p6_0p8=0.0131 acc_corrupt_t_0p8_1p0=0.7734 corrupt_frac_t_0p8_1p0=0.0078 loss_all=4.7286 init_gold_top10=0.1997 init_gold_top100=0.5761 rollout_applied_pos_frac=0.4297 init_acc_rollout_applied=0.0887 init_acc_rollout_kept=0.1080 logit_acc_rollout_applied=0.1376 logit_acc_rollout_kept=0.1628
194
+ step=500 epoch=500/500 epoch_step=1/1 micro_steps=500 elapsed=7.2s lr=2.000000e-03 loss=4.1926 loss_recon=4.1926 loss_meanflow=0.0000 mean_model_t=0.2071 mean_corrupt_t=0.2071 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5024 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1859 corrupt_frac=1.0000 acc_corrupt=0.1859 loss_corrupt=4.1926 wrong_frac=0.7928 init_acc_corrupt=0.1163 acc_corrupt_t_0p0_0p2=0.0756 corrupt_frac_t_0p0_0p2=0.5632 acc_corrupt_t_0p2_0p4=0.2771 corrupt_frac_t_0p2_0p4=0.3546 acc_corrupt_t_0p4_0p6=0.5321 corrupt_frac_t_0p4_0p6=0.0745 acc_corrupt_t_0p6_0p8=0.6971 corrupt_frac_t_0p6_0p8=0.0118 acc_corrupt_t_0p8_1p0=0.8555 corrupt_frac_t_0p8_1p0=0.0078 out_w_norm=8.3476 out_g_norm=0.4581 loss_all=3.7301 init_gold_top10=0.2301 init_gold_top100=0.6757 rollout_applied_pos_frac=0.5156 init_acc_rollout_applied=0.1155 init_acc_rollout_kept=0.1360 logit_acc_rollout_applied=0.2035 logit_acc_rollout_kept=0.2262
LTA_openwebtext_dualt/logs/wmt14_deen_elfofficial_4gpu_debug/debug_wmt14_deen_elfofficial_t5_len128_in64_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_zeroout_4gpu_20260516_020150.log ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "elf_conditional_hf:8192:max_input_len=64:pad=1:loss_on_pad=1",
7
+ "vocab_size": 32100,
8
+ "tokenizer_vocab_size": 32100,
9
+ "save_dir": "runs/debug_wmt14_deen_elfofficial_t5_len128_in64_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_zeroout_4gpu_20260516_020150",
10
+ "batch_size": 32,
11
+ "grad_accum": 4,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 16,
18
+ "total_steps": 10,
19
+ "warmup_steps": 8,
20
+ "warmup_epochs": 0.5,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.0,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "optax",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": true,
33
+ "muon_width_scale": true,
34
+ "muon_grouping": "hidden_2d",
35
+ "muon_param_count": 84934656,
36
+ "muon_adam_param_count": 50212608,
37
+ "muon_param_names": [
38
+ "blocks.0.attn_qkv.weight",
39
+ "blocks.0.attn_out.weight",
40
+ "blocks.0.mlp.w12.weight",
41
+ "blocks.0.mlp.w3.weight",
42
+ "blocks.1.attn_qkv.weight",
43
+ "blocks.1.attn_out.weight",
44
+ "blocks.1.mlp.w12.weight",
45
+ "blocks.1.mlp.w3.weight",
46
+ "blocks.2.attn_qkv.weight",
47
+ "blocks.2.attn_out.weight",
48
+ "blocks.2.mlp.w12.weight",
49
+ "blocks.2.mlp.w3.weight",
50
+ "blocks.3.attn_qkv.weight",
51
+ "blocks.3.attn_out.weight",
52
+ "blocks.3.mlp.w12.weight",
53
+ "blocks.3.mlp.w3.weight",
54
+ "blocks.4.attn_qkv.weight",
55
+ "blocks.4.attn_out.weight",
56
+ "blocks.4.mlp.w12.weight",
57
+ "blocks.4.mlp.w3.weight",
58
+ "blocks.5.attn_qkv.weight",
59
+ "blocks.5.attn_out.weight",
60
+ "blocks.5.mlp.w12.weight",
61
+ "blocks.5.mlp.w3.weight",
62
+ "blocks.6.attn_qkv.weight",
63
+ "blocks.6.attn_out.weight",
64
+ "blocks.6.mlp.w12.weight",
65
+ "blocks.6.mlp.w3.weight",
66
+ "blocks.7.attn_qkv.weight",
67
+ "blocks.7.attn_out.weight",
68
+ "blocks.7.mlp.w12.weight",
69
+ "blocks.7.mlp.w3.weight",
70
+ "blocks.8.attn_qkv.weight",
71
+ "blocks.8.attn_out.weight",
72
+ "blocks.8.mlp.w12.weight",
73
+ "blocks.8.mlp.w3.weight",
74
+ "blocks.9.attn_qkv.weight",
75
+ "blocks.9.attn_out.weight",
76
+ "blocks.9.mlp.w12.weight",
77
+ "blocks.9.mlp.w3.weight",
78
+ "blocks.10.attn_qkv.weight",
79
+ "blocks.10.attn_out.weight",
80
+ "blocks.10.mlp.w12.weight",
81
+ "blocks.10.mlp.w3.weight",
82
+ "blocks.11.attn_qkv.weight",
83
+ "blocks.11.attn_out.weight",
84
+ "blocks.11.mlp.w12.weight",
85
+ "blocks.11.mlp.w3.weight"
86
+ ],
87
+ "muon_adam_param_names": [
88
+ "time_tokens",
89
+ "vocab_embed.embedding",
90
+ "sigma_map.net.0.weight",
91
+ "sigma_map.net.0.bias",
92
+ "sigma_map.net.2.weight",
93
+ "sigma_map.net.2.bias",
94
+ "blocks.0.norm1.weight",
95
+ "blocks.0.attn_qkv.bias",
96
+ "blocks.0.attn_out.bias",
97
+ "blocks.0.q_norm.weight",
98
+ "blocks.0.k_norm.weight",
99
+ "blocks.0.norm2.weight",
100
+ "blocks.0.mlp.w12.bias",
101
+ "blocks.0.mlp.w3.bias",
102
+ "blocks.1.norm1.weight",
103
+ "blocks.1.attn_qkv.bias",
104
+ "blocks.1.attn_out.bias",
105
+ "blocks.1.q_norm.weight",
106
+ "blocks.1.k_norm.weight",
107
+ "blocks.1.norm2.weight",
108
+ "blocks.1.mlp.w12.bias",
109
+ "blocks.1.mlp.w3.bias",
110
+ "blocks.2.norm1.weight",
111
+ "blocks.2.attn_qkv.bias",
112
+ "blocks.2.attn_out.bias",
113
+ "blocks.2.q_norm.weight",
114
+ "blocks.2.k_norm.weight",
115
+ "blocks.2.norm2.weight",
116
+ "blocks.2.mlp.w12.bias",
117
+ "blocks.2.mlp.w3.bias",
118
+ "blocks.3.norm1.weight",
119
+ "blocks.3.attn_qkv.bias",
120
+ "blocks.3.attn_out.bias",
121
+ "blocks.3.q_norm.weight",
122
+ "blocks.3.k_norm.weight",
123
+ "blocks.3.norm2.weight",
124
+ "blocks.3.mlp.w12.bias",
125
+ "blocks.3.mlp.w3.bias",
126
+ "blocks.4.norm1.weight",
127
+ "blocks.4.attn_qkv.bias",
128
+ "blocks.4.attn_out.bias",
129
+ "blocks.4.q_norm.weight",
130
+ "blocks.4.k_norm.weight",
131
+ "blocks.4.norm2.weight",
132
+ "blocks.4.mlp.w12.bias",
133
+ "blocks.4.mlp.w3.bias",
134
+ "blocks.5.norm1.weight",
135
+ "blocks.5.attn_qkv.bias",
136
+ "blocks.5.attn_out.bias",
137
+ "blocks.5.q_norm.weight",
138
+ "blocks.5.k_norm.weight",
139
+ "blocks.5.norm2.weight",
140
+ "blocks.5.mlp.w12.bias",
141
+ "blocks.5.mlp.w3.bias",
142
+ "blocks.6.norm1.weight",
143
+ "blocks.6.attn_qkv.bias",
144
+ "blocks.6.attn_out.bias",
145
+ "blocks.6.q_norm.weight",
146
+ "blocks.6.k_norm.weight",
147
+ "blocks.6.norm2.weight",
148
+ "blocks.6.mlp.w12.bias",
149
+ "blocks.6.mlp.w3.bias",
150
+ "blocks.7.norm1.weight",
151
+ "blocks.7.attn_qkv.bias",
152
+ "blocks.7.attn_out.bias",
153
+ "blocks.7.q_norm.weight",
154
+ "blocks.7.k_norm.weight",
155
+ "blocks.7.norm2.weight",
156
+ "blocks.7.mlp.w12.bias",
157
+ "blocks.7.mlp.w3.bias",
158
+ "blocks.8.norm1.weight",
159
+ "blocks.8.attn_qkv.bias",
160
+ "blocks.8.attn_out.bias",
161
+ "blocks.8.q_norm.weight",
162
+ "blocks.8.k_norm.weight",
163
+ "blocks.8.norm2.weight",
164
+ "blocks.8.mlp.w12.bias",
165
+ "blocks.8.mlp.w3.bias",
166
+ "blocks.9.norm1.weight",
167
+ "blocks.9.attn_qkv.bias",
168
+ "blocks.9.attn_out.bias",
169
+ "blocks.9.q_norm.weight",
170
+ "blocks.9.k_norm.weight",
171
+ "blocks.9.norm2.weight",
172
+ "blocks.9.mlp.w12.bias",
173
+ "blocks.9.mlp.w3.bias",
174
+ "blocks.10.norm1.weight",
175
+ "blocks.10.attn_qkv.bias",
176
+ "blocks.10.attn_out.bias",
177
+ "blocks.10.q_norm.weight",
178
+ "blocks.10.k_norm.weight",
179
+ "blocks.10.norm2.weight",
180
+ "blocks.10.mlp.w12.bias",
181
+ "blocks.10.mlp.w3.bias",
182
+ "blocks.11.norm1.weight",
183
+ "blocks.11.attn_qkv.bias",
184
+ "blocks.11.attn_out.bias",
185
+ "blocks.11.q_norm.weight",
186
+ "blocks.11.k_norm.weight",
187
+ "blocks.11.norm2.weight",
188
+ "blocks.11.mlp.w12.bias",
189
+ "blocks.11.mlp.w3.bias",
190
+ "output_layer.norm_final.weight",
191
+ "output_layer.linear.weight"
192
+ ],
193
+ "muon_effective_nesterov": true,
194
+ "muon_effective_width_scale": true,
195
+ "muon_effective_weight_decay": 0.0,
196
+ "muon_adam_fallback_nesterov": true,
197
+ "muon_adam_fallback_weight_decay": 0.0,
198
+ "ema_decay": 0.9999,
199
+ "ema_start_step": 0,
200
+ "model_type": "ddit_elf",
201
+ "elf_num_time_tokens": 4,
202
+ "elf_num_model_mode_tokens": 0,
203
+ "qk_norm": true,
204
+ "output_bias": false,
205
+ "output_init_std": 0.0,
206
+ "norm_type": "rmsnorm",
207
+ "t_sampling_mode": "logit_normal",
208
+ "t_sampling_power": 1.0,
209
+ "t_sampling_eps": 0.0001,
210
+ "t_sampling_logit_mean": -1.5,
211
+ "t_sampling_logit_std": 0.8,
212
+ "dual_t": true,
213
+ "corrupt_t_mode": "same",
214
+ "corrupt_min_t": 0.0,
215
+ "corrupt_max_t": 1.0,
216
+ "prefix_block_prob": 0.0,
217
+ "prefix_block_len": 128,
218
+ "mask_ratio_floor_schedule": "none",
219
+ "dirichlet_endpoint_mode": "categorical_dual_t",
220
+ "dirichlet_semantic_t_mode": "same",
221
+ "dirichlet_semantic_t_value": 0.0,
222
+ "dirichlet_semantic_t_curve": "linear",
223
+ "dirichlet_semantic_t_power": 1.0,
224
+ "endpoint_sequence_random_prob_alpha": 0.0,
225
+ "categorical_wrong_from_full_vocab": true,
226
+ "categorical_wrong_from_batch_valid_tokens": false,
227
+ "categorical_wrong_basin_token_ids": "",
228
+ "categorical_wrong_basin_prob": 0.0,
229
+ "categorical_wrong_unigram_prob": 0.0,
230
+ "categorical_wrong_uniform_prob": 0.0,
231
+ "categorical_wrong_corpus_unigram_path": "",
232
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
233
+ "categorical_wrong_basin_shared_prob": 0.0,
234
+ "categorical_wrong_unigram_shared_prob": 0.0,
235
+ "mask_mixture_original_prob": 0.0,
236
+ "mask_mixture_lowk_prob": 0.0,
237
+ "mask_mixture_lowcorrupt_prob": 0.0,
238
+ "mask_mixture_block_prob": 0.0,
239
+ "mask_mixture_all_prob": 0.0,
240
+ "mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
241
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
242
+ "mask_mixture_block_tokens": "64,128",
243
+ "simplex_bridge_sampler": "dirichlet",
244
+ "logistic_normal_sigma_min": 0.18,
245
+ "logistic_normal_sigma_max": 2.2,
246
+ "logistic_normal_tau_min": 0.65,
247
+ "logistic_normal_tau_max": 1.15,
248
+ "torch_compile": false,
249
+ "compile_mode": "max-autotune",
250
+ "state_format": "prob",
251
+ "target_loss": "hard_ce",
252
+ "meanflow_weight": 0.0,
253
+ "rollout_train_prob": 0.0,
254
+ "rollout_train_steps": 1,
255
+ "rollout_train_infer_steps": 64,
256
+ "rollout_train_temp": 1.45,
257
+ "rollout_train_max_gamma": 1.0,
258
+ "rollout_train_corrupt_only": true,
259
+ "rollout_train_samplewise": false,
260
+ "rollout_train_compute_always": false,
261
+ "bridge_noise_init": "logistic_normal",
262
+ "noise_sigma": -1.0,
263
+ "allow_tf32": true,
264
+ "activation_checkpointing": true,
265
+ "activation_checkpoint_interval": 1,
266
+ "activation_checkpoint_scope": "mlp",
267
+ "ddp_static_graph": false,
268
+ "ddp_gradient_as_bucket_view": true,
269
+ "blocking_data_transfer": false,
270
+ "dataloader_prefetch_factor": 4,
271
+ "full_train_stats": false,
272
+ "tokenized_hf": false,
273
+ "tokenized_pad_token": "pad",
274
+ "elf_conditional_hf": true,
275
+ "record_pad_truncate": false,
276
+ "record_add_eos": false,
277
+ "record_add_special_tokens": false,
278
+ "record_pad_token": "pad",
279
+ "record_shuffle_buffer": 10000,
280
+ "wrap": false,
281
+ "wrap_mode": "stream",
282
+ "wrap_record_buffer_size": 200,
283
+ "owt_cached_chunks": false,
284
+ "owt_chunk_cache_dir": "",
285
+ "owt_chunk_cache_rebuild": false,
286
+ "owt_chunk_cache_write_batch": 4096,
287
+ "owt_exact_repeat_per_chunk": 0,
288
+ "online_chunk_shuffle": false,
289
+ "online_chunk_shuffle_buffer": 10000,
290
+ "openwebtext_split": "all",
291
+ "detokenizer": "auto",
292
+ "resolved_detokenizer": null,
293
+ "num_workers": 8,
294
+ "latest_every": 10,
295
+ "resume_path": ""
296
+ }
297
+ step=5 epoch=1/1 epoch_step=5/16 micro_steps=20 elapsed=3.3s lr=7.500000e-04 loss=9.8932 loss_recon=9.8932 loss_meanflow=0.0000 mean_model_t=0.2029 mean_corrupt_t=0.2029 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0184 corrupt_frac=0.7514 acc_corrupt=0.3652 loss_corrupt=9.8932 wrong_frac=0.8011 init_acc_corrupt=0.1049 acc_corrupt_t_0p0_0p2=0.3351 corrupt_frac_t_0p0_0p2=0.6084 acc_corrupt_t_0p2_0p4=0.4118 corrupt_frac_t_0p2_0p4=0.3064 out_w_norm=2.5940 out_g_norm=7.9256 acc_corrupt_t_0p4_0p6=0.4473 corrupt_frac_t_0p4_0p6=0.0879 acc_corrupt_t_0p6_0p8=0.1730 corrupt_frac_t_0p6_0p8=0.0257 loss_all=9.6665 init_gold_top10=0.2277 init_gold_top100=0.6675
298
+ step=10 epoch=1/1 epoch_step=10/16 micro_steps=40 elapsed=2.3s lr=1.000000e-03 loss=7.0102 loss_recon=7.0102 loss_meanflow=0.0000 mean_model_t=0.2051 mean_corrupt_t=0.2051 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0250 corrupt_frac=0.7331 acc_corrupt=0.6253 loss_corrupt=7.0102 wrong_frac=0.7990 init_acc_corrupt=0.1080 acc_corrupt_t_0p0_0p2=0.6326 corrupt_frac_t_0p0_0p2=0.5792 acc_corrupt_t_0p2_0p4=0.6154 corrupt_frac_t_0p2_0p4=0.3491 acc_corrupt_t_0p4_0p6=0.6087 corrupt_frac_t_0p4_0p6=0.0721 out_w_norm=15.6885 out_g_norm=10.2636 acc_corrupt_t_0p6_0p8=0.8182 corrupt_frac_t_0p6_0p8=0.0282 acc_corrupt_t_0p8_1p0=0.3125 corrupt_frac_t_0p8_1p0=0.0107 loss_all=8.2540 init_gold_top10=0.1694 init_gold_top100=0.7650
LTA_openwebtext_dualt/logs/wmt14_deen_elfofficial_4gpu_debug/debug_wmt14_deen_elfofficial_t5_len256_in128_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_zeroout_4gpu_20260516_014755.log ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "elf_conditional_hf:8192:max_input_len=128:pad=1:loss_on_pad=1",
7
+ "vocab_size": 32100,
8
+ "tokenizer_vocab_size": 32100,
9
+ "save_dir": "runs/debug_wmt14_deen_elfofficial_t5_len256_in128_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_zeroout_4gpu_20260516_014755",
10
+ "batch_size": 32,
11
+ "grad_accum": 4,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 16,
18
+ "total_steps": 20,
19
+ "warmup_steps": 8,
20
+ "warmup_epochs": 0.5,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.0,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "optax",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": true,
33
+ "muon_width_scale": true,
34
+ "muon_grouping": "hidden_2d",
35
+ "muon_param_count": 84934656,
36
+ "muon_adam_param_count": 50212608,
37
+ "muon_param_names": [
38
+ "blocks.0.attn_qkv.weight",
39
+ "blocks.0.attn_out.weight",
40
+ "blocks.0.mlp.w12.weight",
41
+ "blocks.0.mlp.w3.weight",
42
+ "blocks.1.attn_qkv.weight",
43
+ "blocks.1.attn_out.weight",
44
+ "blocks.1.mlp.w12.weight",
45
+ "blocks.1.mlp.w3.weight",
46
+ "blocks.2.attn_qkv.weight",
47
+ "blocks.2.attn_out.weight",
48
+ "blocks.2.mlp.w12.weight",
49
+ "blocks.2.mlp.w3.weight",
50
+ "blocks.3.attn_qkv.weight",
51
+ "blocks.3.attn_out.weight",
52
+ "blocks.3.mlp.w12.weight",
53
+ "blocks.3.mlp.w3.weight",
54
+ "blocks.4.attn_qkv.weight",
55
+ "blocks.4.attn_out.weight",
56
+ "blocks.4.mlp.w12.weight",
57
+ "blocks.4.mlp.w3.weight",
58
+ "blocks.5.attn_qkv.weight",
59
+ "blocks.5.attn_out.weight",
60
+ "blocks.5.mlp.w12.weight",
61
+ "blocks.5.mlp.w3.weight",
62
+ "blocks.6.attn_qkv.weight",
63
+ "blocks.6.attn_out.weight",
64
+ "blocks.6.mlp.w12.weight",
65
+ "blocks.6.mlp.w3.weight",
66
+ "blocks.7.attn_qkv.weight",
67
+ "blocks.7.attn_out.weight",
68
+ "blocks.7.mlp.w12.weight",
69
+ "blocks.7.mlp.w3.weight",
70
+ "blocks.8.attn_qkv.weight",
71
+ "blocks.8.attn_out.weight",
72
+ "blocks.8.mlp.w12.weight",
73
+ "blocks.8.mlp.w3.weight",
74
+ "blocks.9.attn_qkv.weight",
75
+ "blocks.9.attn_out.weight",
76
+ "blocks.9.mlp.w12.weight",
77
+ "blocks.9.mlp.w3.weight",
78
+ "blocks.10.attn_qkv.weight",
79
+ "blocks.10.attn_out.weight",
80
+ "blocks.10.mlp.w12.weight",
81
+ "blocks.10.mlp.w3.weight",
82
+ "blocks.11.attn_qkv.weight",
83
+ "blocks.11.attn_out.weight",
84
+ "blocks.11.mlp.w12.weight",
85
+ "blocks.11.mlp.w3.weight"
86
+ ],
87
+ "muon_adam_param_names": [
88
+ "time_tokens",
89
+ "vocab_embed.embedding",
90
+ "sigma_map.net.0.weight",
91
+ "sigma_map.net.0.bias",
92
+ "sigma_map.net.2.weight",
93
+ "sigma_map.net.2.bias",
94
+ "blocks.0.norm1.weight",
95
+ "blocks.0.attn_qkv.bias",
96
+ "blocks.0.attn_out.bias",
97
+ "blocks.0.q_norm.weight",
98
+ "blocks.0.k_norm.weight",
99
+ "blocks.0.norm2.weight",
100
+ "blocks.0.mlp.w12.bias",
101
+ "blocks.0.mlp.w3.bias",
102
+ "blocks.1.norm1.weight",
103
+ "blocks.1.attn_qkv.bias",
104
+ "blocks.1.attn_out.bias",
105
+ "blocks.1.q_norm.weight",
106
+ "blocks.1.k_norm.weight",
107
+ "blocks.1.norm2.weight",
108
+ "blocks.1.mlp.w12.bias",
109
+ "blocks.1.mlp.w3.bias",
110
+ "blocks.2.norm1.weight",
111
+ "blocks.2.attn_qkv.bias",
112
+ "blocks.2.attn_out.bias",
113
+ "blocks.2.q_norm.weight",
114
+ "blocks.2.k_norm.weight",
115
+ "blocks.2.norm2.weight",
116
+ "blocks.2.mlp.w12.bias",
117
+ "blocks.2.mlp.w3.bias",
118
+ "blocks.3.norm1.weight",
119
+ "blocks.3.attn_qkv.bias",
120
+ "blocks.3.attn_out.bias",
121
+ "blocks.3.q_norm.weight",
122
+ "blocks.3.k_norm.weight",
123
+ "blocks.3.norm2.weight",
124
+ "blocks.3.mlp.w12.bias",
125
+ "blocks.3.mlp.w3.bias",
126
+ "blocks.4.norm1.weight",
127
+ "blocks.4.attn_qkv.bias",
128
+ "blocks.4.attn_out.bias",
129
+ "blocks.4.q_norm.weight",
130
+ "blocks.4.k_norm.weight",
131
+ "blocks.4.norm2.weight",
132
+ "blocks.4.mlp.w12.bias",
133
+ "blocks.4.mlp.w3.bias",
134
+ "blocks.5.norm1.weight",
135
+ "blocks.5.attn_qkv.bias",
136
+ "blocks.5.attn_out.bias",
137
+ "blocks.5.q_norm.weight",
138
+ "blocks.5.k_norm.weight",
139
+ "blocks.5.norm2.weight",
140
+ "blocks.5.mlp.w12.bias",
141
+ "blocks.5.mlp.w3.bias",
142
+ "blocks.6.norm1.weight",
143
+ "blocks.6.attn_qkv.bias",
144
+ "blocks.6.attn_out.bias",
145
+ "blocks.6.q_norm.weight",
146
+ "blocks.6.k_norm.weight",
147
+ "blocks.6.norm2.weight",
148
+ "blocks.6.mlp.w12.bias",
149
+ "blocks.6.mlp.w3.bias",
150
+ "blocks.7.norm1.weight",
151
+ "blocks.7.attn_qkv.bias",
152
+ "blocks.7.attn_out.bias",
153
+ "blocks.7.q_norm.weight",
154
+ "blocks.7.k_norm.weight",
155
+ "blocks.7.norm2.weight",
156
+ "blocks.7.mlp.w12.bias",
157
+ "blocks.7.mlp.w3.bias",
158
+ "blocks.8.norm1.weight",
159
+ "blocks.8.attn_qkv.bias",
160
+ "blocks.8.attn_out.bias",
161
+ "blocks.8.q_norm.weight",
162
+ "blocks.8.k_norm.weight",
163
+ "blocks.8.norm2.weight",
164
+ "blocks.8.mlp.w12.bias",
165
+ "blocks.8.mlp.w3.bias",
166
+ "blocks.9.norm1.weight",
167
+ "blocks.9.attn_qkv.bias",
168
+ "blocks.9.attn_out.bias",
169
+ "blocks.9.q_norm.weight",
170
+ "blocks.9.k_norm.weight",
171
+ "blocks.9.norm2.weight",
172
+ "blocks.9.mlp.w12.bias",
173
+ "blocks.9.mlp.w3.bias",
174
+ "blocks.10.norm1.weight",
175
+ "blocks.10.attn_qkv.bias",
176
+ "blocks.10.attn_out.bias",
177
+ "blocks.10.q_norm.weight",
178
+ "blocks.10.k_norm.weight",
179
+ "blocks.10.norm2.weight",
180
+ "blocks.10.mlp.w12.bias",
181
+ "blocks.10.mlp.w3.bias",
182
+ "blocks.11.norm1.weight",
183
+ "blocks.11.attn_qkv.bias",
184
+ "blocks.11.attn_out.bias",
185
+ "blocks.11.q_norm.weight",
186
+ "blocks.11.k_norm.weight",
187
+ "blocks.11.norm2.weight",
188
+ "blocks.11.mlp.w12.bias",
189
+ "blocks.11.mlp.w3.bias",
190
+ "output_layer.norm_final.weight",
191
+ "output_layer.linear.weight"
192
+ ],
193
+ "muon_effective_nesterov": true,
194
+ "muon_effective_width_scale": true,
195
+ "muon_effective_weight_decay": 0.0,
196
+ "muon_adam_fallback_nesterov": true,
197
+ "muon_adam_fallback_weight_decay": 0.0,
198
+ "ema_decay": 0.9999,
199
+ "ema_start_step": 0,
200
+ "model_type": "ddit_elf",
201
+ "elf_num_time_tokens": 4,
202
+ "elf_num_model_mode_tokens": 0,
203
+ "qk_norm": true,
204
+ "output_bias": false,
205
+ "output_init_std": 0.0,
206
+ "norm_type": "rmsnorm",
207
+ "t_sampling_mode": "logit_normal",
208
+ "t_sampling_power": 1.0,
209
+ "t_sampling_eps": 0.0001,
210
+ "t_sampling_logit_mean": -1.5,
211
+ "t_sampling_logit_std": 0.8,
212
+ "dual_t": true,
213
+ "corrupt_t_mode": "same",
214
+ "corrupt_min_t": 0.0,
215
+ "corrupt_max_t": 1.0,
216
+ "prefix_block_prob": 0.0,
217
+ "prefix_block_len": 128,
218
+ "mask_ratio_floor_schedule": "none",
219
+ "dirichlet_endpoint_mode": "categorical_dual_t",
220
+ "dirichlet_semantic_t_mode": "same",
221
+ "dirichlet_semantic_t_value": 0.0,
222
+ "dirichlet_semantic_t_curve": "linear",
223
+ "dirichlet_semantic_t_power": 1.0,
224
+ "endpoint_sequence_random_prob_alpha": 0.0,
225
+ "categorical_wrong_from_full_vocab": true,
226
+ "categorical_wrong_from_batch_valid_tokens": false,
227
+ "categorical_wrong_basin_token_ids": "",
228
+ "categorical_wrong_basin_prob": 0.0,
229
+ "categorical_wrong_unigram_prob": 0.0,
230
+ "categorical_wrong_uniform_prob": 0.0,
231
+ "categorical_wrong_corpus_unigram_path": "",
232
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
233
+ "categorical_wrong_basin_shared_prob": 0.0,
234
+ "categorical_wrong_unigram_shared_prob": 0.0,
235
+ "mask_mixture_original_prob": 0.0,
236
+ "mask_mixture_lowk_prob": 0.0,
237
+ "mask_mixture_lowcorrupt_prob": 0.0,
238
+ "mask_mixture_block_prob": 0.0,
239
+ "mask_mixture_all_prob": 0.0,
240
+ "mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
241
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
242
+ "mask_mixture_block_tokens": "64,128",
243
+ "simplex_bridge_sampler": "dirichlet",
244
+ "logistic_normal_sigma_min": 0.18,
245
+ "logistic_normal_sigma_max": 2.2,
246
+ "logistic_normal_tau_min": 0.65,
247
+ "logistic_normal_tau_max": 1.15,
248
+ "torch_compile": false,
249
+ "compile_mode": "max-autotune",
250
+ "state_format": "prob",
251
+ "target_loss": "hard_ce",
252
+ "meanflow_weight": 0.0,
253
+ "rollout_train_prob": 0.0,
254
+ "rollout_train_steps": 1,
255
+ "rollout_train_infer_steps": 64,
256
+ "rollout_train_temp": 1.45,
257
+ "rollout_train_max_gamma": 1.0,
258
+ "rollout_train_corrupt_only": true,
259
+ "rollout_train_samplewise": false,
260
+ "rollout_train_compute_always": false,
261
+ "bridge_noise_init": "logistic_normal",
262
+ "noise_sigma": -1.0,
263
+ "allow_tf32": true,
264
+ "activation_checkpointing": true,
265
+ "activation_checkpoint_interval": 1,
266
+ "activation_checkpoint_scope": "mlp",
267
+ "ddp_static_graph": false,
268
+ "ddp_gradient_as_bucket_view": true,
269
+ "blocking_data_transfer": false,
270
+ "dataloader_prefetch_factor": 4,
271
+ "full_train_stats": false,
272
+ "tokenized_hf": false,
273
+ "tokenized_pad_token": "pad",
274
+ "elf_conditional_hf": true,
275
+ "record_pad_truncate": false,
276
+ "record_add_eos": false,
277
+ "record_add_special_tokens": false,
278
+ "record_pad_token": "pad",
279
+ "record_shuffle_buffer": 10000,
280
+ "wrap": false,
281
+ "wrap_mode": "stream",
282
+ "wrap_record_buffer_size": 200,
283
+ "owt_cached_chunks": false,
284
+ "owt_chunk_cache_dir": "",
285
+ "owt_chunk_cache_rebuild": false,
286
+ "owt_chunk_cache_write_batch": 4096,
287
+ "owt_exact_repeat_per_chunk": 0,
288
+ "online_chunk_shuffle": false,
289
+ "online_chunk_shuffle_buffer": 10000,
290
+ "openwebtext_split": "all",
291
+ "detokenizer": "auto",
292
+ "resolved_detokenizer": null,
293
+ "num_workers": 8,
294
+ "latest_every": 10,
295
+ "resume_path": ""
296
+ }
297
+ step=5 epoch=1/2 epoch_step=5/16 micro_steps=20 elapsed=5.3s lr=7.500000e-04 loss=9.8666 loss_recon=9.8666 loss_meanflow=0.0000 mean_model_t=0.2051 mean_corrupt_t=0.2051 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0185 corrupt_frac=1.5743 acc_corrupt=0.4846 loss_corrupt=9.8666 wrong_frac=0.7960 init_acc_corrupt=0.1119 acc_corrupt_t_0p0_0p2=0.4741 corrupt_frac_t_0p0_0p2=0.5812 acc_corrupt_t_0p2_0p4=0.5148 corrupt_frac_t_0p2_0p4=0.3298 out_w_norm=2.6140 out_g_norm=10.0464 acc_corrupt_t_0p4_0p6=0.4628 corrupt_frac_t_0p4_0p6=0.0931 acc_corrupt_t_0p6_0p8=0.1000 corrupt_frac_t_0p6_0p8=0.0356 loss_all=9.7009 init_gold_top10=0.1821 init_gold_top100=0.8430
298
+ step=10 epoch=1/2 epoch_step=10/16 micro_steps=40 elapsed=4.8s lr=1.000000e-03 loss=6.5588 loss_recon=6.5588 loss_meanflow=0.0000 mean_model_t=0.2179 mean_corrupt_t=0.2179 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0257 corrupt_frac=1.5627 acc_corrupt=0.8456 loss_corrupt=6.5588 wrong_frac=0.7817 init_acc_corrupt=0.1262 acc_corrupt_t_0p0_0p2=0.8507 corrupt_frac_t_0p0_0p2=0.5440 acc_corrupt_t_0p2_0p4=0.8393 corrupt_frac_t_0p2_0p4=0.3469 acc_corrupt_t_0p4_0p6=0.8397 corrupt_frac_t_0p4_0p6=0.0950 out_w_norm=16.2181 out_g_norm=13.4040 acc_corrupt_t_0p6_0p8=0.8291 corrupt_frac_t_0p6_0p8=0.0479 acc_corrupt_t_0p8_1p0=0.9043 corrupt_frac_t_0p8_1p0=0.0520 loss_all=8.2193 init_gold_top10=0.2056 init_gold_top100=0.7543
299
+ [demo] target: Wir warten nun ab und sehen den nächsten Schritten entgegen.
300
+ [rank0]: Traceback (most recent call last):
301
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2490, in <module>
302
+ [rank0]: main()
303
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2476, in main
304
+ [rank0]: run_demo(args, unwrap_model(trainable_model), tokenizer, last_batch, device)
305
+ [rank0]: File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
306
+ [rank0]: return func(*args, **kwargs)
307
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^
308
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1608, in run_demo
309
+ [rank0]: init, mask = fill_blank_init(
310
+ [rank0]: ^^^^^^^^^^^^^^^^
311
+ [rank0]: File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
312
+ [rank0]: return func(*args, **kwargs)
313
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^
314
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/decode.py", line 193, in fill_blank_init
315
+ [rank0]: noise = sample_noise_simplex(
316
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^
317
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/decode.py", line 32, in sample_noise_simplex
318
+ [rank0]: sigma = resolve_noise_sigma(vocab_size, target_prob, noise_sigma)
319
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
320
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/decode.py", line 16, in resolve_noise_sigma
321
+ [rank0]: return float(margin_for_target_prob(vocab_size, target_prob))
322
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
323
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/bridges.py", line 111, in margin_for_target_prob
324
+ [rank0]: raise ValueError(f"target_prob must be in (0, 1), got {q}")
325
+ [rank0]: ValueError: target_prob must be in (0, 1), got 1.0
326
+ [rank0]:[W516 01:49:18.259818969 ProcessGroupNCCL.cpp:1487] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
LTA_openwebtext_dualt/logs/wmt14_deen_elfofficial_4gpu_debug/debug_wmt14_deen_elfofficial_t5_len256_in128_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_zeroout_4gpu_20260516_015703.log ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NCCL version 2.25.1+cuda12.8
2
+ {
3
+ "device": "cuda:0",
4
+ "rank": 0,
5
+ "world_size": 4,
6
+ "samples": "elf_conditional_hf:8192:max_input_len=128:pad=0:loss_on_pad=0",
7
+ "vocab_size": 32100,
8
+ "tokenizer_vocab_size": 32100,
9
+ "save_dir": "runs/debug_wmt14_deen_elfofficial_t5_len256_in128_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_zeroout_4gpu_20260516_015703",
10
+ "batch_size": 32,
11
+ "grad_accum": 4,
12
+ "effective_batch_size": 512,
13
+ "global_batch_size": 512,
14
+ "lr_schedule": "constant_warmup",
15
+ "optimizer": "muon",
16
+ "epochs": 0.0,
17
+ "steps_per_epoch": 16,
18
+ "total_steps": 20,
19
+ "warmup_steps": 8,
20
+ "warmup_epochs": 0.5,
21
+ "min_lr": 0.0,
22
+ "weight_decay": 0.0,
23
+ "output_weight_decay": -1.0,
24
+ "adamw_param_groups": "nanogpt",
25
+ "adam_beta1": 0.9,
26
+ "adam_beta2": 0.95,
27
+ "adam_eps": 1e-08,
28
+ "muon_impl": "optax",
29
+ "muon_momentum": 0.95,
30
+ "muon_ns_steps": 5,
31
+ "muon_update_scale": 1.0,
32
+ "muon_nesterov": true,
33
+ "muon_width_scale": true,
34
+ "muon_grouping": "hidden_2d",
35
+ "muon_param_count": 84934656,
36
+ "muon_adam_param_count": 50212608,
37
+ "muon_param_names": [
38
+ "blocks.0.attn_qkv.weight",
39
+ "blocks.0.attn_out.weight",
40
+ "blocks.0.mlp.w12.weight",
41
+ "blocks.0.mlp.w3.weight",
42
+ "blocks.1.attn_qkv.weight",
43
+ "blocks.1.attn_out.weight",
44
+ "blocks.1.mlp.w12.weight",
45
+ "blocks.1.mlp.w3.weight",
46
+ "blocks.2.attn_qkv.weight",
47
+ "blocks.2.attn_out.weight",
48
+ "blocks.2.mlp.w12.weight",
49
+ "blocks.2.mlp.w3.weight",
50
+ "blocks.3.attn_qkv.weight",
51
+ "blocks.3.attn_out.weight",
52
+ "blocks.3.mlp.w12.weight",
53
+ "blocks.3.mlp.w3.weight",
54
+ "blocks.4.attn_qkv.weight",
55
+ "blocks.4.attn_out.weight",
56
+ "blocks.4.mlp.w12.weight",
57
+ "blocks.4.mlp.w3.weight",
58
+ "blocks.5.attn_qkv.weight",
59
+ "blocks.5.attn_out.weight",
60
+ "blocks.5.mlp.w12.weight",
61
+ "blocks.5.mlp.w3.weight",
62
+ "blocks.6.attn_qkv.weight",
63
+ "blocks.6.attn_out.weight",
64
+ "blocks.6.mlp.w12.weight",
65
+ "blocks.6.mlp.w3.weight",
66
+ "blocks.7.attn_qkv.weight",
67
+ "blocks.7.attn_out.weight",
68
+ "blocks.7.mlp.w12.weight",
69
+ "blocks.7.mlp.w3.weight",
70
+ "blocks.8.attn_qkv.weight",
71
+ "blocks.8.attn_out.weight",
72
+ "blocks.8.mlp.w12.weight",
73
+ "blocks.8.mlp.w3.weight",
74
+ "blocks.9.attn_qkv.weight",
75
+ "blocks.9.attn_out.weight",
76
+ "blocks.9.mlp.w12.weight",
77
+ "blocks.9.mlp.w3.weight",
78
+ "blocks.10.attn_qkv.weight",
79
+ "blocks.10.attn_out.weight",
80
+ "blocks.10.mlp.w12.weight",
81
+ "blocks.10.mlp.w3.weight",
82
+ "blocks.11.attn_qkv.weight",
83
+ "blocks.11.attn_out.weight",
84
+ "blocks.11.mlp.w12.weight",
85
+ "blocks.11.mlp.w3.weight"
86
+ ],
87
+ "muon_adam_param_names": [
88
+ "time_tokens",
89
+ "vocab_embed.embedding",
90
+ "sigma_map.net.0.weight",
91
+ "sigma_map.net.0.bias",
92
+ "sigma_map.net.2.weight",
93
+ "sigma_map.net.2.bias",
94
+ "blocks.0.norm1.weight",
95
+ "blocks.0.attn_qkv.bias",
96
+ "blocks.0.attn_out.bias",
97
+ "blocks.0.q_norm.weight",
98
+ "blocks.0.k_norm.weight",
99
+ "blocks.0.norm2.weight",
100
+ "blocks.0.mlp.w12.bias",
101
+ "blocks.0.mlp.w3.bias",
102
+ "blocks.1.norm1.weight",
103
+ "blocks.1.attn_qkv.bias",
104
+ "blocks.1.attn_out.bias",
105
+ "blocks.1.q_norm.weight",
106
+ "blocks.1.k_norm.weight",
107
+ "blocks.1.norm2.weight",
108
+ "blocks.1.mlp.w12.bias",
109
+ "blocks.1.mlp.w3.bias",
110
+ "blocks.2.norm1.weight",
111
+ "blocks.2.attn_qkv.bias",
112
+ "blocks.2.attn_out.bias",
113
+ "blocks.2.q_norm.weight",
114
+ "blocks.2.k_norm.weight",
115
+ "blocks.2.norm2.weight",
116
+ "blocks.2.mlp.w12.bias",
117
+ "blocks.2.mlp.w3.bias",
118
+ "blocks.3.norm1.weight",
119
+ "blocks.3.attn_qkv.bias",
120
+ "blocks.3.attn_out.bias",
121
+ "blocks.3.q_norm.weight",
122
+ "blocks.3.k_norm.weight",
123
+ "blocks.3.norm2.weight",
124
+ "blocks.3.mlp.w12.bias",
125
+ "blocks.3.mlp.w3.bias",
126
+ "blocks.4.norm1.weight",
127
+ "blocks.4.attn_qkv.bias",
128
+ "blocks.4.attn_out.bias",
129
+ "blocks.4.q_norm.weight",
130
+ "blocks.4.k_norm.weight",
131
+ "blocks.4.norm2.weight",
132
+ "blocks.4.mlp.w12.bias",
133
+ "blocks.4.mlp.w3.bias",
134
+ "blocks.5.norm1.weight",
135
+ "blocks.5.attn_qkv.bias",
136
+ "blocks.5.attn_out.bias",
137
+ "blocks.5.q_norm.weight",
138
+ "blocks.5.k_norm.weight",
139
+ "blocks.5.norm2.weight",
140
+ "blocks.5.mlp.w12.bias",
141
+ "blocks.5.mlp.w3.bias",
142
+ "blocks.6.norm1.weight",
143
+ "blocks.6.attn_qkv.bias",
144
+ "blocks.6.attn_out.bias",
145
+ "blocks.6.q_norm.weight",
146
+ "blocks.6.k_norm.weight",
147
+ "blocks.6.norm2.weight",
148
+ "blocks.6.mlp.w12.bias",
149
+ "blocks.6.mlp.w3.bias",
150
+ "blocks.7.norm1.weight",
151
+ "blocks.7.attn_qkv.bias",
152
+ "blocks.7.attn_out.bias",
153
+ "blocks.7.q_norm.weight",
154
+ "blocks.7.k_norm.weight",
155
+ "blocks.7.norm2.weight",
156
+ "blocks.7.mlp.w12.bias",
157
+ "blocks.7.mlp.w3.bias",
158
+ "blocks.8.norm1.weight",
159
+ "blocks.8.attn_qkv.bias",
160
+ "blocks.8.attn_out.bias",
161
+ "blocks.8.q_norm.weight",
162
+ "blocks.8.k_norm.weight",
163
+ "blocks.8.norm2.weight",
164
+ "blocks.8.mlp.w12.bias",
165
+ "blocks.8.mlp.w3.bias",
166
+ "blocks.9.norm1.weight",
167
+ "blocks.9.attn_qkv.bias",
168
+ "blocks.9.attn_out.bias",
169
+ "blocks.9.q_norm.weight",
170
+ "blocks.9.k_norm.weight",
171
+ "blocks.9.norm2.weight",
172
+ "blocks.9.mlp.w12.bias",
173
+ "blocks.9.mlp.w3.bias",
174
+ "blocks.10.norm1.weight",
175
+ "blocks.10.attn_qkv.bias",
176
+ "blocks.10.attn_out.bias",
177
+ "blocks.10.q_norm.weight",
178
+ "blocks.10.k_norm.weight",
179
+ "blocks.10.norm2.weight",
180
+ "blocks.10.mlp.w12.bias",
181
+ "blocks.10.mlp.w3.bias",
182
+ "blocks.11.norm1.weight",
183
+ "blocks.11.attn_qkv.bias",
184
+ "blocks.11.attn_out.bias",
185
+ "blocks.11.q_norm.weight",
186
+ "blocks.11.k_norm.weight",
187
+ "blocks.11.norm2.weight",
188
+ "blocks.11.mlp.w12.bias",
189
+ "blocks.11.mlp.w3.bias",
190
+ "output_layer.norm_final.weight",
191
+ "output_layer.linear.weight"
192
+ ],
193
+ "muon_effective_nesterov": true,
194
+ "muon_effective_width_scale": true,
195
+ "muon_effective_weight_decay": 0.0,
196
+ "muon_adam_fallback_nesterov": true,
197
+ "muon_adam_fallback_weight_decay": 0.0,
198
+ "ema_decay": 0.9999,
199
+ "ema_start_step": 0,
200
+ "model_type": "ddit_elf",
201
+ "elf_num_time_tokens": 4,
202
+ "elf_num_model_mode_tokens": 0,
203
+ "qk_norm": true,
204
+ "output_bias": false,
205
+ "output_init_std": 0.0,
206
+ "norm_type": "rmsnorm",
207
+ "t_sampling_mode": "logit_normal",
208
+ "t_sampling_power": 1.0,
209
+ "t_sampling_eps": 0.0001,
210
+ "t_sampling_logit_mean": -1.5,
211
+ "t_sampling_logit_std": 0.8,
212
+ "dual_t": true,
213
+ "corrupt_t_mode": "same",
214
+ "corrupt_min_t": 0.0,
215
+ "corrupt_max_t": 1.0,
216
+ "prefix_block_prob": 0.0,
217
+ "prefix_block_len": 128,
218
+ "mask_ratio_floor_schedule": "none",
219
+ "dirichlet_endpoint_mode": "categorical_dual_t",
220
+ "dirichlet_semantic_t_mode": "same",
221
+ "dirichlet_semantic_t_value": 0.0,
222
+ "dirichlet_semantic_t_curve": "linear",
223
+ "dirichlet_semantic_t_power": 1.0,
224
+ "endpoint_sequence_random_prob_alpha": 0.0,
225
+ "categorical_wrong_from_full_vocab": true,
226
+ "categorical_wrong_from_batch_valid_tokens": false,
227
+ "categorical_wrong_basin_token_ids": "",
228
+ "categorical_wrong_basin_prob": 0.0,
229
+ "categorical_wrong_unigram_prob": 0.0,
230
+ "categorical_wrong_uniform_prob": 0.0,
231
+ "categorical_wrong_corpus_unigram_path": "",
232
+ "categorical_wrong_corpus_unigram_alpha": 1.0,
233
+ "categorical_wrong_basin_shared_prob": 0.0,
234
+ "categorical_wrong_unigram_shared_prob": 0.0,
235
+ "mask_mixture_original_prob": 0.0,
236
+ "mask_mixture_lowk_prob": 0.0,
237
+ "mask_mixture_lowcorrupt_prob": 0.0,
238
+ "mask_mixture_block_prob": 0.0,
239
+ "mask_mixture_all_prob": 0.0,
240
+ "mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
241
+ "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
242
+ "mask_mixture_block_tokens": "64,128",
243
+ "simplex_bridge_sampler": "dirichlet",
244
+ "logistic_normal_sigma_min": 0.18,
245
+ "logistic_normal_sigma_max": 2.2,
246
+ "logistic_normal_tau_min": 0.65,
247
+ "logistic_normal_tau_max": 1.15,
248
+ "torch_compile": false,
249
+ "compile_mode": "max-autotune",
250
+ "state_format": "prob",
251
+ "target_loss": "hard_ce",
252
+ "meanflow_weight": 0.0,
253
+ "rollout_train_prob": 0.0,
254
+ "rollout_train_steps": 1,
255
+ "rollout_train_infer_steps": 64,
256
+ "rollout_train_temp": 1.45,
257
+ "rollout_train_max_gamma": 1.0,
258
+ "rollout_train_corrupt_only": true,
259
+ "rollout_train_samplewise": false,
260
+ "rollout_train_compute_always": false,
261
+ "bridge_noise_init": "logistic_normal",
262
+ "noise_sigma": -1.0,
263
+ "allow_tf32": true,
264
+ "activation_checkpointing": true,
265
+ "activation_checkpoint_interval": 1,
266
+ "activation_checkpoint_scope": "mlp",
267
+ "ddp_static_graph": false,
268
+ "ddp_gradient_as_bucket_view": true,
269
+ "blocking_data_transfer": false,
270
+ "dataloader_prefetch_factor": 4,
271
+ "full_train_stats": false,
272
+ "tokenized_hf": false,
273
+ "tokenized_pad_token": "pad",
274
+ "elf_conditional_hf": true,
275
+ "record_pad_truncate": false,
276
+ "record_add_eos": false,
277
+ "record_add_special_tokens": false,
278
+ "record_pad_token": "pad",
279
+ "record_shuffle_buffer": 10000,
280
+ "wrap": false,
281
+ "wrap_mode": "stream",
282
+ "wrap_record_buffer_size": 200,
283
+ "owt_cached_chunks": false,
284
+ "owt_chunk_cache_dir": "",
285
+ "owt_chunk_cache_rebuild": false,
286
+ "owt_chunk_cache_write_batch": 4096,
287
+ "owt_exact_repeat_per_chunk": 0,
288
+ "online_chunk_shuffle": false,
289
+ "online_chunk_shuffle_buffer": 10000,
290
+ "openwebtext_split": "all",
291
+ "detokenizer": "auto",
292
+ "resolved_detokenizer": null,
293
+ "num_workers": 8,
294
+ "latest_every": 10,
295
+ "resume_path": ""
296
+ }
297
+ step=5 epoch=1/2 epoch_step=5/16 micro_steps=20 elapsed=5.2s lr=7.500000e-04 loss=9.9922 loss_recon=9.9922 loss_meanflow=0.0000 mean_model_t=0.2073 mean_corrupt_t=0.2073 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0125 corrupt_frac=0.2568 acc_corrupt=0.0172 loss_corrupt=9.9922 wrong_frac=0.7924 init_acc_corrupt=0.1154 acc_corrupt_t_0p0_0p2=0.0162 corrupt_frac_t_0p0_0p2=0.5674 acc_corrupt_t_0p2_0p4=0.0191 corrupt_frac_t_0p2_0p4=0.3424 out_w_norm=2.5805 out_g_norm=1.3452 acc_corrupt_t_0p4_0p6=0.0137 corrupt_frac_t_0p4_0p6=0.0888 acc_corrupt_t_0p6_0p8=0.0313 corrupt_frac_t_0p6_0p8=0.0376 loss_all=9.7110 init_gold_top10=0.1817 init_gold_top100=0.5263
298
+ step=10 epoch=1/2 epoch_step=10/16 micro_steps=40 elapsed=4.3s lr=1.000000e-03 loss=7.8761 loss_recon=7.8761 loss_meanflow=0.0000 mean_model_t=0.2004 mean_corrupt_t=0.2004 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0298 corrupt_frac=0.2547 acc_corrupt=0.0496 loss_corrupt=7.8761 wrong_frac=0.8020 init_acc_corrupt=0.1007 acc_corrupt_t_0p0_0p2=0.0507 corrupt_frac_t_0p0_0p2=0.5641 acc_corrupt_t_0p2_0p4=0.0468 corrupt_frac_t_0p2_0p4=0.3677 acc_corrupt_t_0p4_0p6=0.0535 corrupt_frac_t_0p4_0p6=0.0669 out_w_norm=15.1139 out_g_norm=1.6920 acc_corrupt_t_0p6_0p8=0.0784 corrupt_frac_t_0p6_0p8=0.0454 loss_all=8.5945 init_gold_top10=0.1755 init_gold_top100=0.5223
299
+ [demo] target: Wir warten nun ab und sehen den nächsten Schritten entgegen.
300
+ [rank0]: Traceback (most recent call last):
301
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2490, in <module>
302
+ [rank0]: main()
303
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 2476, in main
304
+ [rank0]: run_demo(args, unwrap_model(trainable_model), tokenizer, last_batch, device)
305
+ [rank0]: File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
306
+ [rank0]: return func(*args, **kwargs)
307
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^
308
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 1608, in run_demo
309
+ [rank0]: init, mask = fill_blank_init(
310
+ [rank0]: ^^^^^^^^^^^^^^^^
311
+ [rank0]: File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
312
+ [rank0]: return func(*args, **kwargs)
313
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^
314
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/decode.py", line 193, in fill_blank_init
315
+ [rank0]: noise = sample_noise_simplex(
316
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^
317
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/decode.py", line 32, in sample_noise_simplex
318
+ [rank0]: sigma = resolve_noise_sigma(vocab_size, target_prob, noise_sigma)
319
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
320
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/decode.py", line 16, in resolve_noise_sigma
321
+ [rank0]: return float(margin_for_target_prob(vocab_size, target_prob))
322
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
323
+ [rank0]: File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/bridges.py", line 111, in margin_for_target_prob
324
+ [rank0]: raise ValueError(f"target_prob must be in (0, 1), got {q}")
325
+ [rank0]: ValueError: target_prob must be in (0, 1), got 1.0
326
+ [rank0]:[W516 01:58:18.216258226 ProcessGroupNCCL.cpp:1487] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
LTA_openwebtext_dualt/mini_owt_fit/model.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import math
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+
9
+
10
+ def apply_rope(x: torch.Tensor, prefix_len: int) -> torch.Tensor:
11
+ """RoPE for q/k tensors shaped [B, heads, length, head_dim].
12
+
13
+ Time prefix tokens are not rotated. Text token position starts at 0.
14
+ """
15
+
16
+ prefix = x[..., :prefix_len, :]
17
+ body = x[..., prefix_len:, :]
18
+ b, h, l, d = body.shape
19
+ half = d // 2
20
+
21
+ pos = torch.arange(l, device=x.device, dtype=torch.float32)
22
+ freq = torch.exp(-math.log(10000.0) * torch.arange(half, device=x.device) / half)
23
+ angle = pos[:, None] * freq[None, :]
24
+ cos = angle.cos()[None, None]
25
+ sin = angle.sin()[None, None]
26
+
27
+ pair = body.float().reshape(b, h, l, half, 2)
28
+ x0 = pair[..., 0]
29
+ x1 = pair[..., 1]
30
+ body = torch.stack((x0 * cos - x1 * sin, x0 * sin + x1 * cos), dim=-1).flatten(-2)
31
+ body = body.to(dtype=x.dtype)
32
+ return torch.cat([prefix, body], dim=-2)
33
+
34
+
35
+ class Block(nn.Module):
36
+ def __init__(self, dim: int, heads: int, mlp_dim: int) -> None:
37
+ super().__init__()
38
+ self.heads = heads
39
+ self.head_dim = dim // heads
40
+ self.norm1 = nn.LayerNorm(dim)
41
+ self.qkv = nn.Linear(dim, 3 * dim, bias=False)
42
+ self.proj = nn.Linear(dim, dim, bias=False)
43
+ self.norm2 = nn.LayerNorm(dim)
44
+ self.mlp = nn.Sequential(
45
+ nn.Linear(dim, mlp_dim),
46
+ nn.GELU(approximate="tanh"),
47
+ nn.Linear(mlp_dim, dim),
48
+ )
49
+
50
+ def forward(self, x: torch.Tensor, prefix_len: int, use_rope: bool) -> torch.Tensor:
51
+ b, l, d = x.shape
52
+ qkv = self.qkv(self.norm1(x)).view(b, l, 3, self.heads, self.head_dim)
53
+ q, k, v = qkv.unbind(dim=2)
54
+ q = q.transpose(1, 2)
55
+ k = k.transpose(1, 2)
56
+ v = v.transpose(1, 2)
57
+ if use_rope:
58
+ q = apply_rope(q, prefix_len)
59
+ k = apply_rope(k, prefix_len)
60
+ y = F.scaled_dot_product_attention(q, k, v, is_causal=False)
61
+ y = y.transpose(1, 2).reshape(b, l, d)
62
+ x = x + self.proj(y)
63
+ x = x + self.mlp(self.norm2(x))
64
+ return x
65
+
66
+
67
+ class TinyFlowLM(nn.Module):
68
+ """Endpoint predictor for probability-cloud inputs.
69
+
70
+ Input:
71
+ probs: [batch, length, vocab]
72
+ t: [batch]
73
+
74
+ Conditioning:
75
+ four learnable time tokens are prepended. Each receives the same
76
+ continuous time embedding. No AdaLN, no gates, no hidden-stream FiLM.
77
+ """
78
+
79
+ def __init__(
80
+ self,
81
+ *,
82
+ vocab_size: int,
83
+ max_len: int,
84
+ dim: int = 256,
85
+ layers: int = 3,
86
+ heads: int = 4,
87
+ mlp_dim: int = 1024,
88
+ time_tokens: int = 4,
89
+ abs_pos: bool = True,
90
+ rope: bool = True,
91
+ ) -> None:
92
+ super().__init__()
93
+ self.vocab_size = vocab_size
94
+ self.max_len = max_len
95
+ self.time_tokens = time_tokens
96
+ self.rope = rope
97
+
98
+ self.token_embed = nn.Parameter(torch.randn(vocab_size, dim) * 0.02)
99
+ self.pos = nn.Embedding(max_len, dim) if abs_pos else None
100
+ self.time_base = nn.Parameter(torch.randn(1, time_tokens, dim) * 0.02)
101
+ self.time_mlp = nn.Sequential(nn.Linear(1, dim), nn.SiLU(), nn.Linear(dim, dim))
102
+ self.blocks = nn.ModuleList(Block(dim, heads, mlp_dim) for _ in range(layers))
103
+ self.norm = nn.LayerNorm(dim)
104
+ self.head = nn.Linear(dim, vocab_size, bias=False)
105
+
106
+ def forward(self, probs: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
107
+ b, l, _ = probs.shape
108
+ x = probs @ self.token_embed
109
+ if self.pos is not None:
110
+ pos = torch.arange(l, device=x.device)
111
+ x = x + self.pos(pos)[None]
112
+
113
+ t_embed = self.time_mlp(t.float().view(b, 1))
114
+ prefix = self.time_base.expand(b, -1, -1) + t_embed[:, None]
115
+ x = torch.cat([prefix, x], dim=1)
116
+
117
+ for block in self.blocks:
118
+ x = block(x, prefix_len=self.time_tokens, use_rope=self.rope)
119
+
120
+ x = x[:, self.time_tokens:]
121
+ return self.head(self.norm(x))
LTA_openwebtext_dualt/mini_owt_fit/run_standard_owt_t5_8gpu.sh ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ cd /e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/mini_owt_fit
5
+
6
+ export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
7
+ export TOKENIZERS_PARALLELISM=false
8
+ export PYTHONUNBUFFERED=1
9
+
10
+ NNODES=${MLP_WORKER_NUM:-1}
11
+ NODE_RANK=${MLP_ROLE_INDEX:-0}
12
+ MASTER_ADDR=${MLP_WORKER_0_HOST:-127.0.0.1}
13
+ MASTER_PORT=${MLP_WORKER_0_PORT:-29500}
14
+
15
+ DATA_PATH="${DATA_PATH:-/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext}"
16
+ TOKENIZER_PATH="${TOKENIZER_PATH:-/e2e-data/evad-tech-vla/wanghan58/models/hf/t5-small/tokenizer.json}"
17
+ CACHE_PATH="${CACHE_PATH:-cache/owt_t5_payload1022_appendeos1.pt}"
18
+
19
+ DATE_TAG="${DATE_TAG:-$(date +%Y%m%d_%H%M%S)}"
20
+ RUN_NAME="${RUN_NAME:-mini_owt_fit_t5_bernoulliwrong_len1024_bos_eos_C1_to_1024_absrope_time4_d768_l12_h12_native_nofloor_full_gbs512_8gpu_${DATE_TAG}}"
21
+ OUT_DIR="${OUT_DIR:-runs/${RUN_NAME}}"
22
+ LOG_FILE="${LOG_FILE:-logs/${RUN_NAME}.log}"
23
+
24
+ mkdir -p "$(dirname "${LOG_FILE}")" "${OUT_DIR}"
25
+
26
+ torchrun \
27
+ --nnodes="${NNODES}" \
28
+ --node_rank="${NODE_RANK}" \
29
+ --master_addr="${MASTER_ADDR}" \
30
+ --master_port="${MASTER_PORT}" \
31
+ --nproc_per_node="${NPROC_PER_NODE:-8}" \
32
+ train.py \
33
+ --data_path "${DATA_PATH}" \
34
+ --tokenizer_path "${TOKENIZER_PATH}" \
35
+ --out_dir "${OUT_DIR}" \
36
+ --subset_size "${SUBSET_SIZE:-0}" \
37
+ --payload_len "${PAYLOAD_LEN:-1022}" \
38
+ --append_eos "${APPEND_EOS:-1}" \
39
+ --log_skips "${LOG_SKIPS:-20}" \
40
+ --cache_path "${CACHE_PATH}" \
41
+ --rebuild_cache "${REBUILD_CACHE:-0}" \
42
+ --online_data "${ONLINE_DATA:-0}" \
43
+ --online_buffer_size "${ONLINE_BUFFER_SIZE:-8192}" \
44
+ --steps "${STEPS:-1000000}" \
45
+ --batch_size "${BATCH_SIZE:-32}" \
46
+ --grad_accum "${GRAD_ACCUM:-2}" \
47
+ --lr "${LR:-3e-4}" \
48
+ --log_every "${LOG_EVERY:-50}" \
49
+ --save_every "${SAVE_EVERY:-1000}" \
50
+ --dim "${DIM:-768}" \
51
+ --layers "${LAYERS:-12}" \
52
+ --heads "${HEADS:-12}" \
53
+ --mlp_dim "${MLP_DIM:-3072}" \
54
+ --time_tokens "${TIME_TOKENS:-4}" \
55
+ --abs_pos "${ABS_POS:-1}" \
56
+ --rope "${ROPE:-1}" \
57
+ --c_min "${CMIN:-1}" \
58
+ --c_max "${CMAX:-1024}" \
59
+ --seed "${SEED:-1234}" \
60
+ 2>&1 | tee -a "${LOG_FILE}"
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/hf_xet-1.5.0.dist-info/licenses/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/hf_xet-1.5.0.dist-info/sboms/hf_xet.cyclonedx.json ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/pygments-2.20.0.dist-info/WHEEL ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/typer/__init__.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Typer, build great CLIs. Easy to code. Based on Python type hints."""
2
+
3
+ __version__ = "0.25.1"
4
+
5
+ from shutil import get_terminal_size as get_terminal_size
6
+
7
+ from click.exceptions import Abort as Abort
8
+ from click.exceptions import BadParameter as BadParameter
9
+ from click.exceptions import Exit as Exit
10
+ from click.termui import clear as clear
11
+ from click.termui import confirm as confirm
12
+ from click.termui import echo_via_pager as echo_via_pager
13
+ from click.termui import edit as edit
14
+ from click.termui import getchar as getchar
15
+ from click.termui import pause as pause
16
+ from click.termui import progressbar as progressbar
17
+ from click.termui import prompt as prompt
18
+ from click.termui import secho as secho
19
+ from click.termui import style as style
20
+ from click.termui import unstyle as unstyle
21
+ from click.utils import echo as echo
22
+ from click.utils import format_filename as format_filename
23
+ from click.utils import get_app_dir as get_app_dir
24
+ from click.utils import get_binary_stream as get_binary_stream
25
+ from click.utils import get_text_stream as get_text_stream
26
+ from click.utils import open_file as open_file
27
+
28
+ from . import colors as colors
29
+ from .main import Typer as Typer
30
+ from .main import launch as launch
31
+ from .main import run as run
32
+ from .models import CallbackParam as CallbackParam
33
+ from .models import Context as Context
34
+ from .models import FileBinaryRead as FileBinaryRead
35
+ from .models import FileBinaryWrite as FileBinaryWrite
36
+ from .models import FileText as FileText
37
+ from .models import FileTextWrite as FileTextWrite
38
+ from .params import Argument as Argument
39
+ from .params import Option as Option
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/typer/__main__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .cli import main
2
+
3
+ main()
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/typer/_completion_classes.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.util
2
+ import os
3
+ import re
4
+ import sys
5
+ from typing import Any
6
+
7
+ import click
8
+ import click.parser
9
+ import click.shell_completion
10
+ from click.shell_completion import split_arg_string as click_split_arg_string
11
+
12
+ from ._completion_shared import (
13
+ COMPLETION_SCRIPT_BASH,
14
+ COMPLETION_SCRIPT_FISH,
15
+ COMPLETION_SCRIPT_POWER_SHELL,
16
+ COMPLETION_SCRIPT_ZSH,
17
+ Shells,
18
+ )
19
+
20
+
21
+ def _sanitize_help_text(text: str) -> str:
22
+ """Sanitizes the help text by removing rich tags"""
23
+ if not importlib.util.find_spec("rich"):
24
+ return text
25
+ from . import rich_utils
26
+
27
+ return rich_utils.rich_render_text(text)
28
+
29
+
30
+ class BashComplete(click.shell_completion.BashComplete):
31
+ name = Shells.bash.value
32
+ source_template = COMPLETION_SCRIPT_BASH
33
+
34
+ def source_vars(self) -> dict[str, Any]:
35
+ return {
36
+ "complete_func": self.func_name,
37
+ "autocomplete_var": self.complete_var,
38
+ "prog_name": self.prog_name,
39
+ }
40
+
41
+ def get_completion_args(self) -> tuple[list[str], str]:
42
+ cwords = click_split_arg_string(os.environ["COMP_WORDS"])
43
+ cword = int(os.environ["COMP_CWORD"])
44
+ args = cwords[1:cword]
45
+
46
+ try:
47
+ incomplete = cwords[cword]
48
+ except IndexError:
49
+ incomplete = ""
50
+
51
+ return args, incomplete
52
+
53
+ def format_completion(self, item: click.shell_completion.CompletionItem) -> str:
54
+ # TODO: Explore replicating the new behavior from Click, with item types and
55
+ # triggering completion for files and directories
56
+ # return f"{item.type},{item.value}"
57
+ return f"{item.value}"
58
+
59
+ def complete(self) -> str:
60
+ args, incomplete = self.get_completion_args()
61
+ completions = self.get_completions(args, incomplete)
62
+ out = [self.format_completion(item) for item in completions]
63
+ return "\n".join(out)
64
+
65
+
66
+ class ZshComplete(click.shell_completion.ZshComplete):
67
+ name = Shells.zsh.value
68
+ source_template = COMPLETION_SCRIPT_ZSH
69
+
70
+ def source_vars(self) -> dict[str, Any]:
71
+ return {
72
+ "complete_func": self.func_name,
73
+ "autocomplete_var": self.complete_var,
74
+ "prog_name": self.prog_name,
75
+ }
76
+
77
+ def get_completion_args(self) -> tuple[list[str], str]:
78
+ completion_args = os.getenv("_TYPER_COMPLETE_ARGS", "")
79
+ cwords = click_split_arg_string(completion_args)
80
+ args = cwords[1:]
81
+ if args and not completion_args.endswith(" "):
82
+ incomplete = args[-1]
83
+ args = args[:-1]
84
+ else:
85
+ incomplete = ""
86
+ return args, incomplete
87
+
88
+ def format_completion(self, item: click.shell_completion.CompletionItem) -> str:
89
+ def escape(s: str) -> str:
90
+ return (
91
+ s.replace('"', '""')
92
+ .replace("'", "''")
93
+ .replace("$", "\\$")
94
+ .replace("`", "\\`")
95
+ .replace(":", r"\\:")
96
+ )
97
+
98
+ # TODO: Explore replicating the new behavior from Click, pay attention to
99
+ # the difference with and without escape
100
+ # return f"{item.type}\n{item.value}\n{item.help if item.help else '_'}"
101
+ if item.help:
102
+ return f'"{escape(item.value)}":"{_sanitize_help_text(escape(item.help))}"'
103
+ else:
104
+ return f'"{escape(item.value)}"'
105
+
106
+ def complete(self) -> str:
107
+ args, incomplete = self.get_completion_args()
108
+ completions = self.get_completions(args, incomplete)
109
+ res = [self.format_completion(item) for item in completions]
110
+ if res:
111
+ args_str = "\n".join(res)
112
+ return f"_arguments '*: :(({args_str}))'"
113
+ else:
114
+ return "_files"
115
+
116
+
117
+ class FishComplete(click.shell_completion.FishComplete):
118
+ name = Shells.fish.value
119
+ source_template = COMPLETION_SCRIPT_FISH
120
+
121
+ def source_vars(self) -> dict[str, Any]:
122
+ return {
123
+ "complete_func": self.func_name,
124
+ "autocomplete_var": self.complete_var,
125
+ "prog_name": self.prog_name,
126
+ }
127
+
128
+ def get_completion_args(self) -> tuple[list[str], str]:
129
+ completion_args = os.getenv("_TYPER_COMPLETE_ARGS", "")
130
+ cwords = click_split_arg_string(completion_args)
131
+ args = cwords[1:]
132
+ if args and not completion_args.endswith(" "):
133
+ incomplete = args[-1]
134
+ args = args[:-1]
135
+ else:
136
+ incomplete = ""
137
+ return args, incomplete
138
+
139
+ def format_completion(self, item: click.shell_completion.CompletionItem) -> str:
140
+ # TODO: Explore replicating the new behavior from Click, pay attention to
141
+ # the difference with and without formatted help
142
+ # if item.help:
143
+ # return f"{item.type},{item.value}\t{item.help}"
144
+
145
+ # return f"{item.type},{item.value}
146
+ if item.help:
147
+ formatted_help = re.sub(r"\s", " ", item.help)
148
+ return f"{item.value}\t{_sanitize_help_text(formatted_help)}"
149
+ else:
150
+ return f"{item.value}"
151
+
152
+ def complete(self) -> str:
153
+ complete_action = os.getenv("_TYPER_COMPLETE_FISH_ACTION", "")
154
+ args, incomplete = self.get_completion_args()
155
+ completions = self.get_completions(args, incomplete)
156
+ show_args = [self.format_completion(item) for item in completions]
157
+ if complete_action == "get-args":
158
+ if show_args:
159
+ return "\n".join(show_args)
160
+ elif complete_action == "is-args":
161
+ if show_args:
162
+ # Activate complete args (no files)
163
+ sys.exit(0)
164
+ else:
165
+ # Deactivate complete args (allow files)
166
+ sys.exit(1)
167
+ return "" # pragma: no cover
168
+
169
+
170
+ class PowerShellComplete(click.shell_completion.ShellComplete):
171
+ name = Shells.powershell.value
172
+ source_template = COMPLETION_SCRIPT_POWER_SHELL
173
+
174
+ def source_vars(self) -> dict[str, Any]:
175
+ return {
176
+ "complete_func": self.func_name,
177
+ "autocomplete_var": self.complete_var,
178
+ "prog_name": self.prog_name,
179
+ }
180
+
181
+ def get_completion_args(self) -> tuple[list[str], str]:
182
+ completion_args = os.getenv("_TYPER_COMPLETE_ARGS", "")
183
+ incomplete = os.getenv("_TYPER_COMPLETE_WORD_TO_COMPLETE", "")
184
+ cwords = click_split_arg_string(completion_args)
185
+ args = cwords[1:-1] if incomplete else cwords[1:]
186
+ return args, incomplete
187
+
188
+ def format_completion(self, item: click.shell_completion.CompletionItem) -> str:
189
+ return f"{item.value}:::{_sanitize_help_text(item.help) if item.help else ' '}"
190
+
191
+
192
+ def completion_init() -> None:
193
+ click.shell_completion.add_completion_class(BashComplete, Shells.bash.value)
194
+ click.shell_completion.add_completion_class(ZshComplete, Shells.zsh.value)
195
+ click.shell_completion.add_completion_class(FishComplete, Shells.fish.value)
196
+ click.shell_completion.add_completion_class(
197
+ PowerShellComplete, Shells.powershell.value
198
+ )
199
+ click.shell_completion.add_completion_class(PowerShellComplete, Shells.pwsh.value)
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/typer/_completion_shared.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import subprocess
4
+ from enum import Enum
5
+ from pathlib import Path
6
+
7
+ import click
8
+ import shellingham
9
+
10
+
11
+ class Shells(str, Enum):
12
+ bash = "bash"
13
+ zsh = "zsh"
14
+ fish = "fish"
15
+ powershell = "powershell"
16
+ pwsh = "pwsh"
17
+
18
+
19
+ COMPLETION_SCRIPT_BASH = """
20
+ %(complete_func)s() {
21
+ local IFS=$'\n'
22
+ COMPREPLY=( $( env COMP_WORDS="${COMP_WORDS[*]}" \\
23
+ COMP_CWORD=$COMP_CWORD \\
24
+ %(autocomplete_var)s=complete_bash $1 ) )
25
+ return 0
26
+ }
27
+
28
+ complete -o default -F %(complete_func)s %(prog_name)s
29
+ """
30
+
31
+ COMPLETION_SCRIPT_ZSH = """
32
+ #compdef %(prog_name)s
33
+
34
+ %(complete_func)s() {
35
+ eval $(env _TYPER_COMPLETE_ARGS="${words[1,$CURRENT]}" %(autocomplete_var)s=complete_zsh %(prog_name)s)
36
+ }
37
+
38
+ compdef %(complete_func)s %(prog_name)s
39
+ """
40
+
41
+ COMPLETION_SCRIPT_FISH = 'complete --command %(prog_name)s --no-files --arguments "(env %(autocomplete_var)s=complete_fish _TYPER_COMPLETE_FISH_ACTION=get-args _TYPER_COMPLETE_ARGS=(commandline -cp) %(prog_name)s)" --condition "env %(autocomplete_var)s=complete_fish _TYPER_COMPLETE_FISH_ACTION=is-args _TYPER_COMPLETE_ARGS=(commandline -cp) %(prog_name)s"'
42
+
43
+ COMPLETION_SCRIPT_POWER_SHELL = """
44
+ Import-Module PSReadLine
45
+ Set-PSReadLineKeyHandler -Chord Tab -Function MenuComplete
46
+ $scriptblock = {
47
+ param($wordToComplete, $commandAst, $cursorPosition)
48
+ $Env:%(autocomplete_var)s = "complete_powershell"
49
+ $Env:_TYPER_COMPLETE_ARGS = $commandAst.ToString()
50
+ $Env:_TYPER_COMPLETE_WORD_TO_COMPLETE = $wordToComplete
51
+ %(prog_name)s | ForEach-Object {
52
+ $commandArray = $_ -Split ":::"
53
+ $command = $commandArray[0]
54
+ $helpString = $commandArray[1]
55
+ [System.Management.Automation.CompletionResult]::new(
56
+ $command, $command, 'ParameterValue', $helpString)
57
+ }
58
+ $Env:%(autocomplete_var)s = ""
59
+ $Env:_TYPER_COMPLETE_ARGS = ""
60
+ $Env:_TYPER_COMPLETE_WORD_TO_COMPLETE = ""
61
+ }
62
+ Register-ArgumentCompleter -Native -CommandName %(prog_name)s -ScriptBlock $scriptblock
63
+ """
64
+
65
+ _completion_scripts = {
66
+ "bash": COMPLETION_SCRIPT_BASH,
67
+ "zsh": COMPLETION_SCRIPT_ZSH,
68
+ "fish": COMPLETION_SCRIPT_FISH,
69
+ "powershell": COMPLETION_SCRIPT_POWER_SHELL,
70
+ "pwsh": COMPLETION_SCRIPT_POWER_SHELL,
71
+ }
72
+
73
+ # TODO: Probably refactor this, copied from Click 7.x
74
+ _invalid_ident_char_re = re.compile(r"[^a-zA-Z0-9_]")
75
+
76
+
77
+ def get_completion_script(*, prog_name: str, complete_var: str, shell: str) -> str:
78
+ cf_name = _invalid_ident_char_re.sub("", prog_name.replace("-", "_"))
79
+ script = _completion_scripts.get(shell)
80
+ if script is None:
81
+ click.echo(f"Shell {shell} not supported.", err=True)
82
+ raise click.exceptions.Exit(1)
83
+ return (
84
+ script
85
+ % {
86
+ "complete_func": f"_{cf_name}_completion",
87
+ "prog_name": prog_name,
88
+ "autocomplete_var": complete_var,
89
+ }
90
+ ).strip()
91
+
92
+
93
+ def install_bash(*, prog_name: str, complete_var: str, shell: str) -> Path:
94
+ # Ref: https://github.com/scop/bash-completion#faq
95
+ # It seems bash-completion is the official completion system for bash:
96
+ # Ref: https://www.gnu.org/software/bash/manual/html_node/A-Programmable-Completion-Example.html
97
+ # But installing in the locations from the docs doesn't seem to have effect
98
+ completion_path = Path.home() / ".bash_completions" / f"{prog_name}.sh"
99
+ rc_path = Path.home() / ".bashrc"
100
+ rc_path.parent.mkdir(parents=True, exist_ok=True)
101
+ rc_content = ""
102
+ if rc_path.is_file():
103
+ rc_content = rc_path.read_text()
104
+ completion_init_lines = [f"source '{completion_path}'"]
105
+ for line in completion_init_lines:
106
+ if line not in rc_content: # pragma: no cover
107
+ rc_content += f"\n{line}"
108
+ rc_content += "\n"
109
+ rc_path.write_text(rc_content)
110
+ # Install completion
111
+ completion_path.parent.mkdir(parents=True, exist_ok=True)
112
+ script_content = get_completion_script(
113
+ prog_name=prog_name, complete_var=complete_var, shell=shell
114
+ )
115
+ completion_path.write_text(script_content)
116
+ return completion_path
117
+
118
+
119
+ def install_zsh(*, prog_name: str, complete_var: str, shell: str) -> Path:
120
+ # Setup Zsh and load ~/.zfunc
121
+ zshrc_path = Path.home() / ".zshrc"
122
+ zshrc_path.parent.mkdir(parents=True, exist_ok=True)
123
+ zshrc_content = ""
124
+ if zshrc_path.is_file():
125
+ zshrc_content = zshrc_path.read_text()
126
+ completion_line = "fpath+=~/.zfunc; autoload -Uz compinit; compinit"
127
+ if completion_line not in zshrc_content:
128
+ zshrc_content += f"\n{completion_line}\n"
129
+ style_line = "zstyle ':completion:*' menu select"
130
+ # TODO: consider setting the style only for the current program
131
+ # style_line = f"zstyle ':completion:*:*:{prog_name}:*' menu select"
132
+ # Install zstyle completion config only if the user doesn't have a customization
133
+ if "zstyle" not in zshrc_content:
134
+ zshrc_content += f"\n{style_line}\n"
135
+ zshrc_content = f"{zshrc_content.strip()}\n"
136
+ zshrc_path.write_text(zshrc_content)
137
+ # Install completion under ~/.zfunc/
138
+ path_obj = Path.home() / f".zfunc/_{prog_name}"
139
+ path_obj.parent.mkdir(parents=True, exist_ok=True)
140
+ script_content = get_completion_script(
141
+ prog_name=prog_name, complete_var=complete_var, shell=shell
142
+ )
143
+ path_obj.write_text(script_content)
144
+ return path_obj
145
+
146
+
147
+ def install_fish(*, prog_name: str, complete_var: str, shell: str) -> Path:
148
+ path_obj = Path.home() / f".config/fish/completions/{prog_name}.fish"
149
+ parent_dir: Path = path_obj.parent
150
+ parent_dir.mkdir(parents=True, exist_ok=True)
151
+ script_content = get_completion_script(
152
+ prog_name=prog_name, complete_var=complete_var, shell=shell
153
+ )
154
+ path_obj.write_text(f"{script_content}\n")
155
+ return path_obj
156
+
157
+
158
+ def install_powershell(*, prog_name: str, complete_var: str, shell: str) -> Path:
159
+ subprocess.run(
160
+ [
161
+ shell,
162
+ "-Command",
163
+ "Set-ExecutionPolicy",
164
+ "Unrestricted",
165
+ "-Scope",
166
+ "CurrentUser",
167
+ ]
168
+ )
169
+ result = subprocess.run(
170
+ [shell, "-NoProfile", "-Command", "echo", "$profile"],
171
+ check=True,
172
+ stdout=subprocess.PIPE,
173
+ )
174
+ if result.returncode != 0: # pragma: no cover
175
+ click.echo("Couldn't get PowerShell user profile", err=True)
176
+ raise click.exceptions.Exit(result.returncode)
177
+ path_str = ""
178
+ if isinstance(result.stdout, str): # pragma: no cover
179
+ path_str = result.stdout
180
+ if isinstance(result.stdout, bytes):
181
+ for encoding in ["windows-1252", "utf8", "cp850"]:
182
+ try:
183
+ path_str = result.stdout.decode(encoding)
184
+ break
185
+ except UnicodeDecodeError: # pragma: no cover
186
+ pass
187
+ if not path_str: # pragma: no cover
188
+ click.echo("Couldn't decode the path automatically", err=True)
189
+ raise click.exceptions.Exit(1)
190
+ path_obj = Path(path_str.strip())
191
+ parent_dir: Path = path_obj.parent
192
+ parent_dir.mkdir(parents=True, exist_ok=True)
193
+ script_content = get_completion_script(
194
+ prog_name=prog_name, complete_var=complete_var, shell=shell
195
+ )
196
+ with path_obj.open(mode="a") as f:
197
+ f.write(f"{script_content}\n")
198
+ return path_obj
199
+
200
+
201
+ def install(
202
+ shell: str | None = None,
203
+ prog_name: str | None = None,
204
+ complete_var: str | None = None,
205
+ ) -> tuple[str, Path]:
206
+ prog_name = prog_name or click.get_current_context().find_root().info_name
207
+ assert prog_name
208
+ if complete_var is None:
209
+ complete_var = "_{}_COMPLETE".format(prog_name.replace("-", "_").upper())
210
+ test_disable_detection = os.getenv("_TYPER_COMPLETE_TEST_DISABLE_SHELL_DETECTION")
211
+ if shell is None and not test_disable_detection:
212
+ shell = _get_shell_name()
213
+ if shell == "bash":
214
+ installed_path = install_bash(
215
+ prog_name=prog_name, complete_var=complete_var, shell=shell
216
+ )
217
+ return shell, installed_path
218
+ elif shell == "zsh":
219
+ installed_path = install_zsh(
220
+ prog_name=prog_name, complete_var=complete_var, shell=shell
221
+ )
222
+ return shell, installed_path
223
+ elif shell == "fish":
224
+ installed_path = install_fish(
225
+ prog_name=prog_name, complete_var=complete_var, shell=shell
226
+ )
227
+ return shell, installed_path
228
+ elif shell in {"powershell", "pwsh"}:
229
+ installed_path = install_powershell(
230
+ prog_name=prog_name, complete_var=complete_var, shell=shell
231
+ )
232
+ return shell, installed_path
233
+ else:
234
+ click.echo(f"Shell {shell} is not supported.")
235
+ raise click.exceptions.Exit(1)
236
+
237
+
238
+ def _get_shell_name() -> str | None:
239
+ """Get the current shell name, if available.
240
+
241
+ The name will always be lowercase. If the shell cannot be detected, None is
242
+ returned.
243
+ """
244
+ name: str | None # N.B. shellingham is untyped
245
+ try:
246
+ # N.B. detect_shell returns a tuple of (shell name, shell command).
247
+ # We only need the name.
248
+ name, _cmd = shellingham.detect_shell() # noqa: TID251
249
+ except shellingham.ShellDetectionFailure: # pragma: no cover
250
+ name = None
251
+
252
+ return name
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/typer/_types.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+ from typing import TypeVar
3
+
4
+ import click
5
+
6
+ ParamTypeValue = TypeVar("ParamTypeValue")
7
+
8
+
9
+ class TyperChoice(click.Choice[ParamTypeValue]):
10
+ def normalize_choice(
11
+ self, choice: ParamTypeValue, ctx: click.Context | None
12
+ ) -> str:
13
+ # Click 8.2.0 added a new method `normalize_choice` to the `Choice` class
14
+ # to support enums, but it uses the enum names, while Typer has always used the
15
+ # enum values.
16
+ # This class overrides that method to maintain the previous behavior.
17
+ # In Click:
18
+ # normed_value = choice.name if isinstance(choice, Enum) else str(choice)
19
+ normed_value = str(choice.value) if isinstance(choice, Enum) else str(choice)
20
+
21
+ if ctx is not None and ctx.token_normalize_func is not None:
22
+ normed_value = ctx.token_normalize_func(normed_value)
23
+
24
+ if not self.case_sensitive:
25
+ normed_value = normed_value.casefold()
26
+
27
+ return normed_value
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/typer/_typing.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copied from pydantic 1.9.2 (the latest version to support python 3.6.)
2
+ # https://github.com/pydantic/pydantic/blob/v1.9.2/pydantic/typing.py
3
+ # Reduced drastically to only include Typer-specific 3.9+ functionality
4
+ # mypy: ignore-errors
5
+
6
+ import types
7
+ from collections.abc import Callable
8
+ from typing import (
9
+ Annotated,
10
+ Any,
11
+ Literal,
12
+ Union,
13
+ get_args,
14
+ get_origin,
15
+ get_type_hints,
16
+ )
17
+
18
+
19
+ def is_union(tp: type[Any] | None) -> bool:
20
+ return tp is Union or tp is types.UnionType # noqa: E721
21
+
22
+
23
+ __all__ = (
24
+ "NoneType",
25
+ "is_none_type",
26
+ "is_callable_type",
27
+ "is_literal_type",
28
+ "all_literal_values",
29
+ "is_union",
30
+ "Annotated",
31
+ "Literal",
32
+ "get_args",
33
+ "get_origin",
34
+ "get_type_hints",
35
+ )
36
+
37
+
38
+ NoneType = None.__class__
39
+
40
+
41
+ NONE_TYPES: tuple[Any, Any, Any] = (None, NoneType, Literal[None])
42
+
43
+
44
+ def is_none_type(type_: Any) -> bool:
45
+ for none_type in NONE_TYPES:
46
+ if type_ is none_type:
47
+ return True
48
+ return False
49
+
50
+
51
+ def is_callable_type(type_: type[Any]) -> bool:
52
+ return type_ is Callable or get_origin(type_) is Callable
53
+
54
+
55
+ def is_literal_type(type_: type[Any]) -> bool:
56
+ return get_origin(type_) is Literal
57
+
58
+
59
+ def literal_values(type_: type[Any]) -> tuple[Any, ...]:
60
+ return get_args(type_)
61
+
62
+
63
+ def all_literal_values(type_: type[Any]) -> tuple[Any, ...]:
64
+ """
65
+ This method is used to retrieve all Literal values as
66
+ Literal can be used recursively (see https://www.python.org/dev/peps/pep-0586)
67
+ e.g. `Literal[Literal[Literal[1, 2, 3], "foo"], 5, None]`
68
+ """
69
+ if not is_literal_type(type_):
70
+ return (type_,)
71
+
72
+ values = literal_values(type_)
73
+ return tuple(x for value in values for x in all_literal_values(value))
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/typer/cli.py ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.util
2
+ import re
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import click
8
+ import typer
9
+ import typer.core
10
+ from click import Command, Group, Option
11
+
12
+ from . import __version__
13
+ from .core import HAS_RICH, MARKUP_MODE_KEY
14
+
15
+ default_app_names = ("app", "cli", "main")
16
+ default_func_names = ("main", "cli", "app")
17
+
18
+ app = typer.Typer()
19
+ utils_app = typer.Typer(help="Extra utility commands for Typer apps.")
20
+ app.add_typer(utils_app, name="utils")
21
+
22
+
23
+ class State:
24
+ def __init__(self) -> None:
25
+ self.app: str | None = None
26
+ self.func: str | None = None
27
+ self.file: Path | None = None
28
+ self.module: str | None = None
29
+
30
+
31
+ state = State()
32
+
33
+
34
+ def maybe_update_state(ctx: click.Context) -> None:
35
+ path_or_module = ctx.params.get("path_or_module")
36
+ if path_or_module:
37
+ file_path = Path(path_or_module)
38
+ if file_path.exists() and file_path.is_file():
39
+ state.file = file_path
40
+ else:
41
+ if not re.fullmatch(r"[a-zA-Z_]\w*(\.[a-zA-Z_]\w*)*", path_or_module):
42
+ typer.echo(
43
+ f"Not a valid file or Python module: {path_or_module}", err=True
44
+ )
45
+ sys.exit(1)
46
+ state.module = path_or_module
47
+ app_name = ctx.params.get("app")
48
+ if app_name:
49
+ state.app = app_name
50
+ func_name = ctx.params.get("func")
51
+ if func_name:
52
+ state.func = func_name
53
+
54
+
55
+ class TyperCLIGroup(typer.core.TyperGroup):
56
+ def list_commands(self, ctx: click.Context) -> list[str]:
57
+ self.maybe_add_run(ctx)
58
+ return super().list_commands(ctx)
59
+
60
+ def get_command(self, ctx: click.Context, name: str) -> Command | None: # ty: ignore[invalid-method-override]
61
+ self.maybe_add_run(ctx)
62
+ return super().get_command(ctx, name)
63
+
64
+ def invoke(self, ctx: click.Context) -> Any:
65
+ self.maybe_add_run(ctx)
66
+ return super().invoke(ctx)
67
+
68
+ def maybe_add_run(self, ctx: click.Context) -> None:
69
+ maybe_update_state(ctx)
70
+ maybe_add_run_to_cli(self)
71
+
72
+
73
+ def get_typer_from_module(module: Any) -> typer.Typer | None:
74
+ # Try to get defined app
75
+ if state.app:
76
+ obj = getattr(module, state.app, None)
77
+ if not isinstance(obj, typer.Typer):
78
+ typer.echo(f"Not a Typer object: --app {state.app}", err=True)
79
+ sys.exit(1)
80
+ return obj
81
+ # Try to get defined function
82
+ if state.func:
83
+ func_obj = getattr(module, state.func, None)
84
+ if not callable(func_obj):
85
+ typer.echo(f"Not a function: --func {state.func}", err=True)
86
+ raise typer.Exit(1)
87
+ sub_app = typer.Typer()
88
+ sub_app.command()(func_obj)
89
+ return sub_app
90
+ # Iterate and get a default object to use as CLI
91
+ local_names = dir(module)
92
+ local_names_set = set(local_names)
93
+ # Try to get a default Typer app
94
+ for name in default_app_names:
95
+ if name in local_names_set:
96
+ obj = getattr(module, name, None)
97
+ if isinstance(obj, typer.Typer):
98
+ return obj
99
+ # Try to get any Typer app
100
+ for name in local_names_set - set(default_app_names):
101
+ obj = getattr(module, name)
102
+ if isinstance(obj, typer.Typer):
103
+ return obj
104
+ # Try to get a default function
105
+ for func_name in default_func_names:
106
+ func_obj = getattr(module, func_name, None)
107
+ if callable(func_obj):
108
+ sub_app = typer.Typer()
109
+ sub_app.command()(func_obj)
110
+ return sub_app
111
+ # Try to get any func app
112
+ for func_name in local_names_set - set(default_func_names):
113
+ func_obj = getattr(module, func_name)
114
+ if callable(func_obj):
115
+ sub_app = typer.Typer()
116
+ sub_app.command()(func_obj)
117
+ return sub_app
118
+ return None
119
+
120
+
121
+ def get_typer_from_state() -> typer.Typer | None:
122
+ spec = None
123
+ if state.file:
124
+ module_name = state.file.name
125
+ spec = importlib.util.spec_from_file_location(module_name, str(state.file))
126
+ elif state.module:
127
+ spec = importlib.util.find_spec(state.module)
128
+ if spec is None:
129
+ if state.file:
130
+ typer.echo(f"Could not import as Python file: {state.file}", err=True)
131
+ else:
132
+ typer.echo(f"Could not import as Python module: {state.module}", err=True)
133
+ sys.exit(1)
134
+ assert spec is not None
135
+ module = importlib.util.module_from_spec(spec)
136
+ spec.loader.exec_module(module) # type: ignore
137
+ obj = get_typer_from_module(module)
138
+ return obj
139
+
140
+
141
+ def maybe_add_run_to_cli(cli: click.Group) -> None:
142
+ if "run" not in cli.commands:
143
+ if state.file or state.module:
144
+ obj = get_typer_from_state()
145
+ if obj:
146
+ obj._add_completion = False
147
+ click_obj = typer.main.get_command(obj)
148
+ click_obj.name = "run"
149
+ if not click_obj.help:
150
+ click_obj.help = "Run the provided Typer app."
151
+ cli.add_command(click_obj)
152
+
153
+
154
+ def print_version(ctx: click.Context, param: Option, value: bool) -> None:
155
+ if not value or ctx.resilient_parsing:
156
+ return
157
+ typer.echo(f"Typer version: {__version__}")
158
+ raise typer.Exit()
159
+
160
+
161
+ @app.callback(cls=TyperCLIGroup, no_args_is_help=True)
162
+ def callback(
163
+ ctx: typer.Context,
164
+ *,
165
+ path_or_module: str = typer.Argument(None),
166
+ app: str = typer.Option(None, help="The typer app object/variable to use."),
167
+ func: str = typer.Option(None, help="The function to convert to Typer."),
168
+ version: bool = typer.Option(
169
+ False,
170
+ "--version",
171
+ help="Print version and exit.",
172
+ callback=print_version,
173
+ ),
174
+ ) -> None:
175
+ """
176
+ Run Typer scripts with completion, without having to create a package.
177
+
178
+ You probably want to install completion for the typer command:
179
+
180
+ $ typer --install-completion
181
+
182
+ https://typer.tiangolo.com/
183
+ """
184
+ maybe_update_state(ctx)
185
+
186
+
187
+ def get_docs_for_click(
188
+ *,
189
+ obj: Command,
190
+ ctx: typer.Context,
191
+ indent: int = 0,
192
+ name: str = "",
193
+ call_prefix: str = "",
194
+ title: str | None = None,
195
+ ) -> str:
196
+ docs = "#" * (1 + indent)
197
+ command_name = name or obj.name
198
+ if call_prefix:
199
+ command_name = f"{call_prefix} {command_name}"
200
+ if not title:
201
+ title = f"`{command_name}`" if command_name else "CLI"
202
+ docs += f" {title}\n\n"
203
+ rich_markup_mode = None
204
+ if hasattr(ctx, "obj") and isinstance(ctx.obj, dict):
205
+ rich_markup_mode = ctx.obj.get(MARKUP_MODE_KEY, None)
206
+ to_parse: bool = bool(HAS_RICH and (rich_markup_mode == "rich"))
207
+ if obj.help:
208
+ docs += f"{_parse_html(to_parse, obj.help)}\n\n"
209
+ usage_pieces = obj.collect_usage_pieces(ctx)
210
+ if usage_pieces:
211
+ docs += "**Usage**:\n\n"
212
+ docs += "```console\n"
213
+ docs += "$ "
214
+ if command_name:
215
+ docs += f"{command_name} "
216
+ docs += f"{' '.join(usage_pieces)}\n"
217
+ docs += "```\n\n"
218
+ args = []
219
+ opts = []
220
+ for param in obj.get_params(ctx):
221
+ rv = param.get_help_record(ctx)
222
+ if rv is not None:
223
+ if param.param_type_name == "argument":
224
+ args.append(rv)
225
+ elif param.param_type_name == "option":
226
+ opts.append(rv)
227
+ if args:
228
+ docs += "**Arguments**:\n\n"
229
+ for arg_name, arg_help in args:
230
+ docs += f"* `{arg_name}`"
231
+ if arg_help:
232
+ docs += f": {_parse_html(to_parse, arg_help)}"
233
+ docs += "\n"
234
+ docs += "\n"
235
+ if opts:
236
+ docs += "**Options**:\n\n"
237
+ for opt_name, opt_help in opts:
238
+ docs += f"* `{opt_name}`"
239
+ if opt_help:
240
+ docs += f": {_parse_html(to_parse, opt_help)}"
241
+ docs += "\n"
242
+ docs += "\n"
243
+ if obj.epilog:
244
+ docs += f"{obj.epilog}\n\n"
245
+ if isinstance(obj, Group):
246
+ group = obj
247
+ commands = group.list_commands(ctx)
248
+ if commands:
249
+ docs += "**Commands**:\n\n"
250
+ for command in commands:
251
+ command_obj = group.get_command(ctx, command)
252
+ assert command_obj
253
+ docs += f"* `{command_obj.name}`"
254
+ command_help = command_obj.get_short_help_str()
255
+ if command_help:
256
+ docs += f": {_parse_html(to_parse, command_help)}"
257
+ docs += "\n"
258
+ docs += "\n"
259
+ for command in commands:
260
+ command_obj = group.get_command(ctx, command)
261
+ assert command_obj
262
+ use_prefix = ""
263
+ if command_name:
264
+ use_prefix += f"{command_name}"
265
+ docs += get_docs_for_click(
266
+ obj=command_obj, ctx=ctx, indent=indent + 1, call_prefix=use_prefix
267
+ )
268
+ return docs
269
+
270
+
271
+ def _parse_html(to_parse: bool, input_text: str) -> str:
272
+ if not to_parse:
273
+ return input_text
274
+ from . import rich_utils
275
+
276
+ return rich_utils.rich_to_html(input_text)
277
+
278
+
279
+ @utils_app.command()
280
+ def docs(
281
+ ctx: typer.Context,
282
+ name: str = typer.Option("", help="The name of the CLI program to use in docs."),
283
+ output: Path | None = typer.Option(
284
+ None,
285
+ help="An output file to write docs to, like README.md.",
286
+ file_okay=True,
287
+ dir_okay=False,
288
+ ),
289
+ title: str | None = typer.Option(
290
+ None,
291
+ help="The title for the documentation page. If not provided, the name of "
292
+ "the program is used.",
293
+ ),
294
+ ) -> None:
295
+ """
296
+ Generate Markdown docs for a Typer app.
297
+ """
298
+ typer_obj = get_typer_from_state()
299
+ if not typer_obj:
300
+ typer.echo("No Typer app found", err=True)
301
+ raise typer.Abort()
302
+ if hasattr(typer_obj, "rich_markup_mode"):
303
+ if not hasattr(ctx, "obj") or ctx.obj is None:
304
+ ctx.ensure_object(dict)
305
+ if isinstance(ctx.obj, dict):
306
+ ctx.obj[MARKUP_MODE_KEY] = typer_obj.rich_markup_mode
307
+ click_obj = typer.main.get_command(typer_obj)
308
+ docs = get_docs_for_click(obj=click_obj, ctx=ctx, name=name, title=title)
309
+ clean_docs = f"{docs.strip()}\n"
310
+ if output:
311
+ output.write_text(clean_docs)
312
+ typer.echo(f"Docs saved to: {output}")
313
+ else:
314
+ typer.echo(clean_docs)
315
+
316
+
317
+ def main() -> Any:
318
+ return app()