Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- LTA_openwebtext_dualt/logs/_smoke4_lm1b_dualtline_cmax16_nwguard_20260503_232708.log +43 -0
- LTA_openwebtext_dualt/logs/ar_lm1b_flmpack_bert_small_len128_gbs512_4gpu_1m_rowshard_b128_resume3000_20260504_202541.log +84 -0
- LTA_openwebtext_dualt/logs/ar_owt_gpt2_len1024_from100k_modelonly_lr1e4_wd0p1_b2p95_cosine_8gpu.log +0 -0
- LTA_openwebtext_dualt/logs/bench_lta_dualt_1gpu_b32_len1024_20260428_223957.log +101 -0
- LTA_openwebtext_dualt/logs/compact_gpt2bpe_v2048_stream1024_fullycoupled_mask1_wd0p1_fp32_8gpu/lta_owt_compact_gpt2bpe_v2048_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_logitnormal_m1p5_s0p8_hardce_mask1p0-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260517_141027.log +0 -0
- LTA_openwebtext_dualt/logs/ctx1024_rollin_sweep_4gpu/20260517_queued_ctx1024_sweep.log +1 -0
- LTA_openwebtext_dualt/logs/ctx1024_rollin_sweep_4gpu/ctx1024_sweep_selected_20260517_210705.log +1306 -0
- LTA_openwebtext_dualt/logs/ctx1024_rollin_sweep_4gpu/ctx1024_sweep_selected_20260517_210705.nohup +0 -0
- LTA_openwebtext_dualt/logs/ctx1024_rollin_sweep_4gpu/queued_ctx1024_sweep.nohup +1 -0
- LTA_openwebtext_dualt/logs/ctx1024_sampledpath_sweep_4gpu/ctx1024_sampledpath_20260517_223933.nohup +246 -0
- LTA_openwebtext_dualt/logs/ctx1024_sampledpath_sweep_4gpu/ctx1024_sampledpath_true_20260517_224139.nohup +985 -0
- LTA_openwebtext_dualt/logs/decode_timegrid_trace_len256_copied_20260517_155402.log +0 -0
- LTA_openwebtext_dualt/logs/elf_lm1b_t5small_elfb_aligned_datasetfix_len128_4gpu_tinysmoke_20260513.log +18 -0
- LTA_openwebtext_dualt/logs/elfaligned_t5record_4gpu/lta_owt_t5record_len1024_elfaligned_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_zeroout_tf32_gbs512_4gpu_20260516_011722.log +296 -0
- LTA_openwebtext_dualt/logs/eval_20260506/ar_8gpu_latest_temp_sweep_20260506_110706.log +47 -0
- LTA_openwebtext_dualt/logs/eval_20260506/categorical_c1024_diffusion_eta1_stateweight_latest_20260506_113031.log +13 -0
- LTA_openwebtext_dualt/logs/eval_20260506/categorical_c1024_diffusion_finalsample_hightemp_quick_20260506_114232.log +14 -0
- LTA_openwebtext_dualt/logs/eval_20260506/categorical_c1024_diffusion_finalsample_latest_20260506_113603.log +41 -0
- LTA_openwebtext_dualt/logs/eval_20260506/categorical_c1024_diffusion_rolling_quick512_20260506_112740.log +9 -0
- LTA_openwebtext_dualt/logs/eval_20260506/categorical_c1024_diffusion_rolling_sweep_latest_20260506_112546.log +11 -0
- LTA_openwebtext_dualt/logs/eval_20260506/categorical_c1024_rolling_noise_focus_latest_20260506_112101.log +5 -0
- LTA_openwebtext_dualt/logs/eval_20260506/categorical_c1024_rolling_noise_sweep_latest_20260506_110706.log +41 -0
- LTA_openwebtext_dualt/logs/eval_20260508/dirichlet_step122k_key3_state_n256.log +1 -0
- LTA_openwebtext_dualt/logs/eval_20260508/dirichlet_step122k_quick2_128steps_n64.log +4 -0
- LTA_openwebtext_dualt/logs/eval_20260508/dirichlet_step122k_quick2_n64.log +4 -0
- LTA_openwebtext_dualt/logs/eval_20260508/dirichlet_step124k_256steps_n64.log +4 -0
- LTA_openwebtext_dualt/logs/eval_20260508/dirichlet_step124k_4096steps_n64.log +4 -0
- LTA_openwebtext_dualt/logs/eval_20260508/dirichlet_step124k_steps16_n64.log +4 -0
- LTA_openwebtext_dualt/logs/eval_20260508/dirichlet_step124k_steps32_n64.log +4 -0
- LTA_openwebtext_dualt/logs/eval_20260508/dirichlet_step124k_steps64_n64.log +4 -0
- LTA_openwebtext_dualt/logs/eval_20260508/dirichlet_step124k_steps8_n64.log +4 -0
- LTA_openwebtext_dualt/logs/eval_20260508/lm1b_8gpu_latest_1024steps_n64.log +4 -0
- LTA_openwebtext_dualt/logs/eval_20260508/lm1b_8gpu_latest_diffusion_noise_steps_128steps_n64.log +88 -0
- LTA_openwebtext_dualt/logs/eval_20260508/lm1b_8gpu_latest_step146k_128steps_n64.log +4 -0
- LTA_openwebtext_dualt/logs/eval_20260508/lm1b_8gpu_latest_temp_push43_128steps_n64.log +56 -0
- LTA_openwebtext_dualt/logs/eval_20260508/lm1b_8gpu_latest_temp_schedule_128steps_n64.log +32 -0
- LTA_openwebtext_dualt/logs/eval_20260508/mauve_step124k_n64_features.log +21 -0
- LTA_openwebtext_dualt/logs/eval_selfcond/selfcond_step1000_dirres_n16_s256_20260514_023314.log +5 -0
- LTA_openwebtext_dualt/logs/eval_selfcond/selfcond_step1000_online_dirres_n16_s256.log +4 -0
- LTA_openwebtext_dualt/logs/eval_selfcond/selfcond_step1000_online_dirres_n8_s128_smoke.log +6 -0
- LTA_openwebtext_dualt/logs/fullycoupled_tpow2_wd0p1_fp32_8gpu/lta_owt_gpt2cached_len1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_tpow2_nanogpt_tf32_ddit768x12_gbs512_8gpu_1m_20260515_003246.log +0 -0
- LTA_openwebtext_dualt/logs/fullycoupled_uniform_mask1_swiglu_wd0p1_fp32_4gpu/lta_owt_gpt2cached_len1024_fullycoupled_rmsnorm_nobias_swiglu_adamw_wd0p1_uniformt_hardce_mask1p0-1p0_nanogpt_fp32_ddit768x12_gbs512_4gpu_1m_20260517_133638.log +163 -0
- LTA_openwebtext_dualt/logs/fullycoupled_uniform_mask1_swiglu_wd0p1_fp32_4gpu/lta_owt_gpt2cached_len1024_fullycoupled_rmsnorm_nobias_swiglu_adamw_wd0p1_uniformt_hardce_mask1p0-1p0_nanogpt_fp32_ddit768x12_gbs512_4gpu_1m_20260517_133638.outer.log +174 -0
- LTA_openwebtext_dualt/logs/genppl_lm1b_step_latest_k1024_s128_flm.log +0 -0
- LTA_openwebtext_dualt/logs/infer_owt_compact_v2048_ckpt_sweep_steps128_c256_temps_n8_large_20260520_205159.log +0 -0
- LTA_openwebtext_dualt/logs/infer_owt_compact_v8192_probe_flow_onehot_steps128_c1024_t1p45_n8_large_20260520_201801.log +42 -0
- LTA_openwebtext_dualt/logs/infer_owt_compact_v8192_step142k_flow_onehot_argmax_sweep_n8_large_20260520_202516.log +184 -0
- LTA_openwebtext_dualt/logs/infer_owt_t5_2node_latest_trainmatched_dirres_c128_lowtemp_n8.log +58 -0
- LTA_openwebtext_dualt/logs/infer_owt_t5_2node_latest_trainmatched_dirres_grid_n8.log +32 -0
- LTA_openwebtext_dualt/logs/infer_owt_t5_2node_step290000_compare_n8_20260520_200659.log +44 -0
LTA_openwebtext_dualt/logs/_smoke4_lm1b_dualtline_cmax16_nwguard_20260503_232708.log
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
*****************************************
|
| 3 |
+
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
*****************************************
|
| 5 |
+
NCCL version 2.25.1+cuda12.8
|
| 6 |
+
{
|
| 7 |
+
"device": "cuda:0",
|
| 8 |
+
"rank": 0,
|
| 9 |
+
"world_size": 4,
|
| 10 |
+
"samples": "wrapped_streaming",
|
| 11 |
+
"vocab_size": 30522,
|
| 12 |
+
"save_dir": "runs/_smoke4_lm1b_dualtline_cmax16_nwguard_20260503_232708",
|
| 13 |
+
"batch_size": 1,
|
| 14 |
+
"grad_accum": 1,
|
| 15 |
+
"effective_batch_size": 4,
|
| 16 |
+
"global_batch_size": 4,
|
| 17 |
+
"lr_schedule": "constant_warmup",
|
| 18 |
+
"warmup_steps": 1,
|
| 19 |
+
"adam_beta1": 0.9,
|
| 20 |
+
"adam_beta2": 0.999,
|
| 21 |
+
"adam_eps": 1e-08,
|
| 22 |
+
"model_type": "ddit",
|
| 23 |
+
"dual_t": true,
|
| 24 |
+
"corrupt_t_mode": "independent",
|
| 25 |
+
"corrupt_min_t": 0.0,
|
| 26 |
+
"corrupt_max_t": 1.0,
|
| 27 |
+
"dirichlet_endpoint_mode": "dual_t_line",
|
| 28 |
+
"dirichlet_semantic_t_mode": "independent",
|
| 29 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 30 |
+
"torch_compile": false,
|
| 31 |
+
"compile_mode": "max-autotune",
|
| 32 |
+
"state_format": "prob",
|
| 33 |
+
"target_loss": "hard_ce",
|
| 34 |
+
"meanflow_weight": 0.0,
|
| 35 |
+
"bridge_noise_init": "logistic_normal",
|
| 36 |
+
"noise_sigma": -1.0,
|
| 37 |
+
"wrap": true,
|
| 38 |
+
"openwebtext_split": "all",
|
| 39 |
+
"num_workers": 0,
|
| 40 |
+
"latest_every": 0,
|
| 41 |
+
"resume_path": ""
|
| 42 |
+
}
|
| 43 |
+
step=1 micro_steps=1 elapsed=0.8s lr=3.000000e-04 loss_all=10.3125 acc_all=0.0000 loss_corrupt=10.3125 acc_corrupt=0.0000 corrupt_frac=0.2812 loss=10.3125 loss_recon=10.3125 loss_meanflow=0.0000 mean_model_t=0.0669 mean_corrupt_t=0.6396 wrong_frac=1.0000 init_acc_corrupt=0.4444 init_gold_top10=1.0000 init_gold_top100=1.0000
|
LTA_openwebtext_dualt/logs/ar_lm1b_flmpack_bert_small_len128_gbs512_4gpu_1m_rowshard_b128_resume3000_20260504_202541.log
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
*****************************************
|
| 3 |
+
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
*****************************************
|
| 5 |
+
NCCL version 2.25.1+cuda12.8
|
| 6 |
+
resumed_from=runs/ar_lm1b_flmpack_bert_small_len128_gbs512_4gpu_1m_rowshard_20260504_201527/latest.pt start_step=3001
|
| 7 |
+
{
|
| 8 |
+
"task": "ar_lm",
|
| 9 |
+
"device": "cuda:0",
|
| 10 |
+
"rank": 0,
|
| 11 |
+
"world_size": 4,
|
| 12 |
+
"samples": "wrapped_streaming",
|
| 13 |
+
"vocab_size": 30522,
|
| 14 |
+
"bos_id": 101,
|
| 15 |
+
"eos_id": 102,
|
| 16 |
+
"save_dir": "runs/ar_lm1b_flmpack_bert_small_len128_gbs512_4gpu_1m_rowshard_b128_resume3000_20260504_202541",
|
| 17 |
+
"params": 108440832,
|
| 18 |
+
"batch_size": 128,
|
| 19 |
+
"grad_accum": 1,
|
| 20 |
+
"effective_batch_size": 512,
|
| 21 |
+
"global_batch_size": 512,
|
| 22 |
+
"max_len": 128,
|
| 23 |
+
"wrap": true,
|
| 24 |
+
"text_detokenizer": "lm1b",
|
| 25 |
+
"openwebtext_split": "all",
|
| 26 |
+
"torch_compile": false
|
| 27 |
+
}
|
| 28 |
+
step=3020 micro_steps=3020 elapsed=6.0s lr=3.000000e-04 loss=4.0947 ppl=60.0657 acc=0.3150 tokens=16256.0000
|
| 29 |
+
step=3040 micro_steps=3040 elapsed=4.4s lr=3.000000e-04 loss=4.0777 ppl=59.0408 acc=0.3169 tokens=16256.0000
|
| 30 |
+
step=3060 micro_steps=3060 elapsed=4.4s lr=3.000000e-04 loss=4.0853 ppl=59.4897 acc=0.3171 tokens=16256.0000
|
| 31 |
+
step=3080 micro_steps=3080 elapsed=4.4s lr=3.000000e-04 loss=4.0716 ppl=58.6893 acc=0.3176 tokens=16256.0000
|
| 32 |
+
step=3100 micro_steps=3100 elapsed=4.4s lr=3.000000e-04 loss=4.0606 ppl=58.0390 acc=0.3181 tokens=16256.0000
|
| 33 |
+
step=3120 micro_steps=3120 elapsed=4.4s lr=3.000000e-04 loss=4.0630 ppl=58.1939 acc=0.3180 tokens=16256.0000
|
| 34 |
+
step=3140 micro_steps=3140 elapsed=4.4s lr=3.000000e-04 loss=4.0699 ppl=58.5920 acc=0.3186 tokens=16256.0000
|
| 35 |
+
step=3160 micro_steps=3160 elapsed=4.4s lr=3.000000e-04 loss=4.0768 ppl=58.9850 acc=0.3161 tokens=16256.0000
|
| 36 |
+
step=3180 micro_steps=3180 elapsed=4.4s lr=3.000000e-04 loss=4.0478 ppl=57.2913 acc=0.3189 tokens=16256.0000
|
| 37 |
+
step=3200 micro_steps=3200 elapsed=4.4s lr=3.000000e-04 loss=4.0432 ppl=57.0342 acc=0.3195 tokens=16256.0000
|
| 38 |
+
step=3220 micro_steps=3220 elapsed=4.4s lr=3.000000e-04 loss=4.0407 ppl=56.9072 acc=0.3205 tokens=16256.0000
|
| 39 |
+
step=3240 micro_steps=3240 elapsed=4.4s lr=3.000000e-04 loss=4.0478 ppl=57.2947 acc=0.3191 tokens=16256.0000
|
| 40 |
+
step=3260 micro_steps=3260 elapsed=4.4s lr=3.000000e-04 loss=4.0492 ppl=57.3790 acc=0.3194 tokens=16256.0000
|
| 41 |
+
step=3280 micro_steps=3280 elapsed=4.4s lr=3.000000e-04 loss=4.0349 ppl=56.5688 acc=0.3210 tokens=16256.0000
|
| 42 |
+
step=3300 micro_steps=3300 elapsed=4.4s lr=3.000000e-04 loss=4.0456 ppl=57.1688 acc=0.3195 tokens=16256.0000
|
| 43 |
+
step=3320 micro_steps=3320 elapsed=4.4s lr=3.000000e-04 loss=4.0447 ppl=57.1197 acc=0.3188 tokens=16256.0000
|
| 44 |
+
step=3340 micro_steps=3340 elapsed=4.4s lr=3.000000e-04 loss=4.0445 ppl=57.1161 acc=0.3209 tokens=16256.0000
|
| 45 |
+
step=3360 micro_steps=3360 elapsed=4.4s lr=3.000000e-04 loss=4.0363 ppl=56.6692 acc=0.3201 tokens=16256.0000
|
| 46 |
+
step=3380 micro_steps=3380 elapsed=4.4s lr=3.000000e-04 loss=4.0198 ppl=55.7166 acc=0.3226 tokens=16256.0000
|
| 47 |
+
step=3400 micro_steps=3400 elapsed=4.4s lr=3.000000e-04 loss=4.0166 ppl=55.5595 acc=0.3225 tokens=16256.0000
|
| 48 |
+
step=3420 micro_steps=3420 elapsed=4.4s lr=3.000000e-04 loss=4.0025 ppl=54.7572 acc=0.3250 tokens=16256.0000
|
| 49 |
+
step=3440 micro_steps=3440 elapsed=4.4s lr=3.000000e-04 loss=4.0096 ppl=55.1576 acc=0.3227 tokens=16256.0000
|
| 50 |
+
step=3460 micro_steps=3460 elapsed=4.4s lr=3.000000e-04 loss=4.0131 ppl=55.3352 acc=0.3235 tokens=16256.0000
|
| 51 |
+
step=3480 micro_steps=3480 elapsed=4.4s lr=3.000000e-04 loss=4.0070 ppl=55.0338 acc=0.3237 tokens=16256.0000
|
| 52 |
+
step=3500 micro_steps=3500 elapsed=4.4s lr=3.000000e-04 loss=4.0074 ppl=55.0277 acc=0.3236 tokens=16256.0000
|
| 53 |
+
step=3520 micro_steps=3520 elapsed=4.4s lr=3.000000e-04 loss=4.0005 ppl=54.6736 acc=0.3242 tokens=16256.0000
|
| 54 |
+
step=3540 micro_steps=3540 elapsed=4.4s lr=3.000000e-04 loss=4.0080 ppl=55.0657 acc=0.3232 tokens=16256.0000
|
| 55 |
+
step=3560 micro_steps=3560 elapsed=4.4s lr=3.000000e-04 loss=4.0036 ppl=54.8304 acc=0.3232 tokens=16256.0000
|
| 56 |
+
step=3580 micro_steps=3580 elapsed=4.4s lr=3.000000e-04 loss=3.9893 ppl=54.0694 acc=0.3265 tokens=16256.0000
|
| 57 |
+
step=3600 micro_steps=3600 elapsed=4.4s lr=3.000000e-04 loss=4.0006 ppl=54.6650 acc=0.3238 tokens=16256.0000
|
| 58 |
+
step=3620 micro_steps=3620 elapsed=4.4s lr=3.000000e-04 loss=3.9700 ppl=53.0214 acc=0.3266 tokens=16256.0000
|
| 59 |
+
step=3640 micro_steps=3640 elapsed=4.4s lr=3.000000e-04 loss=3.9747 ppl=53.2832 acc=0.3263 tokens=16256.0000
|
| 60 |
+
step=3660 micro_steps=3660 elapsed=4.4s lr=3.000000e-04 loss=3.9813 ppl=53.6207 acc=0.3253 tokens=16256.0000
|
| 61 |
+
step=3680 micro_steps=3680 elapsed=4.4s lr=3.000000e-04 loss=3.9763 ppl=53.3634 acc=0.3270 tokens=16256.0000
|
| 62 |
+
step=3700 micro_steps=3700 elapsed=4.4s lr=3.000000e-04 loss=3.9751 ppl=53.2994 acc=0.3264 tokens=16256.0000
|
| 63 |
+
step=3720 micro_steps=3720 elapsed=4.4s lr=3.000000e-04 loss=3.9777 ppl=53.4121 acc=0.3253 tokens=16256.0000
|
| 64 |
+
step=3740 micro_steps=3740 elapsed=4.4s lr=3.000000e-04 loss=3.9552 ppl=52.2290 acc=0.3273 tokens=16256.0000
|
| 65 |
+
step=3760 micro_steps=3760 elapsed=4.4s lr=3.000000e-04 loss=3.9726 ppl=53.1452 acc=0.3267 tokens=16256.0000
|
| 66 |
+
step=3780 micro_steps=3780 elapsed=4.4s lr=3.000000e-04 loss=3.9734 ppl=53.2219 acc=0.3265 tokens=16256.0000
|
| 67 |
+
step=3800 micro_steps=3800 elapsed=4.4s lr=3.000000e-04 loss=3.9694 ppl=52.9792 acc=0.3259 tokens=16256.0000
|
| 68 |
+
step=3820 micro_steps=3820 elapsed=4.4s lr=3.000000e-04 loss=3.9546 ppl=52.2044 acc=0.3272 tokens=16256.0000
|
| 69 |
+
step=3840 micro_steps=3840 elapsed=4.4s lr=3.000000e-04 loss=3.9565 ppl=52.3033 acc=0.3292 tokens=16256.0000
|
| 70 |
+
step=3860 micro_steps=3860 elapsed=4.4s lr=3.000000e-04 loss=3.9553 ppl=52.2470 acc=0.3278 tokens=16256.0000
|
| 71 |
+
step=3880 micro_steps=3880 elapsed=4.4s lr=3.000000e-04 loss=3.9563 ppl=52.2977 acc=0.3282 tokens=16256.0000
|
| 72 |
+
step=3900 micro_steps=3900 elapsed=4.4s lr=3.000000e-04 loss=3.9502 ppl=51.9592 acc=0.3286 tokens=16256.0000
|
| 73 |
+
step=3920 micro_steps=3920 elapsed=4.4s lr=3.000000e-04 loss=3.9436 ppl=51.6124 acc=0.3299 tokens=16256.0000
|
| 74 |
+
step=3940 micro_steps=3940 elapsed=4.4s lr=3.000000e-04 loss=3.9517 ppl=52.0682 acc=0.3288 tokens=16256.0000
|
| 75 |
+
step=3960 micro_steps=3960 elapsed=4.4s lr=3.000000e-04 loss=3.9499 ppl=51.9636 acc=0.3275 tokens=16256.0000
|
| 76 |
+
step=3980 micro_steps=3980 elapsed=4.4s lr=3.000000e-04 loss=3.9439 ppl=51.6394 acc=0.3281 tokens=16256.0000
|
| 77 |
+
step=4000 micro_steps=4000 elapsed=4.4s lr=3.000000e-04 loss=3.9314 ppl=51.0094 acc=0.3300 tokens=16256.0000
|
| 78 |
+
[sample step=4000] [CLS] bashir on thursday. [SEP] he had often played cocaine and used a baseball used to talk about the prime minister's success. [SEP] i am glad you put a lot of in my swelling pool and 314 of the same. [SEP] "if the president didn't address that situation [monday], that celebration and he would accept it. [SEP] being an honour will soon be largely symbolic. [SEP] so we haven't experienced much of the season," hamilton was quoted as saying. [SEP] i'm sure that the administration of john f. kennedy has to realize that he's too rotten to move on from working with his [SEP]
|
| 79 |
+
step=4020 micro_steps=4020 elapsed=7.2s lr=3.000000e-04 loss=3.9248 ppl=50.6715 acc=0.3315 tokens=16256.0000
|
| 80 |
+
step=4040 micro_steps=4040 elapsed=4.5s lr=3.000000e-04 loss=3.9372 ppl=51.3072 acc=0.3292 tokens=16256.0000
|
| 81 |
+
step=4060 micro_steps=4060 elapsed=4.4s lr=3.000000e-04 loss=3.9534 ppl=52.1452 acc=0.3275 tokens=16256.0000
|
| 82 |
+
step=4080 micro_steps=4080 elapsed=4.4s lr=3.000000e-04 loss=3.9326 ppl=51.0598 acc=0.3308 tokens=16256.0000
|
| 83 |
+
step=4100 micro_steps=4100 elapsed=4.4s lr=3.000000e-04 loss=3.9261 ppl=50.7547 acc=0.3310 tokens=16256.0000
|
| 84 |
+
step=4120 micro_steps=4120 elapsed=4.4s lr=3.000000e-04 loss=3.9235 ppl=50.6324 acc=0.3313 tokens=16256.0000
|
LTA_openwebtext_dualt/logs/ar_owt_gpt2_len1024_from100k_modelonly_lr1e4_wd0p1_b2p95_cosine_8gpu.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/bench_lta_dualt_1gpu_b32_len1024_20260428_223957.log
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"device": "cuda:0",
|
| 3 |
+
"rank": 0,
|
| 4 |
+
"world_size": 1,
|
| 5 |
+
"samples": "wrapped_streaming",
|
| 6 |
+
"vocab_size": 50257,
|
| 7 |
+
"save_dir": "runs/bench_lta_dualt_1gpu_b32_len1024_20260428_223957",
|
| 8 |
+
"batch_size": 32,
|
| 9 |
+
"grad_accum": 1,
|
| 10 |
+
"effective_batch_size": 32,
|
| 11 |
+
"global_batch_size": 32,
|
| 12 |
+
"lr_schedule": "constant_warmup",
|
| 13 |
+
"warmup_steps": 1,
|
| 14 |
+
"model_type": "ddit",
|
| 15 |
+
"dual_t": true,
|
| 16 |
+
"corrupt_t_mode": "independent",
|
| 17 |
+
"corrupt_min_t": 0.0,
|
| 18 |
+
"corrupt_max_t": 1.0,
|
| 19 |
+
"torch_compile": false,
|
| 20 |
+
"compile_mode": "max-autotune",
|
| 21 |
+
"state_format": "prob",
|
| 22 |
+
"target_loss": "soft_ce",
|
| 23 |
+
"meanflow_weight": 0.0,
|
| 24 |
+
"bridge_noise_init": "logistic_normal",
|
| 25 |
+
"noise_sigma": -1.0,
|
| 26 |
+
"wrap": true,
|
| 27 |
+
"num_workers": 4
|
| 28 |
+
}
|
| 29 |
+
Traceback (most recent call last):
|
| 30 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 456, in <module>
|
| 31 |
+
main()
|
| 32 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/train.py", line 367, in main
|
| 33 |
+
batch = next(data_iter)
|
| 34 |
+
^^^^^^^^^^^^^^^
|
| 35 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 708, in __next__
|
| 36 |
+
data = self._next_data()
|
| 37 |
+
^^^^^^^^^^^^^^^^^
|
| 38 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1487, in _next_data
|
| 39 |
+
return self._process_data(data, worker_id)
|
| 40 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 41 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1522, in _process_data
|
| 42 |
+
data.reraise()
|
| 43 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/_utils.py", line 733, in reraise
|
| 44 |
+
raise exception
|
| 45 |
+
zipfile.BadZipFile: Caught BadZipFile in DataLoader worker process 0.
|
| 46 |
+
Original Traceback (most recent call last):
|
| 47 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
|
| 48 |
+
data = fetcher.fetch(index) # type: ignore[possibly-undefined]
|
| 49 |
+
^^^^^^^^^^^^^^^^^^^^
|
| 50 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/_utils/fetch.py", line 33, in fetch
|
| 51 |
+
data.append(next(self.dataset_iter))
|
| 52 |
+
^^^^^^^^^^^^^^^^^^^^^^^
|
| 53 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 230, in __iter__
|
| 54 |
+
for text in iter_text_records(
|
| 55 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 111, in iter_text_records
|
| 56 |
+
yield from _iter_parquet(path, text_column)
|
| 57 |
+
File "/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/flowtext_lab/data.py", line 69, in _iter_parquet
|
| 58 |
+
from datasets import Dataset as HFDataset
|
| 59 |
+
File "/usr/local/lib/python3.12/dist-packages/datasets/__init__.py", line 17, in <module>
|
| 60 |
+
from .arrow_dataset import Dataset
|
| 61 |
+
File "/usr/local/lib/python3.12/dist-packages/datasets/arrow_dataset.py", line 54, in <module>
|
| 62 |
+
import fsspec
|
| 63 |
+
File "/usr/local/lib/python3.12/dist-packages/fsspec/__init__.py", line 69, in <module>
|
| 64 |
+
process_entries()
|
| 65 |
+
File "/usr/local/lib/python3.12/dist-packages/fsspec/__init__.py", line 43, in process_entries
|
| 66 |
+
eps = entry_points()
|
| 67 |
+
^^^^^^^^^^^^^^
|
| 68 |
+
File "/usr/lib/python3.12/importlib/metadata/__init__.py", line 913, in entry_points
|
| 69 |
+
return EntryPoints(eps).select(**params)
|
| 70 |
+
^^^^^^^^^^^^^^^^
|
| 71 |
+
File "/usr/lib/python3.12/importlib/metadata/__init__.py", line 910, in <genexpr>
|
| 72 |
+
eps = itertools.chain.from_iterable(
|
| 73 |
+
^
|
| 74 |
+
File "/usr/lib/python3.12/importlib/metadata/_itertools.py", line 16, in unique_everseen
|
| 75 |
+
k = key(element)
|
| 76 |
+
^^^^^^^^^^^^
|
| 77 |
+
File "/usr/lib/python3.12/importlib/metadata/__init__.py", line 835, in _normalized_name
|
| 78 |
+
or super()._normalized_name
|
| 79 |
+
^^^^^^^^^^^^^^^^^^^^^^^^
|
| 80 |
+
File "/usr/lib/python3.12/importlib/metadata/__init__.py", line 462, in _normalized_name
|
| 81 |
+
return Prepared.normalize(self.name)
|
| 82 |
+
^^^^^^^^^
|
| 83 |
+
File "/usr/lib/python3.12/importlib/metadata/__init__.py", line 457, in name
|
| 84 |
+
return self.metadata['Name']
|
| 85 |
+
^^^^^^^^^^^^^
|
| 86 |
+
File "/usr/lib/python3.12/importlib/metadata/__init__.py", line 445, in metadata
|
| 87 |
+
or self.read_text('PKG-INFO')
|
| 88 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 89 |
+
File "/usr/lib/python3.12/importlib/metadata/__init__.py", line 819, in read_text
|
| 90 |
+
return self._path.joinpath(filename).read_text(encoding='utf-8')
|
| 91 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 92 |
+
File "/usr/lib/python3.12/zipfile/_path/__init__.py", line 339, in read_text
|
| 93 |
+
with self.open('r', encoding, *args, **kwargs) as strm:
|
| 94 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 95 |
+
File "/usr/lib/python3.12/zipfile/_path/__init__.py", line 305, in open
|
| 96 |
+
stream = self.root.open(self.at, zip_mode, pwd=pwd)
|
| 97 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 98 |
+
File "/usr/lib/python3.12/zipfile/__init__.py", line 1625, in open
|
| 99 |
+
raise BadZipFile("Bad magic number for file header")
|
| 100 |
+
zipfile.BadZipFile: Bad magic number for file header
|
| 101 |
+
|
LTA_openwebtext_dualt/logs/compact_gpt2bpe_v2048_stream1024_fullycoupled_mask1_wd0p1_fp32_8gpu/lta_owt_compact_gpt2bpe_v2048_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_logitnormal_m1p5_s0p8_hardce_mask1p0-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260517_141027.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/ctx1024_rollin_sweep_4gpu/20260517_queued_ctx1024_sweep.log
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
[ctx1024-sweep] waiting for run=train8_rollin_len1024_rollin_p50_s4_i32_20260517_1855ctx1024bs128
|
LTA_openwebtext_dualt/logs/ctx1024_rollin_sweep_4gpu/ctx1024_sweep_selected_20260517_210705.log
ADDED
|
@@ -0,0 +1,1306 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[ctx1024-sweep] waiting for run=train8_rollin_len1024_rollin_p50_s4_i32_20260517_1855ctx1024bs128
|
| 2 |
+
[ctx1024-sweep] start stamp=ctx1024_sweep_selected_20260517_210705 len=1024 vocab=2664 out=docs/lta_samples/metrics_20260517/ctx1024_rollin_sweep_bs512_ode128_ctx1024_sweep_selected_20260517_210705
|
| 3 |
+
[ctx1024-sweep] config=p75_s4_i32_outwdm1 run=train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705 p=0.75 steps=4 infer=32 outwd=-1 sync_t=0
|
| 4 |
+
[ctx1024-sweep] train config=p75_s4_i32_outwdm1 from=0 to=1000
|
| 5 |
+
[ctx1024-sweep] eval config=p75_s4_i32_outwdm1 step=1000
|
| 6 |
+
[eval-decode-acc] train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705 step=1000 soft=none
|
| 7 |
+
[decode] max_len=1024 generated=64/64
|
| 8 |
+
{
|
| 9 |
+
"num_rows": 1,
|
| 10 |
+
"best_by_run": {
|
| 11 |
+
"train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705::none": {
|
| 12 |
+
"run": "train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705",
|
| 13 |
+
"checkpoint": "runs/train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705/step_0001000.pt",
|
| 14 |
+
"ckpt_step": 1000,
|
| 15 |
+
"endpoint_softening": "none",
|
| 16 |
+
"decode_rule": "flowmap",
|
| 17 |
+
"steps": 128,
|
| 18 |
+
"time_schedule": "logit_normal",
|
| 19 |
+
"model_t_mode": "post",
|
| 20 |
+
"final_from": "state",
|
| 21 |
+
"n_gen": 64,
|
| 22 |
+
"n_refs": 8,
|
| 23 |
+
"token_acc_mean": 0.0558013916015625,
|
| 24 |
+
"token_acc_min": 0.029296875,
|
| 25 |
+
"token_acc_max": 0.119140625,
|
| 26 |
+
"exact_acc": 0.0,
|
| 27 |
+
"exact_count": 0,
|
| 28 |
+
"exact_ref_coverage": 0.0,
|
| 29 |
+
"exact_ref_count": 0,
|
| 30 |
+
"exact_ref_hits": [],
|
| 31 |
+
"best_ref_idx": [
|
| 32 |
+
7,
|
| 33 |
+
7,
|
| 34 |
+
4,
|
| 35 |
+
7,
|
| 36 |
+
6,
|
| 37 |
+
7,
|
| 38 |
+
2,
|
| 39 |
+
6,
|
| 40 |
+
6,
|
| 41 |
+
6,
|
| 42 |
+
6,
|
| 43 |
+
6,
|
| 44 |
+
2,
|
| 45 |
+
6,
|
| 46 |
+
2,
|
| 47 |
+
6,
|
| 48 |
+
6,
|
| 49 |
+
7,
|
| 50 |
+
7,
|
| 51 |
+
6,
|
| 52 |
+
6,
|
| 53 |
+
6,
|
| 54 |
+
6,
|
| 55 |
+
4,
|
| 56 |
+
6,
|
| 57 |
+
6,
|
| 58 |
+
5,
|
| 59 |
+
6,
|
| 60 |
+
6,
|
| 61 |
+
6,
|
| 62 |
+
3,
|
| 63 |
+
6,
|
| 64 |
+
6,
|
| 65 |
+
2,
|
| 66 |
+
4,
|
| 67 |
+
4,
|
| 68 |
+
4,
|
| 69 |
+
6,
|
| 70 |
+
2,
|
| 71 |
+
5,
|
| 72 |
+
6,
|
| 73 |
+
6,
|
| 74 |
+
7,
|
| 75 |
+
6,
|
| 76 |
+
6,
|
| 77 |
+
6,
|
| 78 |
+
1,
|
| 79 |
+
6,
|
| 80 |
+
2,
|
| 81 |
+
4,
|
| 82 |
+
6,
|
| 83 |
+
6,
|
| 84 |
+
7,
|
| 85 |
+
6,
|
| 86 |
+
6,
|
| 87 |
+
5,
|
| 88 |
+
6,
|
| 89 |
+
4,
|
| 90 |
+
2,
|
| 91 |
+
6,
|
| 92 |
+
6,
|
| 93 |
+
7,
|
| 94 |
+
4,
|
| 95 |
+
6
|
| 96 |
+
],
|
| 97 |
+
"best_token_acc": [
|
| 98 |
+
0.04296875,
|
| 99 |
+
0.060546875,
|
| 100 |
+
0.0478515625,
|
| 101 |
+
0.0537109375,
|
| 102 |
+
0.0517578125,
|
| 103 |
+
0.046875,
|
| 104 |
+
0.08203125,
|
| 105 |
+
0.046875,
|
| 106 |
+
0.0341796875,
|
| 107 |
+
0.119140625,
|
| 108 |
+
0.0556640625,
|
| 109 |
+
0.0576171875,
|
| 110 |
+
0.0546875,
|
| 111 |
+
0.0556640625,
|
| 112 |
+
0.052734375,
|
| 113 |
+
0.0703125,
|
| 114 |
+
0.05859375,
|
| 115 |
+
0.0634765625,
|
| 116 |
+
0.046875,
|
| 117 |
+
0.0751953125,
|
| 118 |
+
0.08203125,
|
| 119 |
+
0.0361328125,
|
| 120 |
+
0.044921875,
|
| 121 |
+
0.0341796875,
|
| 122 |
+
0.0517578125,
|
| 123 |
+
0.072265625,
|
| 124 |
+
0.0322265625,
|
| 125 |
+
0.0615234375,
|
| 126 |
+
0.0615234375,
|
| 127 |
+
0.0517578125,
|
| 128 |
+
0.037109375,
|
| 129 |
+
0.0703125,
|
| 130 |
+
0.056640625,
|
| 131 |
+
0.0869140625,
|
| 132 |
+
0.0478515625,
|
| 133 |
+
0.0380859375,
|
| 134 |
+
0.037109375,
|
| 135 |
+
0.0419921875,
|
| 136 |
+
0.0703125,
|
| 137 |
+
0.0341796875,
|
| 138 |
+
0.056640625,
|
| 139 |
+
0.033203125,
|
| 140 |
+
0.072265625,
|
| 141 |
+
0.0380859375,
|
| 142 |
+
0.0478515625,
|
| 143 |
+
0.0380859375,
|
| 144 |
+
0.0322265625,
|
| 145 |
+
0.0693359375,
|
| 146 |
+
0.0712890625,
|
| 147 |
+
0.0419921875,
|
| 148 |
+
0.0576171875,
|
| 149 |
+
0.0869140625,
|
| 150 |
+
0.0615234375,
|
| 151 |
+
0.03515625,
|
| 152 |
+
0.0517578125,
|
| 153 |
+
0.080078125,
|
| 154 |
+
0.0380859375,
|
| 155 |
+
0.0341796875,
|
| 156 |
+
0.1171875,
|
| 157 |
+
0.0791015625,
|
| 158 |
+
0.029296875,
|
| 159 |
+
0.0478515625,
|
| 160 |
+
0.0595703125,
|
| 161 |
+
0.06640625
|
| 162 |
+
]
|
| 163 |
+
}
|
| 164 |
+
},
|
| 165 |
+
"first_exact_by_run": {}
|
| 166 |
+
}
|
| 167 |
+
RESULT config=p75_s4_i32_outwdm1 run=train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705 ckpt_step=1000 views=512000 token_acc=0.0558 exact=0/64 exact_refs=0 hits=[]
|
| 168 |
+
[ctx1024-sweep] train config=p75_s4_i32_outwdm1 from=1000 to=2000
|
| 169 |
+
[ctx1024-sweep] eval config=p75_s4_i32_outwdm1 step=2000
|
| 170 |
+
[eval-decode-acc] train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705 step=2000 soft=none
|
| 171 |
+
[decode] max_len=1024 generated=64/64
|
| 172 |
+
{
|
| 173 |
+
"num_rows": 1,
|
| 174 |
+
"best_by_run": {
|
| 175 |
+
"train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705::none": {
|
| 176 |
+
"run": "train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705",
|
| 177 |
+
"checkpoint": "runs/train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705/step_0002000.pt",
|
| 178 |
+
"ckpt_step": 2000,
|
| 179 |
+
"endpoint_softening": "none",
|
| 180 |
+
"decode_rule": "flowmap",
|
| 181 |
+
"steps": 128,
|
| 182 |
+
"time_schedule": "logit_normal",
|
| 183 |
+
"model_t_mode": "post",
|
| 184 |
+
"final_from": "state",
|
| 185 |
+
"n_gen": 64,
|
| 186 |
+
"n_refs": 8,
|
| 187 |
+
"token_acc_mean": 0.970245361328125,
|
| 188 |
+
"token_acc_min": 0.3955078125,
|
| 189 |
+
"token_acc_max": 0.9990234375,
|
| 190 |
+
"exact_acc": 0.0,
|
| 191 |
+
"exact_count": 0,
|
| 192 |
+
"exact_ref_coverage": 0.0,
|
| 193 |
+
"exact_ref_count": 0,
|
| 194 |
+
"exact_ref_hits": [],
|
| 195 |
+
"best_ref_idx": [
|
| 196 |
+
4,
|
| 197 |
+
7,
|
| 198 |
+
4,
|
| 199 |
+
4,
|
| 200 |
+
4,
|
| 201 |
+
7,
|
| 202 |
+
4,
|
| 203 |
+
4,
|
| 204 |
+
7,
|
| 205 |
+
7,
|
| 206 |
+
4,
|
| 207 |
+
1,
|
| 208 |
+
4,
|
| 209 |
+
7,
|
| 210 |
+
4,
|
| 211 |
+
7,
|
| 212 |
+
4,
|
| 213 |
+
7,
|
| 214 |
+
7,
|
| 215 |
+
5,
|
| 216 |
+
4,
|
| 217 |
+
4,
|
| 218 |
+
4,
|
| 219 |
+
4,
|
| 220 |
+
4,
|
| 221 |
+
4,
|
| 222 |
+
7,
|
| 223 |
+
2,
|
| 224 |
+
0,
|
| 225 |
+
4,
|
| 226 |
+
4,
|
| 227 |
+
7,
|
| 228 |
+
4,
|
| 229 |
+
0,
|
| 230 |
+
4,
|
| 231 |
+
4,
|
| 232 |
+
4,
|
| 233 |
+
0,
|
| 234 |
+
4,
|
| 235 |
+
4,
|
| 236 |
+
2,
|
| 237 |
+
4,
|
| 238 |
+
4,
|
| 239 |
+
3,
|
| 240 |
+
0,
|
| 241 |
+
4,
|
| 242 |
+
2,
|
| 243 |
+
0,
|
| 244 |
+
5,
|
| 245 |
+
0,
|
| 246 |
+
0,
|
| 247 |
+
7,
|
| 248 |
+
7,
|
| 249 |
+
4,
|
| 250 |
+
4,
|
| 251 |
+
4,
|
| 252 |
+
7,
|
| 253 |
+
4,
|
| 254 |
+
7,
|
| 255 |
+
0,
|
| 256 |
+
7,
|
| 257 |
+
4,
|
| 258 |
+
7,
|
| 259 |
+
4
|
| 260 |
+
],
|
| 261 |
+
"best_token_acc": [
|
| 262 |
+
0.986328125,
|
| 263 |
+
0.982421875,
|
| 264 |
+
0.92578125,
|
| 265 |
+
0.9404296875,
|
| 266 |
+
0.9599609375,
|
| 267 |
+
0.990234375,
|
| 268 |
+
0.990234375,
|
| 269 |
+
0.98828125,
|
| 270 |
+
0.3955078125,
|
| 271 |
+
0.9853515625,
|
| 272 |
+
0.939453125,
|
| 273 |
+
0.984375,
|
| 274 |
+
0.97265625,
|
| 275 |
+
0.984375,
|
| 276 |
+
0.978515625,
|
| 277 |
+
0.9833984375,
|
| 278 |
+
0.9833984375,
|
| 279 |
+
0.982421875,
|
| 280 |
+
0.8916015625,
|
| 281 |
+
0.998046875,
|
| 282 |
+
0.9794921875,
|
| 283 |
+
0.9873046875,
|
| 284 |
+
0.986328125,
|
| 285 |
+
0.9658203125,
|
| 286 |
+
0.9912109375,
|
| 287 |
+
0.9873046875,
|
| 288 |
+
0.9873046875,
|
| 289 |
+
0.9814453125,
|
| 290 |
+
0.994140625,
|
| 291 |
+
0.8408203125,
|
| 292 |
+
0.98828125,
|
| 293 |
+
0.9912109375,
|
| 294 |
+
0.9658203125,
|
| 295 |
+
0.98046875,
|
| 296 |
+
0.9833984375,
|
| 297 |
+
0.98828125,
|
| 298 |
+
0.990234375,
|
| 299 |
+
0.99609375,
|
| 300 |
+
0.9873046875,
|
| 301 |
+
0.9833984375,
|
| 302 |
+
0.994140625,
|
| 303 |
+
0.9892578125,
|
| 304 |
+
0.9658203125,
|
| 305 |
+
0.9697265625,
|
| 306 |
+
0.9873046875,
|
| 307 |
+
0.9853515625,
|
| 308 |
+
0.982421875,
|
| 309 |
+
0.99609375,
|
| 310 |
+
0.9990234375,
|
| 311 |
+
0.9931640625,
|
| 312 |
+
0.9970703125,
|
| 313 |
+
0.9912109375,
|
| 314 |
+
0.984375,
|
| 315 |
+
0.9892578125,
|
| 316 |
+
0.9677734375,
|
| 317 |
+
0.990234375,
|
| 318 |
+
0.98828125,
|
| 319 |
+
0.990234375,
|
| 320 |
+
0.986328125,
|
| 321 |
+
0.994140625,
|
| 322 |
+
0.98828125,
|
| 323 |
+
0.98828125,
|
| 324 |
+
0.9892578125,
|
| 325 |
+
0.990234375
|
| 326 |
+
]
|
| 327 |
+
}
|
| 328 |
+
},
|
| 329 |
+
"first_exact_by_run": {}
|
| 330 |
+
}
|
| 331 |
+
RESULT config=p75_s4_i32_outwdm1 run=train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705 ckpt_step=2000 views=1024000 token_acc=0.9702 exact=0/64 exact_refs=0 hits=[]
|
| 332 |
+
[ctx1024-sweep] train config=p75_s4_i32_outwdm1 from=2000 to=3000
|
| 333 |
+
[ctx1024-sweep] eval config=p75_s4_i32_outwdm1 step=3000
|
| 334 |
+
[eval-decode-acc] train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705 step=3000 soft=none
|
| 335 |
+
[decode] max_len=1024 generated=64/64
|
| 336 |
+
{
|
| 337 |
+
"num_rows": 1,
|
| 338 |
+
"best_by_run": {
|
| 339 |
+
"train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705::none": {
|
| 340 |
+
"run": "train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705",
|
| 341 |
+
"checkpoint": "runs/train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705/step_0003000.pt",
|
| 342 |
+
"ckpt_step": 3000,
|
| 343 |
+
"endpoint_softening": "none",
|
| 344 |
+
"decode_rule": "flowmap",
|
| 345 |
+
"steps": 128,
|
| 346 |
+
"time_schedule": "logit_normal",
|
| 347 |
+
"model_t_mode": "post",
|
| 348 |
+
"final_from": "state",
|
| 349 |
+
"n_gen": 64,
|
| 350 |
+
"n_refs": 8,
|
| 351 |
+
"token_acc_mean": 0.919891357421875,
|
| 352 |
+
"token_acc_min": 0.8154296875,
|
| 353 |
+
"token_acc_max": 0.97265625,
|
| 354 |
+
"exact_acc": 0.0,
|
| 355 |
+
"exact_count": 0,
|
| 356 |
+
"exact_ref_coverage": 0.0,
|
| 357 |
+
"exact_ref_count": 0,
|
| 358 |
+
"exact_ref_hits": [],
|
| 359 |
+
"best_ref_idx": [
|
| 360 |
+
6,
|
| 361 |
+
6,
|
| 362 |
+
5,
|
| 363 |
+
6,
|
| 364 |
+
6,
|
| 365 |
+
1,
|
| 366 |
+
5,
|
| 367 |
+
6,
|
| 368 |
+
6,
|
| 369 |
+
5,
|
| 370 |
+
6,
|
| 371 |
+
3,
|
| 372 |
+
6,
|
| 373 |
+
3,
|
| 374 |
+
1,
|
| 375 |
+
6,
|
| 376 |
+
6,
|
| 377 |
+
6,
|
| 378 |
+
6,
|
| 379 |
+
6,
|
| 380 |
+
3,
|
| 381 |
+
6,
|
| 382 |
+
4,
|
| 383 |
+
6,
|
| 384 |
+
7,
|
| 385 |
+
6,
|
| 386 |
+
5,
|
| 387 |
+
1,
|
| 388 |
+
6,
|
| 389 |
+
7,
|
| 390 |
+
6,
|
| 391 |
+
6,
|
| 392 |
+
6,
|
| 393 |
+
6,
|
| 394 |
+
6,
|
| 395 |
+
6,
|
| 396 |
+
3,
|
| 397 |
+
6,
|
| 398 |
+
6,
|
| 399 |
+
6,
|
| 400 |
+
6,
|
| 401 |
+
3,
|
| 402 |
+
6,
|
| 403 |
+
6,
|
| 404 |
+
6,
|
| 405 |
+
7,
|
| 406 |
+
5,
|
| 407 |
+
6,
|
| 408 |
+
0,
|
| 409 |
+
6,
|
| 410 |
+
6,
|
| 411 |
+
6,
|
| 412 |
+
6,
|
| 413 |
+
6,
|
| 414 |
+
6,
|
| 415 |
+
6,
|
| 416 |
+
5,
|
| 417 |
+
6,
|
| 418 |
+
6,
|
| 419 |
+
6,
|
| 420 |
+
6,
|
| 421 |
+
6,
|
| 422 |
+
6,
|
| 423 |
+
6
|
| 424 |
+
],
|
| 425 |
+
"best_token_acc": [
|
| 426 |
+
0.921875,
|
| 427 |
+
0.908203125,
|
| 428 |
+
0.96484375,
|
| 429 |
+
0.90234375,
|
| 430 |
+
0.912109375,
|
| 431 |
+
0.955078125,
|
| 432 |
+
0.94921875,
|
| 433 |
+
0.91796875,
|
| 434 |
+
0.9111328125,
|
| 435 |
+
0.9404296875,
|
| 436 |
+
0.9130859375,
|
| 437 |
+
0.939453125,
|
| 438 |
+
0.916015625,
|
| 439 |
+
0.955078125,
|
| 440 |
+
0.9189453125,
|
| 441 |
+
0.9267578125,
|
| 442 |
+
0.9091796875,
|
| 443 |
+
0.8955078125,
|
| 444 |
+
0.9296875,
|
| 445 |
+
0.9248046875,
|
| 446 |
+
0.94921875,
|
| 447 |
+
0.9013671875,
|
| 448 |
+
0.9609375,
|
| 449 |
+
0.9091796875,
|
| 450 |
+
0.962890625,
|
| 451 |
+
0.9140625,
|
| 452 |
+
0.9521484375,
|
| 453 |
+
0.9482421875,
|
| 454 |
+
0.8896484375,
|
| 455 |
+
0.966796875,
|
| 456 |
+
0.916015625,
|
| 457 |
+
0.8916015625,
|
| 458 |
+
0.908203125,
|
| 459 |
+
0.912109375,
|
| 460 |
+
0.9228515625,
|
| 461 |
+
0.900390625,
|
| 462 |
+
0.916015625,
|
| 463 |
+
0.89453125,
|
| 464 |
+
0.9091796875,
|
| 465 |
+
0.92578125,
|
| 466 |
+
0.91015625,
|
| 467 |
+
0.919921875,
|
| 468 |
+
0.88671875,
|
| 469 |
+
0.9208984375,
|
| 470 |
+
0.9091796875,
|
| 471 |
+
0.9619140625,
|
| 472 |
+
0.9697265625,
|
| 473 |
+
0.90625,
|
| 474 |
+
0.8154296875,
|
| 475 |
+
0.91796875,
|
| 476 |
+
0.9033203125,
|
| 477 |
+
0.8828125,
|
| 478 |
+
0.90625,
|
| 479 |
+
0.9091796875,
|
| 480 |
+
0.9267578125,
|
| 481 |
+
0.916015625,
|
| 482 |
+
0.97265625,
|
| 483 |
+
0.892578125,
|
| 484 |
+
0.9169921875,
|
| 485 |
+
0.912109375,
|
| 486 |
+
0.923828125,
|
| 487 |
+
0.9033203125,
|
| 488 |
+
0.9150390625,
|
| 489 |
+
0.9111328125
|
| 490 |
+
]
|
| 491 |
+
}
|
| 492 |
+
},
|
| 493 |
+
"first_exact_by_run": {}
|
| 494 |
+
}
|
| 495 |
+
RESULT config=p75_s4_i32_outwdm1 run=train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705 ckpt_step=3000 views=1536000 token_acc=0.9199 exact=0/64 exact_refs=0 hits=[]
|
| 496 |
+
[ctx1024-sweep] train config=p75_s4_i32_outwdm1 from=3000 to=4000
|
| 497 |
+
[ctx1024-sweep] eval config=p75_s4_i32_outwdm1 step=4000
|
| 498 |
+
[eval-decode-acc] train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705 step=4000 soft=none
|
| 499 |
+
[decode] max_len=1024 generated=64/64
|
| 500 |
+
{
|
| 501 |
+
"num_rows": 1,
|
| 502 |
+
"best_by_run": {
|
| 503 |
+
"train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705::none": {
|
| 504 |
+
"run": "train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705",
|
| 505 |
+
"checkpoint": "runs/train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705/step_0004000.pt",
|
| 506 |
+
"ckpt_step": 4000,
|
| 507 |
+
"endpoint_softening": "none",
|
| 508 |
+
"decode_rule": "flowmap",
|
| 509 |
+
"steps": 128,
|
| 510 |
+
"time_schedule": "logit_normal",
|
| 511 |
+
"model_t_mode": "post",
|
| 512 |
+
"final_from": "state",
|
| 513 |
+
"n_gen": 64,
|
| 514 |
+
"n_refs": 8,
|
| 515 |
+
"token_acc_mean": 0.972259521484375,
|
| 516 |
+
"token_acc_min": 0.951171875,
|
| 517 |
+
"token_acc_max": 1.0,
|
| 518 |
+
"exact_acc": 0.015625,
|
| 519 |
+
"exact_count": 1,
|
| 520 |
+
"exact_ref_coverage": 0.125,
|
| 521 |
+
"exact_ref_count": 1,
|
| 522 |
+
"exact_ref_hits": [
|
| 523 |
+
4
|
| 524 |
+
],
|
| 525 |
+
"best_ref_idx": [
|
| 526 |
+
6,
|
| 527 |
+
6,
|
| 528 |
+
6,
|
| 529 |
+
6,
|
| 530 |
+
7,
|
| 531 |
+
6,
|
| 532 |
+
6,
|
| 533 |
+
2,
|
| 534 |
+
6,
|
| 535 |
+
6,
|
| 536 |
+
6,
|
| 537 |
+
7,
|
| 538 |
+
7,
|
| 539 |
+
6,
|
| 540 |
+
6,
|
| 541 |
+
7,
|
| 542 |
+
6,
|
| 543 |
+
6,
|
| 544 |
+
5,
|
| 545 |
+
6,
|
| 546 |
+
6,
|
| 547 |
+
6,
|
| 548 |
+
6,
|
| 549 |
+
6,
|
| 550 |
+
6,
|
| 551 |
+
7,
|
| 552 |
+
2,
|
| 553 |
+
7,
|
| 554 |
+
7,
|
| 555 |
+
6,
|
| 556 |
+
6,
|
| 557 |
+
6,
|
| 558 |
+
6,
|
| 559 |
+
6,
|
| 560 |
+
0,
|
| 561 |
+
6,
|
| 562 |
+
6,
|
| 563 |
+
0,
|
| 564 |
+
6,
|
| 565 |
+
6,
|
| 566 |
+
6,
|
| 567 |
+
6,
|
| 568 |
+
6,
|
| 569 |
+
6,
|
| 570 |
+
6,
|
| 571 |
+
6,
|
| 572 |
+
6,
|
| 573 |
+
7,
|
| 574 |
+
6,
|
| 575 |
+
4,
|
| 576 |
+
6,
|
| 577 |
+
7,
|
| 578 |
+
6,
|
| 579 |
+
6,
|
| 580 |
+
6,
|
| 581 |
+
2,
|
| 582 |
+
6,
|
| 583 |
+
6,
|
| 584 |
+
2,
|
| 585 |
+
6,
|
| 586 |
+
6,
|
| 587 |
+
6,
|
| 588 |
+
2,
|
| 589 |
+
0
|
| 590 |
+
],
|
| 591 |
+
"best_token_acc": [
|
| 592 |
+
0.951171875,
|
| 593 |
+
0.9658203125,
|
| 594 |
+
0.955078125,
|
| 595 |
+
0.9658203125,
|
| 596 |
+
0.9931640625,
|
| 597 |
+
0.96484375,
|
| 598 |
+
0.970703125,
|
| 599 |
+
0.9912109375,
|
| 600 |
+
0.970703125,
|
| 601 |
+
0.9677734375,
|
| 602 |
+
0.966796875,
|
| 603 |
+
0.9931640625,
|
| 604 |
+
0.998046875,
|
| 605 |
+
0.96875,
|
| 606 |
+
0.9619140625,
|
| 607 |
+
0.994140625,
|
| 608 |
+
0.9619140625,
|
| 609 |
+
0.9560546875,
|
| 610 |
+
0.998046875,
|
| 611 |
+
0.966796875,
|
| 612 |
+
0.9638671875,
|
| 613 |
+
0.962890625,
|
| 614 |
+
0.9609375,
|
| 615 |
+
0.966796875,
|
| 616 |
+
0.9658203125,
|
| 617 |
+
0.994140625,
|
| 618 |
+
0.98828125,
|
| 619 |
+
0.9951171875,
|
| 620 |
+
0.9921875,
|
| 621 |
+
0.951171875,
|
| 622 |
+
0.9697265625,
|
| 623 |
+
0.9736328125,
|
| 624 |
+
0.966796875,
|
| 625 |
+
0.958984375,
|
| 626 |
+
0.9990234375,
|
| 627 |
+
0.9658203125,
|
| 628 |
+
0.9677734375,
|
| 629 |
+
0.9931640625,
|
| 630 |
+
0.958984375,
|
| 631 |
+
0.9599609375,
|
| 632 |
+
0.966796875,
|
| 633 |
+
0.9638671875,
|
| 634 |
+
0.9658203125,
|
| 635 |
+
0.966796875,
|
| 636 |
+
0.962890625,
|
| 637 |
+
0.96875,
|
| 638 |
+
0.953125,
|
| 639 |
+
0.9970703125,
|
| 640 |
+
0.958984375,
|
| 641 |
+
1.0,
|
| 642 |
+
0.9560546875,
|
| 643 |
+
0.99609375,
|
| 644 |
+
0.9697265625,
|
| 645 |
+
0.953125,
|
| 646 |
+
0.9658203125,
|
| 647 |
+
0.9921875,
|
| 648 |
+
0.96875,
|
| 649 |
+
0.962890625,
|
| 650 |
+
0.9912109375,
|
| 651 |
+
0.9677734375,
|
| 652 |
+
0.9658203125,
|
| 653 |
+
0.9638671875,
|
| 654 |
+
0.9912109375,
|
| 655 |
+
0.958984375
|
| 656 |
+
]
|
| 657 |
+
}
|
| 658 |
+
},
|
| 659 |
+
"first_exact_by_run": {
|
| 660 |
+
"train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705::none": {
|
| 661 |
+
"run": "train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705",
|
| 662 |
+
"checkpoint": "runs/train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705/step_0004000.pt",
|
| 663 |
+
"ckpt_step": 4000,
|
| 664 |
+
"endpoint_softening": "none",
|
| 665 |
+
"decode_rule": "flowmap",
|
| 666 |
+
"steps": 128,
|
| 667 |
+
"time_schedule": "logit_normal",
|
| 668 |
+
"model_t_mode": "post",
|
| 669 |
+
"final_from": "state",
|
| 670 |
+
"n_gen": 64,
|
| 671 |
+
"n_refs": 8,
|
| 672 |
+
"token_acc_mean": 0.972259521484375,
|
| 673 |
+
"token_acc_min": 0.951171875,
|
| 674 |
+
"token_acc_max": 1.0,
|
| 675 |
+
"exact_acc": 0.015625,
|
| 676 |
+
"exact_count": 1,
|
| 677 |
+
"exact_ref_coverage": 0.125,
|
| 678 |
+
"exact_ref_count": 1,
|
| 679 |
+
"exact_ref_hits": [
|
| 680 |
+
4
|
| 681 |
+
],
|
| 682 |
+
"best_ref_idx": [
|
| 683 |
+
6,
|
| 684 |
+
6,
|
| 685 |
+
6,
|
| 686 |
+
6,
|
| 687 |
+
7,
|
| 688 |
+
6,
|
| 689 |
+
6,
|
| 690 |
+
2,
|
| 691 |
+
6,
|
| 692 |
+
6,
|
| 693 |
+
6,
|
| 694 |
+
7,
|
| 695 |
+
7,
|
| 696 |
+
6,
|
| 697 |
+
6,
|
| 698 |
+
7,
|
| 699 |
+
6,
|
| 700 |
+
6,
|
| 701 |
+
5,
|
| 702 |
+
6,
|
| 703 |
+
6,
|
| 704 |
+
6,
|
| 705 |
+
6,
|
| 706 |
+
6,
|
| 707 |
+
6,
|
| 708 |
+
7,
|
| 709 |
+
2,
|
| 710 |
+
7,
|
| 711 |
+
7,
|
| 712 |
+
6,
|
| 713 |
+
6,
|
| 714 |
+
6,
|
| 715 |
+
6,
|
| 716 |
+
6,
|
| 717 |
+
0,
|
| 718 |
+
6,
|
| 719 |
+
6,
|
| 720 |
+
0,
|
| 721 |
+
6,
|
| 722 |
+
6,
|
| 723 |
+
6,
|
| 724 |
+
6,
|
| 725 |
+
6,
|
| 726 |
+
6,
|
| 727 |
+
6,
|
| 728 |
+
6,
|
| 729 |
+
6,
|
| 730 |
+
7,
|
| 731 |
+
6,
|
| 732 |
+
4,
|
| 733 |
+
6,
|
| 734 |
+
7,
|
| 735 |
+
6,
|
| 736 |
+
6,
|
| 737 |
+
6,
|
| 738 |
+
2,
|
| 739 |
+
6,
|
| 740 |
+
6,
|
| 741 |
+
2,
|
| 742 |
+
6,
|
| 743 |
+
6,
|
| 744 |
+
6,
|
| 745 |
+
2,
|
| 746 |
+
0
|
| 747 |
+
],
|
| 748 |
+
"best_token_acc": [
|
| 749 |
+
0.951171875,
|
| 750 |
+
0.9658203125,
|
| 751 |
+
0.955078125,
|
| 752 |
+
0.9658203125,
|
| 753 |
+
0.9931640625,
|
| 754 |
+
0.96484375,
|
| 755 |
+
0.970703125,
|
| 756 |
+
0.9912109375,
|
| 757 |
+
0.970703125,
|
| 758 |
+
0.9677734375,
|
| 759 |
+
0.966796875,
|
| 760 |
+
0.9931640625,
|
| 761 |
+
0.998046875,
|
| 762 |
+
0.96875,
|
| 763 |
+
0.9619140625,
|
| 764 |
+
0.994140625,
|
| 765 |
+
0.9619140625,
|
| 766 |
+
0.9560546875,
|
| 767 |
+
0.998046875,
|
| 768 |
+
0.966796875,
|
| 769 |
+
0.9638671875,
|
| 770 |
+
0.962890625,
|
| 771 |
+
0.9609375,
|
| 772 |
+
0.966796875,
|
| 773 |
+
0.9658203125,
|
| 774 |
+
0.994140625,
|
| 775 |
+
0.98828125,
|
| 776 |
+
0.9951171875,
|
| 777 |
+
0.9921875,
|
| 778 |
+
0.951171875,
|
| 779 |
+
0.9697265625,
|
| 780 |
+
0.9736328125,
|
| 781 |
+
0.966796875,
|
| 782 |
+
0.958984375,
|
| 783 |
+
0.9990234375,
|
| 784 |
+
0.9658203125,
|
| 785 |
+
0.9677734375,
|
| 786 |
+
0.9931640625,
|
| 787 |
+
0.958984375,
|
| 788 |
+
0.9599609375,
|
| 789 |
+
0.966796875,
|
| 790 |
+
0.9638671875,
|
| 791 |
+
0.9658203125,
|
| 792 |
+
0.966796875,
|
| 793 |
+
0.962890625,
|
| 794 |
+
0.96875,
|
| 795 |
+
0.953125,
|
| 796 |
+
0.9970703125,
|
| 797 |
+
0.958984375,
|
| 798 |
+
1.0,
|
| 799 |
+
0.9560546875,
|
| 800 |
+
0.99609375,
|
| 801 |
+
0.9697265625,
|
| 802 |
+
0.953125,
|
| 803 |
+
0.9658203125,
|
| 804 |
+
0.9921875,
|
| 805 |
+
0.96875,
|
| 806 |
+
0.962890625,
|
| 807 |
+
0.9912109375,
|
| 808 |
+
0.9677734375,
|
| 809 |
+
0.9658203125,
|
| 810 |
+
0.9638671875,
|
| 811 |
+
0.9912109375,
|
| 812 |
+
0.958984375
|
| 813 |
+
]
|
| 814 |
+
}
|
| 815 |
+
}
|
| 816 |
+
}
|
| 817 |
+
RESULT config=p75_s4_i32_outwdm1 run=train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705 ckpt_step=4000 views=2048000 token_acc=0.9723 exact=1/64 exact_refs=1 hits=[4]
|
| 818 |
+
[ctx1024-sweep] train config=p75_s4_i32_outwdm1 from=4000 to=5000
|
| 819 |
+
[ctx1024-sweep] eval config=p75_s4_i32_outwdm1 step=5000
|
| 820 |
+
[eval-decode-acc] train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705 step=5000 soft=none
|
| 821 |
+
[decode] max_len=1024 generated=64/64
|
| 822 |
+
{
|
| 823 |
+
"num_rows": 1,
|
| 824 |
+
"best_by_run": {
|
| 825 |
+
"train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705::none": {
|
| 826 |
+
"run": "train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705",
|
| 827 |
+
"checkpoint": "runs/train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705/step_0005000.pt",
|
| 828 |
+
"ckpt_step": 5000,
|
| 829 |
+
"endpoint_softening": "none",
|
| 830 |
+
"decode_rule": "flowmap",
|
| 831 |
+
"steps": 128,
|
| 832 |
+
"time_schedule": "logit_normal",
|
| 833 |
+
"model_t_mode": "post",
|
| 834 |
+
"final_from": "state",
|
| 835 |
+
"n_gen": 64,
|
| 836 |
+
"n_refs": 8,
|
| 837 |
+
"token_acc_mean": 0.9835357666015625,
|
| 838 |
+
"token_acc_min": 0.587890625,
|
| 839 |
+
"token_acc_max": 0.9970703125,
|
| 840 |
+
"exact_acc": 0.0,
|
| 841 |
+
"exact_count": 0,
|
| 842 |
+
"exact_ref_coverage": 0.0,
|
| 843 |
+
"exact_ref_count": 0,
|
| 844 |
+
"exact_ref_hits": [],
|
| 845 |
+
"best_ref_idx": [
|
| 846 |
+
6,
|
| 847 |
+
5,
|
| 848 |
+
6,
|
| 849 |
+
6,
|
| 850 |
+
5,
|
| 851 |
+
6,
|
| 852 |
+
6,
|
| 853 |
+
6,
|
| 854 |
+
6,
|
| 855 |
+
6,
|
| 856 |
+
0,
|
| 857 |
+
6,
|
| 858 |
+
5,
|
| 859 |
+
7,
|
| 860 |
+
1,
|
| 861 |
+
6,
|
| 862 |
+
3,
|
| 863 |
+
0,
|
| 864 |
+
6,
|
| 865 |
+
3,
|
| 866 |
+
6,
|
| 867 |
+
6,
|
| 868 |
+
6,
|
| 869 |
+
6,
|
| 870 |
+
5,
|
| 871 |
+
0,
|
| 872 |
+
5,
|
| 873 |
+
5,
|
| 874 |
+
6,
|
| 875 |
+
1,
|
| 876 |
+
6,
|
| 877 |
+
6,
|
| 878 |
+
1,
|
| 879 |
+
2,
|
| 880 |
+
1,
|
| 881 |
+
4,
|
| 882 |
+
6,
|
| 883 |
+
2,
|
| 884 |
+
6,
|
| 885 |
+
4,
|
| 886 |
+
6,
|
| 887 |
+
6,
|
| 888 |
+
6,
|
| 889 |
+
6,
|
| 890 |
+
0,
|
| 891 |
+
6,
|
| 892 |
+
6,
|
| 893 |
+
0,
|
| 894 |
+
6,
|
| 895 |
+
1,
|
| 896 |
+
5,
|
| 897 |
+
1,
|
| 898 |
+
6,
|
| 899 |
+
6,
|
| 900 |
+
6,
|
| 901 |
+
6,
|
| 902 |
+
6,
|
| 903 |
+
6,
|
| 904 |
+
6,
|
| 905 |
+
6,
|
| 906 |
+
1,
|
| 907 |
+
6,
|
| 908 |
+
3,
|
| 909 |
+
0
|
| 910 |
+
],
|
| 911 |
+
"best_token_acc": [
|
| 912 |
+
0.98828125,
|
| 913 |
+
0.99609375,
|
| 914 |
+
0.9921875,
|
| 915 |
+
0.9931640625,
|
| 916 |
+
0.9970703125,
|
| 917 |
+
0.9912109375,
|
| 918 |
+
0.9921875,
|
| 919 |
+
0.9912109375,
|
| 920 |
+
0.986328125,
|
| 921 |
+
0.9931640625,
|
| 922 |
+
0.9931640625,
|
| 923 |
+
0.587890625,
|
| 924 |
+
0.994140625,
|
| 925 |
+
0.9814453125,
|
| 926 |
+
0.9951171875,
|
| 927 |
+
0.9951171875,
|
| 928 |
+
0.97265625,
|
| 929 |
+
0.9541015625,
|
| 930 |
+
0.9833984375,
|
| 931 |
+
0.984375,
|
| 932 |
+
0.9921875,
|
| 933 |
+
0.98828125,
|
| 934 |
+
0.9931640625,
|
| 935 |
+
0.994140625,
|
| 936 |
+
0.9970703125,
|
| 937 |
+
0.9921875,
|
| 938 |
+
0.99609375,
|
| 939 |
+
0.9931640625,
|
| 940 |
+
0.9931640625,
|
| 941 |
+
0.9814453125,
|
| 942 |
+
0.994140625,
|
| 943 |
+
0.9892578125,
|
| 944 |
+
0.9833984375,
|
| 945 |
+
0.9736328125,
|
| 946 |
+
0.9970703125,
|
| 947 |
+
0.984375,
|
| 948 |
+
0.9921875,
|
| 949 |
+
0.98046875,
|
| 950 |
+
0.98828125,
|
| 951 |
+
0.9931640625,
|
| 952 |
+
0.994140625,
|
| 953 |
+
0.9833984375,
|
| 954 |
+
0.9931640625,
|
| 955 |
+
0.9931640625,
|
| 956 |
+
0.9921875,
|
| 957 |
+
0.9931640625,
|
| 958 |
+
0.9921875,
|
| 959 |
+
0.9970703125,
|
| 960 |
+
0.9921875,
|
| 961 |
+
0.994140625,
|
| 962 |
+
0.994140625,
|
| 963 |
+
0.9892578125,
|
| 964 |
+
0.9951171875,
|
| 965 |
+
0.9765625,
|
| 966 |
+
0.9921875,
|
| 967 |
+
0.98828125,
|
| 968 |
+
0.9951171875,
|
| 969 |
+
0.9931640625,
|
| 970 |
+
0.9921875,
|
| 971 |
+
0.990234375,
|
| 972 |
+
0.9970703125,
|
| 973 |
+
0.9794921875,
|
| 974 |
+
0.990234375,
|
| 975 |
+
0.9892578125
|
| 976 |
+
]
|
| 977 |
+
}
|
| 978 |
+
},
|
| 979 |
+
"first_exact_by_run": {}
|
| 980 |
+
}
|
| 981 |
+
RESULT config=p75_s4_i32_outwdm1 run=train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705 ckpt_step=5000 views=2560000 token_acc=0.9835 exact=0/64 exact_refs=0 hits=[]
|
| 982 |
+
[ctx1024-sweep] train config=p75_s4_i32_outwdm1 from=5000 to=6000
|
| 983 |
+
[ctx1024-sweep] eval config=p75_s4_i32_outwdm1 step=6000
|
| 984 |
+
[eval-decode-acc] train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705 step=6000 soft=none
|
| 985 |
+
[decode] max_len=1024 generated=64/64
|
| 986 |
+
{
|
| 987 |
+
"num_rows": 1,
|
| 988 |
+
"best_by_run": {
|
| 989 |
+
"train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705::none": {
|
| 990 |
+
"run": "train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705",
|
| 991 |
+
"checkpoint": "runs/train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705/step_0006000.pt",
|
| 992 |
+
"ckpt_step": 6000,
|
| 993 |
+
"endpoint_softening": "none",
|
| 994 |
+
"decode_rule": "flowmap",
|
| 995 |
+
"steps": 128,
|
| 996 |
+
"time_schedule": "logit_normal",
|
| 997 |
+
"model_t_mode": "post",
|
| 998 |
+
"final_from": "state",
|
| 999 |
+
"n_gen": 64,
|
| 1000 |
+
"n_refs": 8,
|
| 1001 |
+
"token_acc_mean": 0.9924468994140625,
|
| 1002 |
+
"token_acc_min": 0.9052734375,
|
| 1003 |
+
"token_acc_max": 1.0,
|
| 1004 |
+
"exact_acc": 0.0625,
|
| 1005 |
+
"exact_count": 4,
|
| 1006 |
+
"exact_ref_coverage": 0.25,
|
| 1007 |
+
"exact_ref_count": 2,
|
| 1008 |
+
"exact_ref_hits": [
|
| 1009 |
+
3,
|
| 1010 |
+
5
|
| 1011 |
+
],
|
| 1012 |
+
"best_ref_idx": [
|
| 1013 |
+
1,
|
| 1014 |
+
6,
|
| 1015 |
+
6,
|
| 1016 |
+
5,
|
| 1017 |
+
4,
|
| 1018 |
+
3,
|
| 1019 |
+
5,
|
| 1020 |
+
6,
|
| 1021 |
+
6,
|
| 1022 |
+
5,
|
| 1023 |
+
6,
|
| 1024 |
+
1,
|
| 1025 |
+
2,
|
| 1026 |
+
3,
|
| 1027 |
+
4,
|
| 1028 |
+
1,
|
| 1029 |
+
1,
|
| 1030 |
+
6,
|
| 1031 |
+
2,
|
| 1032 |
+
3,
|
| 1033 |
+
1,
|
| 1034 |
+
6,
|
| 1035 |
+
6,
|
| 1036 |
+
0,
|
| 1037 |
+
2,
|
| 1038 |
+
6,
|
| 1039 |
+
1,
|
| 1040 |
+
4,
|
| 1041 |
+
6,
|
| 1042 |
+
6,
|
| 1043 |
+
6,
|
| 1044 |
+
1,
|
| 1045 |
+
0,
|
| 1046 |
+
1,
|
| 1047 |
+
6,
|
| 1048 |
+
7,
|
| 1049 |
+
6,
|
| 1050 |
+
1,
|
| 1051 |
+
5,
|
| 1052 |
+
2,
|
| 1053 |
+
6,
|
| 1054 |
+
1,
|
| 1055 |
+
6,
|
| 1056 |
+
6,
|
| 1057 |
+
7,
|
| 1058 |
+
1,
|
| 1059 |
+
6,
|
| 1060 |
+
6,
|
| 1061 |
+
2,
|
| 1062 |
+
2,
|
| 1063 |
+
6,
|
| 1064 |
+
1,
|
| 1065 |
+
6,
|
| 1066 |
+
7,
|
| 1067 |
+
5,
|
| 1068 |
+
5,
|
| 1069 |
+
3,
|
| 1070 |
+
6,
|
| 1071 |
+
1,
|
| 1072 |
+
1,
|
| 1073 |
+
5,
|
| 1074 |
+
6,
|
| 1075 |
+
2,
|
| 1076 |
+
6
|
| 1077 |
+
],
|
| 1078 |
+
"best_token_acc": [
|
| 1079 |
+
0.9951171875,
|
| 1080 |
+
0.99609375,
|
| 1081 |
+
0.998046875,
|
| 1082 |
+
1.0,
|
| 1083 |
+
0.9453125,
|
| 1084 |
+
0.9970703125,
|
| 1085 |
+
0.9990234375,
|
| 1086 |
+
0.9970703125,
|
| 1087 |
+
0.9990234375,
|
| 1088 |
+
1.0,
|
| 1089 |
+
0.9990234375,
|
| 1090 |
+
0.99609375,
|
| 1091 |
+
0.9970703125,
|
| 1092 |
+
0.9990234375,
|
| 1093 |
+
0.9951171875,
|
| 1094 |
+
0.9990234375,
|
| 1095 |
+
0.9990234375,
|
| 1096 |
+
0.998046875,
|
| 1097 |
+
0.9970703125,
|
| 1098 |
+
0.9990234375,
|
| 1099 |
+
0.998046875,
|
| 1100 |
+
0.998046875,
|
| 1101 |
+
0.994140625,
|
| 1102 |
+
0.9052734375,
|
| 1103 |
+
0.9970703125,
|
| 1104 |
+
0.9970703125,
|
| 1105 |
+
0.9990234375,
|
| 1106 |
+
0.994140625,
|
| 1107 |
+
0.998046875,
|
| 1108 |
+
0.9970703125,
|
| 1109 |
+
0.998046875,
|
| 1110 |
+
0.9970703125,
|
| 1111 |
+
0.9189453125,
|
| 1112 |
+
0.9970703125,
|
| 1113 |
+
0.9951171875,
|
| 1114 |
+
0.9736328125,
|
| 1115 |
+
0.998046875,
|
| 1116 |
+
0.9970703125,
|
| 1117 |
+
0.9970703125,
|
| 1118 |
+
0.9970703125,
|
| 1119 |
+
0.9970703125,
|
| 1120 |
+
0.9970703125,
|
| 1121 |
+
0.9970703125,
|
| 1122 |
+
0.9970703125,
|
| 1123 |
+
0.953125,
|
| 1124 |
+
0.9990234375,
|
| 1125 |
+
0.998046875,
|
| 1126 |
+
0.998046875,
|
| 1127 |
+
0.9970703125,
|
| 1128 |
+
0.99609375,
|
| 1129 |
+
0.998046875,
|
| 1130 |
+
0.998046875,
|
| 1131 |
+
0.9970703125,
|
| 1132 |
+
0.962890625,
|
| 1133 |
+
0.994140625,
|
| 1134 |
+
1.0,
|
| 1135 |
+
1.0,
|
| 1136 |
+
0.9970703125,
|
| 1137 |
+
0.9990234375,
|
| 1138 |
+
0.9970703125,
|
| 1139 |
+
0.9990234375,
|
| 1140 |
+
0.998046875,
|
| 1141 |
+
0.99609375,
|
| 1142 |
+
0.998046875
|
| 1143 |
+
]
|
| 1144 |
+
}
|
| 1145 |
+
},
|
| 1146 |
+
"first_exact_by_run": {
|
| 1147 |
+
"train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705::none": {
|
| 1148 |
+
"run": "train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705",
|
| 1149 |
+
"checkpoint": "runs/train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705/step_0006000.pt",
|
| 1150 |
+
"ckpt_step": 6000,
|
| 1151 |
+
"endpoint_softening": "none",
|
| 1152 |
+
"decode_rule": "flowmap",
|
| 1153 |
+
"steps": 128,
|
| 1154 |
+
"time_schedule": "logit_normal",
|
| 1155 |
+
"model_t_mode": "post",
|
| 1156 |
+
"final_from": "state",
|
| 1157 |
+
"n_gen": 64,
|
| 1158 |
+
"n_refs": 8,
|
| 1159 |
+
"token_acc_mean": 0.9924468994140625,
|
| 1160 |
+
"token_acc_min": 0.9052734375,
|
| 1161 |
+
"token_acc_max": 1.0,
|
| 1162 |
+
"exact_acc": 0.0625,
|
| 1163 |
+
"exact_count": 4,
|
| 1164 |
+
"exact_ref_coverage": 0.25,
|
| 1165 |
+
"exact_ref_count": 2,
|
| 1166 |
+
"exact_ref_hits": [
|
| 1167 |
+
3,
|
| 1168 |
+
5
|
| 1169 |
+
],
|
| 1170 |
+
"best_ref_idx": [
|
| 1171 |
+
1,
|
| 1172 |
+
6,
|
| 1173 |
+
6,
|
| 1174 |
+
5,
|
| 1175 |
+
4,
|
| 1176 |
+
3,
|
| 1177 |
+
5,
|
| 1178 |
+
6,
|
| 1179 |
+
6,
|
| 1180 |
+
5,
|
| 1181 |
+
6,
|
| 1182 |
+
1,
|
| 1183 |
+
2,
|
| 1184 |
+
3,
|
| 1185 |
+
4,
|
| 1186 |
+
1,
|
| 1187 |
+
1,
|
| 1188 |
+
6,
|
| 1189 |
+
2,
|
| 1190 |
+
3,
|
| 1191 |
+
1,
|
| 1192 |
+
6,
|
| 1193 |
+
6,
|
| 1194 |
+
0,
|
| 1195 |
+
2,
|
| 1196 |
+
6,
|
| 1197 |
+
1,
|
| 1198 |
+
4,
|
| 1199 |
+
6,
|
| 1200 |
+
6,
|
| 1201 |
+
6,
|
| 1202 |
+
1,
|
| 1203 |
+
0,
|
| 1204 |
+
1,
|
| 1205 |
+
6,
|
| 1206 |
+
7,
|
| 1207 |
+
6,
|
| 1208 |
+
1,
|
| 1209 |
+
5,
|
| 1210 |
+
2,
|
| 1211 |
+
6,
|
| 1212 |
+
1,
|
| 1213 |
+
6,
|
| 1214 |
+
6,
|
| 1215 |
+
7,
|
| 1216 |
+
1,
|
| 1217 |
+
6,
|
| 1218 |
+
6,
|
| 1219 |
+
2,
|
| 1220 |
+
2,
|
| 1221 |
+
6,
|
| 1222 |
+
1,
|
| 1223 |
+
6,
|
| 1224 |
+
7,
|
| 1225 |
+
5,
|
| 1226 |
+
5,
|
| 1227 |
+
3,
|
| 1228 |
+
6,
|
| 1229 |
+
1,
|
| 1230 |
+
1,
|
| 1231 |
+
5,
|
| 1232 |
+
6,
|
| 1233 |
+
2,
|
| 1234 |
+
6
|
| 1235 |
+
],
|
| 1236 |
+
"best_token_acc": [
|
| 1237 |
+
0.9951171875,
|
| 1238 |
+
0.99609375,
|
| 1239 |
+
0.998046875,
|
| 1240 |
+
1.0,
|
| 1241 |
+
0.9453125,
|
| 1242 |
+
0.9970703125,
|
| 1243 |
+
0.9990234375,
|
| 1244 |
+
0.9970703125,
|
| 1245 |
+
0.9990234375,
|
| 1246 |
+
1.0,
|
| 1247 |
+
0.9990234375,
|
| 1248 |
+
0.99609375,
|
| 1249 |
+
0.9970703125,
|
| 1250 |
+
0.9990234375,
|
| 1251 |
+
0.9951171875,
|
| 1252 |
+
0.9990234375,
|
| 1253 |
+
0.9990234375,
|
| 1254 |
+
0.998046875,
|
| 1255 |
+
0.9970703125,
|
| 1256 |
+
0.9990234375,
|
| 1257 |
+
0.998046875,
|
| 1258 |
+
0.998046875,
|
| 1259 |
+
0.994140625,
|
| 1260 |
+
0.9052734375,
|
| 1261 |
+
0.9970703125,
|
| 1262 |
+
0.9970703125,
|
| 1263 |
+
0.9990234375,
|
| 1264 |
+
0.994140625,
|
| 1265 |
+
0.998046875,
|
| 1266 |
+
0.9970703125,
|
| 1267 |
+
0.998046875,
|
| 1268 |
+
0.9970703125,
|
| 1269 |
+
0.9189453125,
|
| 1270 |
+
0.9970703125,
|
| 1271 |
+
0.9951171875,
|
| 1272 |
+
0.9736328125,
|
| 1273 |
+
0.998046875,
|
| 1274 |
+
0.9970703125,
|
| 1275 |
+
0.9970703125,
|
| 1276 |
+
0.9970703125,
|
| 1277 |
+
0.9970703125,
|
| 1278 |
+
0.9970703125,
|
| 1279 |
+
0.9970703125,
|
| 1280 |
+
0.9970703125,
|
| 1281 |
+
0.953125,
|
| 1282 |
+
0.9990234375,
|
| 1283 |
+
0.998046875,
|
| 1284 |
+
0.998046875,
|
| 1285 |
+
0.9970703125,
|
| 1286 |
+
0.99609375,
|
| 1287 |
+
0.998046875,
|
| 1288 |
+
0.998046875,
|
| 1289 |
+
0.9970703125,
|
| 1290 |
+
0.962890625,
|
| 1291 |
+
0.994140625,
|
| 1292 |
+
1.0,
|
| 1293 |
+
1.0,
|
| 1294 |
+
0.9970703125,
|
| 1295 |
+
0.9990234375,
|
| 1296 |
+
0.9970703125,
|
| 1297 |
+
0.9990234375,
|
| 1298 |
+
0.998046875,
|
| 1299 |
+
0.99609375,
|
| 1300 |
+
0.998046875
|
| 1301 |
+
]
|
| 1302 |
+
}
|
| 1303 |
+
}
|
| 1304 |
+
}
|
| 1305 |
+
RESULT config=p75_s4_i32_outwdm1 run=train8_ctx1024_p75_s4_i32_outwdm1_ctx1024_sweep_selected_20260517_210705 ckpt_step=6000 views=3072000 token_acc=0.9924 exact=4/64 exact_refs=2 hits=[3, 5]
|
| 1306 |
+
[ctx1024-sweep] train config=p75_s4_i32_outwdm1 from=6000 to=7000
|
LTA_openwebtext_dualt/logs/ctx1024_rollin_sweep_4gpu/ctx1024_sweep_selected_20260517_210705.nohup
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/ctx1024_rollin_sweep_4gpu/queued_ctx1024_sweep.nohup
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
[ctx1024-sweep] waiting for run=train8_rollin_len1024_rollin_p50_s4_i32_20260517_1855ctx1024bs128
|
LTA_openwebtext_dualt/logs/ctx1024_sampledpath_sweep_4gpu/ctx1024_sampledpath_20260517_223933.nohup
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[ctx1024-sampleds] start stamp=ctx1024_sampledpath_20260517_223933 len=1024 vocab=2664 out=docs/lta_samples/metrics_20260517/ctx1024_sampleds_sweep_bs512_ode128_ctx1024_sampledpath_20260517_223933
|
| 2 |
+
[ctx1024-sampleds] config=p50_path4_unif0_0p125_outwdm1 run=train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_20260517_223933 p=0.50 s_dist=uniform s_frac=0.0->0.125 beta=2.0,6.0 outwd=-1 sync_t=1
|
| 3 |
+
[ctx1024-sampleds] train config=p50_path4_unif0_0p125_outwdm1 from=0 to=1000
|
| 4 |
+
[launch] gpt2 cached OWT soft-endpoint m/n pilot
|
| 5 |
+
[launch] run_name=train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_20260517_223933
|
| 6 |
+
[launch] save_dir=runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_20260517_223933
|
| 7 |
+
[launch] n=1024 m=0 clean_state_mode=onehot
|
| 8 |
+
[launch] mask_mixture lowk=0.0 all=1.0
|
| 9 |
+
[launch] model d=192 layers=3 heads=3 ff=768 vocab_override=2664
|
| 10 |
+
[launch] optimizer=muon muon_impl=legacy weight_decay=0.1 output_weight_decay=-1
|
| 11 |
+
[launch] target_loss=hard_ce conf=0.0->1.0 power=1.0
|
| 12 |
+
[launch] mask_ratio=1.0->1.0
|
| 13 |
+
[launch] mask_ratio_floor_schedule=none
|
| 14 |
+
[launch] dirichlet C=1.0->1024 endpoint=categorical_dual_t sampler=dirichlet
|
| 15 |
+
[launch] wrong_mix seq_alpha=0.0 wrong_floor=0.0 unigram=0.0 uniform=0.0 basin=0.0 basin_ids=
|
| 16 |
+
[launch] rollout_train prob=0.50 mode=sampled_s steps=1 infer_steps=1 s_dist=uniform s_frac=0.0->0.125 temp=1.45 corrupt_only=1 samplewise=1 selected_only=1 sync_t=1
|
| 17 |
+
[launch] cache=/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit exact_repeat_per_chunk=64
|
| 18 |
+
NCCL version 2.25.1+cuda12.8
|
| 19 |
+
{
|
| 20 |
+
"device": "cuda:0",
|
| 21 |
+
"rank": 0,
|
| 22 |
+
"world_size": 4,
|
| 23 |
+
"samples": "owt_cached_chunks:8",
|
| 24 |
+
"vocab_size": 2664,
|
| 25 |
+
"tokenizer_vocab_size": 50257,
|
| 26 |
+
"save_dir": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_20260517_223933",
|
| 27 |
+
"batch_size": 128,
|
| 28 |
+
"grad_accum": 1,
|
| 29 |
+
"effective_batch_size": 512,
|
| 30 |
+
"global_batch_size": 512,
|
| 31 |
+
"lr_schedule": "constant_warmup",
|
| 32 |
+
"optimizer": "muon",
|
| 33 |
+
"epochs": 0.0,
|
| 34 |
+
"steps_per_epoch": 1,
|
| 35 |
+
"total_steps": 1000,
|
| 36 |
+
"warmup_steps": 10,
|
| 37 |
+
"warmup_epochs": -1.0,
|
| 38 |
+
"min_lr": 0.0,
|
| 39 |
+
"weight_decay": 0.1,
|
| 40 |
+
"output_weight_decay": -1.0,
|
| 41 |
+
"adamw_param_groups": "nanogpt",
|
| 42 |
+
"adam_beta1": 0.9,
|
| 43 |
+
"adam_beta2": 0.95,
|
| 44 |
+
"adam_eps": 1e-08,
|
| 45 |
+
"muon_impl": "legacy",
|
| 46 |
+
"muon_momentum": 0.95,
|
| 47 |
+
"muon_ns_steps": 5,
|
| 48 |
+
"muon_update_scale": 1.0,
|
| 49 |
+
"muon_nesterov": false,
|
| 50 |
+
"muon_width_scale": false,
|
| 51 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 52 |
+
"muon_param_count": 2616320,
|
| 53 |
+
"muon_adam_param_count": 8192,
|
| 54 |
+
"muon_param_names": [
|
| 55 |
+
"vocab_embed.embedding",
|
| 56 |
+
"sigma_map.net.0.weight",
|
| 57 |
+
"sigma_map.net.2.weight",
|
| 58 |
+
"blocks.0.attn_qkv.weight",
|
| 59 |
+
"blocks.0.attn_out.weight",
|
| 60 |
+
"blocks.0.mlp.0.weight",
|
| 61 |
+
"blocks.0.mlp.2.weight",
|
| 62 |
+
"blocks.0.adaLN_modulation.weight",
|
| 63 |
+
"blocks.1.attn_qkv.weight",
|
| 64 |
+
"blocks.1.attn_out.weight",
|
| 65 |
+
"blocks.1.mlp.0.weight",
|
| 66 |
+
"blocks.1.mlp.2.weight",
|
| 67 |
+
"blocks.1.adaLN_modulation.weight",
|
| 68 |
+
"blocks.2.attn_qkv.weight",
|
| 69 |
+
"blocks.2.attn_out.weight",
|
| 70 |
+
"blocks.2.mlp.0.weight",
|
| 71 |
+
"blocks.2.mlp.2.weight",
|
| 72 |
+
"blocks.2.adaLN_modulation.weight",
|
| 73 |
+
"output_layer.linear.weight",
|
| 74 |
+
"output_layer.adaLN_modulation.weight"
|
| 75 |
+
],
|
| 76 |
+
"muon_adam_param_names": [
|
| 77 |
+
"sigma_map.net.0.bias",
|
| 78 |
+
"sigma_map.net.2.bias",
|
| 79 |
+
"blocks.0.norm1.weight",
|
| 80 |
+
"blocks.0.norm2.weight",
|
| 81 |
+
"blocks.0.mlp.0.bias",
|
| 82 |
+
"blocks.0.mlp.2.bias",
|
| 83 |
+
"blocks.0.adaLN_modulation.bias",
|
| 84 |
+
"blocks.1.norm1.weight",
|
| 85 |
+
"blocks.1.norm2.weight",
|
| 86 |
+
"blocks.1.mlp.0.bias",
|
| 87 |
+
"blocks.1.mlp.2.bias",
|
| 88 |
+
"blocks.1.adaLN_modulation.bias",
|
| 89 |
+
"blocks.2.norm1.weight",
|
| 90 |
+
"blocks.2.norm2.weight",
|
| 91 |
+
"blocks.2.mlp.0.bias",
|
| 92 |
+
"blocks.2.mlp.2.bias",
|
| 93 |
+
"blocks.2.adaLN_modulation.bias",
|
| 94 |
+
"output_layer.norm_final.weight",
|
| 95 |
+
"output_layer.adaLN_modulation.bias"
|
| 96 |
+
],
|
| 97 |
+
"muon_effective_nesterov": false,
|
| 98 |
+
"muon_effective_width_scale": false,
|
| 99 |
+
"muon_effective_weight_decay": 0.1,
|
| 100 |
+
"muon_adam_fallback_nesterov": false,
|
| 101 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 102 |
+
"ema_decay": 0.9999,
|
| 103 |
+
"ema_start_step": 0,
|
| 104 |
+
"model_type": "ddit",
|
| 105 |
+
"ddit_mlp_type": "gelu",
|
| 106 |
+
"elf_num_time_tokens": 4,
|
| 107 |
+
"elf_num_model_mode_tokens": 0,
|
| 108 |
+
"qk_norm": true,
|
| 109 |
+
"output_bias": false,
|
| 110 |
+
"output_init_std": -1.0,
|
| 111 |
+
"norm_type": "rmsnorm",
|
| 112 |
+
"target_loss": "hard_ce",
|
| 113 |
+
"linear_soft_target_power": 1.0,
|
| 114 |
+
"linear_soft_target_min_conf": 0.0,
|
| 115 |
+
"linear_soft_target_max_conf": 1.0,
|
| 116 |
+
"t_sampling_mode": "logit_normal",
|
| 117 |
+
"t_sampling_power": 1.0,
|
| 118 |
+
"t_sampling_eps": 0.0001,
|
| 119 |
+
"t_sampling_logit_mean": -1.5,
|
| 120 |
+
"t_sampling_logit_std": 0.8,
|
| 121 |
+
"dual_t": true,
|
| 122 |
+
"corrupt_t_mode": "same",
|
| 123 |
+
"corrupt_min_t": 0.0,
|
| 124 |
+
"corrupt_max_t": 1.0,
|
| 125 |
+
"prefix_block_prob": 0.0,
|
| 126 |
+
"prefix_block_len": 128,
|
| 127 |
+
"mask_ratio_floor_schedule": "none",
|
| 128 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 129 |
+
"dirichlet_semantic_t_mode": "same",
|
| 130 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 131 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 132 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 133 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 134 |
+
"categorical_wrong_from_full_vocab": true,
|
| 135 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 136 |
+
"categorical_wrong_basin_token_ids": "",
|
| 137 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 138 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 139 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 140 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 141 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 142 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 143 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 144 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 145 |
+
"mask_mixture_original_prob": 0.0,
|
| 146 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 147 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 148 |
+
"mask_mixture_block_prob": 0.0,
|
| 149 |
+
"mask_mixture_all_prob": 1.0,
|
| 150 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 151 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 152 |
+
"mask_mixture_block_tokens": "64,128",
|
| 153 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 154 |
+
"logistic_normal_sigma_min": 0.1,
|
| 155 |
+
"logistic_normal_sigma_max": 1.0,
|
| 156 |
+
"logistic_normal_tau_min": 1.0,
|
| 157 |
+
"logistic_normal_tau_max": 1.0,
|
| 158 |
+
"torch_compile": false,
|
| 159 |
+
"compile_mode": "max-autotune",
|
| 160 |
+
"state_format": "prob",
|
| 161 |
+
"meanflow_weight": 0.0,
|
| 162 |
+
"rollout_train_prob": 0.5,
|
| 163 |
+
"rollout_train_steps": 1,
|
| 164 |
+
"rollout_train_infer_steps": 1,
|
| 165 |
+
"rollout_train_time_mode": "sampled_s",
|
| 166 |
+
"rollout_train_s_dist": "uniform",
|
| 167 |
+
"rollout_train_s_min_frac": 0.0,
|
| 168 |
+
"rollout_train_s_max_frac": 0.125,
|
| 169 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 170 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 171 |
+
"rollout_train_temp": 1.45,
|
| 172 |
+
"rollout_train_max_gamma": 1.0,
|
| 173 |
+
"rollout_train_corrupt_only": true,
|
| 174 |
+
"rollout_train_samplewise": true,
|
| 175 |
+
"rollout_train_compute_always": false,
|
| 176 |
+
"rollout_train_sync_t": true,
|
| 177 |
+
"bridge_noise_init": "logistic_normal",
|
| 178 |
+
"noise_sigma": -1.0,
|
| 179 |
+
"allow_tf32": true,
|
| 180 |
+
"activation_checkpointing": false,
|
| 181 |
+
"activation_checkpoint_interval": 1,
|
| 182 |
+
"activation_checkpoint_scope": "block",
|
| 183 |
+
"ddp_static_graph": false,
|
| 184 |
+
"ddp_gradient_as_bucket_view": true,
|
| 185 |
+
"blocking_data_transfer": false,
|
| 186 |
+
"dataloader_prefetch_factor": 4,
|
| 187 |
+
"full_train_stats": false,
|
| 188 |
+
"tokenized_hf": false,
|
| 189 |
+
"tokenized_pad_token": "pad",
|
| 190 |
+
"elf_conditional_hf": false,
|
| 191 |
+
"record_pad_truncate": false,
|
| 192 |
+
"record_add_eos": false,
|
| 193 |
+
"record_add_special_tokens": false,
|
| 194 |
+
"record_pad_token": "pad",
|
| 195 |
+
"record_shuffle_buffer": 10000,
|
| 196 |
+
"wrap": true,
|
| 197 |
+
"wrap_mode": "stream",
|
| 198 |
+
"wrap_record_buffer_size": 200,
|
| 199 |
+
"owt_cached_chunks": true,
|
| 200 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
|
| 201 |
+
"owt_chunk_cache_rebuild": false,
|
| 202 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 203 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 204 |
+
"online_chunk_shuffle": false,
|
| 205 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 206 |
+
"openwebtext_split": "train_minus_100k",
|
| 207 |
+
"detokenizer": "auto",
|
| 208 |
+
"resolved_detokenizer": null,
|
| 209 |
+
"num_workers": 0,
|
| 210 |
+
"latest_every": 1000,
|
| 211 |
+
"resume_path": ""
|
| 212 |
+
}
|
| 213 |
+
W0517 22:40:01.897000 386925 torch/distributed/elastic/agent/server/api.py:719] Received 15 death signal, shutting down workers
|
| 214 |
+
W0517 22:40:01.899000 386925 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 386929 closing signal SIGTERM
|
| 215 |
+
W0517 22:40:01.900000 386925 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 386930 closing signal SIGTERM
|
| 216 |
+
W0517 22:40:01.900000 386925 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 386931 closing signal SIGTERM
|
| 217 |
+
W0517 22:40:01.901000 386925 torch/distributed/elastic/multiprocessing/api.py:898] Sending process 386932 closing signal SIGTERM
|
| 218 |
+
Traceback (most recent call last):
|
| 219 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 220 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 221 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 922, in <module>
|
| 222 |
+
main()
|
| 223 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 224 |
+
return f(*args, **kwargs)
|
| 225 |
+
^^^^^^^^^^^^^^^^^^
|
| 226 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 918, in main
|
| 227 |
+
run(args)
|
| 228 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 909, in run
|
| 229 |
+
elastic_launch(
|
| 230 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__
|
| 231 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 232 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 233 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
|
| 234 |
+
result = agent.run()
|
| 235 |
+
^^^^^^^^^^^
|
| 236 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
|
| 237 |
+
result = f(*args, **kwargs)
|
| 238 |
+
^^^^^^^^^^^^^^^^^^
|
| 239 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run
|
| 240 |
+
result = self._invoke_run(role)
|
| 241 |
+
^^^^^^^^^^^^^^^^^^^^^^
|
| 242 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 870, in _invoke_run
|
| 243 |
+
time.sleep(monitor_interval)
|
| 244 |
+
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler
|
| 245 |
+
raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
|
| 246 |
+
torch.distributed.elastic.multiprocessing.api.SignalException: Process 386925 got signal: 15
|
LTA_openwebtext_dualt/logs/ctx1024_sampledpath_sweep_4gpu/ctx1024_sampledpath_true_20260517_224139.nohup
ADDED
|
@@ -0,0 +1,985 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[ctx1024-sampleds] start stamp=ctx1024_sampledpath_true_20260517_224139 len=1024 vocab=2664 out=docs/lta_samples/metrics_20260517/ctx1024_sampleds_sweep_bs512_ode128_ctx1024_sampledpath_true_20260517_224139
|
| 2 |
+
[ctx1024-sampleds] config=p50_path4_unif0_0p125_outwdm1 run=train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139 p=0.50 mode=sampled_path steps=4 s_dist=uniform s_frac=0.0->0.125 beta=2.0,6.0 outwd=-1 sync_t=1
|
| 3 |
+
[ctx1024-sampleds] train config=p50_path4_unif0_0p125_outwdm1 from=0 to=1000
|
| 4 |
+
[launch] gpt2 cached OWT soft-endpoint m/n pilot
|
| 5 |
+
[launch] run_name=train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139
|
| 6 |
+
[launch] save_dir=runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139
|
| 7 |
+
[launch] n=1024 m=0 clean_state_mode=onehot
|
| 8 |
+
[launch] mask_mixture lowk=0.0 all=1.0
|
| 9 |
+
[launch] model d=192 layers=3 heads=3 ff=768 vocab_override=2664
|
| 10 |
+
[launch] optimizer=muon muon_impl=legacy weight_decay=0.1 output_weight_decay=-1
|
| 11 |
+
[launch] target_loss=hard_ce conf=0.0->1.0 power=1.0
|
| 12 |
+
[launch] mask_ratio=1.0->1.0
|
| 13 |
+
[launch] mask_ratio_floor_schedule=none
|
| 14 |
+
[launch] dirichlet C=1.0->1024 endpoint=categorical_dual_t sampler=dirichlet
|
| 15 |
+
[launch] wrong_mix seq_alpha=0.0 wrong_floor=0.0 unigram=0.0 uniform=0.0 basin=0.0 basin_ids=
|
| 16 |
+
[launch] rollout_train prob=0.50 mode=sampled_path steps=4 infer_steps=1 s_dist=uniform s_frac=0.0->0.125 temp=1.45 corrupt_only=1 samplewise=1 selected_only=1 sync_t=1
|
| 17 |
+
[launch] cache=/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit exact_repeat_per_chunk=64
|
| 18 |
+
NCCL version 2.25.1+cuda12.8
|
| 19 |
+
{
|
| 20 |
+
"device": "cuda:0",
|
| 21 |
+
"rank": 0,
|
| 22 |
+
"world_size": 4,
|
| 23 |
+
"samples": "owt_cached_chunks:8",
|
| 24 |
+
"vocab_size": 2664,
|
| 25 |
+
"tokenizer_vocab_size": 50257,
|
| 26 |
+
"save_dir": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139",
|
| 27 |
+
"batch_size": 128,
|
| 28 |
+
"grad_accum": 1,
|
| 29 |
+
"effective_batch_size": 512,
|
| 30 |
+
"global_batch_size": 512,
|
| 31 |
+
"lr_schedule": "constant_warmup",
|
| 32 |
+
"optimizer": "muon",
|
| 33 |
+
"epochs": 0.0,
|
| 34 |
+
"steps_per_epoch": 1,
|
| 35 |
+
"total_steps": 1000,
|
| 36 |
+
"warmup_steps": 10,
|
| 37 |
+
"warmup_epochs": -1.0,
|
| 38 |
+
"min_lr": 0.0,
|
| 39 |
+
"weight_decay": 0.1,
|
| 40 |
+
"output_weight_decay": -1.0,
|
| 41 |
+
"adamw_param_groups": "nanogpt",
|
| 42 |
+
"adam_beta1": 0.9,
|
| 43 |
+
"adam_beta2": 0.95,
|
| 44 |
+
"adam_eps": 1e-08,
|
| 45 |
+
"muon_impl": "legacy",
|
| 46 |
+
"muon_momentum": 0.95,
|
| 47 |
+
"muon_ns_steps": 5,
|
| 48 |
+
"muon_update_scale": 1.0,
|
| 49 |
+
"muon_nesterov": false,
|
| 50 |
+
"muon_width_scale": false,
|
| 51 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 52 |
+
"muon_param_count": 2616320,
|
| 53 |
+
"muon_adam_param_count": 8192,
|
| 54 |
+
"muon_param_names": [
|
| 55 |
+
"vocab_embed.embedding",
|
| 56 |
+
"sigma_map.net.0.weight",
|
| 57 |
+
"sigma_map.net.2.weight",
|
| 58 |
+
"blocks.0.attn_qkv.weight",
|
| 59 |
+
"blocks.0.attn_out.weight",
|
| 60 |
+
"blocks.0.mlp.0.weight",
|
| 61 |
+
"blocks.0.mlp.2.weight",
|
| 62 |
+
"blocks.0.adaLN_modulation.weight",
|
| 63 |
+
"blocks.1.attn_qkv.weight",
|
| 64 |
+
"blocks.1.attn_out.weight",
|
| 65 |
+
"blocks.1.mlp.0.weight",
|
| 66 |
+
"blocks.1.mlp.2.weight",
|
| 67 |
+
"blocks.1.adaLN_modulation.weight",
|
| 68 |
+
"blocks.2.attn_qkv.weight",
|
| 69 |
+
"blocks.2.attn_out.weight",
|
| 70 |
+
"blocks.2.mlp.0.weight",
|
| 71 |
+
"blocks.2.mlp.2.weight",
|
| 72 |
+
"blocks.2.adaLN_modulation.weight",
|
| 73 |
+
"output_layer.linear.weight",
|
| 74 |
+
"output_layer.adaLN_modulation.weight"
|
| 75 |
+
],
|
| 76 |
+
"muon_adam_param_names": [
|
| 77 |
+
"sigma_map.net.0.bias",
|
| 78 |
+
"sigma_map.net.2.bias",
|
| 79 |
+
"blocks.0.norm1.weight",
|
| 80 |
+
"blocks.0.norm2.weight",
|
| 81 |
+
"blocks.0.mlp.0.bias",
|
| 82 |
+
"blocks.0.mlp.2.bias",
|
| 83 |
+
"blocks.0.adaLN_modulation.bias",
|
| 84 |
+
"blocks.1.norm1.weight",
|
| 85 |
+
"blocks.1.norm2.weight",
|
| 86 |
+
"blocks.1.mlp.0.bias",
|
| 87 |
+
"blocks.1.mlp.2.bias",
|
| 88 |
+
"blocks.1.adaLN_modulation.bias",
|
| 89 |
+
"blocks.2.norm1.weight",
|
| 90 |
+
"blocks.2.norm2.weight",
|
| 91 |
+
"blocks.2.mlp.0.bias",
|
| 92 |
+
"blocks.2.mlp.2.bias",
|
| 93 |
+
"blocks.2.adaLN_modulation.bias",
|
| 94 |
+
"output_layer.norm_final.weight",
|
| 95 |
+
"output_layer.adaLN_modulation.bias"
|
| 96 |
+
],
|
| 97 |
+
"muon_effective_nesterov": false,
|
| 98 |
+
"muon_effective_width_scale": false,
|
| 99 |
+
"muon_effective_weight_decay": 0.1,
|
| 100 |
+
"muon_adam_fallback_nesterov": false,
|
| 101 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 102 |
+
"ema_decay": 0.9999,
|
| 103 |
+
"ema_start_step": 0,
|
| 104 |
+
"model_type": "ddit",
|
| 105 |
+
"ddit_mlp_type": "gelu",
|
| 106 |
+
"elf_num_time_tokens": 4,
|
| 107 |
+
"elf_num_model_mode_tokens": 0,
|
| 108 |
+
"qk_norm": true,
|
| 109 |
+
"output_bias": false,
|
| 110 |
+
"output_init_std": -1.0,
|
| 111 |
+
"norm_type": "rmsnorm",
|
| 112 |
+
"target_loss": "hard_ce",
|
| 113 |
+
"linear_soft_target_power": 1.0,
|
| 114 |
+
"linear_soft_target_min_conf": 0.0,
|
| 115 |
+
"linear_soft_target_max_conf": 1.0,
|
| 116 |
+
"t_sampling_mode": "logit_normal",
|
| 117 |
+
"t_sampling_power": 1.0,
|
| 118 |
+
"t_sampling_eps": 0.0001,
|
| 119 |
+
"t_sampling_logit_mean": -1.5,
|
| 120 |
+
"t_sampling_logit_std": 0.8,
|
| 121 |
+
"dual_t": true,
|
| 122 |
+
"corrupt_t_mode": "same",
|
| 123 |
+
"corrupt_min_t": 0.0,
|
| 124 |
+
"corrupt_max_t": 1.0,
|
| 125 |
+
"prefix_block_prob": 0.0,
|
| 126 |
+
"prefix_block_len": 128,
|
| 127 |
+
"mask_ratio_floor_schedule": "none",
|
| 128 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 129 |
+
"dirichlet_semantic_t_mode": "same",
|
| 130 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 131 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 132 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 133 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 134 |
+
"categorical_wrong_from_full_vocab": true,
|
| 135 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 136 |
+
"categorical_wrong_basin_token_ids": "",
|
| 137 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 138 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 139 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 140 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 141 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 142 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 143 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 144 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 145 |
+
"mask_mixture_original_prob": 0.0,
|
| 146 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 147 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 148 |
+
"mask_mixture_block_prob": 0.0,
|
| 149 |
+
"mask_mixture_all_prob": 1.0,
|
| 150 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 151 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 152 |
+
"mask_mixture_block_tokens": "64,128",
|
| 153 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 154 |
+
"logistic_normal_sigma_min": 0.1,
|
| 155 |
+
"logistic_normal_sigma_max": 1.0,
|
| 156 |
+
"logistic_normal_tau_min": 1.0,
|
| 157 |
+
"logistic_normal_tau_max": 1.0,
|
| 158 |
+
"torch_compile": false,
|
| 159 |
+
"compile_mode": "max-autotune",
|
| 160 |
+
"state_format": "prob",
|
| 161 |
+
"meanflow_weight": 0.0,
|
| 162 |
+
"rollout_train_prob": 0.5,
|
| 163 |
+
"rollout_train_steps": 4,
|
| 164 |
+
"rollout_train_infer_steps": 1,
|
| 165 |
+
"rollout_train_time_mode": "sampled_path",
|
| 166 |
+
"rollout_train_s_dist": "uniform",
|
| 167 |
+
"rollout_train_s_min_frac": 0.0,
|
| 168 |
+
"rollout_train_s_max_frac": 0.125,
|
| 169 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 170 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 171 |
+
"rollout_train_temp": 1.45,
|
| 172 |
+
"rollout_train_max_gamma": 1.0,
|
| 173 |
+
"rollout_train_corrupt_only": true,
|
| 174 |
+
"rollout_train_samplewise": true,
|
| 175 |
+
"rollout_train_compute_always": false,
|
| 176 |
+
"rollout_train_sync_t": true,
|
| 177 |
+
"bridge_noise_init": "logistic_normal",
|
| 178 |
+
"noise_sigma": -1.0,
|
| 179 |
+
"allow_tf32": true,
|
| 180 |
+
"activation_checkpointing": false,
|
| 181 |
+
"activation_checkpoint_interval": 1,
|
| 182 |
+
"activation_checkpoint_scope": "block",
|
| 183 |
+
"ddp_static_graph": false,
|
| 184 |
+
"ddp_gradient_as_bucket_view": true,
|
| 185 |
+
"blocking_data_transfer": false,
|
| 186 |
+
"dataloader_prefetch_factor": 4,
|
| 187 |
+
"full_train_stats": false,
|
| 188 |
+
"tokenized_hf": false,
|
| 189 |
+
"tokenized_pad_token": "pad",
|
| 190 |
+
"elf_conditional_hf": false,
|
| 191 |
+
"record_pad_truncate": false,
|
| 192 |
+
"record_add_eos": false,
|
| 193 |
+
"record_add_special_tokens": false,
|
| 194 |
+
"record_pad_token": "pad",
|
| 195 |
+
"record_shuffle_buffer": 10000,
|
| 196 |
+
"wrap": true,
|
| 197 |
+
"wrap_mode": "stream",
|
| 198 |
+
"wrap_record_buffer_size": 200,
|
| 199 |
+
"owt_cached_chunks": true,
|
| 200 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
|
| 201 |
+
"owt_chunk_cache_rebuild": false,
|
| 202 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 203 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 204 |
+
"online_chunk_shuffle": false,
|
| 205 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 206 |
+
"openwebtext_split": "train_minus_100k",
|
| 207 |
+
"detokenizer": "auto",
|
| 208 |
+
"resolved_detokenizer": null,
|
| 209 |
+
"num_workers": 0,
|
| 210 |
+
"latest_every": 1000,
|
| 211 |
+
"resume_path": ""
|
| 212 |
+
}
|
| 213 |
+
step=100 epoch=100/1000 epoch_step=1/1 micro_steps=100 elapsed=24.8s lr=2.000000e-03 loss=7.7206 loss_recon=7.7206 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5077 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0958 corrupt_frac=1.0000 acc_corrupt=0.0958 loss_corrupt=7.7206 wrong_frac=0.7915 init_acc_corrupt=0.1164 acc_corrupt_t_0p0_0p2=0.0500 corrupt_frac_t_0p0_0p2=0.5640 acc_corrupt_t_0p2_0p4=0.1270 corrupt_frac_t_0p2_0p4=0.3466 acc_corrupt_t_0p4_0p6=0.2493 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p6_0p8=0.3719 corrupt_frac_t_0p6_0p8=0.0136 out_w_norm=1.0047 out_g_norm=1.0928 acc_corrupt_t_0p8_1p0=0.4936 corrupt_frac_t_0p8_1p0=0.0078 loss_all=7.4724 init_gold_top10=0.2003 init_gold_top100=0.4085 rollout_applied_pos_frac=0.4453 init_acc_rollout_applied=0.1056 init_acc_rollout_kept=0.1192 logit_acc_rollout_applied=0.0969 logit_acc_rollout_kept=0.0996
|
| 214 |
+
step=200 epoch=200/1000 epoch_step=1/1 micro_steps=200 elapsed=23.9s lr=2.000000e-03 loss=7.0874 loss_recon=7.0874 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4995 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1036 corrupt_frac=1.0000 acc_corrupt=0.1036 loss_corrupt=7.0874 wrong_frac=0.7905 init_acc_corrupt=0.1172 acc_corrupt_t_0p0_0p2=0.0560 corrupt_frac_t_0p0_0p2=0.5557 acc_corrupt_t_0p2_0p4=0.1392 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.2552 corrupt_frac_t_0p4_0p6=0.0762 acc_corrupt_t_0p6_0p8=0.3485 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=2.8612 out_g_norm=1.7761 acc_corrupt_t_0p8_1p0=0.4243 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.6891 init_gold_top10=0.2090 init_gold_top100=0.4276 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.1378 init_acc_rollout_kept=0.1215 logit_acc_rollout_applied=0.1143 logit_acc_rollout_kept=0.1146
|
| 215 |
+
step=300 epoch=300/1000 epoch_step=1/1 micro_steps=300 elapsed=24.0s lr=2.000000e-03 loss=6.4546 loss_recon=6.4546 loss_meanflow=0.0000 mean_model_t=0.2098 mean_corrupt_t=0.2098 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5023 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1137 corrupt_frac=1.0000 acc_corrupt=0.1137 loss_corrupt=6.4546 wrong_frac=0.7902 init_acc_corrupt=0.1187 acc_corrupt_t_0p0_0p2=0.0592 corrupt_frac_t_0p0_0p2=0.5544 acc_corrupt_t_0p2_0p4=0.1548 corrupt_frac_t_0p2_0p4=0.3617 acc_corrupt_t_0p4_0p6=0.2839 corrupt_frac_t_0p4_0p6=0.0743 out_w_norm=4.3408 out_g_norm=1.3199 acc_corrupt_t_0p6_0p8=0.3901 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=0.5415 corrupt_frac_t_0p8_1p0=0.0078 loss_all=6.2279 init_gold_top10=0.2020 init_gold_top100=0.4348 rollout_applied_pos_frac=0.4375 init_acc_rollout_applied=0.0878 init_acc_rollout_kept=0.1243 logit_acc_rollout_applied=0.1032 logit_acc_rollout_kept=0.1241
|
| 216 |
+
step=400 epoch=400/1000 epoch_step=1/1 micro_steps=400 elapsed=23.9s lr=2.000000e-03 loss=5.9837 loss_recon=5.9837 loss_meanflow=0.0000 mean_model_t=0.2072 mean_corrupt_t=0.2072 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5030 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1233 corrupt_frac=1.0000 acc_corrupt=0.1233 loss_corrupt=5.9837 wrong_frac=0.7929 init_acc_corrupt=0.1160 acc_corrupt_t_0p0_0p2=0.0639 corrupt_frac_t_0p0_0p2=0.5638 acc_corrupt_t_0p2_0p4=0.1708 corrupt_frac_t_0p2_0p4=0.3526 acc_corrupt_t_0p4_0p6=0.3106 corrupt_frac_t_0p4_0p6=0.0753 out_w_norm=5.4789 out_g_norm=0.5031 acc_corrupt_t_0p6_0p8=0.4367 corrupt_frac_t_0p6_0p8=0.0128 acc_corrupt_t_0p8_1p0=0.5306 corrupt_frac_t_0p8_1p0=0.0117 loss_all=5.7599 init_gold_top10=0.2017 init_gold_top100=0.4699 rollout_applied_pos_frac=0.4844 init_acc_rollout_applied=0.1182 init_acc_rollout_kept=0.1042 logit_acc_rollout_applied=0.1310 logit_acc_rollout_kept=0.1227
|
| 217 |
+
step=500 epoch=500/1000 epoch_step=1/1 micro_steps=500 elapsed=23.9s lr=2.000000e-03 loss=5.4774 loss_recon=5.4774 loss_meanflow=0.0000 mean_model_t=0.2101 mean_corrupt_t=0.2101 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4994 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1378 corrupt_frac=1.0000 acc_corrupt=0.1378 loss_corrupt=5.4774 wrong_frac=0.7898 init_acc_corrupt=0.1197 acc_corrupt_t_0p0_0p2=0.0677 corrupt_frac_t_0p0_0p2=0.5506 acc_corrupt_t_0p2_0p4=0.1918 corrupt_frac_t_0p2_0p4=0.3660 acc_corrupt_t_0p4_0p6=0.3501 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.4913 corrupt_frac_t_0p6_0p8=0.0128 out_w_norm=6.7180 out_g_norm=0.4070 acc_corrupt_t_0p8_1p0=0.6104 corrupt_frac_t_0p8_1p0=0.0078 loss_all=5.1003 init_gold_top10=0.2273 init_gold_top100=0.5161 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.1306 init_acc_rollout_kept=0.1486 logit_acc_rollout_applied=0.1487 logit_acc_rollout_kept=0.1648
|
| 218 |
+
step=600 epoch=600/1000 epoch_step=1/1 micro_steps=600 elapsed=23.9s lr=2.000000e-03 loss=4.8712 loss_recon=4.8712 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5037 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1521 corrupt_frac=1.0000 acc_corrupt=0.1521 loss_corrupt=4.8712 wrong_frac=0.7918 init_acc_corrupt=0.1187 acc_corrupt_t_0p0_0p2=0.0719 corrupt_frac_t_0p0_0p2=0.5629 acc_corrupt_t_0p2_0p4=0.2147 corrupt_frac_t_0p2_0p4=0.3502 acc_corrupt_t_0p4_0p6=0.4001 corrupt_frac_t_0p4_0p6=0.0768 out_w_norm=7.9804 out_g_norm=0.4280 acc_corrupt_t_0p6_0p8=0.5625 corrupt_frac_t_0p6_0p8=0.0133 loss_all=4.5656 init_gold_top10=0.2029 init_gold_top100=0.5925 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.1124 init_acc_rollout_kept=0.1078 logit_acc_rollout_applied=0.1600 logit_acc_rollout_kept=0.1533
|
| 219 |
+
step=700 epoch=700/1000 epoch_step=1/1 micro_steps=700 elapsed=24.1s lr=2.000000e-03 loss=4.2343 loss_recon=4.2343 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5123 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1770 corrupt_frac=1.0000 acc_corrupt=0.1770 loss_corrupt=4.2343 wrong_frac=0.7915 init_acc_corrupt=0.1192 acc_corrupt_t_0p0_0p2=0.0787 corrupt_frac_t_0p0_0p2=0.5591 acc_corrupt_t_0p2_0p4=0.2531 corrupt_frac_t_0p2_0p4=0.3563 acc_corrupt_t_0p4_0p6=0.4880 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=9.1564 out_g_norm=0.4726 acc_corrupt_t_0p6_0p8=0.6622 corrupt_frac_t_0p6_0p8=0.0125 acc_corrupt_t_0p8_1p0=0.8376 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.9427 init_gold_top10=0.2206 init_gold_top100=0.6230 rollout_applied_pos_frac=0.5156 init_acc_rollout_applied=0.1283 init_acc_rollout_kept=0.1214 logit_acc_rollout_applied=0.1920 logit_acc_rollout_kept=0.1924
|
| 220 |
+
step=800 epoch=800/1000 epoch_step=1/1 micro_steps=800 elapsed=23.9s lr=2.000000e-03 loss=3.7296 loss_recon=3.7296 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5020 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2008 corrupt_frac=1.0000 acc_corrupt=0.2008 loss_corrupt=3.7296 wrong_frac=0.7911 init_acc_corrupt=0.1208 acc_corrupt_t_0p0_0p2=0.0884 corrupt_frac_t_0p0_0p2=0.5509 acc_corrupt_t_0p2_0p4=0.2944 corrupt_frac_t_0p2_0p4=0.3674 acc_corrupt_t_0p4_0p6=0.5237 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.6942 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=9.9997 out_g_norm=0.5989 acc_corrupt_t_0p8_1p0=0.8555 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.4499 init_gold_top10=0.2440 init_gold_top100=0.5899 rollout_applied_pos_frac=0.4531 init_acc_rollout_applied=0.1209 init_acc_rollout_kept=0.1210 logit_acc_rollout_applied=0.2120 logit_acc_rollout_kept=0.2282
|
| 221 |
+
step=900 epoch=900/1000 epoch_step=1/1 micro_steps=900 elapsed=24.0s lr=2.000000e-03 loss=3.2983 loss_recon=3.2983 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5052 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2379 corrupt_frac=1.0000 acc_corrupt=0.2379 loss_corrupt=3.2983 wrong_frac=0.7905 init_acc_corrupt=0.1230 acc_corrupt_t_0p0_0p2=0.1028 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.3571 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.5909 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.7411 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=10.4993 out_g_norm=0.8729 acc_corrupt_t_0p8_1p0=0.8477 corrupt_frac_t_0p8_1p0=0.0078 loss_all=3.0155 init_gold_top10=0.2915 init_gold_top100=0.6315 rollout_applied_pos_frac=0.4766 init_acc_rollout_applied=0.1272 init_acc_rollout_kept=0.1170 logit_acc_rollout_applied=0.2742 logit_acc_rollout_kept=0.2656
|
| 222 |
+
step=1000 epoch=1000/1000 epoch_step=1/1 micro_steps=1000 elapsed=23.8s lr=2.000000e-03 loss=2.8778 loss_recon=2.8778 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4952 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2973 corrupt_frac=1.0000 acc_corrupt=0.2973 loss_corrupt=2.8778 wrong_frac=0.7924 init_acc_corrupt=0.1228 acc_corrupt_t_0p0_0p2=0.1297 corrupt_frac_t_0p0_0p2=0.5567 acc_corrupt_t_0p2_0p4=0.4607 corrupt_frac_t_0p2_0p4=0.3598 acc_corrupt_t_0p4_0p6=0.6989 corrupt_frac_t_0p4_0p6=0.0751 acc_corrupt_t_0p6_0p8=0.8142 corrupt_frac_t_0p6_0p8=0.0128 out_w_norm=10.8390 out_g_norm=1.0522 loss_all=2.7461 init_gold_top10=0.3628 init_gold_top100=0.6665 rollout_applied_pos_frac=0.5234 init_acc_rollout_applied=0.1368 init_acc_rollout_kept=0.0965 logit_acc_rollout_applied=0.3638 logit_acc_rollout_kept=0.2733
|
| 223 |
+
[ctx1024-sampleds] eval config=p50_path4_unif0_0p125_outwdm1 step=1000
|
| 224 |
+
[eval-decode-acc] train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139 step=1000 soft=none
|
| 225 |
+
[decode] max_len=1024 generated=64/64
|
| 226 |
+
{
|
| 227 |
+
"num_rows": 1,
|
| 228 |
+
"best_by_run": {
|
| 229 |
+
"train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139::none": {
|
| 230 |
+
"run": "train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139",
|
| 231 |
+
"checkpoint": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139/step_0001000.pt",
|
| 232 |
+
"ckpt_step": 1000,
|
| 233 |
+
"endpoint_softening": "none",
|
| 234 |
+
"decode_rule": "flowmap",
|
| 235 |
+
"steps": 128,
|
| 236 |
+
"time_schedule": "logit_normal",
|
| 237 |
+
"model_t_mode": "post",
|
| 238 |
+
"final_from": "state",
|
| 239 |
+
"n_gen": 64,
|
| 240 |
+
"n_refs": 8,
|
| 241 |
+
"token_acc_mean": 0.0288543701171875,
|
| 242 |
+
"token_acc_min": 0.01953125,
|
| 243 |
+
"token_acc_max": 0.0478515625,
|
| 244 |
+
"exact_acc": 0.0,
|
| 245 |
+
"exact_count": 0,
|
| 246 |
+
"exact_ref_coverage": 0.0,
|
| 247 |
+
"exact_ref_count": 0,
|
| 248 |
+
"exact_ref_hits": [],
|
| 249 |
+
"best_ref_idx": [
|
| 250 |
+
7,
|
| 251 |
+
2,
|
| 252 |
+
5,
|
| 253 |
+
7,
|
| 254 |
+
7,
|
| 255 |
+
7,
|
| 256 |
+
2,
|
| 257 |
+
0,
|
| 258 |
+
7,
|
| 259 |
+
3,
|
| 260 |
+
7,
|
| 261 |
+
4,
|
| 262 |
+
2,
|
| 263 |
+
7,
|
| 264 |
+
7,
|
| 265 |
+
5,
|
| 266 |
+
7,
|
| 267 |
+
2,
|
| 268 |
+
1,
|
| 269 |
+
0,
|
| 270 |
+
7,
|
| 271 |
+
2,
|
| 272 |
+
7,
|
| 273 |
+
5,
|
| 274 |
+
7,
|
| 275 |
+
0,
|
| 276 |
+
5,
|
| 277 |
+
5,
|
| 278 |
+
7,
|
| 279 |
+
7,
|
| 280 |
+
5,
|
| 281 |
+
7,
|
| 282 |
+
4,
|
| 283 |
+
7,
|
| 284 |
+
7,
|
| 285 |
+
5,
|
| 286 |
+
5,
|
| 287 |
+
4,
|
| 288 |
+
7,
|
| 289 |
+
5,
|
| 290 |
+
2,
|
| 291 |
+
7,
|
| 292 |
+
5,
|
| 293 |
+
7,
|
| 294 |
+
7,
|
| 295 |
+
7,
|
| 296 |
+
7,
|
| 297 |
+
4,
|
| 298 |
+
4,
|
| 299 |
+
5,
|
| 300 |
+
7,
|
| 301 |
+
2,
|
| 302 |
+
4,
|
| 303 |
+
7,
|
| 304 |
+
0,
|
| 305 |
+
7,
|
| 306 |
+
7,
|
| 307 |
+
5,
|
| 308 |
+
0,
|
| 309 |
+
4,
|
| 310 |
+
0,
|
| 311 |
+
7,
|
| 312 |
+
4,
|
| 313 |
+
5
|
| 314 |
+
],
|
| 315 |
+
"best_token_acc": [
|
| 316 |
+
0.0263671875,
|
| 317 |
+
0.021484375,
|
| 318 |
+
0.029296875,
|
| 319 |
+
0.01953125,
|
| 320 |
+
0.029296875,
|
| 321 |
+
0.0283203125,
|
| 322 |
+
0.0234375,
|
| 323 |
+
0.021484375,
|
| 324 |
+
0.0263671875,
|
| 325 |
+
0.0283203125,
|
| 326 |
+
0.0234375,
|
| 327 |
+
0.025390625,
|
| 328 |
+
0.0302734375,
|
| 329 |
+
0.0263671875,
|
| 330 |
+
0.0263671875,
|
| 331 |
+
0.0263671875,
|
| 332 |
+
0.0263671875,
|
| 333 |
+
0.0234375,
|
| 334 |
+
0.0283203125,
|
| 335 |
+
0.0302734375,
|
| 336 |
+
0.033203125,
|
| 337 |
+
0.0380859375,
|
| 338 |
+
0.0263671875,
|
| 339 |
+
0.0283203125,
|
| 340 |
+
0.02734375,
|
| 341 |
+
0.03515625,
|
| 342 |
+
0.044921875,
|
| 343 |
+
0.0419921875,
|
| 344 |
+
0.03125,
|
| 345 |
+
0.0283203125,
|
| 346 |
+
0.0478515625,
|
| 347 |
+
0.0224609375,
|
| 348 |
+
0.0263671875,
|
| 349 |
+
0.0283203125,
|
| 350 |
+
0.033203125,
|
| 351 |
+
0.041015625,
|
| 352 |
+
0.03125,
|
| 353 |
+
0.0224609375,
|
| 354 |
+
0.03125,
|
| 355 |
+
0.03515625,
|
| 356 |
+
0.0244140625,
|
| 357 |
+
0.0263671875,
|
| 358 |
+
0.0302734375,
|
| 359 |
+
0.0234375,
|
| 360 |
+
0.01953125,
|
| 361 |
+
0.021484375,
|
| 362 |
+
0.0224609375,
|
| 363 |
+
0.0322265625,
|
| 364 |
+
0.025390625,
|
| 365 |
+
0.0322265625,
|
| 366 |
+
0.029296875,
|
| 367 |
+
0.033203125,
|
| 368 |
+
0.0244140625,
|
| 369 |
+
0.033203125,
|
| 370 |
+
0.02734375,
|
| 371 |
+
0.041015625,
|
| 372 |
+
0.025390625,
|
| 373 |
+
0.0302734375,
|
| 374 |
+
0.0234375,
|
| 375 |
+
0.02734375,
|
| 376 |
+
0.0244140625,
|
| 377 |
+
0.025390625,
|
| 378 |
+
0.025390625,
|
| 379 |
+
0.0439453125
|
| 380 |
+
]
|
| 381 |
+
}
|
| 382 |
+
},
|
| 383 |
+
"first_exact_by_run": {}
|
| 384 |
+
}
|
| 385 |
+
RESULT config=p50_path4_unif0_0p125_outwdm1 run=train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139 ckpt_step=1000 views=512000 token_acc=0.0289 exact=0/64 exact_refs=0 hits=[]
|
| 386 |
+
[ctx1024-sampleds] train config=p50_path4_unif0_0p125_outwdm1 from=1000 to=2000
|
| 387 |
+
[launch] gpt2 cached OWT soft-endpoint m/n pilot
|
| 388 |
+
[launch] run_name=train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139
|
| 389 |
+
[launch] save_dir=runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139
|
| 390 |
+
[launch] n=1024 m=0 clean_state_mode=onehot
|
| 391 |
+
[launch] mask_mixture lowk=0.0 all=1.0
|
| 392 |
+
[launch] model d=192 layers=3 heads=3 ff=768 vocab_override=2664
|
| 393 |
+
[launch] optimizer=muon muon_impl=legacy weight_decay=0.1 output_weight_decay=-1
|
| 394 |
+
[launch] target_loss=hard_ce conf=0.0->1.0 power=1.0
|
| 395 |
+
[launch] mask_ratio=1.0->1.0
|
| 396 |
+
[launch] mask_ratio_floor_schedule=none
|
| 397 |
+
[launch] dirichlet C=1.0->1024 endpoint=categorical_dual_t sampler=dirichlet
|
| 398 |
+
[launch] wrong_mix seq_alpha=0.0 wrong_floor=0.0 unigram=0.0 uniform=0.0 basin=0.0 basin_ids=
|
| 399 |
+
[launch] rollout_train prob=0.50 mode=sampled_path steps=4 infer_steps=1 s_dist=uniform s_frac=0.0->0.125 temp=1.45 corrupt_only=1 samplewise=1 selected_only=1 sync_t=1
|
| 400 |
+
[launch] cache=/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit exact_repeat_per_chunk=64
|
| 401 |
+
[launch] resume_path=runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139/latest.pt
|
| 402 |
+
NCCL version 2.25.1+cuda12.8
|
| 403 |
+
resumed_from=runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139/latest.pt start_step=1001
|
| 404 |
+
{
|
| 405 |
+
"device": "cuda:0",
|
| 406 |
+
"rank": 0,
|
| 407 |
+
"world_size": 4,
|
| 408 |
+
"samples": "owt_cached_chunks:8",
|
| 409 |
+
"vocab_size": 2664,
|
| 410 |
+
"tokenizer_vocab_size": 50257,
|
| 411 |
+
"save_dir": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139",
|
| 412 |
+
"batch_size": 128,
|
| 413 |
+
"grad_accum": 1,
|
| 414 |
+
"effective_batch_size": 512,
|
| 415 |
+
"global_batch_size": 512,
|
| 416 |
+
"lr_schedule": "constant_warmup",
|
| 417 |
+
"optimizer": "muon",
|
| 418 |
+
"epochs": 0.0,
|
| 419 |
+
"steps_per_epoch": 1,
|
| 420 |
+
"total_steps": 2000,
|
| 421 |
+
"warmup_steps": 10,
|
| 422 |
+
"warmup_epochs": -1.0,
|
| 423 |
+
"min_lr": 0.0,
|
| 424 |
+
"weight_decay": 0.1,
|
| 425 |
+
"output_weight_decay": -1.0,
|
| 426 |
+
"adamw_param_groups": "nanogpt",
|
| 427 |
+
"adam_beta1": 0.9,
|
| 428 |
+
"adam_beta2": 0.95,
|
| 429 |
+
"adam_eps": 1e-08,
|
| 430 |
+
"muon_impl": "legacy",
|
| 431 |
+
"muon_momentum": 0.95,
|
| 432 |
+
"muon_ns_steps": 5,
|
| 433 |
+
"muon_update_scale": 1.0,
|
| 434 |
+
"muon_nesterov": false,
|
| 435 |
+
"muon_width_scale": false,
|
| 436 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 437 |
+
"muon_param_count": 2616320,
|
| 438 |
+
"muon_adam_param_count": 8192,
|
| 439 |
+
"muon_param_names": [
|
| 440 |
+
"vocab_embed.embedding",
|
| 441 |
+
"sigma_map.net.0.weight",
|
| 442 |
+
"sigma_map.net.2.weight",
|
| 443 |
+
"blocks.0.attn_qkv.weight",
|
| 444 |
+
"blocks.0.attn_out.weight",
|
| 445 |
+
"blocks.0.mlp.0.weight",
|
| 446 |
+
"blocks.0.mlp.2.weight",
|
| 447 |
+
"blocks.0.adaLN_modulation.weight",
|
| 448 |
+
"blocks.1.attn_qkv.weight",
|
| 449 |
+
"blocks.1.attn_out.weight",
|
| 450 |
+
"blocks.1.mlp.0.weight",
|
| 451 |
+
"blocks.1.mlp.2.weight",
|
| 452 |
+
"blocks.1.adaLN_modulation.weight",
|
| 453 |
+
"blocks.2.attn_qkv.weight",
|
| 454 |
+
"blocks.2.attn_out.weight",
|
| 455 |
+
"blocks.2.mlp.0.weight",
|
| 456 |
+
"blocks.2.mlp.2.weight",
|
| 457 |
+
"blocks.2.adaLN_modulation.weight",
|
| 458 |
+
"output_layer.linear.weight",
|
| 459 |
+
"output_layer.adaLN_modulation.weight"
|
| 460 |
+
],
|
| 461 |
+
"muon_adam_param_names": [
|
| 462 |
+
"sigma_map.net.0.bias",
|
| 463 |
+
"sigma_map.net.2.bias",
|
| 464 |
+
"blocks.0.norm1.weight",
|
| 465 |
+
"blocks.0.norm2.weight",
|
| 466 |
+
"blocks.0.mlp.0.bias",
|
| 467 |
+
"blocks.0.mlp.2.bias",
|
| 468 |
+
"blocks.0.adaLN_modulation.bias",
|
| 469 |
+
"blocks.1.norm1.weight",
|
| 470 |
+
"blocks.1.norm2.weight",
|
| 471 |
+
"blocks.1.mlp.0.bias",
|
| 472 |
+
"blocks.1.mlp.2.bias",
|
| 473 |
+
"blocks.1.adaLN_modulation.bias",
|
| 474 |
+
"blocks.2.norm1.weight",
|
| 475 |
+
"blocks.2.norm2.weight",
|
| 476 |
+
"blocks.2.mlp.0.bias",
|
| 477 |
+
"blocks.2.mlp.2.bias",
|
| 478 |
+
"blocks.2.adaLN_modulation.bias",
|
| 479 |
+
"output_layer.norm_final.weight",
|
| 480 |
+
"output_layer.adaLN_modulation.bias"
|
| 481 |
+
],
|
| 482 |
+
"muon_effective_nesterov": false,
|
| 483 |
+
"muon_effective_width_scale": false,
|
| 484 |
+
"muon_effective_weight_decay": 0.1,
|
| 485 |
+
"muon_adam_fallback_nesterov": false,
|
| 486 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 487 |
+
"ema_decay": 0.9999,
|
| 488 |
+
"ema_start_step": 0,
|
| 489 |
+
"model_type": "ddit",
|
| 490 |
+
"ddit_mlp_type": "gelu",
|
| 491 |
+
"elf_num_time_tokens": 4,
|
| 492 |
+
"elf_num_model_mode_tokens": 0,
|
| 493 |
+
"qk_norm": true,
|
| 494 |
+
"output_bias": false,
|
| 495 |
+
"output_init_std": -1.0,
|
| 496 |
+
"norm_type": "rmsnorm",
|
| 497 |
+
"target_loss": "hard_ce",
|
| 498 |
+
"linear_soft_target_power": 1.0,
|
| 499 |
+
"linear_soft_target_min_conf": 0.0,
|
| 500 |
+
"linear_soft_target_max_conf": 1.0,
|
| 501 |
+
"t_sampling_mode": "logit_normal",
|
| 502 |
+
"t_sampling_power": 1.0,
|
| 503 |
+
"t_sampling_eps": 0.0001,
|
| 504 |
+
"t_sampling_logit_mean": -1.5,
|
| 505 |
+
"t_sampling_logit_std": 0.8,
|
| 506 |
+
"dual_t": true,
|
| 507 |
+
"corrupt_t_mode": "same",
|
| 508 |
+
"corrupt_min_t": 0.0,
|
| 509 |
+
"corrupt_max_t": 1.0,
|
| 510 |
+
"prefix_block_prob": 0.0,
|
| 511 |
+
"prefix_block_len": 128,
|
| 512 |
+
"mask_ratio_floor_schedule": "none",
|
| 513 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 514 |
+
"dirichlet_semantic_t_mode": "same",
|
| 515 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 516 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 517 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 518 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 519 |
+
"categorical_wrong_from_full_vocab": true,
|
| 520 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 521 |
+
"categorical_wrong_basin_token_ids": "",
|
| 522 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 523 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 524 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 525 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 526 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 527 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 528 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 529 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 530 |
+
"mask_mixture_original_prob": 0.0,
|
| 531 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 532 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 533 |
+
"mask_mixture_block_prob": 0.0,
|
| 534 |
+
"mask_mixture_all_prob": 1.0,
|
| 535 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 536 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 537 |
+
"mask_mixture_block_tokens": "64,128",
|
| 538 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 539 |
+
"logistic_normal_sigma_min": 0.1,
|
| 540 |
+
"logistic_normal_sigma_max": 1.0,
|
| 541 |
+
"logistic_normal_tau_min": 1.0,
|
| 542 |
+
"logistic_normal_tau_max": 1.0,
|
| 543 |
+
"torch_compile": false,
|
| 544 |
+
"compile_mode": "max-autotune",
|
| 545 |
+
"state_format": "prob",
|
| 546 |
+
"meanflow_weight": 0.0,
|
| 547 |
+
"rollout_train_prob": 0.5,
|
| 548 |
+
"rollout_train_steps": 4,
|
| 549 |
+
"rollout_train_infer_steps": 1,
|
| 550 |
+
"rollout_train_time_mode": "sampled_path",
|
| 551 |
+
"rollout_train_s_dist": "uniform",
|
| 552 |
+
"rollout_train_s_min_frac": 0.0,
|
| 553 |
+
"rollout_train_s_max_frac": 0.125,
|
| 554 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 555 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 556 |
+
"rollout_train_temp": 1.45,
|
| 557 |
+
"rollout_train_max_gamma": 1.0,
|
| 558 |
+
"rollout_train_corrupt_only": true,
|
| 559 |
+
"rollout_train_samplewise": true,
|
| 560 |
+
"rollout_train_compute_always": false,
|
| 561 |
+
"rollout_train_sync_t": true,
|
| 562 |
+
"bridge_noise_init": "logistic_normal",
|
| 563 |
+
"noise_sigma": -1.0,
|
| 564 |
+
"allow_tf32": true,
|
| 565 |
+
"activation_checkpointing": false,
|
| 566 |
+
"activation_checkpoint_interval": 1,
|
| 567 |
+
"activation_checkpoint_scope": "block",
|
| 568 |
+
"ddp_static_graph": false,
|
| 569 |
+
"ddp_gradient_as_bucket_view": true,
|
| 570 |
+
"blocking_data_transfer": false,
|
| 571 |
+
"dataloader_prefetch_factor": 4,
|
| 572 |
+
"full_train_stats": false,
|
| 573 |
+
"tokenized_hf": false,
|
| 574 |
+
"tokenized_pad_token": "pad",
|
| 575 |
+
"elf_conditional_hf": false,
|
| 576 |
+
"record_pad_truncate": false,
|
| 577 |
+
"record_add_eos": false,
|
| 578 |
+
"record_add_special_tokens": false,
|
| 579 |
+
"record_pad_token": "pad",
|
| 580 |
+
"record_shuffle_buffer": 10000,
|
| 581 |
+
"wrap": true,
|
| 582 |
+
"wrap_mode": "stream",
|
| 583 |
+
"wrap_record_buffer_size": 200,
|
| 584 |
+
"owt_cached_chunks": true,
|
| 585 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
|
| 586 |
+
"owt_chunk_cache_rebuild": false,
|
| 587 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 588 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 589 |
+
"online_chunk_shuffle": false,
|
| 590 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 591 |
+
"openwebtext_split": "train_minus_100k",
|
| 592 |
+
"detokenizer": "auto",
|
| 593 |
+
"resolved_detokenizer": null,
|
| 594 |
+
"num_workers": 0,
|
| 595 |
+
"latest_every": 1000,
|
| 596 |
+
"resume_path": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139/latest.pt"
|
| 597 |
+
}
|
| 598 |
+
step=1100 epoch=1100/2000 epoch_step=1/1 micro_steps=1100 elapsed=24.6s lr=2.000000e-03 loss=2.4561 loss_recon=2.4561 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5077 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.3788 corrupt_frac=1.0000 acc_corrupt=0.3788 loss_corrupt=2.4561 wrong_frac=0.7915 init_acc_corrupt=0.1281 acc_corrupt_t_0p0_0p2=0.1728 corrupt_frac_t_0p0_0p2=0.5640 acc_corrupt_t_0p2_0p4=0.6004 corrupt_frac_t_0p2_0p4=0.3466 acc_corrupt_t_0p4_0p6=0.8111 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p6_0p8=0.8828 corrupt_frac_t_0p6_0p8=0.0136 out_w_norm=11.1283 out_g_norm=1.2622 acc_corrupt_t_0p8_1p0=0.9307 corrupt_frac_t_0p8_1p0=0.0078 loss_all=2.1052 init_gold_top10=0.4033 init_gold_top100=0.6222 rollout_applied_pos_frac=0.4453 init_acc_rollout_applied=0.1418 init_acc_rollout_kept=0.1192 logit_acc_rollout_applied=0.4907 logit_acc_rollout_kept=0.4183
|
| 599 |
+
step=1200 epoch=1200/2000 epoch_step=1/1 micro_steps=1200 elapsed=23.7s lr=2.000000e-03 loss=2.0694 loss_recon=2.0694 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4995 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4665 corrupt_frac=1.0000 acc_corrupt=0.4665 loss_corrupt=2.0694 wrong_frac=0.7905 init_acc_corrupt=0.1362 acc_corrupt_t_0p0_0p2=0.2250 corrupt_frac_t_0p0_0p2=0.5557 acc_corrupt_t_0p2_0p4=0.7365 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.9014 corrupt_frac_t_0p4_0p6=0.0762 acc_corrupt_t_0p6_0p8=0.9335 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=11.3914 out_g_norm=1.4567 acc_corrupt_t_0p8_1p0=0.9663 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.9323 init_gold_top10=0.4233 init_gold_top100=0.6392 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.1810 init_acc_rollout_kept=0.1215 logit_acc_rollout_applied=0.5332 logit_acc_rollout_kept=0.4872
|
| 600 |
+
step=1300 epoch=1300/2000 epoch_step=1/1 micro_steps=1300 elapsed=23.7s lr=2.000000e-03 loss=1.7590 loss_recon=1.7590 loss_meanflow=0.0000 mean_model_t=0.2098 mean_corrupt_t=0.2098 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5023 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5421 corrupt_frac=1.0000 acc_corrupt=0.5421 loss_corrupt=1.7590 wrong_frac=0.7902 init_acc_corrupt=0.1490 acc_corrupt_t_0p0_0p2=0.2864 corrupt_frac_t_0p0_0p2=0.5544 acc_corrupt_t_0p2_0p4=0.8375 corrupt_frac_t_0p2_0p4=0.3617 acc_corrupt_t_0p4_0p6=0.9568 corrupt_frac_t_0p4_0p6=0.0743 out_w_norm=11.5756 out_g_norm=1.5131 acc_corrupt_t_0p6_0p8=0.9688 corrupt_frac_t_0p6_0p8=0.0139 acc_corrupt_t_0p8_1p0=0.9624 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.4657 init_gold_top10=0.4631 init_gold_top100=0.6198 rollout_applied_pos_frac=0.4375 init_acc_rollout_applied=0.1669 init_acc_rollout_kept=0.1243 logit_acc_rollout_applied=0.6838 logit_acc_rollout_kept=0.5458
|
| 601 |
+
step=1400 epoch=1400/2000 epoch_step=1/1 micro_steps=1400 elapsed=23.7s lr=2.000000e-03 loss=1.5170 loss_recon=1.5170 loss_meanflow=0.0000 mean_model_t=0.2072 mean_corrupt_t=0.2072 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5030 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6018 corrupt_frac=1.0000 acc_corrupt=0.6018 loss_corrupt=1.5170 wrong_frac=0.7929 init_acc_corrupt=0.1570 acc_corrupt_t_0p0_0p2=0.3584 corrupt_frac_t_0p0_0p2=0.5638 acc_corrupt_t_0p2_0p4=0.9009 corrupt_frac_t_0p2_0p4=0.3526 acc_corrupt_t_0p4_0p6=0.9811 corrupt_frac_t_0p4_0p6=0.0753 out_w_norm=11.6982 out_g_norm=1.5191 acc_corrupt_t_0p6_0p8=0.9840 corrupt_frac_t_0p6_0p8=0.0128 acc_corrupt_t_0p8_1p0=0.9805 corrupt_frac_t_0p8_1p0=0.0117 loss_all=1.3672 init_gold_top10=0.4951 init_gold_top100=0.6454 rollout_applied_pos_frac=0.4844 init_acc_rollout_applied=0.2290 init_acc_rollout_kept=0.1042 logit_acc_rollout_applied=0.7189 logit_acc_rollout_kept=0.5401
|
| 602 |
+
step=1500 epoch=1500/2000 epoch_step=1/1 micro_steps=1500 elapsed=23.7s lr=2.000000e-03 loss=1.2915 loss_recon=1.2915 loss_meanflow=0.0000 mean_model_t=0.2101 mean_corrupt_t=0.2101 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4994 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6540 corrupt_frac=1.0000 acc_corrupt=0.6540 loss_corrupt=1.2915 wrong_frac=0.7898 init_acc_corrupt=0.1734 acc_corrupt_t_0p0_0p2=0.4134 corrupt_frac_t_0p0_0p2=0.5506 acc_corrupt_t_0p2_0p4=0.9392 corrupt_frac_t_0p2_0p4=0.3660 acc_corrupt_t_0p4_0p6=0.9911 corrupt_frac_t_0p4_0p6=0.0752 acc_corrupt_t_0p6_0p8=0.9905 corrupt_frac_t_0p6_0p8=0.0128 out_w_norm=11.7810 out_g_norm=1.4652 acc_corrupt_t_0p8_1p0=0.9917 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0025 init_gold_top10=0.5435 init_gold_top100=0.6617 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.2334 init_acc_rollout_kept=0.1486 logit_acc_rollout_applied=0.7939 logit_acc_rollout_kept=0.6810
|
| 603 |
+
step=1600 epoch=1600/2000 epoch_step=1/1 micro_steps=1600 elapsed=23.7s lr=2.000000e-03 loss=1.1774 loss_recon=1.1774 loss_meanflow=0.0000 mean_model_t=0.2082 mean_corrupt_t=0.2082 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5037 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.6826 corrupt_frac=1.0000 acc_corrupt=0.6826 loss_corrupt=1.1774 wrong_frac=0.7918 init_acc_corrupt=0.1812 acc_corrupt_t_0p0_0p2=0.4607 corrupt_frac_t_0p0_0p2=0.5629 acc_corrupt_t_0p2_0p4=0.9616 corrupt_frac_t_0p2_0p4=0.3502 acc_corrupt_t_0p4_0p6=0.9956 corrupt_frac_t_0p4_0p6=0.0768 out_w_norm=11.8241 out_g_norm=1.3619 acc_corrupt_t_0p6_0p8=0.9946 corrupt_frac_t_0p6_0p8=0.0133 loss_all=1.1961 init_gold_top10=0.5235 init_gold_top100=0.6535 rollout_applied_pos_frac=0.4922 init_acc_rollout_applied=0.2526 init_acc_rollout_kept=0.1078 logit_acc_rollout_applied=0.8093 logit_acc_rollout_kept=0.5727
|
| 604 |
+
step=1700 epoch=1700/2000 epoch_step=1/1 micro_steps=1700 elapsed=23.8s lr=2.000000e-03 loss=1.0426 loss_recon=1.0426 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5123 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7140 corrupt_frac=1.0000 acc_corrupt=0.7140 loss_corrupt=1.0426 wrong_frac=0.7915 init_acc_corrupt=0.1897 acc_corrupt_t_0p0_0p2=0.5042 corrupt_frac_t_0p0_0p2=0.5591 acc_corrupt_t_0p2_0p4=0.9759 corrupt_frac_t_0p2_0p4=0.3563 acc_corrupt_t_0p4_0p6=0.9978 corrupt_frac_t_0p4_0p6=0.0764 out_w_norm=11.8438 out_g_norm=1.2704 acc_corrupt_t_0p6_0p8=0.9969 corrupt_frac_t_0p6_0p8=0.0125 acc_corrupt_t_0p8_1p0=0.9912 corrupt_frac_t_0p8_1p0=0.0078 loss_all=1.0589 init_gold_top10=0.5409 init_gold_top100=0.6740 rollout_applied_pos_frac=0.5156 init_acc_rollout_applied=0.2758 init_acc_rollout_kept=0.1214 logit_acc_rollout_applied=0.7361 logit_acc_rollout_kept=0.6505
|
| 605 |
+
step=1800 epoch=1800/2000 epoch_step=1/1 micro_steps=1800 elapsed=23.7s lr=2.000000e-03 loss=0.9326 loss_recon=0.9326 loss_meanflow=0.0000 mean_model_t=0.2092 mean_corrupt_t=0.2092 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5020 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7394 corrupt_frac=1.0000 acc_corrupt=0.7394 loss_corrupt=0.9326 wrong_frac=0.7911 init_acc_corrupt=0.1967 acc_corrupt_t_0p0_0p2=0.5387 corrupt_frac_t_0p0_0p2=0.5509 acc_corrupt_t_0p2_0p4=0.9829 corrupt_frac_t_0p2_0p4=0.3674 acc_corrupt_t_0p4_0p6=0.9988 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.9967 corrupt_frac_t_0p6_0p8=0.0126 out_w_norm=11.8643 out_g_norm=1.1264 acc_corrupt_t_0p8_1p0=0.9866 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.7271 init_gold_top10=0.5306 init_gold_top100=0.6286 rollout_applied_pos_frac=0.4531 init_acc_rollout_applied=0.2711 init_acc_rollout_kept=0.1210 logit_acc_rollout_applied=0.8536 logit_acc_rollout_kept=0.7328
|
| 606 |
+
step=1900 epoch=1900/2000 epoch_step=1/1 micro_steps=1900 elapsed=23.8s lr=2.000000e-03 loss=0.8706 loss_recon=0.8706 loss_meanflow=0.0000 mean_model_t=0.2095 mean_corrupt_t=0.2095 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5052 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7491 corrupt_frac=1.0000 acc_corrupt=0.7491 loss_corrupt=0.8706 wrong_frac=0.7905 init_acc_corrupt=0.2009 acc_corrupt_t_0p0_0p2=0.5529 corrupt_frac_t_0p0_0p2=0.5516 acc_corrupt_t_0p2_0p4=0.9885 corrupt_frac_t_0p2_0p4=0.3645 acc_corrupt_t_0p4_0p6=0.9988 corrupt_frac_t_0p4_0p6=0.0748 acc_corrupt_t_0p6_0p8=0.9971 corrupt_frac_t_0p6_0p8=0.0123 out_w_norm=11.8558 out_g_norm=1.0997 acc_corrupt_t_0p8_1p0=0.9964 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.7654 init_gold_top10=0.5507 init_gold_top100=0.6459 rollout_applied_pos_frac=0.4766 init_acc_rollout_applied=0.2787 init_acc_rollout_kept=0.1170 logit_acc_rollout_applied=0.8564 logit_acc_rollout_kept=0.7075
|
| 607 |
+
step=2000 epoch=2000/2000 epoch_step=1/1 micro_steps=2000 elapsed=23.7s lr=2.000000e-03 loss=0.7684 loss_recon=0.7684 loss_meanflow=0.0000 mean_model_t=0.2077 mean_corrupt_t=0.2077 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4952 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7720 corrupt_frac=1.0000 acc_corrupt=0.7720 loss_corrupt=0.7684 wrong_frac=0.7924 init_acc_corrupt=0.1993 acc_corrupt_t_0p0_0p2=0.5971 corrupt_frac_t_0p0_0p2=0.5567 acc_corrupt_t_0p2_0p4=0.9900 corrupt_frac_t_0p2_0p4=0.3598 acc_corrupt_t_0p4_0p6=0.9990 corrupt_frac_t_0p4_0p6=0.0751 acc_corrupt_t_0p6_0p8=0.9969 corrupt_frac_t_0p6_0p8=0.0128 out_w_norm=11.8523 out_g_norm=1.1152 loss_all=0.7562 init_gold_top10=0.5871 init_gold_top100=0.6723 rollout_applied_pos_frac=0.5234 init_acc_rollout_applied=0.3057 init_acc_rollout_kept=0.0965 logit_acc_rollout_applied=0.8930 logit_acc_rollout_kept=0.6352
|
| 608 |
+
[ctx1024-sampleds] eval config=p50_path4_unif0_0p125_outwdm1 step=2000
|
| 609 |
+
[eval-decode-acc] train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139 step=2000 soft=none
|
| 610 |
+
[decode] max_len=1024 generated=64/64
|
| 611 |
+
{
|
| 612 |
+
"num_rows": 1,
|
| 613 |
+
"best_by_run": {
|
| 614 |
+
"train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139::none": {
|
| 615 |
+
"run": "train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139",
|
| 616 |
+
"checkpoint": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139/step_0002000.pt",
|
| 617 |
+
"ckpt_step": 2000,
|
| 618 |
+
"endpoint_softening": "none",
|
| 619 |
+
"decode_rule": "flowmap",
|
| 620 |
+
"steps": 128,
|
| 621 |
+
"time_schedule": "logit_normal",
|
| 622 |
+
"model_t_mode": "post",
|
| 623 |
+
"final_from": "state",
|
| 624 |
+
"n_gen": 64,
|
| 625 |
+
"n_refs": 8,
|
| 626 |
+
"token_acc_mean": 0.86529541015625,
|
| 627 |
+
"token_acc_min": 0.2119140625,
|
| 628 |
+
"token_acc_max": 0.9921875,
|
| 629 |
+
"exact_acc": 0.0,
|
| 630 |
+
"exact_count": 0,
|
| 631 |
+
"exact_ref_coverage": 0.0,
|
| 632 |
+
"exact_ref_count": 0,
|
| 633 |
+
"exact_ref_hits": [],
|
| 634 |
+
"best_ref_idx": [
|
| 635 |
+
4,
|
| 636 |
+
4,
|
| 637 |
+
5,
|
| 638 |
+
4,
|
| 639 |
+
1,
|
| 640 |
+
7,
|
| 641 |
+
4,
|
| 642 |
+
4,
|
| 643 |
+
5,
|
| 644 |
+
1,
|
| 645 |
+
5,
|
| 646 |
+
1,
|
| 647 |
+
4,
|
| 648 |
+
1,
|
| 649 |
+
4,
|
| 650 |
+
4,
|
| 651 |
+
1,
|
| 652 |
+
5,
|
| 653 |
+
4,
|
| 654 |
+
1,
|
| 655 |
+
4,
|
| 656 |
+
4,
|
| 657 |
+
4,
|
| 658 |
+
4,
|
| 659 |
+
4,
|
| 660 |
+
4,
|
| 661 |
+
7,
|
| 662 |
+
4,
|
| 663 |
+
4,
|
| 664 |
+
1,
|
| 665 |
+
4,
|
| 666 |
+
4,
|
| 667 |
+
4,
|
| 668 |
+
1,
|
| 669 |
+
4,
|
| 670 |
+
4,
|
| 671 |
+
4,
|
| 672 |
+
4,
|
| 673 |
+
5,
|
| 674 |
+
4,
|
| 675 |
+
1,
|
| 676 |
+
4,
|
| 677 |
+
4,
|
| 678 |
+
3,
|
| 679 |
+
4,
|
| 680 |
+
5,
|
| 681 |
+
7,
|
| 682 |
+
7,
|
| 683 |
+
5,
|
| 684 |
+
4,
|
| 685 |
+
4,
|
| 686 |
+
4,
|
| 687 |
+
5,
|
| 688 |
+
4,
|
| 689 |
+
4,
|
| 690 |
+
4,
|
| 691 |
+
5,
|
| 692 |
+
4,
|
| 693 |
+
7,
|
| 694 |
+
4,
|
| 695 |
+
5,
|
| 696 |
+
4,
|
| 697 |
+
1,
|
| 698 |
+
4
|
| 699 |
+
],
|
| 700 |
+
"best_token_acc": [
|
| 701 |
+
0.962890625,
|
| 702 |
+
0.966796875,
|
| 703 |
+
0.9912109375,
|
| 704 |
+
0.3701171875,
|
| 705 |
+
0.78125,
|
| 706 |
+
0.7607421875,
|
| 707 |
+
0.9658203125,
|
| 708 |
+
0.9560546875,
|
| 709 |
+
0.9873046875,
|
| 710 |
+
0.376953125,
|
| 711 |
+
0.9921875,
|
| 712 |
+
0.970703125,
|
| 713 |
+
0.95703125,
|
| 714 |
+
0.72265625,
|
| 715 |
+
0.71484375,
|
| 716 |
+
0.7314453125,
|
| 717 |
+
0.9765625,
|
| 718 |
+
0.9892578125,
|
| 719 |
+
0.73828125,
|
| 720 |
+
0.861328125,
|
| 721 |
+
0.9521484375,
|
| 722 |
+
0.9716796875,
|
| 723 |
+
0.8662109375,
|
| 724 |
+
0.8330078125,
|
| 725 |
+
0.96875,
|
| 726 |
+
0.9482421875,
|
| 727 |
+
0.9853515625,
|
| 728 |
+
0.689453125,
|
| 729 |
+
0.962890625,
|
| 730 |
+
0.9775390625,
|
| 731 |
+
0.9580078125,
|
| 732 |
+
0.2119140625,
|
| 733 |
+
0.9345703125,
|
| 734 |
+
0.2138671875,
|
| 735 |
+
0.90625,
|
| 736 |
+
0.5400390625,
|
| 737 |
+
0.9736328125,
|
| 738 |
+
0.685546875,
|
| 739 |
+
0.9716796875,
|
| 740 |
+
0.8994140625,
|
| 741 |
+
0.974609375,
|
| 742 |
+
0.9736328125,
|
| 743 |
+
0.9248046875,
|
| 744 |
+
0.984375,
|
| 745 |
+
0.9677734375,
|
| 746 |
+
0.7060546875,
|
| 747 |
+
0.9345703125,
|
| 748 |
+
0.9228515625,
|
| 749 |
+
0.9912109375,
|
| 750 |
+
0.8466796875,
|
| 751 |
+
0.9599609375,
|
| 752 |
+
0.9638671875,
|
| 753 |
+
0.4013671875,
|
| 754 |
+
0.9033203125,
|
| 755 |
+
0.9736328125,
|
| 756 |
+
0.943359375,
|
| 757 |
+
0.9912109375,
|
| 758 |
+
0.9716796875,
|
| 759 |
+
0.98828125,
|
| 760 |
+
0.9716796875,
|
| 761 |
+
0.97265625,
|
| 762 |
+
0.94921875,
|
| 763 |
+
0.97265625,
|
| 764 |
+
0.9658203125
|
| 765 |
+
]
|
| 766 |
+
}
|
| 767 |
+
},
|
| 768 |
+
"first_exact_by_run": {}
|
| 769 |
+
}
|
| 770 |
+
RESULT config=p50_path4_unif0_0p125_outwdm1 run=train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139 ckpt_step=2000 views=1024000 token_acc=0.8653 exact=0/64 exact_refs=0 hits=[]
|
| 771 |
+
[ctx1024-sampleds] train config=p50_path4_unif0_0p125_outwdm1 from=2000 to=3000
|
| 772 |
+
[launch] gpt2 cached OWT soft-endpoint m/n pilot
|
| 773 |
+
[launch] run_name=train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139
|
| 774 |
+
[launch] save_dir=runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139
|
| 775 |
+
[launch] n=1024 m=0 clean_state_mode=onehot
|
| 776 |
+
[launch] mask_mixture lowk=0.0 all=1.0
|
| 777 |
+
[launch] model d=192 layers=3 heads=3 ff=768 vocab_override=2664
|
| 778 |
+
[launch] optimizer=muon muon_impl=legacy weight_decay=0.1 output_weight_decay=-1
|
| 779 |
+
[launch] target_loss=hard_ce conf=0.0->1.0 power=1.0
|
| 780 |
+
[launch] mask_ratio=1.0->1.0
|
| 781 |
+
[launch] mask_ratio_floor_schedule=none
|
| 782 |
+
[launch] dirichlet C=1.0->1024 endpoint=categorical_dual_t sampler=dirichlet
|
| 783 |
+
[launch] wrong_mix seq_alpha=0.0 wrong_floor=0.0 unigram=0.0 uniform=0.0 basin=0.0 basin_ids=
|
| 784 |
+
[launch] rollout_train prob=0.50 mode=sampled_path steps=4 infer_steps=1 s_dist=uniform s_frac=0.0->0.125 temp=1.45 corrupt_only=1 samplewise=1 selected_only=1 sync_t=1
|
| 785 |
+
[launch] cache=/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit exact_repeat_per_chunk=64
|
| 786 |
+
[launch] resume_path=runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139/latest.pt
|
| 787 |
+
NCCL version 2.25.1+cuda12.8
|
| 788 |
+
resumed_from=runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139/latest.pt start_step=2001
|
| 789 |
+
{
|
| 790 |
+
"device": "cuda:0",
|
| 791 |
+
"rank": 0,
|
| 792 |
+
"world_size": 4,
|
| 793 |
+
"samples": "owt_cached_chunks:8",
|
| 794 |
+
"vocab_size": 2664,
|
| 795 |
+
"tokenizer_vocab_size": 50257,
|
| 796 |
+
"save_dir": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139",
|
| 797 |
+
"batch_size": 128,
|
| 798 |
+
"grad_accum": 1,
|
| 799 |
+
"effective_batch_size": 512,
|
| 800 |
+
"global_batch_size": 512,
|
| 801 |
+
"lr_schedule": "constant_warmup",
|
| 802 |
+
"optimizer": "muon",
|
| 803 |
+
"epochs": 0.0,
|
| 804 |
+
"steps_per_epoch": 1,
|
| 805 |
+
"total_steps": 3000,
|
| 806 |
+
"warmup_steps": 10,
|
| 807 |
+
"warmup_epochs": -1.0,
|
| 808 |
+
"min_lr": 0.0,
|
| 809 |
+
"weight_decay": 0.1,
|
| 810 |
+
"output_weight_decay": -1.0,
|
| 811 |
+
"adamw_param_groups": "nanogpt",
|
| 812 |
+
"adam_beta1": 0.9,
|
| 813 |
+
"adam_beta2": 0.95,
|
| 814 |
+
"adam_eps": 1e-08,
|
| 815 |
+
"muon_impl": "legacy",
|
| 816 |
+
"muon_momentum": 0.95,
|
| 817 |
+
"muon_ns_steps": 5,
|
| 818 |
+
"muon_update_scale": 1.0,
|
| 819 |
+
"muon_nesterov": false,
|
| 820 |
+
"muon_width_scale": false,
|
| 821 |
+
"muon_grouping": "legacy_dim_ge_2",
|
| 822 |
+
"muon_param_count": 2616320,
|
| 823 |
+
"muon_adam_param_count": 8192,
|
| 824 |
+
"muon_param_names": [
|
| 825 |
+
"vocab_embed.embedding",
|
| 826 |
+
"sigma_map.net.0.weight",
|
| 827 |
+
"sigma_map.net.2.weight",
|
| 828 |
+
"blocks.0.attn_qkv.weight",
|
| 829 |
+
"blocks.0.attn_out.weight",
|
| 830 |
+
"blocks.0.mlp.0.weight",
|
| 831 |
+
"blocks.0.mlp.2.weight",
|
| 832 |
+
"blocks.0.adaLN_modulation.weight",
|
| 833 |
+
"blocks.1.attn_qkv.weight",
|
| 834 |
+
"blocks.1.attn_out.weight",
|
| 835 |
+
"blocks.1.mlp.0.weight",
|
| 836 |
+
"blocks.1.mlp.2.weight",
|
| 837 |
+
"blocks.1.adaLN_modulation.weight",
|
| 838 |
+
"blocks.2.attn_qkv.weight",
|
| 839 |
+
"blocks.2.attn_out.weight",
|
| 840 |
+
"blocks.2.mlp.0.weight",
|
| 841 |
+
"blocks.2.mlp.2.weight",
|
| 842 |
+
"blocks.2.adaLN_modulation.weight",
|
| 843 |
+
"output_layer.linear.weight",
|
| 844 |
+
"output_layer.adaLN_modulation.weight"
|
| 845 |
+
],
|
| 846 |
+
"muon_adam_param_names": [
|
| 847 |
+
"sigma_map.net.0.bias",
|
| 848 |
+
"sigma_map.net.2.bias",
|
| 849 |
+
"blocks.0.norm1.weight",
|
| 850 |
+
"blocks.0.norm2.weight",
|
| 851 |
+
"blocks.0.mlp.0.bias",
|
| 852 |
+
"blocks.0.mlp.2.bias",
|
| 853 |
+
"blocks.0.adaLN_modulation.bias",
|
| 854 |
+
"blocks.1.norm1.weight",
|
| 855 |
+
"blocks.1.norm2.weight",
|
| 856 |
+
"blocks.1.mlp.0.bias",
|
| 857 |
+
"blocks.1.mlp.2.bias",
|
| 858 |
+
"blocks.1.adaLN_modulation.bias",
|
| 859 |
+
"blocks.2.norm1.weight",
|
| 860 |
+
"blocks.2.norm2.weight",
|
| 861 |
+
"blocks.2.mlp.0.bias",
|
| 862 |
+
"blocks.2.mlp.2.bias",
|
| 863 |
+
"blocks.2.adaLN_modulation.bias",
|
| 864 |
+
"output_layer.norm_final.weight",
|
| 865 |
+
"output_layer.adaLN_modulation.bias"
|
| 866 |
+
],
|
| 867 |
+
"muon_effective_nesterov": false,
|
| 868 |
+
"muon_effective_width_scale": false,
|
| 869 |
+
"muon_effective_weight_decay": 0.1,
|
| 870 |
+
"muon_adam_fallback_nesterov": false,
|
| 871 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 872 |
+
"ema_decay": 0.9999,
|
| 873 |
+
"ema_start_step": 0,
|
| 874 |
+
"model_type": "ddit",
|
| 875 |
+
"ddit_mlp_type": "gelu",
|
| 876 |
+
"elf_num_time_tokens": 4,
|
| 877 |
+
"elf_num_model_mode_tokens": 0,
|
| 878 |
+
"qk_norm": true,
|
| 879 |
+
"output_bias": false,
|
| 880 |
+
"output_init_std": -1.0,
|
| 881 |
+
"norm_type": "rmsnorm",
|
| 882 |
+
"target_loss": "hard_ce",
|
| 883 |
+
"linear_soft_target_power": 1.0,
|
| 884 |
+
"linear_soft_target_min_conf": 0.0,
|
| 885 |
+
"linear_soft_target_max_conf": 1.0,
|
| 886 |
+
"t_sampling_mode": "logit_normal",
|
| 887 |
+
"t_sampling_power": 1.0,
|
| 888 |
+
"t_sampling_eps": 0.0001,
|
| 889 |
+
"t_sampling_logit_mean": -1.5,
|
| 890 |
+
"t_sampling_logit_std": 0.8,
|
| 891 |
+
"dual_t": true,
|
| 892 |
+
"corrupt_t_mode": "same",
|
| 893 |
+
"corrupt_min_t": 0.0,
|
| 894 |
+
"corrupt_max_t": 1.0,
|
| 895 |
+
"prefix_block_prob": 0.0,
|
| 896 |
+
"prefix_block_len": 128,
|
| 897 |
+
"mask_ratio_floor_schedule": "none",
|
| 898 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 899 |
+
"dirichlet_semantic_t_mode": "same",
|
| 900 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 901 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 902 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 903 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 904 |
+
"categorical_wrong_from_full_vocab": true,
|
| 905 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 906 |
+
"categorical_wrong_basin_token_ids": "",
|
| 907 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 908 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 909 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 910 |
+
"categorical_wrong_prob_floor": 0.0,
|
| 911 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 912 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 913 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 914 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 915 |
+
"mask_mixture_original_prob": 0.0,
|
| 916 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 917 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 918 |
+
"mask_mixture_block_prob": 0.0,
|
| 919 |
+
"mask_mixture_all_prob": 1.0,
|
| 920 |
+
"mask_mixture_lowk_clean_tokens": "0",
|
| 921 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 922 |
+
"mask_mixture_block_tokens": "64,128",
|
| 923 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 924 |
+
"logistic_normal_sigma_min": 0.1,
|
| 925 |
+
"logistic_normal_sigma_max": 1.0,
|
| 926 |
+
"logistic_normal_tau_min": 1.0,
|
| 927 |
+
"logistic_normal_tau_max": 1.0,
|
| 928 |
+
"torch_compile": false,
|
| 929 |
+
"compile_mode": "max-autotune",
|
| 930 |
+
"state_format": "prob",
|
| 931 |
+
"meanflow_weight": 0.0,
|
| 932 |
+
"rollout_train_prob": 0.5,
|
| 933 |
+
"rollout_train_steps": 4,
|
| 934 |
+
"rollout_train_infer_steps": 1,
|
| 935 |
+
"rollout_train_time_mode": "sampled_path",
|
| 936 |
+
"rollout_train_s_dist": "uniform",
|
| 937 |
+
"rollout_train_s_min_frac": 0.0,
|
| 938 |
+
"rollout_train_s_max_frac": 0.125,
|
| 939 |
+
"rollout_train_s_beta_alpha": 2.0,
|
| 940 |
+
"rollout_train_s_beta_beta": 6.0,
|
| 941 |
+
"rollout_train_temp": 1.45,
|
| 942 |
+
"rollout_train_max_gamma": 1.0,
|
| 943 |
+
"rollout_train_corrupt_only": true,
|
| 944 |
+
"rollout_train_samplewise": true,
|
| 945 |
+
"rollout_train_compute_always": false,
|
| 946 |
+
"rollout_train_sync_t": true,
|
| 947 |
+
"bridge_noise_init": "logistic_normal",
|
| 948 |
+
"noise_sigma": -1.0,
|
| 949 |
+
"allow_tf32": true,
|
| 950 |
+
"activation_checkpointing": false,
|
| 951 |
+
"activation_checkpoint_interval": 1,
|
| 952 |
+
"activation_checkpoint_scope": "block",
|
| 953 |
+
"ddp_static_graph": false,
|
| 954 |
+
"ddp_gradient_as_bucket_view": true,
|
| 955 |
+
"blocking_data_transfer": false,
|
| 956 |
+
"dataloader_prefetch_factor": 4,
|
| 957 |
+
"full_train_stats": false,
|
| 958 |
+
"tokenized_hf": false,
|
| 959 |
+
"tokenized_pad_token": "pad",
|
| 960 |
+
"elf_conditional_hf": false,
|
| 961 |
+
"record_pad_truncate": false,
|
| 962 |
+
"record_add_eos": false,
|
| 963 |
+
"record_add_special_tokens": false,
|
| 964 |
+
"record_pad_token": "pad",
|
| 965 |
+
"record_shuffle_buffer": 10000,
|
| 966 |
+
"wrap": true,
|
| 967 |
+
"wrap_mode": "stream",
|
| 968 |
+
"wrap_record_buffer_size": 200,
|
| 969 |
+
"owt_cached_chunks": true,
|
| 970 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train8_compact_overfit",
|
| 971 |
+
"owt_chunk_cache_rebuild": false,
|
| 972 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 973 |
+
"owt_exact_repeat_per_chunk": 64,
|
| 974 |
+
"online_chunk_shuffle": false,
|
| 975 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 976 |
+
"openwebtext_split": "train_minus_100k",
|
| 977 |
+
"detokenizer": "auto",
|
| 978 |
+
"resolved_detokenizer": null,
|
| 979 |
+
"num_workers": 0,
|
| 980 |
+
"latest_every": 1000,
|
| 981 |
+
"resume_path": "runs/train8_ctx1024_p50_path4_unif0_0p125_outwdm1_ctx1024_sampledpath_true_20260517_224139/latest.pt"
|
| 982 |
+
}
|
| 983 |
+
step=2100 epoch=2100/3000 epoch_step=1/1 micro_steps=2100 elapsed=24.6s lr=2.000000e-03 loss=0.6727 loss_recon=0.6727 loss_meanflow=0.0000 mean_model_t=0.2085 mean_corrupt_t=0.2085 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.5077 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.7942 corrupt_frac=1.0000 acc_corrupt=0.7942 loss_corrupt=0.6727 wrong_frac=0.7915 init_acc_corrupt=0.2065 acc_corrupt_t_0p0_0p2=0.6391 corrupt_frac_t_0p0_0p2=0.5640 acc_corrupt_t_0p2_0p4=0.9938 corrupt_frac_t_0p2_0p4=0.3466 acc_corrupt_t_0p4_0p6=0.9993 corrupt_frac_t_0p4_0p6=0.0791 acc_corrupt_t_0p6_0p8=0.9977 corrupt_frac_t_0p6_0p8=0.0136 out_w_norm=11.8640 out_g_norm=1.0068 acc_corrupt_t_0p8_1p0=0.9895 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5473 init_gold_top10=0.5368 init_gold_top100=0.6265 rollout_applied_pos_frac=0.4453 init_acc_rollout_applied=0.3061 init_acc_rollout_kept=0.1192 logit_acc_rollout_applied=0.8278 logit_acc_rollout_kept=0.8275
|
| 984 |
+
step=2200 epoch=2200/3000 epoch_step=1/1 micro_steps=2200 elapsed=23.6s lr=2.000000e-03 loss=0.5787 loss_recon=0.5787 loss_meanflow=0.0000 mean_model_t=0.2096 mean_corrupt_t=0.2096 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.4995 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.8214 corrupt_frac=1.0000 acc_corrupt=0.8214 loss_corrupt=0.5787 wrong_frac=0.7905 init_acc_corrupt=0.2095 acc_corrupt_t_0p0_0p2=0.6810 corrupt_frac_t_0p0_0p2=0.5557 acc_corrupt_t_0p2_0p4=0.9966 corrupt_frac_t_0p2_0p4=0.3595 acc_corrupt_t_0p4_0p6=0.9994 corrupt_frac_t_0p4_0p6=0.0762 acc_corrupt_t_0p6_0p8=0.9984 corrupt_frac_t_0p6_0p8=0.0129 out_w_norm=11.8582 out_g_norm=0.9122 acc_corrupt_t_0p8_1p0=0.9941 corrupt_frac_t_0p8_1p0=0.0078 loss_all=0.5341 init_gold_top10=0.5514 init_gold_top100=0.6421 rollout_applied_pos_frac=0.4688 init_acc_rollout_applied=0.2945 init_acc_rollout_kept=0.1215 logit_acc_rollout_applied=0.8562 logit_acc_rollout_kept=0.8293
|
| 985 |
+
Terminated
|
LTA_openwebtext_dualt/logs/decode_timegrid_trace_len256_copied_20260517_155402.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/elf_lm1b_t5small_elfb_aligned_datasetfix_len128_4gpu_tinysmoke_20260513.log
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
|
| 2 |
+
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
|
| 3 |
+
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
|
| 4 |
+
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
|
| 5 |
+
[elf-lm1b] encoder=/e2e-data/evad-tech-vla/wanghan58/models/hf/t5-small enc_dim=512 vocab=32100
|
| 6 |
+
[elf-lm1b] batch=4 world=4 grad_accum=1 gbs~=16
|
| 7 |
+
/usr/local/lib/python3.12/dist-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
|
| 8 |
+
return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
|
| 9 |
+
/usr/local/lib/python3.12/dist-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
|
| 10 |
+
return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
|
| 11 |
+
/usr/local/lib/python3.12/dist-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
|
| 12 |
+
return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
|
| 13 |
+
/usr/local/lib/python3.12/dist-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
|
| 14 |
+
return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
|
| 15 |
+
[2026-05-13 16:43:32] step=1 elapsed=1.0s lr=2.000000e-03 loss=1.0246 l2=1.2808 ce=0.0000 decoder_frac=0.000 t=0.217 tokens=129
|
| 16 |
+
[2026-05-13 16:43:32] step=2 elapsed=0.2s lr=2.000000e-03 loss=1.4248 l2=1.7810 ce=0.0000 decoder_frac=0.000 t=0.233 tokens=108
|
| 17 |
+
[2026-05-13 16:43:32] step=3 elapsed=0.2s lr=2.000000e-03 loss=1.2049 l2=1.5061 ce=0.0000 decoder_frac=0.000 t=0.245 tokens=111
|
| 18 |
+
[2026-05-13 16:43:32] step=4 elapsed=0.2s lr=2.000000e-03 loss=0.9481 l2=1.1851 ce=0.0000 decoder_frac=0.000 t=0.189 tokens=144
|
LTA_openwebtext_dualt/logs/elfaligned_t5record_4gpu/lta_owt_t5record_len1024_elfaligned_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_zeroout_tf32_gbs512_4gpu_20260516_011722.log
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NCCL version 2.25.1+cuda12.8
|
| 2 |
+
{
|
| 3 |
+
"device": "cuda:0",
|
| 4 |
+
"rank": 0,
|
| 5 |
+
"world_size": 4,
|
| 6 |
+
"samples": "record_pad_truncate:pad=0:add_eos=0:add_special=0:shuffle_buffer=10000",
|
| 7 |
+
"vocab_size": 32100,
|
| 8 |
+
"tokenizer_vocab_size": 32100,
|
| 9 |
+
"save_dir": "runs/lta_owt_t5record_len1024_elfaligned_dditelf_muon_logitnormal_m1p5_s0p8_none_floor0p0_zeroout_tf32_gbs512_4gpu_20260516_011722",
|
| 10 |
+
"batch_size": 32,
|
| 11 |
+
"grad_accum": 4,
|
| 12 |
+
"effective_batch_size": 512,
|
| 13 |
+
"global_batch_size": 512,
|
| 14 |
+
"lr_schedule": "constant_warmup",
|
| 15 |
+
"optimizer": "muon",
|
| 16 |
+
"epochs": 5.0,
|
| 17 |
+
"steps_per_epoch": 15457,
|
| 18 |
+
"total_steps": 77285,
|
| 19 |
+
"warmup_steps": 7729,
|
| 20 |
+
"warmup_epochs": 0.5,
|
| 21 |
+
"min_lr": 0.0,
|
| 22 |
+
"weight_decay": 0.0,
|
| 23 |
+
"output_weight_decay": -1.0,
|
| 24 |
+
"adamw_param_groups": "nanogpt",
|
| 25 |
+
"adam_beta1": 0.9,
|
| 26 |
+
"adam_beta2": 0.999,
|
| 27 |
+
"adam_eps": 1e-08,
|
| 28 |
+
"muon_impl": "optax",
|
| 29 |
+
"muon_momentum": 0.95,
|
| 30 |
+
"muon_ns_steps": 5,
|
| 31 |
+
"muon_update_scale": 1.0,
|
| 32 |
+
"muon_nesterov": true,
|
| 33 |
+
"muon_width_scale": true,
|
| 34 |
+
"muon_grouping": "hidden_2d",
|
| 35 |
+
"muon_param_count": 84934656,
|
| 36 |
+
"muon_adam_param_count": 50212608,
|
| 37 |
+
"muon_param_names": [
|
| 38 |
+
"blocks.0.attn_qkv.weight",
|
| 39 |
+
"blocks.0.attn_out.weight",
|
| 40 |
+
"blocks.0.mlp.w12.weight",
|
| 41 |
+
"blocks.0.mlp.w3.weight",
|
| 42 |
+
"blocks.1.attn_qkv.weight",
|
| 43 |
+
"blocks.1.attn_out.weight",
|
| 44 |
+
"blocks.1.mlp.w12.weight",
|
| 45 |
+
"blocks.1.mlp.w3.weight",
|
| 46 |
+
"blocks.2.attn_qkv.weight",
|
| 47 |
+
"blocks.2.attn_out.weight",
|
| 48 |
+
"blocks.2.mlp.w12.weight",
|
| 49 |
+
"blocks.2.mlp.w3.weight",
|
| 50 |
+
"blocks.3.attn_qkv.weight",
|
| 51 |
+
"blocks.3.attn_out.weight",
|
| 52 |
+
"blocks.3.mlp.w12.weight",
|
| 53 |
+
"blocks.3.mlp.w3.weight",
|
| 54 |
+
"blocks.4.attn_qkv.weight",
|
| 55 |
+
"blocks.4.attn_out.weight",
|
| 56 |
+
"blocks.4.mlp.w12.weight",
|
| 57 |
+
"blocks.4.mlp.w3.weight",
|
| 58 |
+
"blocks.5.attn_qkv.weight",
|
| 59 |
+
"blocks.5.attn_out.weight",
|
| 60 |
+
"blocks.5.mlp.w12.weight",
|
| 61 |
+
"blocks.5.mlp.w3.weight",
|
| 62 |
+
"blocks.6.attn_qkv.weight",
|
| 63 |
+
"blocks.6.attn_out.weight",
|
| 64 |
+
"blocks.6.mlp.w12.weight",
|
| 65 |
+
"blocks.6.mlp.w3.weight",
|
| 66 |
+
"blocks.7.attn_qkv.weight",
|
| 67 |
+
"blocks.7.attn_out.weight",
|
| 68 |
+
"blocks.7.mlp.w12.weight",
|
| 69 |
+
"blocks.7.mlp.w3.weight",
|
| 70 |
+
"blocks.8.attn_qkv.weight",
|
| 71 |
+
"blocks.8.attn_out.weight",
|
| 72 |
+
"blocks.8.mlp.w12.weight",
|
| 73 |
+
"blocks.8.mlp.w3.weight",
|
| 74 |
+
"blocks.9.attn_qkv.weight",
|
| 75 |
+
"blocks.9.attn_out.weight",
|
| 76 |
+
"blocks.9.mlp.w12.weight",
|
| 77 |
+
"blocks.9.mlp.w3.weight",
|
| 78 |
+
"blocks.10.attn_qkv.weight",
|
| 79 |
+
"blocks.10.attn_out.weight",
|
| 80 |
+
"blocks.10.mlp.w12.weight",
|
| 81 |
+
"blocks.10.mlp.w3.weight",
|
| 82 |
+
"blocks.11.attn_qkv.weight",
|
| 83 |
+
"blocks.11.attn_out.weight",
|
| 84 |
+
"blocks.11.mlp.w12.weight",
|
| 85 |
+
"blocks.11.mlp.w3.weight"
|
| 86 |
+
],
|
| 87 |
+
"muon_adam_param_names": [
|
| 88 |
+
"time_tokens",
|
| 89 |
+
"vocab_embed.embedding",
|
| 90 |
+
"sigma_map.net.0.weight",
|
| 91 |
+
"sigma_map.net.0.bias",
|
| 92 |
+
"sigma_map.net.2.weight",
|
| 93 |
+
"sigma_map.net.2.bias",
|
| 94 |
+
"blocks.0.norm1.weight",
|
| 95 |
+
"blocks.0.attn_qkv.bias",
|
| 96 |
+
"blocks.0.attn_out.bias",
|
| 97 |
+
"blocks.0.q_norm.weight",
|
| 98 |
+
"blocks.0.k_norm.weight",
|
| 99 |
+
"blocks.0.norm2.weight",
|
| 100 |
+
"blocks.0.mlp.w12.bias",
|
| 101 |
+
"blocks.0.mlp.w3.bias",
|
| 102 |
+
"blocks.1.norm1.weight",
|
| 103 |
+
"blocks.1.attn_qkv.bias",
|
| 104 |
+
"blocks.1.attn_out.bias",
|
| 105 |
+
"blocks.1.q_norm.weight",
|
| 106 |
+
"blocks.1.k_norm.weight",
|
| 107 |
+
"blocks.1.norm2.weight",
|
| 108 |
+
"blocks.1.mlp.w12.bias",
|
| 109 |
+
"blocks.1.mlp.w3.bias",
|
| 110 |
+
"blocks.2.norm1.weight",
|
| 111 |
+
"blocks.2.attn_qkv.bias",
|
| 112 |
+
"blocks.2.attn_out.bias",
|
| 113 |
+
"blocks.2.q_norm.weight",
|
| 114 |
+
"blocks.2.k_norm.weight",
|
| 115 |
+
"blocks.2.norm2.weight",
|
| 116 |
+
"blocks.2.mlp.w12.bias",
|
| 117 |
+
"blocks.2.mlp.w3.bias",
|
| 118 |
+
"blocks.3.norm1.weight",
|
| 119 |
+
"blocks.3.attn_qkv.bias",
|
| 120 |
+
"blocks.3.attn_out.bias",
|
| 121 |
+
"blocks.3.q_norm.weight",
|
| 122 |
+
"blocks.3.k_norm.weight",
|
| 123 |
+
"blocks.3.norm2.weight",
|
| 124 |
+
"blocks.3.mlp.w12.bias",
|
| 125 |
+
"blocks.3.mlp.w3.bias",
|
| 126 |
+
"blocks.4.norm1.weight",
|
| 127 |
+
"blocks.4.attn_qkv.bias",
|
| 128 |
+
"blocks.4.attn_out.bias",
|
| 129 |
+
"blocks.4.q_norm.weight",
|
| 130 |
+
"blocks.4.k_norm.weight",
|
| 131 |
+
"blocks.4.norm2.weight",
|
| 132 |
+
"blocks.4.mlp.w12.bias",
|
| 133 |
+
"blocks.4.mlp.w3.bias",
|
| 134 |
+
"blocks.5.norm1.weight",
|
| 135 |
+
"blocks.5.attn_qkv.bias",
|
| 136 |
+
"blocks.5.attn_out.bias",
|
| 137 |
+
"blocks.5.q_norm.weight",
|
| 138 |
+
"blocks.5.k_norm.weight",
|
| 139 |
+
"blocks.5.norm2.weight",
|
| 140 |
+
"blocks.5.mlp.w12.bias",
|
| 141 |
+
"blocks.5.mlp.w3.bias",
|
| 142 |
+
"blocks.6.norm1.weight",
|
| 143 |
+
"blocks.6.attn_qkv.bias",
|
| 144 |
+
"blocks.6.attn_out.bias",
|
| 145 |
+
"blocks.6.q_norm.weight",
|
| 146 |
+
"blocks.6.k_norm.weight",
|
| 147 |
+
"blocks.6.norm2.weight",
|
| 148 |
+
"blocks.6.mlp.w12.bias",
|
| 149 |
+
"blocks.6.mlp.w3.bias",
|
| 150 |
+
"blocks.7.norm1.weight",
|
| 151 |
+
"blocks.7.attn_qkv.bias",
|
| 152 |
+
"blocks.7.attn_out.bias",
|
| 153 |
+
"blocks.7.q_norm.weight",
|
| 154 |
+
"blocks.7.k_norm.weight",
|
| 155 |
+
"blocks.7.norm2.weight",
|
| 156 |
+
"blocks.7.mlp.w12.bias",
|
| 157 |
+
"blocks.7.mlp.w3.bias",
|
| 158 |
+
"blocks.8.norm1.weight",
|
| 159 |
+
"blocks.8.attn_qkv.bias",
|
| 160 |
+
"blocks.8.attn_out.bias",
|
| 161 |
+
"blocks.8.q_norm.weight",
|
| 162 |
+
"blocks.8.k_norm.weight",
|
| 163 |
+
"blocks.8.norm2.weight",
|
| 164 |
+
"blocks.8.mlp.w12.bias",
|
| 165 |
+
"blocks.8.mlp.w3.bias",
|
| 166 |
+
"blocks.9.norm1.weight",
|
| 167 |
+
"blocks.9.attn_qkv.bias",
|
| 168 |
+
"blocks.9.attn_out.bias",
|
| 169 |
+
"blocks.9.q_norm.weight",
|
| 170 |
+
"blocks.9.k_norm.weight",
|
| 171 |
+
"blocks.9.norm2.weight",
|
| 172 |
+
"blocks.9.mlp.w12.bias",
|
| 173 |
+
"blocks.9.mlp.w3.bias",
|
| 174 |
+
"blocks.10.norm1.weight",
|
| 175 |
+
"blocks.10.attn_qkv.bias",
|
| 176 |
+
"blocks.10.attn_out.bias",
|
| 177 |
+
"blocks.10.q_norm.weight",
|
| 178 |
+
"blocks.10.k_norm.weight",
|
| 179 |
+
"blocks.10.norm2.weight",
|
| 180 |
+
"blocks.10.mlp.w12.bias",
|
| 181 |
+
"blocks.10.mlp.w3.bias",
|
| 182 |
+
"blocks.11.norm1.weight",
|
| 183 |
+
"blocks.11.attn_qkv.bias",
|
| 184 |
+
"blocks.11.attn_out.bias",
|
| 185 |
+
"blocks.11.q_norm.weight",
|
| 186 |
+
"blocks.11.k_norm.weight",
|
| 187 |
+
"blocks.11.norm2.weight",
|
| 188 |
+
"blocks.11.mlp.w12.bias",
|
| 189 |
+
"blocks.11.mlp.w3.bias",
|
| 190 |
+
"output_layer.norm_final.weight",
|
| 191 |
+
"output_layer.linear.weight"
|
| 192 |
+
],
|
| 193 |
+
"muon_effective_nesterov": true,
|
| 194 |
+
"muon_effective_width_scale": true,
|
| 195 |
+
"muon_effective_weight_decay": 0.0,
|
| 196 |
+
"muon_adam_fallback_nesterov": true,
|
| 197 |
+
"muon_adam_fallback_weight_decay": 0.0,
|
| 198 |
+
"ema_decay": 0.9999,
|
| 199 |
+
"ema_start_step": 0,
|
| 200 |
+
"model_type": "ddit_elf",
|
| 201 |
+
"elf_num_time_tokens": 4,
|
| 202 |
+
"elf_num_model_mode_tokens": 0,
|
| 203 |
+
"qk_norm": true,
|
| 204 |
+
"output_bias": false,
|
| 205 |
+
"output_init_std": 0.0,
|
| 206 |
+
"norm_type": "rmsnorm",
|
| 207 |
+
"t_sampling_mode": "logit_normal",
|
| 208 |
+
"t_sampling_power": 1.0,
|
| 209 |
+
"t_sampling_eps": 0.0001,
|
| 210 |
+
"t_sampling_logit_mean": -1.5,
|
| 211 |
+
"t_sampling_logit_std": 0.8,
|
| 212 |
+
"dual_t": true,
|
| 213 |
+
"corrupt_t_mode": "same",
|
| 214 |
+
"corrupt_min_t": 0.0,
|
| 215 |
+
"corrupt_max_t": 1.0,
|
| 216 |
+
"prefix_block_prob": 0.0,
|
| 217 |
+
"prefix_block_len": 128,
|
| 218 |
+
"mask_ratio_floor_schedule": "none",
|
| 219 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 220 |
+
"dirichlet_semantic_t_mode": "same",
|
| 221 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 222 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 223 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 224 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 225 |
+
"categorical_wrong_from_full_vocab": true,
|
| 226 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 227 |
+
"categorical_wrong_basin_token_ids": "",
|
| 228 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 229 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 230 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 231 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 232 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 233 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 234 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 235 |
+
"mask_mixture_original_prob": 0.0,
|
| 236 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 237 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 238 |
+
"mask_mixture_block_prob": 0.0,
|
| 239 |
+
"mask_mixture_all_prob": 0.0,
|
| 240 |
+
"mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
|
| 241 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 242 |
+
"mask_mixture_block_tokens": "64,128",
|
| 243 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 244 |
+
"logistic_normal_sigma_min": 0.18,
|
| 245 |
+
"logistic_normal_sigma_max": 2.2,
|
| 246 |
+
"logistic_normal_tau_min": 0.65,
|
| 247 |
+
"logistic_normal_tau_max": 1.15,
|
| 248 |
+
"torch_compile": false,
|
| 249 |
+
"compile_mode": "max-autotune",
|
| 250 |
+
"state_format": "prob",
|
| 251 |
+
"target_loss": "hard_ce",
|
| 252 |
+
"meanflow_weight": 0.0,
|
| 253 |
+
"rollout_train_prob": 0.0,
|
| 254 |
+
"rollout_train_steps": 1,
|
| 255 |
+
"rollout_train_infer_steps": 64,
|
| 256 |
+
"rollout_train_temp": 1.45,
|
| 257 |
+
"rollout_train_max_gamma": 1.0,
|
| 258 |
+
"rollout_train_corrupt_only": true,
|
| 259 |
+
"rollout_train_samplewise": false,
|
| 260 |
+
"rollout_train_compute_always": false,
|
| 261 |
+
"bridge_noise_init": "logistic_normal",
|
| 262 |
+
"noise_sigma": -1.0,
|
| 263 |
+
"allow_tf32": true,
|
| 264 |
+
"activation_checkpointing": true,
|
| 265 |
+
"activation_checkpoint_interval": 1,
|
| 266 |
+
"activation_checkpoint_scope": "mlp",
|
| 267 |
+
"ddp_static_graph": false,
|
| 268 |
+
"ddp_gradient_as_bucket_view": true,
|
| 269 |
+
"blocking_data_transfer": false,
|
| 270 |
+
"dataloader_prefetch_factor": 4,
|
| 271 |
+
"full_train_stats": false,
|
| 272 |
+
"tokenized_hf": false,
|
| 273 |
+
"tokenized_pad_token": "pad",
|
| 274 |
+
"elf_conditional_hf": false,
|
| 275 |
+
"record_pad_truncate": true,
|
| 276 |
+
"record_add_eos": false,
|
| 277 |
+
"record_add_special_tokens": false,
|
| 278 |
+
"record_pad_token": "pad",
|
| 279 |
+
"record_shuffle_buffer": 10000,
|
| 280 |
+
"wrap": false,
|
| 281 |
+
"wrap_mode": "stream",
|
| 282 |
+
"wrap_record_buffer_size": 200,
|
| 283 |
+
"owt_cached_chunks": false,
|
| 284 |
+
"owt_chunk_cache_dir": "",
|
| 285 |
+
"owt_chunk_cache_rebuild": false,
|
| 286 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 287 |
+
"owt_exact_repeat_per_chunk": 0,
|
| 288 |
+
"online_chunk_shuffle": false,
|
| 289 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 290 |
+
"openwebtext_split": "train_minus_100k",
|
| 291 |
+
"detokenizer": "auto",
|
| 292 |
+
"resolved_detokenizer": null,
|
| 293 |
+
"num_workers": 8,
|
| 294 |
+
"latest_every": 1000,
|
| 295 |
+
"resume_path": ""
|
| 296 |
+
}
|
LTA_openwebtext_dualt/logs/eval_20260506/ar_8gpu_latest_temp_sweep_20260506_110706.log
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[load] ar_8gpu_scratch_latest step=1000000 ckpt=runs/ar_lm1b_flmpack_bert_small_len128_gbs512_8gpu_1m_scratch_20260505/latest.pt
|
| 2 |
+
[ar temp=1] generated 32/256
|
| 3 |
+
[ar temp=1] generated 64/256
|
| 4 |
+
[ar temp=1] generated 96/256
|
| 5 |
+
[ar temp=1] generated 128/256
|
| 6 |
+
[ar temp=1] generated 160/256
|
| 7 |
+
[ar temp=1] generated 192/256
|
| 8 |
+
[ar temp=1] generated 224/256
|
| 9 |
+
[ar temp=1] generated 256/256
|
| 10 |
+
[ar temp=0.8] generated 32/256
|
| 11 |
+
[ar temp=0.8] generated 64/256
|
| 12 |
+
[ar temp=0.8] generated 96/256
|
| 13 |
+
[ar temp=0.8] generated 128/256
|
| 14 |
+
[ar temp=0.8] generated 160/256
|
| 15 |
+
[ar temp=0.8] generated 192/256
|
| 16 |
+
[ar temp=0.8] generated 224/256
|
| 17 |
+
[ar temp=0.8] generated 256/256
|
| 18 |
+
[ar temp=0.6] generated 32/256
|
| 19 |
+
[ar temp=0.6] generated 64/256
|
| 20 |
+
[ar temp=0.6] generated 96/256
|
| 21 |
+
[ar temp=0.6] generated 128/256
|
| 22 |
+
[ar temp=0.6] generated 160/256
|
| 23 |
+
[ar temp=0.6] generated 192/256
|
| 24 |
+
[ar temp=0.6] generated 224/256
|
| 25 |
+
[ar temp=0.6] generated 256/256
|
| 26 |
+
[ar temp=0.4] generated 32/256
|
| 27 |
+
[ar temp=0.4] generated 64/256
|
| 28 |
+
[ar temp=0.4] generated 96/256
|
| 29 |
+
[ar temp=0.4] generated 128/256
|
| 30 |
+
[ar temp=0.4] generated 160/256
|
| 31 |
+
[ar temp=0.4] generated 192/256
|
| 32 |
+
[ar temp=0.4] generated 224/256
|
| 33 |
+
[ar temp=0.4] generated 256/256
|
| 34 |
+
[ar temp=0.2] generated 32/256
|
| 35 |
+
[ar temp=0.2] generated 64/256
|
| 36 |
+
[ar temp=0.2] generated 96/256
|
| 37 |
+
[ar temp=0.2] generated 128/256
|
| 38 |
+
[ar temp=0.2] generated 160/256
|
| 39 |
+
[ar temp=0.2] generated 192/256
|
| 40 |
+
[ar temp=0.2] generated 224/256
|
| 41 |
+
[ar temp=0.2] generated 256/256
|
| 42 |
+
[summary] {"type": "summary", "name": "ar_8gpu_scratch_latest_t1p0", "kind": "ar", "checkpoint": "runs/ar_lm1b_flmpack_bert_small_len128_gbs512_8gpu_1m_scratch_20260505/latest.pt", "step": 1000000, "decode": {"kind": "ar_sample", "temp": 1.0, "max_new_tokens": 127, "n_samples": 256, "seed": 20260506}, "raw_genppl": {"ppl": 66.26789796127002, "nll_per_token": 4.193705586286218, "tokens": 38758, "kept_samples": 256, "total_samples": 256, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 102.02902966909716, "nll_per_token": 4.625257377391057, "tokens": 32517, "kept_samples": 256, "total_samples": 256, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 4.3391454366321325, "unique_tokens": 6850, "token_count": 32768, "distinct_1": 0.20904541015625, "distinct_2": 0.721795029527559, "top_token_mass": 0.04486083984375}}
|
| 43 |
+
[summary] {"type": "summary", "name": "ar_8gpu_scratch_latest_t0p8", "kind": "ar", "checkpoint": "runs/ar_lm1b_flmpack_bert_small_len128_gbs512_8gpu_1m_scratch_20260505/latest.pt", "step": 1000000, "decode": {"kind": "ar_sample", "temp": 0.8, "max_new_tokens": 127, "n_samples": 256, "seed": 20260506}, "raw_genppl": {"ppl": 32.32856535877376, "nll_per_token": 3.475951215830808, "tokens": 38613, "kept_samples": 256, "total_samples": 256, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 44.806999513308874, "nll_per_token": 3.8023643663532356, "tokens": 32111, "kept_samples": 256, "total_samples": 256, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 4.198900388253301, "unique_tokens": 5328, "token_count": 32768, "distinct_1": 0.16259765625, "distinct_2": 0.5892285925196851, "top_token_mass": 0.06524658203125}}
|
| 44 |
+
[summary] {"type": "summary", "name": "ar_8gpu_scratch_latest_t0p6", "kind": "ar", "checkpoint": "runs/ar_lm1b_flmpack_bert_small_len128_gbs512_8gpu_1m_scratch_20260505/latest.pt", "step": 1000000, "decode": {"kind": "ar_sample", "temp": 0.6, "max_new_tokens": 127, "n_samples": 256, "seed": 20260506}, "raw_genppl": {"ppl": 20.308583315932225, "nll_per_token": 3.0110436201280355, "tokens": 38710, "kept_samples": 256, "total_samples": 256, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 26.539437908520128, "nll_per_token": 3.2786318496488907, "tokens": 31979, "kept_samples": 256, "total_samples": 256, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 4.087555248966228, "unique_tokens": 4227, "token_count": 32768, "distinct_1": 0.128997802734375, "distinct_2": 0.4760396161417323, "top_token_mass": 0.083648681640625}}
|
| 45 |
+
[summary] {"type": "summary", "name": "ar_8gpu_scratch_latest_t0p4", "kind": "ar", "checkpoint": "runs/ar_lm1b_flmpack_bert_small_len128_gbs512_8gpu_1m_scratch_20260505/latest.pt", "step": 1000000, "decode": {"kind": "ar_sample", "temp": 0.4, "max_new_tokens": 127, "n_samples": 256, "seed": 20260506}, "raw_genppl": {"ppl": 15.039557317205846, "nll_per_token": 2.710683884392094, "tokens": 38573, "kept_samples": 256, "total_samples": 256, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 18.999365346672068, "nll_per_token": 2.944405575801821, "tokens": 31826, "kept_samples": 256, "total_samples": 256, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 4.005193950920615, "unique_tokens": 3146, "token_count": 32768, "distinct_1": 0.09600830078125, "distinct_2": 0.3395669291338583, "top_token_mass": 0.095611572265625}}
|
| 46 |
+
[summary] {"type": "summary", "name": "ar_8gpu_scratch_latest_t0p2", "kind": "ar", "checkpoint": "runs/ar_lm1b_flmpack_bert_small_len128_gbs512_8gpu_1m_scratch_20260505/latest.pt", "step": 1000000, "decode": {"kind": "ar_sample", "temp": 0.2, "max_new_tokens": 127, "n_samples": 256, "seed": 20260506}, "raw_genppl": {"ppl": 9.414447999719362, "nll_per_token": 2.2422455305145195, "tokens": 38474, "kept_samples": 256, "total_samples": 256, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 11.492390079715493, "nll_per_token": 2.4416850841291637, "tokens": 31417, "kept_samples": 256, "total_samples": 256, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 3.78621437718483, "unique_tokens": 1502, "token_count": 32768, "distinct_1": 0.04583740234375, "distinct_2": 0.13379675196850394, "top_token_mass": 0.093292236328125}}
|
| 47 |
+
[done] docs/lta_samples/metrics_20260506/ar_8gpu_latest_temp_sweep
|
LTA_openwebtext_dualt/logs/eval_20260506/categorical_c1024_diffusion_eta1_stateweight_latest_20260506_113031.log
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[load] checkpoint=runs/lta_lm1b_dirichlet_categorical_fullvocab_dualt_flmpack_onehot_hardce_ddit_small_len128_gbs512_8gpu_1m_nw0/latest.pt step=650000
|
| 2 |
+
[decode] diff_t1p3_eta1_sw0p70 generated 16/64
|
| 3 |
+
[decode] diff_t1p3_eta1_sw0p70 generated 32/64
|
| 4 |
+
[decode] diff_t1p3_eta1_sw0p70 generated 48/64
|
| 5 |
+
[decode] diff_t1p3_eta1_sw0p70 generated 64/64
|
| 6 |
+
[summary] diff_t1p3_eta1_sw0p70 raw=28.808 strip=39.590 ent=3.972 d2=0.599
|
| 7 |
+
[decode] diff_t1p3_eta1_sw0p90 generated 16/64
|
| 8 |
+
[decode] diff_t1p3_eta1_sw0p90 generated 32/64
|
| 9 |
+
[decode] diff_t1p3_eta1_sw0p90 generated 48/64
|
| 10 |
+
[decode] diff_t1p3_eta1_sw0p90 generated 64/64
|
| 11 |
+
[summary] diff_t1p3_eta1_sw0p90 raw=28.808 strip=39.590 ent=3.972 d2=0.599
|
| 12 |
+
[decode] diff_t1p3_eta1_state generated 16/64
|
| 13 |
+
[decode] diff_t1p3_eta1_state generated 32/64
|
LTA_openwebtext_dualt/logs/eval_20260506/categorical_c1024_diffusion_finalsample_hightemp_quick_20260506_114232.log
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[load] checkpoint=runs/lta_lm1b_dirichlet_categorical_fullvocab_dualt_flmpack_onehot_hardce_ddit_small_len128_gbs512_8gpu_1m_nw0/latest.pt step=656000
|
| 2 |
+
[decode] fs_t1p3_eta1_blend_ft1p00 generated 16/32
|
| 3 |
+
[decode] fs_t1p3_eta1_blend_ft1p00 generated 32/32
|
| 4 |
+
[summary] fs_t1p3_eta1_blend_ft1p00 raw=24.742 strip=37.189 ent=3.994 d2=0.643
|
| 5 |
+
[decode] fs_t1p3_eta1_blend_ft1p30 generated 16/32
|
| 6 |
+
[decode] fs_t1p3_eta1_blend_ft1p30 generated 32/32
|
| 7 |
+
[summary] fs_t1p3_eta1_blend_ft1p30 raw=34.585 strip=54.985 ent=4.025 d2=0.663
|
| 8 |
+
[decode] fs_t1p3_eta1_blend_ft1p60 generated 16/32
|
| 9 |
+
[decode] fs_t1p3_eta1_blend_ft1p60 generated 32/32
|
| 10 |
+
[summary] fs_t1p3_eta1_blend_ft1p60 raw=346.345 strip=639.185 ent=4.286 d2=0.825
|
| 11 |
+
[decode] fs_t1p2_eta1_blend_ft1p00 generated 16/32
|
| 12 |
+
[decode] fs_t1p2_eta1_blend_ft1p00 generated 32/32
|
| 13 |
+
[summary] fs_t1p2_eta1_blend_ft1p00 raw=23.393 strip=34.329 ent=3.971 d2=0.610
|
| 14 |
+
[done] docs/lta_samples/metrics_20260506/categorical_c1024_diffusion_finalsample_hightemp_quick
|
LTA_openwebtext_dualt/logs/eval_20260506/categorical_c1024_diffusion_finalsample_latest_20260506_113603.log
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[load] checkpoint=runs/lta_lm1b_dirichlet_categorical_fullvocab_dualt_flmpack_onehot_hardce_ddit_small_len128_gbs512_8gpu_1m_nw0/latest.pt step=653000
|
| 2 |
+
[decode] fs_t1p3_eta1_blend_ft0p35 generated 16/64
|
| 3 |
+
[decode] fs_t1p3_eta1_blend_ft0p35 generated 32/64
|
| 4 |
+
[decode] fs_t1p3_eta1_blend_ft0p35 generated 48/64
|
| 5 |
+
[decode] fs_t1p3_eta1_blend_ft0p35 generated 64/64
|
| 6 |
+
[summary] fs_t1p3_eta1_blend_ft0p35 raw=24.791 strip=35.428 ent=4.041 d2=0.563
|
| 7 |
+
[decode] fs_t1p3_eta1_blend_ft0p50 generated 16/64
|
| 8 |
+
[decode] fs_t1p3_eta1_blend_ft0p50 generated 32/64
|
| 9 |
+
[decode] fs_t1p3_eta1_blend_ft0p50 generated 48/64
|
| 10 |
+
[decode] fs_t1p3_eta1_blend_ft0p50 generated 64/64
|
| 11 |
+
[summary] fs_t1p3_eta1_blend_ft0p50 raw=24.791 strip=35.428 ent=4.041 d2=0.563
|
| 12 |
+
[decode] fs_t1p3_eta1_blend_ft0p70 generated 16/64
|
| 13 |
+
[decode] fs_t1p3_eta1_blend_ft0p70 generated 32/64
|
| 14 |
+
[decode] fs_t1p3_eta1_blend_ft0p70 generated 48/64
|
| 15 |
+
[decode] fs_t1p3_eta1_blend_ft0p70 generated 64/64
|
| 16 |
+
[summary] fs_t1p3_eta1_blend_ft0p70 raw=24.791 strip=35.428 ent=4.041 d2=0.563
|
| 17 |
+
[decode] fs_t1p2_eta1_blend_ft0p50 generated 16/64
|
| 18 |
+
[decode] fs_t1p2_eta1_blend_ft0p50 generated 32/64
|
| 19 |
+
[decode] fs_t1p2_eta1_blend_ft0p50 generated 48/64
|
| 20 |
+
[decode] fs_t1p2_eta1_blend_ft0p50 generated 64/64
|
| 21 |
+
[summary] fs_t1p2_eta1_blend_ft0p50 raw=22.928 strip=32.454 ent=4.015 d2=0.529
|
| 22 |
+
[decode] fs_t1p2_eta1_blend_ft0p70 generated 16/64
|
| 23 |
+
[decode] fs_t1p2_eta1_blend_ft0p70 generated 32/64
|
| 24 |
+
[decode] fs_t1p2_eta1_blend_ft0p70 generated 48/64
|
| 25 |
+
[decode] fs_t1p2_eta1_blend_ft0p70 generated 64/64
|
| 26 |
+
[summary] fs_t1p2_eta1_blend_ft0p70 raw=22.928 strip=32.454 ent=4.015 d2=0.529
|
| 27 |
+
[decode] fs_t1p1_eta1_blend_ft0p70 generated 16/64
|
| 28 |
+
[decode] fs_t1p1_eta1_blend_ft0p70 generated 32/64
|
| 29 |
+
[decode] fs_t1p1_eta1_blend_ft0p70 generated 48/64
|
| 30 |
+
[decode] fs_t1p1_eta1_blend_ft0p70 generated 64/64
|
| 31 |
+
[summary] fs_t1p1_eta1_blend_ft0p70 raw=22.678 strip=30.867 ent=3.946 d2=0.521
|
| 32 |
+
[decode] fs_t1p3_eta1_state_ft0p35 generated 16/64
|
| 33 |
+
[decode] fs_t1p3_eta1_state_ft0p35 generated 32/64
|
| 34 |
+
[decode] fs_t1p3_eta1_state_ft0p35 generated 48/64
|
| 35 |
+
[decode] fs_t1p3_eta1_state_ft0p35 generated 64/64
|
| 36 |
+
[summary] fs_t1p3_eta1_state_ft0p35 raw=24.791 strip=35.428 ent=4.041 d2=0.563
|
| 37 |
+
[decode] fs_t1p3_eta1_state_ft0p50 generated 16/64
|
| 38 |
+
[decode] fs_t1p3_eta1_state_ft0p50 generated 32/64
|
| 39 |
+
[decode] fs_t1p3_eta1_state_ft0p50 generated 48/64
|
| 40 |
+
[decode] fs_t1p3_eta1_state_ft0p50 generated 64/64
|
| 41 |
+
[summary] fs_t1p3_eta1_state_ft0p50 raw=24.791 strip=35.428 ent=4.041 d2=0.563
|
LTA_openwebtext_dualt/logs/eval_20260506/categorical_c1024_diffusion_rolling_quick512_20260506_112740.log
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[load] checkpoint=runs/lta_lm1b_dirichlet_categorical_fullvocab_dualt_flmpack_onehot_hardce_ddit_small_len128_gbs512_8gpu_1m_nw0/latest.pt step=648000
|
| 2 |
+
[decode] diff_t1p1_eta0p50_sw0p70 generated 32/32
|
| 3 |
+
[summary] diff_t1p1_eta0p50_sw0p70 raw=30.088 strip=40.977 ent=3.473 d2=0.531
|
| 4 |
+
[decode] diff_t1p3_eta0p25_sw0p70 generated 32/32
|
| 5 |
+
[summary] diff_t1p3_eta0p25_sw0p70 raw=2.578 strip=2.274 ent=0.672 d2=0.049
|
| 6 |
+
[decode] diff_t1p3_eta0p50_sw0p70 generated 32/32
|
| 7 |
+
[summary] diff_t1p3_eta0p50_sw0p70 raw=18.456 strip=16.892 ent=1.686 d2=0.197
|
| 8 |
+
[decode] diff_t1p3_eta0p75_sw0p70 generated 32/32
|
| 9 |
+
[summary] diff_t1p3_eta0p75_sw0p70 raw=34.201 strip=52.082 ent=4.029 d2=0.675
|
LTA_openwebtext_dualt/logs/eval_20260506/categorical_c1024_diffusion_rolling_sweep_latest_20260506_112546.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[load] checkpoint=runs/lta_lm1b_dirichlet_categorical_fullvocab_dualt_flmpack_onehot_hardce_ddit_small_len128_gbs512_8gpu_1m_nw0/latest.pt step=647000
|
| 2 |
+
[decode] diff_t1p1_eta0p50_sw0p70 generated 16/64
|
| 3 |
+
[decode] diff_t1p1_eta0p50_sw0p70 generated 32/64
|
| 4 |
+
[decode] diff_t1p1_eta0p50_sw0p70 generated 48/64
|
| 5 |
+
[decode] diff_t1p1_eta0p50_sw0p70 generated 64/64
|
| 6 |
+
[summary] diff_t1p1_eta0p50_sw0p70 raw=24.353 strip=33.608 ent=3.641 d2=0.482
|
| 7 |
+
[decode] diff_t1p1_eta0p75_sw0p70 generated 16/64
|
| 8 |
+
[decode] diff_t1p1_eta0p75_sw0p70 generated 32/64
|
| 9 |
+
[decode] diff_t1p1_eta0p75_sw0p70 generated 48/64
|
| 10 |
+
[decode] diff_t1p1_eta0p75_sw0p70 generated 64/64
|
| 11 |
+
[summary] diff_t1p1_eta0p75_sw0p70 raw=24.930 strip=34.618 ent=3.987 d2=0.553
|
LTA_openwebtext_dualt/logs/eval_20260506/categorical_c1024_rolling_noise_focus_latest_20260506_112101.log
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[load] checkpoint=runs/lta_lm1b_dirichlet_categorical_fullvocab_dualt_flmpack_onehot_hardce_ddit_small_len128_gbs512_8gpu_1m_nw0/latest.pt step=644000
|
| 2 |
+
[decode] sp1p25_sem2p0_temp1p5_eta1_blend generated 64/64
|
| 3 |
+
[summary] sp1p25_sem2p0_temp1p5_eta1_blend raw=36.622 strip=54.520 ent=4.057 d2=0.601
|
| 4 |
+
[decode] sem2p5_temp1p7_eta0p75_blend generated 64/64
|
| 5 |
+
[summary] sem2p5_temp1p7_eta0p75_blend raw=7.537 strip=5.859 ent=0.867 d2=0.074
|
LTA_openwebtext_dualt/logs/eval_20260506/categorical_c1024_rolling_noise_sweep_latest_20260506_110706.log
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[load] checkpoint=runs/lta_lm1b_dirichlet_categorical_fullvocab_dualt_flmpack_onehot_hardce_ddit_small_len128_gbs512_8gpu_1m_nw0/latest.pt step=637000
|
| 2 |
+
[decode] baseline_t1p3_sp1_sem1p5_eta1_blend generated 16/128
|
| 3 |
+
[decode] baseline_t1p3_sp1_sem1p5_eta1_blend generated 32/128
|
| 4 |
+
[decode] baseline_t1p3_sp1_sem1p5_eta1_blend generated 48/128
|
| 5 |
+
[decode] baseline_t1p3_sp1_sem1p5_eta1_blend generated 64/128
|
| 6 |
+
[decode] baseline_t1p3_sp1_sem1p5_eta1_blend generated 80/128
|
| 7 |
+
[decode] baseline_t1p3_sp1_sem1p5_eta1_blend generated 96/128
|
| 8 |
+
[decode] baseline_t1p3_sp1_sem1p5_eta1_blend generated 112/128
|
| 9 |
+
[decode] baseline_t1p3_sp1_sem1p5_eta1_blend generated 128/128
|
| 10 |
+
[summary] baseline_t1p3_sp1_sem1p5_eta1_blend raw=27.627 strip=40.194 ent=4.004 d2=0.506
|
| 11 |
+
[decode] temp1p5_eta1_blend generated 16/128
|
| 12 |
+
[decode] temp1p5_eta1_blend generated 32/128
|
| 13 |
+
[decode] temp1p5_eta1_blend generated 48/128
|
| 14 |
+
[decode] temp1p5_eta1_blend generated 64/128
|
| 15 |
+
[decode] temp1p5_eta1_blend generated 80/128
|
| 16 |
+
[decode] temp1p5_eta1_blend generated 96/128
|
| 17 |
+
[decode] temp1p5_eta1_blend generated 112/128
|
| 18 |
+
[decode] temp1p5_eta1_blend generated 128/128
|
| 19 |
+
[summary] temp1p5_eta1_blend raw=32.000 strip=44.321 ent=3.980 d2=0.544
|
| 20 |
+
[decode] temp1p7_eta1_blend generated 16/128
|
| 21 |
+
[decode] temp1p7_eta1_blend generated 32/128
|
| 22 |
+
[decode] temp1p7_eta1_blend generated 48/128
|
| 23 |
+
[decode] temp1p7_eta1_blend generated 64/128
|
| 24 |
+
[decode] temp1p7_eta1_blend generated 80/128
|
| 25 |
+
[decode] temp1p7_eta1_blend generated 96/128
|
| 26 |
+
[decode] temp1p7_eta1_blend generated 112/128
|
| 27 |
+
[decode] temp1p7_eta1_blend generated 128/128
|
| 28 |
+
[summary] temp1p7_eta1_blend raw=16.912 strip=15.385 ent=2.584 d2=0.146
|
| 29 |
+
[decode] temp2p0_eta1_blend generated 16/128
|
| 30 |
+
[decode] temp2p0_eta1_blend generated 32/128
|
| 31 |
+
[decode] temp2p0_eta1_blend generated 48/128
|
| 32 |
+
[decode] temp2p0_eta1_blend generated 64/128
|
| 33 |
+
[decode] temp2p0_eta1_blend generated 80/128
|
| 34 |
+
[decode] temp2p0_eta1_blend generated 96/128
|
| 35 |
+
[decode] temp2p0_eta1_blend generated 112/128
|
| 36 |
+
[decode] temp2p0_eta1_blend generated 128/128
|
| 37 |
+
[summary] temp2p0_eta1_blend raw=15.834 strip=13.898 ent=2.520 d2=0.094
|
| 38 |
+
[decode] sem2p0_temp1p5_eta1_blend generated 16/128
|
| 39 |
+
[decode] sem2p0_temp1p5_eta1_blend generated 32/128
|
| 40 |
+
[decode] sem2p0_temp1p5_eta1_blend generated 48/128
|
| 41 |
+
[decode] sem2p0_temp1p5_eta1_blend generated 64/128
|
LTA_openwebtext_dualt/logs/eval_20260508/dirichlet_step122k_key3_state_n256.log
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
[decode] match_post_sem1_state_c16_t1p3
|
LTA_openwebtext_dualt/logs/eval_20260508/dirichlet_step122k_quick2_128steps_n64.log
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[decode] state_c256_t1p45
|
| 2 |
+
[summary] {"name": "state_c256_t1p45", "step": 123000, "detok_genppl": 34.04571508013546, "sample_entropy": 4.117197060615391, "distinct_2": 0.6318897637795275, "top_token_mass": 0.05859375, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 256.0, "update": "resample"}
|
| 3 |
+
[decode] state_c1024_t1p45
|
| 4 |
+
[summary] {"name": "state_c1024_t1p45", "step": 123000, "detok_genppl": 32.44883591727502, "sample_entropy": 4.1283035155644505, "distinct_2": 0.6188484251968503, "top_token_mass": 0.0576171875, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 1024.0, "update": "resample"}
|
LTA_openwebtext_dualt/logs/eval_20260508/dirichlet_step122k_quick2_n64.log
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[decode] state_c256_t1p45
|
| 2 |
+
[summary] {"name": "state_c256_t1p45", "step": 123000, "detok_genppl": 23.76129065779824, "sample_entropy": 4.072567398410477, "distinct_2": 0.562869094488189, "top_token_mass": 0.0634765625, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 256.0, "update": "resample"}
|
| 3 |
+
[decode] state_c1024_t1p45
|
| 4 |
+
[summary] {"name": "state_c1024_t1p45", "step": 123000, "detok_genppl": 22.850390314662413, "sample_entropy": 4.072407155167, "distinct_2": 0.5417076771653543, "top_token_mass": 0.0562744140625, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 1024.0, "update": "resample"}
|
LTA_openwebtext_dualt/logs/eval_20260508/dirichlet_step124k_256steps_n64.log
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[decode] state_c256_t1p45
|
| 2 |
+
[summary] {"name": "state_c256_t1p45", "step": 124000, "detok_genppl": 29.81545187350111, "sample_entropy": 4.085741040367163, "distinct_2": 0.5905511811023622, "top_token_mass": 0.0538330078125, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 256.0, "update": "resample"}
|
| 3 |
+
[decode] state_c1024_t1p45
|
| 4 |
+
[summary] {"name": "state_c1024_t1p45", "step": 124000, "detok_genppl": 28.415282971033125, "sample_entropy": 4.048794239038511, "distinct_2": 0.5878444881889764, "top_token_mass": 0.057861328125, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 1024.0, "update": "resample"}
|
LTA_openwebtext_dualt/logs/eval_20260508/dirichlet_step124k_4096steps_n64.log
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[decode] state_c256_t1p45
|
| 2 |
+
[summary] {"name": "state_c256_t1p45", "step": 124000, "detok_genppl": 15.863792771486848, "sample_entropy": 3.8477992784844646, "distinct_2": 0.3817667322834646, "top_token_mass": 0.0645751953125, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 256.0, "update": "resample"}
|
| 3 |
+
[decode] state_c1024_t1p45
|
| 4 |
+
[summary] {"name": "state_c1024_t1p45", "step": 124000, "detok_genppl": 14.379869187838807, "sample_entropy": 3.7611245679001697, "distinct_2": 0.32258858267716534, "top_token_mass": 0.0748291015625, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 1024.0, "update": "resample"}
|
LTA_openwebtext_dualt/logs/eval_20260508/dirichlet_step124k_steps16_n64.log
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[decode] state_c256_t1p45
|
| 2 |
+
[summary] {"name": "state_c256_t1p45", "step": 124000, "detok_genppl": 42.769466206615945, "sample_entropy": 3.1708587735321347, "distinct_2": 0.41313976377952755, "top_token_mass": 0.22607421875, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 256.0, "update": "resample"}
|
| 3 |
+
[decode] state_c1024_t1p45
|
| 4 |
+
[summary] {"name": "state_c1024_t1p45", "step": 124000, "detok_genppl": 24.348744185198743, "sample_entropy": 2.0679188483539344, "distinct_2": 0.23363681102362205, "top_token_mass": 0.392578125, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 1024.0, "update": "resample"}
|
LTA_openwebtext_dualt/logs/eval_20260508/dirichlet_step124k_steps32_n64.log
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[decode] state_c256_t1p45
|
| 2 |
+
[summary] {"name": "state_c256_t1p45", "step": 124000, "detok_genppl": 44.930859242990756, "sample_entropy": 3.7823763722555808, "distinct_2": 0.5745570866141733, "top_token_mass": 0.11572265625, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 256.0, "update": "resample"}
|
| 3 |
+
[decode] state_c1024_t1p45
|
| 4 |
+
[summary] {"name": "state_c1024_t1p45", "step": 124000, "detok_genppl": 31.13635173493272, "sample_entropy": 3.3146319833825286, "distinct_2": 0.4309793307086614, "top_token_mass": 0.2288818359375, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 1024.0, "update": "resample"}
|
LTA_openwebtext_dualt/logs/eval_20260508/dirichlet_step124k_steps64_n64.log
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[decode] state_c256_t1p45
|
| 2 |
+
[summary] {"name": "state_c256_t1p45", "step": 124000, "detok_genppl": 42.24168306980635, "sample_entropy": 4.010304198715844, "distinct_2": 0.6353346456692913, "top_token_mass": 0.0780029296875, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 256.0, "update": "resample"}
|
| 3 |
+
[decode] state_c1024_t1p45
|
| 4 |
+
[summary] {"name": "state_c1024_t1p45", "step": 124000, "detok_genppl": 35.36294023649359, "sample_entropy": 3.884623591993139, "distinct_2": 0.5751722440944882, "top_token_mass": 0.1007080078125, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 1024.0, "update": "resample"}
|
LTA_openwebtext_dualt/logs/eval_20260508/dirichlet_step124k_steps8_n64.log
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[decode] state_c256_t1p45
|
| 2 |
+
[summary] {"name": "state_c256_t1p45", "step": 124000, "detok_genppl": 146.35828905491644, "sample_entropy": 2.9716103029900554, "distinct_2": 0.45681594488188976, "top_token_mass": 0.2855224609375, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 256.0, "update": "resample"}
|
| 3 |
+
[decode] state_c1024_t1p45
|
| 4 |
+
[summary] {"name": "state_c1024_t1p45", "step": 124000, "detok_genppl": 92.67198759188139, "sample_entropy": 1.7085331062756133, "distinct_2": 0.1935285433070866, "top_token_mass": 0.3070068359375, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 1024.0, "update": "resample"}
|
LTA_openwebtext_dualt/logs/eval_20260508/lm1b_8gpu_latest_1024steps_n64.log
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[decode] state_c256_t1p45
|
| 2 |
+
[summary] {"name": "state_c256_t1p45", "step": 156000, "detok_genppl": 26.453125333929982, "sample_entropy": 3.994721810704877, "distinct_2": 0.5850147637795275, "top_token_mass": 0.0726318359375, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 256.0, "update": "resample"}
|
| 3 |
+
[decode] state_c1024_t1p45
|
| 4 |
+
[summary] {"name": "state_c1024_t1p45", "step": 156000, "detok_genppl": 21.308442953038007, "sample_entropy": 3.9465318229680624, "distinct_2": 0.5394931102362205, "top_token_mass": 0.071533203125, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 1024.0, "update": "resample"}
|
LTA_openwebtext_dualt/logs/eval_20260508/lm1b_8gpu_latest_diffusion_noise_steps_128steps_n64.log
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[decode] base_const1p45_c256
|
| 2 |
+
[summary] {"name": "base_const1p45_c256", "step": 187000, "detok_genppl": 45.86065023558471, "sample_entropy": 4.124430385339583, "distinct_2": 0.6966043307086615, "top_token_mass": 0.0712890625, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 3 |
+
[decode] base_temp2to08_c256
|
| 4 |
+
[summary] {"name": "base_temp2to08_c256", "step": 187000, "detok_genppl": 47.86767440351385, "sample_entropy": 4.196306894865523, "distinct_2": 0.718380905511811, "top_token_mass": 0.0533447265625, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 5 |
+
[decode] tpow1p5_const1p45_c256
|
| 6 |
+
[summary] {"name": "tpow1p5_const1p45_c256", "step": 187000, "detok_genppl": 47.15461218178996, "sample_entropy": 4.124970969360973, "distinct_2": 0.7017716535433071, "top_token_mass": 0.067138671875, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 1.5, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 7 |
+
[decode] tpow2p0_const1p45_c256
|
| 8 |
+
[summary] {"name": "tpow2p0_const1p45_c256", "step": 187000, "detok_genppl": 50.51946723435765, "sample_entropy": 4.078867688498713, "distinct_2": 0.7000492125984252, "top_token_mass": 0.0865478515625, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 2.0, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 9 |
+
[decode] tpow0p7_const1p45_c256
|
| 10 |
+
[summary] {"name": "tpow0p7_const1p45_c256", "step": 187000, "detok_genppl": 43.35758042812266, "sample_entropy": 4.1828232497490045, "distinct_2": 0.6999261811023622, "top_token_mass": 0.055908203125, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 0.7, "eta0": 0.0, "eta_schedule": "none", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 11 |
+
[decode] noise_const1p45_early_linear_eta0p01_c256
|
| 12 |
+
[summary] {"name": "noise_const1p45_early_linear_eta0p01_c256", "step": 187000, "detok_genppl": 43.2830792498643, "sample_entropy": 4.186505573692286, "distinct_2": 0.7080462598425197, "top_token_mass": 0.055419921875, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.01, "eta_schedule": "early_linear", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 13 |
+
[decode] noise_const1p45_early_cosine_eta0p01_c256
|
| 14 |
+
[summary] {"name": "noise_const1p45_early_cosine_eta0p01_c256", "step": 187000, "detok_genppl": 42.09900669006817, "sample_entropy": 4.165938766811043, "distinct_2": 0.6938976377952756, "top_token_mass": 0.0592041015625, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.01, "eta_schedule": "early_cosine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 15 |
+
[decode] noise_const1p45_mid_sine_eta0p01_c256
|
| 16 |
+
[summary] {"name": "noise_const1p45_mid_sine_eta0p01_c256", "step": 187000, "detok_genppl": 44.031384454172304, "sample_entropy": 4.192402841950814, "distinct_2": 0.7004183070866141, "top_token_mass": 0.0552978515625, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.01, "eta_schedule": "mid_sine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 17 |
+
[decode] noise_const1p45_early_linear_eta0p02_c256
|
| 18 |
+
[summary] {"name": "noise_const1p45_early_linear_eta0p02_c256", "step": 187000, "detok_genppl": 44.65909863566894, "sample_entropy": 4.198494329382783, "distinct_2": 0.7079232283464567, "top_token_mass": 0.05419921875, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.02, "eta_schedule": "early_linear", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 19 |
+
[decode] noise_const1p45_early_cosine_eta0p02_c256
|
| 20 |
+
[summary] {"name": "noise_const1p45_early_cosine_eta0p02_c256", "step": 187000, "detok_genppl": 46.54789765723706, "sample_entropy": 4.189696595593963, "distinct_2": 0.7057086614173228, "top_token_mass": 0.0576171875, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.02, "eta_schedule": "early_cosine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 21 |
+
[decode] noise_const1p45_mid_sine_eta0p02_c256
|
| 22 |
+
[summary] {"name": "noise_const1p45_mid_sine_eta0p02_c256", "step": 187000, "detok_genppl": 46.05345631456055, "sample_entropy": 4.179009616612305, "distinct_2": 0.6916830708661418, "top_token_mass": 0.05712890625, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.02, "eta_schedule": "mid_sine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 23 |
+
[decode] noise_const1p45_early_linear_eta0p05_c256
|
| 24 |
+
[summary] {"name": "noise_const1p45_early_linear_eta0p05_c256", "step": 187000, "detok_genppl": 45.9198653520818, "sample_entropy": 4.198348275601427, "distinct_2": 0.7081692913385826, "top_token_mass": 0.0552978515625, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.05, "eta_schedule": "early_linear", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 25 |
+
[decode] noise_const1p45_early_cosine_eta0p05_c256
|
| 26 |
+
[summary] {"name": "noise_const1p45_early_cosine_eta0p05_c256", "step": 187000, "detok_genppl": 46.75523684526139, "sample_entropy": 4.197912062162802, "distinct_2": 0.7121062992125984, "top_token_mass": 0.0545654296875, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.05, "eta_schedule": "early_cosine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 27 |
+
[decode] noise_const1p45_mid_sine_eta0p05_c256
|
| 28 |
+
[summary] {"name": "noise_const1p45_mid_sine_eta0p05_c256", "step": 187000, "detok_genppl": 42.7693226634343, "sample_entropy": 4.183180739045019, "distinct_2": 0.6919291338582677, "top_token_mass": 0.0548095703125, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.05, "eta_schedule": "mid_sine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 29 |
+
[decode] noise_const1p45_early_linear_eta0p08_c256
|
| 30 |
+
[summary] {"name": "noise_const1p45_early_linear_eta0p08_c256", "step": 187000, "detok_genppl": 44.54583377426971, "sample_entropy": 4.1536400284735615, "distinct_2": 0.6952509842519685, "top_token_mass": 0.0660400390625, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.08, "eta_schedule": "early_linear", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 31 |
+
[decode] noise_const1p45_early_cosine_eta0p08_c256
|
| 32 |
+
[summary] {"name": "noise_const1p45_early_cosine_eta0p08_c256", "step": 187000, "detok_genppl": 42.21974970343552, "sample_entropy": 4.12835095126496, "distinct_2": 0.6998031496062992, "top_token_mass": 0.0679931640625, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.08, "eta_schedule": "early_cosine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 33 |
+
[decode] noise_const1p45_mid_sine_eta0p08_c256
|
| 34 |
+
[summary] {"name": "noise_const1p45_mid_sine_eta0p08_c256", "step": 187000, "detok_genppl": 44.612465411027486, "sample_entropy": 4.105863199315936, "distinct_2": 0.6919291338582677, "top_token_mass": 0.0728759765625, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.08, "eta_schedule": "mid_sine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 35 |
+
[decode] noise_const1p45_early_linear_eta0p12_c256
|
| 36 |
+
[summary] {"name": "noise_const1p45_early_linear_eta0p12_c256", "step": 187000, "detok_genppl": 46.894060450372244, "sample_entropy": 4.130338682029675, "distinct_2": 0.7098917322834646, "top_token_mass": 0.0697021484375, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.12, "eta_schedule": "early_linear", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 37 |
+
[decode] noise_const1p45_early_cosine_eta0p12_c256
|
| 38 |
+
[summary] {"name": "noise_const1p45_early_cosine_eta0p12_c256", "step": 187000, "detok_genppl": 44.33293502778378, "sample_entropy": 4.205396118072462, "distinct_2": 0.7105068897637795, "top_token_mass": 0.055908203125, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.12, "eta_schedule": "early_cosine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 39 |
+
[decode] noise_const1p45_mid_sine_eta0p12_c256
|
| 40 |
+
[summary] {"name": "noise_const1p45_mid_sine_eta0p12_c256", "step": 187000, "detok_genppl": 44.742664272649876, "sample_entropy": 4.208047575760355, "distinct_2": 0.6950049212598425, "top_token_mass": 0.0523681640625, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "t_power": 1.0, "eta0": 0.12, "eta_schedule": "mid_sine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 41 |
+
[decode] noise_linear2to08_early_linear_eta0p01_c256
|
| 42 |
+
[summary] {"name": "noise_linear2to08_early_linear_eta0p01_c256", "step": 187000, "detok_genppl": 48.10164682261911, "sample_entropy": 4.200641298969796, "distinct_2": 0.7181348425196851, "top_token_mass": 0.052978515625, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.01, "eta_schedule": "early_linear", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 43 |
+
[decode] noise_linear2to08_early_cosine_eta0p01_c256
|
| 44 |
+
[summary] {"name": "noise_linear2to08_early_cosine_eta0p01_c256", "step": 187000, "detok_genppl": 48.581707982325014, "sample_entropy": 4.18684190232035, "distinct_2": 0.7247785433070866, "top_token_mass": 0.0528564453125, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.01, "eta_schedule": "early_cosine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 45 |
+
[decode] noise_linear2to08_mid_sine_eta0p01_c256
|
| 46 |
+
[summary] {"name": "noise_linear2to08_mid_sine_eta0p01_c256", "step": 187000, "detok_genppl": 48.2817867052147, "sample_entropy": 4.19866648234313, "distinct_2": 0.7100147637795275, "top_token_mass": 0.0546875, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.01, "eta_schedule": "mid_sine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 47 |
+
[decode] noise_linear2to08_early_linear_eta0p02_c256
|
| 48 |
+
[summary] {"name": "noise_linear2to08_early_linear_eta0p02_c256", "step": 187000, "detok_genppl": 48.18240875879645, "sample_entropy": 4.200519902464651, "distinct_2": 0.7197342519685039, "top_token_mass": 0.0533447265625, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.02, "eta_schedule": "early_linear", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 49 |
+
[decode] noise_linear2to08_early_cosine_eta0p02_c256
|
| 50 |
+
[summary] {"name": "noise_linear2to08_early_cosine_eta0p02_c256", "step": 187000, "detok_genppl": 49.24717321473146, "sample_entropy": 4.190932299534418, "distinct_2": 0.7204724409448819, "top_token_mass": 0.0516357421875, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.02, "eta_schedule": "early_cosine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 51 |
+
[decode] noise_linear2to08_mid_sine_eta0p02_c256
|
| 52 |
+
[summary] {"name": "noise_linear2to08_mid_sine_eta0p02_c256", "step": 187000, "detok_genppl": 49.82313453731583, "sample_entropy": 4.217693530581953, "distinct_2": 0.7095226377952756, "top_token_mass": 0.05419921875, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.02, "eta_schedule": "mid_sine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 53 |
+
[decode] noise_linear2to08_early_linear_eta0p05_c256
|
| 54 |
+
[summary] {"name": "noise_linear2to08_early_linear_eta0p05_c256", "step": 187000, "detok_genppl": 48.83178019589149, "sample_entropy": 4.2050137794841085, "distinct_2": 0.7265009842519685, "top_token_mass": 0.050537109375, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.05, "eta_schedule": "early_linear", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 55 |
+
[decode] noise_linear2to08_early_cosine_eta0p05_c256
|
| 56 |
+
[summary] {"name": "noise_linear2to08_early_cosine_eta0p05_c256", "step": 187000, "detok_genppl": 49.35257597935552, "sample_entropy": 4.205382906118391, "distinct_2": 0.7299458661417323, "top_token_mass": 0.0531005859375, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.05, "eta_schedule": "early_cosine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 57 |
+
[decode] noise_linear2to08_mid_sine_eta0p05_c256
|
| 58 |
+
[summary] {"name": "noise_linear2to08_mid_sine_eta0p05_c256", "step": 187000, "detok_genppl": 49.82670395081913, "sample_entropy": 4.223790486496133, "distinct_2": 0.7219488188976378, "top_token_mass": 0.051025390625, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.05, "eta_schedule": "mid_sine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 59 |
+
[decode] noise_linear2to08_early_linear_eta0p08_c256
|
| 60 |
+
[summary] {"name": "noise_linear2to08_early_linear_eta0p08_c256", "step": 187000, "detok_genppl": 49.12591398887004, "sample_entropy": 4.217049731211352, "distinct_2": 0.7287155511811023, "top_token_mass": 0.051513671875, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.08, "eta_schedule": "early_linear", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 61 |
+
[decode] noise_linear2to08_early_cosine_eta0p08_c256
|
| 62 |
+
[summary] {"name": "noise_linear2to08_early_cosine_eta0p08_c256", "step": 187000, "detok_genppl": 46.47073894768385, "sample_entropy": 4.206427871276884, "distinct_2": 0.718873031496063, "top_token_mass": 0.053466796875, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.08, "eta_schedule": "early_cosine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 63 |
+
[decode] noise_linear2to08_mid_sine_eta0p08_c256
|
| 64 |
+
[summary] {"name": "noise_linear2to08_mid_sine_eta0p08_c256", "step": 187000, "detok_genppl": 49.95328576578725, "sample_entropy": 4.222333312252868, "distinct_2": 0.7234251968503937, "top_token_mass": 0.0513916015625, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.08, "eta_schedule": "mid_sine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 65 |
+
[decode] noise_linear2to08_early_linear_eta0p12_c256
|
| 66 |
+
[summary] {"name": "noise_linear2to08_early_linear_eta0p12_c256", "step": 187000, "detok_genppl": 45.858783855541894, "sample_entropy": 4.225998514820446, "distinct_2": 0.717888779527559, "top_token_mass": 0.0540771484375, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.12, "eta_schedule": "early_linear", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 67 |
+
[decode] noise_linear2to08_early_cosine_eta0p12_c256
|
| 68 |
+
[summary] {"name": "noise_linear2to08_early_cosine_eta0p12_c256", "step": 187000, "detok_genppl": 46.82628279884142, "sample_entropy": 4.213082036146694, "distinct_2": 0.7100147637795275, "top_token_mass": 0.0517578125, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.12, "eta_schedule": "early_cosine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 69 |
+
[decode] noise_linear2to08_mid_sine_eta0p12_c256
|
| 70 |
+
[summary] {"name": "noise_linear2to08_mid_sine_eta0p12_c256", "step": 187000, "detok_genppl": 47.38313453813254, "sample_entropy": 4.222067649767945, "distinct_2": 0.7165354330708661, "top_token_mass": 0.049560546875, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.12, "eta_schedule": "mid_sine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 71 |
+
[decode] tpow1p5_noise_linear2to08_early_cosine_eta0p02_c256
|
| 72 |
+
[summary] {"name": "tpow1p5_noise_linear2to08_early_cosine_eta0p02_c256", "step": 187000, "detok_genppl": 41.43285220336455, "sample_entropy": 4.144170494681995, "distinct_2": 0.6658464566929134, "top_token_mass": 0.058837890625, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.5, "eta0": 0.02, "eta_schedule": "early_cosine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 73 |
+
[decode] tpow1p5_noise_linear2to08_mid_sine_eta0p02_c256
|
| 74 |
+
[summary] {"name": "tpow1p5_noise_linear2to08_mid_sine_eta0p02_c256", "step": 187000, "detok_genppl": 40.688636689981976, "sample_entropy": 4.138078246400767, "distinct_2": 0.6857775590551181, "top_token_mass": 0.0615234375, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.5, "eta0": 0.02, "eta_schedule": "mid_sine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 75 |
+
[decode] tpow1p5_noise_linear2to08_early_cosine_eta0p05_c256
|
| 76 |
+
[summary] {"name": "tpow1p5_noise_linear2to08_early_cosine_eta0p05_c256", "step": 187000, "detok_genppl": 43.72608758972266, "sample_entropy": 4.146192054211393, "distinct_2": 0.6766732283464567, "top_token_mass": 0.0582275390625, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.5, "eta0": 0.05, "eta_schedule": "early_cosine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 77 |
+
[decode] tpow1p5_noise_linear2to08_mid_sine_eta0p05_c256
|
| 78 |
+
[summary] {"name": "tpow1p5_noise_linear2to08_mid_sine_eta0p05_c256", "step": 187000, "detok_genppl": 40.41088438525592, "sample_entropy": 4.153850742833741, "distinct_2": 0.6948818897637795, "top_token_mass": 0.06103515625, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.5, "eta0": 0.05, "eta_schedule": "mid_sine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 79 |
+
[decode] tpow1p5_noise_linear2to08_early_cosine_eta0p08_c256
|
| 80 |
+
[summary] {"name": "tpow1p5_noise_linear2to08_early_cosine_eta0p08_c256", "step": 187000, "detok_genppl": 42.80336239844838, "sample_entropy": 4.142722771044542, "distinct_2": 0.6817175196850394, "top_token_mass": 0.0576171875, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.5, "eta0": 0.08, "eta_schedule": "early_cosine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 81 |
+
[decode] tpow1p5_noise_linear2to08_mid_sine_eta0p08_c256
|
| 82 |
+
[summary] {"name": "tpow1p5_noise_linear2to08_mid_sine_eta0p08_c256", "step": 187000, "detok_genppl": 41.16435120941998, "sample_entropy": 4.162179467920619, "distinct_2": 0.6884842519685039, "top_token_mass": 0.0582275390625, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.5, "eta0": 0.08, "eta_schedule": "mid_sine", "noise_conc": 1.0, "concentration_max": 256.0, "update": "resample"}
|
| 83 |
+
[decode] noise_linear2to08_early_cosine_eta0p01_c1024
|
| 84 |
+
[summary] {"name": "noise_linear2to08_early_cosine_eta0p01_c1024", "step": 187000, "detok_genppl": 46.75827674418147, "sample_entropy": 4.14898724091752, "distinct_2": 0.6947588582677166, "top_token_mass": 0.05419921875, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.01, "eta_schedule": "early_cosine", "noise_conc": 1.0, "concentration_max": 1024.0, "update": "resample"}
|
| 85 |
+
[decode] noise_linear2to08_early_cosine_eta0p02_c1024
|
| 86 |
+
[summary] {"name": "noise_linear2to08_early_cosine_eta0p02_c1024", "step": 187000, "detok_genppl": 45.648558001763085, "sample_entropy": 4.153952417528344, "distinct_2": 0.687623031496063, "top_token_mass": 0.0555419921875, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.02, "eta_schedule": "early_cosine", "noise_conc": 1.0, "concentration_max": 1024.0, "update": "resample"}
|
| 87 |
+
[decode] noise_linear2to08_early_cosine_eta0p05_c1024
|
| 88 |
+
[summary] {"name": "noise_linear2to08_early_cosine_eta0p05_c1024", "step": 187000, "detok_genppl": 46.266957384208816, "sample_entropy": 4.169058739359141, "distinct_2": 0.6929133858267716, "top_token_mass": 0.0540771484375, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "t_power": 1.0, "eta0": 0.05, "eta_schedule": "early_cosine", "noise_conc": 1.0, "concentration_max": 1024.0, "update": "resample"}
|
LTA_openwebtext_dualt/logs/eval_20260508/lm1b_8gpu_latest_step146k_128steps_n64.log
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[decode] state_c256_t1p45
|
| 2 |
+
[summary] {"name": "state_c256_t1p45", "step": 147000, "detok_genppl": 42.95175406093094, "sample_entropy": 4.177227507708967, "distinct_2": 0.7065698818897638, "top_token_mass": 0.0526123046875, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 256.0, "update": "resample"}
|
| 3 |
+
[decode] state_c1024_t1p45
|
| 4 |
+
[summary] {"name": "state_c1024_t1p45", "step": 147000, "detok_genppl": 43.46101424967429, "sample_entropy": 4.177106000985858, "distinct_2": 0.7044783464566929, "top_token_mass": 0.063232421875, "model_t_mode": "post", "support_power": 1.0, "semantic_power": 1.0, "final_from": "state", "endpoint_temp": 1.45, "concentration_max": 1024.0, "update": "resample"}
|
LTA_openwebtext_dualt/logs/eval_20260508/lm1b_8gpu_latest_temp_push43_128steps_n64.log
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[decode] const1p7_c256
|
| 2 |
+
[summary] {"name": "const1p7_c256", "step": 175000, "detok_genppl": 73.02172589789832, "sample_entropy": 3.883921295576758, "distinct_2": 0.6646161417322834, "top_token_mass": 0.0772705078125, "temp_start": 1.7, "temp_end": 1.7, "temp_schedule": "const", "concentration_max": 256.0, "update": "resample"}
|
| 3 |
+
[decode] const1p9_c256
|
| 4 |
+
[summary] {"name": "const1p9_c256", "step": 175000, "detok_genppl": 14.922162281776664, "sample_entropy": 0.8953066809266682, "distinct_2": 0.06594488188976377, "top_token_mass": 0.535888671875, "temp_start": 1.9, "temp_end": 1.9, "temp_schedule": "const", "concentration_max": 256.0, "update": "resample"}
|
| 5 |
+
[decode] linear_2p2_to_1p0_c256
|
| 6 |
+
[summary] {"name": "linear_2p2_to_1p0_c256", "step": 175000, "detok_genppl": 58.24675350743373, "sample_entropy": 4.1634674784481325, "distinct_2": 0.7332677165354331, "top_token_mass": 0.0516357421875, "temp_start": 2.2, "temp_end": 1.0, "temp_schedule": "linear", "concentration_max": 256.0, "update": "resample"}
|
| 7 |
+
[decode] linear_2p2_to_0p8_c256
|
| 8 |
+
[summary] {"name": "linear_2p2_to_0p8_c256", "step": 175000, "detok_genppl": 56.400704956455975, "sample_entropy": 4.217845318082235, "distinct_2": 0.7203494094488189, "top_token_mass": 0.0545654296875, "temp_start": 2.2, "temp_end": 0.8, "temp_schedule": "linear", "concentration_max": 256.0, "update": "resample"}
|
| 9 |
+
[decode] linear_2p4_to_1p0_c256
|
| 10 |
+
[summary] {"name": "linear_2p4_to_1p0_c256", "step": 175000, "detok_genppl": 64.6726820413709, "sample_entropy": 3.974846120158277, "distinct_2": 0.68626968503937, "top_token_mass": 0.075439453125, "temp_start": 2.4, "temp_end": 1.0, "temp_schedule": "linear", "concentration_max": 256.0, "update": "resample"}
|
| 11 |
+
[decode] linear_2p4_to_0p8_c256
|
| 12 |
+
[summary] {"name": "linear_2p4_to_0p8_c256", "step": 175000, "detok_genppl": 58.89225325764547, "sample_entropy": 4.186939088090959, "distinct_2": 0.7390501968503937, "top_token_mass": 0.0535888671875, "temp_start": 2.4, "temp_end": 0.8, "temp_schedule": "linear", "concentration_max": 256.0, "update": "resample"}
|
| 13 |
+
[decode] linear_2p6_to_1p0_c256
|
| 14 |
+
[summary] {"name": "linear_2p6_to_1p0_c256", "step": 175000, "detok_genppl": 93.22654001417575, "sample_entropy": 3.5502153622269423, "distinct_2": 0.5669291338582677, "top_token_mass": 0.0662841796875, "temp_start": 2.6, "temp_end": 1.0, "temp_schedule": "linear", "concentration_max": 256.0, "update": "resample"}
|
| 15 |
+
[decode] linear_2p6_to_0p8_c256
|
| 16 |
+
[summary] {"name": "linear_2p6_to_0p8_c256", "step": 175000, "detok_genppl": 67.44367329956249, "sample_entropy": 4.013322546727759, "distinct_2": 0.6983267716535433, "top_token_mass": 0.062744140625, "temp_start": 2.6, "temp_end": 0.8, "temp_schedule": "linear", "concentration_max": 256.0, "update": "resample"}
|
| 17 |
+
[decode] cosine_2p2_to_1p0_c256
|
| 18 |
+
[summary] {"name": "cosine_2p2_to_1p0_c256", "step": 175000, "detok_genppl": 59.13250534161515, "sample_entropy": 4.156997882351534, "distinct_2": 0.7245324803149606, "top_token_mass": 0.0587158203125, "temp_start": 2.2, "temp_end": 1.0, "temp_schedule": "cosine", "concentration_max": 256.0, "update": "resample"}
|
| 19 |
+
[decode] cosine_2p2_to_0p8_c256
|
| 20 |
+
[summary] {"name": "cosine_2p2_to_0p8_c256", "step": 175000, "detok_genppl": 53.079358293412945, "sample_entropy": 4.194362495717261, "distinct_2": 0.71751968503937, "top_token_mass": 0.057373046875, "temp_start": 2.2, "temp_end": 0.8, "temp_schedule": "cosine", "concentration_max": 256.0, "update": "resample"}
|
| 21 |
+
[decode] cosine_2p4_to_1p0_c256
|
| 22 |
+
[summary] {"name": "cosine_2p4_to_1p0_c256", "step": 175000, "detok_genppl": 67.63405924691095, "sample_entropy": 3.9118554001672514, "distinct_2": 0.6654773622047244, "top_token_mass": 0.061767578125, "temp_start": 2.4, "temp_end": 1.0, "temp_schedule": "cosine", "concentration_max": 256.0, "update": "resample"}
|
| 23 |
+
[decode] cosine_2p4_to_0p8_c256
|
| 24 |
+
[summary] {"name": "cosine_2p4_to_0p8_c256", "step": 175000, "detok_genppl": 59.268749670223364, "sample_entropy": 4.142414604309686, "distinct_2": 0.7123523622047244, "top_token_mass": 0.0567626953125, "temp_start": 2.4, "temp_end": 0.8, "temp_schedule": "cosine", "concentration_max": 256.0, "update": "resample"}
|
| 25 |
+
[decode] cosine_2p6_to_1p0_c256
|
| 26 |
+
[summary] {"name": "cosine_2p6_to_1p0_c256", "step": 175000, "detok_genppl": 81.90252773757335, "sample_entropy": 3.697331724362402, "distinct_2": 0.609744094488189, "top_token_mass": 0.0743408203125, "temp_start": 2.6, "temp_end": 1.0, "temp_schedule": "cosine", "concentration_max": 256.0, "update": "resample"}
|
| 27 |
+
[decode] cosine_2p6_to_0p8_c256
|
| 28 |
+
[summary] {"name": "cosine_2p6_to_0p8_c256", "step": 175000, "detok_genppl": 66.93942012291302, "sample_entropy": 4.024862423606699, "distinct_2": 0.702632874015748, "top_token_mass": 0.060791015625, "temp_start": 2.6, "temp_end": 0.8, "temp_schedule": "cosine", "concentration_max": 256.0, "update": "resample"}
|
| 29 |
+
[decode] const1p7_c1024
|
| 30 |
+
[summary] {"name": "const1p7_c1024", "step": 175000, "detok_genppl": 24.95490342565739, "sample_entropy": 2.0072551200432747, "distinct_2": 0.19266732283464566, "top_token_mass": 0.1678466796875, "temp_start": 1.7, "temp_end": 1.7, "temp_schedule": "const", "concentration_max": 1024.0, "update": "resample"}
|
| 31 |
+
[decode] const1p9_c1024
|
| 32 |
+
[summary] {"name": "const1p9_c1024", "step": 175000, "detok_genppl": 3.385201792360027, "sample_entropy": 0.2811448039592659, "distinct_2": 0.01439468503937008, "top_token_mass": 0.6204833984375, "temp_start": 1.9, "temp_end": 1.9, "temp_schedule": "const", "concentration_max": 1024.0, "update": "resample"}
|
| 33 |
+
[decode] linear_2p2_to_1p0_c1024
|
| 34 |
+
[summary] {"name": "linear_2p2_to_1p0_c1024", "step": 175000, "detok_genppl": 40.143828038381145, "sample_entropy": 3.0747068927734635, "distinct_2": 0.40760334645669294, "top_token_mass": 0.1370849609375, "temp_start": 2.2, "temp_end": 1.0, "temp_schedule": "linear", "concentration_max": 1024.0, "update": "resample"}
|
| 35 |
+
[decode] linear_2p2_to_0p8_c1024
|
| 36 |
+
[summary] {"name": "linear_2p2_to_0p8_c1024", "step": 175000, "detok_genppl": 46.983933012582675, "sample_entropy": 3.9660751510643766, "distinct_2": 0.6374261811023622, "top_token_mass": 0.0643310546875, "temp_start": 2.2, "temp_end": 0.8, "temp_schedule": "linear", "concentration_max": 1024.0, "update": "resample"}
|
| 37 |
+
[decode] linear_2p4_to_1p0_c1024
|
| 38 |
+
[summary] {"name": "linear_2p4_to_1p0_c1024", "step": 175000, "detok_genppl": 36.51915972403966, "sample_entropy": 2.188378572202313, "distinct_2": 0.17679625984251968, "top_token_mass": 0.1708984375, "temp_start": 2.4, "temp_end": 1.0, "temp_schedule": "linear", "concentration_max": 1024.0, "update": "resample"}
|
| 39 |
+
[decode] linear_2p4_to_0p8_c1024
|
| 40 |
+
[summary] {"name": "linear_2p4_to_0p8_c1024", "step": 175000, "detok_genppl": 42.03332370225938, "sample_entropy": 2.927752437640226, "distinct_2": 0.37893700787401574, "top_token_mass": 0.14404296875, "temp_start": 2.4, "temp_end": 0.8, "temp_schedule": "linear", "concentration_max": 1024.0, "update": "resample"}
|
| 41 |
+
[decode] linear_2p6_to_1p0_c1024
|
| 42 |
+
[summary] {"name": "linear_2p6_to_1p0_c1024", "step": 175000, "detok_genppl": 56.54484225723178, "sample_entropy": 1.6456302306471402, "distinct_2": 0.06975885826771654, "top_token_mass": 0.28125, "temp_start": 2.6, "temp_end": 1.0, "temp_schedule": "linear", "concentration_max": 1024.0, "update": "resample"}
|
| 43 |
+
[decode] linear_2p6_to_0p8_c1024
|
| 44 |
+
[summary] {"name": "linear_2p6_to_0p8_c1024", "step": 175000, "detok_genppl": 39.52693715768492, "sample_entropy": 2.094969797494312, "distinct_2": 0.1622785433070866, "top_token_mass": 0.1690673828125, "temp_start": 2.6, "temp_end": 0.8, "temp_schedule": "linear", "concentration_max": 1024.0, "update": "resample"}
|
| 45 |
+
[decode] cosine_2p2_to_1p0_c1024
|
| 46 |
+
[summary] {"name": "cosine_2p2_to_1p0_c1024", "step": 175000, "detok_genppl": 40.03004349797324, "sample_entropy": 2.982592103912474, "distinct_2": 0.38250492125984253, "top_token_mass": 0.1568603515625, "temp_start": 2.2, "temp_end": 1.0, "temp_schedule": "cosine", "concentration_max": 1024.0, "update": "resample"}
|
| 47 |
+
[decode] cosine_2p2_to_0p8_c1024
|
| 48 |
+
[summary] {"name": "cosine_2p2_to_0p8_c1024", "step": 175000, "detok_genppl": 48.996002380359144, "sample_entropy": 3.7614902426841117, "distinct_2": 0.578986220472441, "top_token_mass": 0.092529296875, "temp_start": 2.2, "temp_end": 0.8, "temp_schedule": "cosine", "concentration_max": 1024.0, "update": "resample"}
|
| 49 |
+
[decode] cosine_2p4_to_1p0_c1024
|
| 50 |
+
[summary] {"name": "cosine_2p4_to_1p0_c1024", "step": 175000, "detok_genppl": 39.082045935365684, "sample_entropy": 2.2156439771111716, "distinct_2": 0.16818405511811024, "top_token_mass": 0.183349609375, "temp_start": 2.4, "temp_end": 1.0, "temp_schedule": "cosine", "concentration_max": 1024.0, "update": "resample"}
|
| 51 |
+
[decode] cosine_2p4_to_0p8_c1024
|
| 52 |
+
[summary] {"name": "cosine_2p4_to_0p8_c1024", "step": 175000, "detok_genppl": 38.684306933753554, "sample_entropy": 2.9657303464806444, "distinct_2": 0.374753937007874, "top_token_mass": 0.1798095703125, "temp_start": 2.4, "temp_end": 0.8, "temp_schedule": "cosine", "concentration_max": 1024.0, "update": "resample"}
|
| 53 |
+
[decode] cosine_2p6_to_1p0_c1024
|
| 54 |
+
[summary] {"name": "cosine_2p6_to_1p0_c1024", "step": 175000, "detok_genppl": 41.033074923366925, "sample_entropy": 1.7315247008660806, "distinct_2": 0.08476870078740158, "top_token_mass": 0.225341796875, "temp_start": 2.6, "temp_end": 1.0, "temp_schedule": "cosine", "concentration_max": 1024.0, "update": "resample"}
|
| 55 |
+
[decode] cosine_2p6_to_0p8_c1024
|
| 56 |
+
[summary] {"name": "cosine_2p6_to_0p8_c1024", "step": 175000, "detok_genppl": 39.47164351744778, "sample_entropy": 2.30810961807286, "distinct_2": 0.1984498031496063, "top_token_mass": 0.1820068359375, "temp_start": 2.6, "temp_end": 0.8, "temp_schedule": "cosine", "concentration_max": 1024.0, "update": "resample"}
|
LTA_openwebtext_dualt/logs/eval_20260508/lm1b_8gpu_latest_temp_schedule_128steps_n64.log
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[decode] const1p45_c256
|
| 2 |
+
[summary] {"name": "const1p45_c256", "step": 168000, "detok_genppl": 46.167692609958465, "sample_entropy": 4.203627610836947, "distinct_2": 0.7084153543307087, "top_token_mass": 0.0589599609375, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "concentration_max": 256.0, "update": "resample"}
|
| 3 |
+
[decode] linear_2p0_to_1p0_c256
|
| 4 |
+
[summary] {"name": "linear_2p0_to_1p0_c256", "step": 168000, "detok_genppl": 57.89282886876284, "sample_entropy": 4.230719319713245, "distinct_2": 0.7317913385826772, "top_token_mass": 0.0478515625, "temp_start": 2.0, "temp_end": 1.0, "temp_schedule": "linear", "concentration_max": 256.0, "update": "resample"}
|
| 5 |
+
[decode] linear_2p0_to_0p8_c256
|
| 6 |
+
[summary] {"name": "linear_2p0_to_0p8_c256", "step": 168000, "detok_genppl": 50.33995014345255, "sample_entropy": 4.20795476641468, "distinct_2": 0.7055856299212598, "top_token_mass": 0.0501708984375, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "concentration_max": 256.0, "update": "resample"}
|
| 7 |
+
[decode] linear_1p8_to_0p8_c256
|
| 8 |
+
[summary] {"name": "linear_1p8_to_0p8_c256", "step": 168000, "detok_genppl": 42.701768026370516, "sample_entropy": 4.1504656186868445, "distinct_2": 0.6781496062992126, "top_token_mass": 0.0665283203125, "temp_start": 1.8, "temp_end": 0.8, "temp_schedule": "linear", "concentration_max": 256.0, "update": "resample"}
|
| 9 |
+
[decode] cosine_2p0_to_1p0_c256
|
| 10 |
+
[summary] {"name": "cosine_2p0_to_1p0_c256", "step": 168000, "detok_genppl": 57.69433042345068, "sample_entropy": 4.217734038811499, "distinct_2": 0.7210875984251969, "top_token_mass": 0.0491943359375, "temp_start": 2.0, "temp_end": 1.0, "temp_schedule": "cosine", "concentration_max": 256.0, "update": "resample"}
|
| 11 |
+
[decode] cosine_2p0_to_0p8_c256
|
| 12 |
+
[summary] {"name": "cosine_2p0_to_0p8_c256", "step": 168000, "detok_genppl": 52.61771836855152, "sample_entropy": 4.229658640820065, "distinct_2": 0.7153051181102362, "top_token_mass": 0.052490234375, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "cosine", "concentration_max": 256.0, "update": "resample"}
|
| 13 |
+
[decode] cosine_1p8_to_0p8_c256
|
| 14 |
+
[summary] {"name": "cosine_1p8_to_0p8_c256", "step": 168000, "detok_genppl": 44.48490287414872, "sample_entropy": 4.189465390460094, "distinct_2": 0.6977116141732284, "top_token_mass": 0.0614013671875, "temp_start": 1.8, "temp_end": 0.8, "temp_schedule": "cosine", "concentration_max": 256.0, "update": "resample"}
|
| 15 |
+
[decode] late_2p0_to_0p8_c256
|
| 16 |
+
[summary] {"name": "late_2p0_to_0p8_c256", "step": 168000, "detok_genppl": 76.25641195724324, "sample_entropy": 4.119732817656021, "distinct_2": 0.6507135826771654, "top_token_mass": 0.050537109375, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "late", "concentration_max": 256.0, "update": "resample"}
|
| 17 |
+
[decode] const1p45_c1024
|
| 18 |
+
[summary] {"name": "const1p45_c1024", "step": 168000, "detok_genppl": 44.64688583445828, "sample_entropy": 4.190344770313543, "distinct_2": 0.6918061023622047, "top_token_mass": 0.0584716796875, "temp_start": 1.45, "temp_end": 1.45, "temp_schedule": "const", "concentration_max": 1024.0, "update": "resample"}
|
| 19 |
+
[decode] linear_2p0_to_1p0_c1024
|
| 20 |
+
[summary] {"name": "linear_2p0_to_1p0_c1024", "step": 168000, "detok_genppl": 67.09372257056255, "sample_entropy": 4.212984973676147, "distinct_2": 0.6530511811023622, "top_token_mass": 0.0494384765625, "temp_start": 2.0, "temp_end": 1.0, "temp_schedule": "linear", "concentration_max": 1024.0, "update": "resample"}
|
| 21 |
+
[decode] linear_2p0_to_0p8_c1024
|
| 22 |
+
[summary] {"name": "linear_2p0_to_0p8_c1024", "step": 168000, "detok_genppl": 59.49218961431751, "sample_entropy": 4.231511210178493, "distinct_2": 0.7090305118110236, "top_token_mass": 0.051025390625, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "linear", "concentration_max": 1024.0, "update": "resample"}
|
| 23 |
+
[decode] linear_1p8_to_0p8_c1024
|
| 24 |
+
[summary] {"name": "linear_1p8_to_0p8_c1024", "step": 168000, "detok_genppl": 44.10530348094567, "sample_entropy": 4.16083612008407, "distinct_2": 0.6830708661417323, "top_token_mass": 0.0643310546875, "temp_start": 1.8, "temp_end": 0.8, "temp_schedule": "linear", "concentration_max": 1024.0, "update": "resample"}
|
| 25 |
+
[decode] cosine_2p0_to_1p0_c1024
|
| 26 |
+
[summary] {"name": "cosine_2p0_to_1p0_c1024", "step": 168000, "detok_genppl": 69.45955054398952, "sample_entropy": 4.014714437300709, "distinct_2": 0.59251968503937, "top_token_mass": 0.058837890625, "temp_start": 2.0, "temp_end": 1.0, "temp_schedule": "cosine", "concentration_max": 1024.0, "update": "resample"}
|
| 27 |
+
[decode] cosine_2p0_to_0p8_c1024
|
| 28 |
+
[summary] {"name": "cosine_2p0_to_0p8_c1024", "step": 168000, "detok_genppl": 61.62004678653673, "sample_entropy": 4.226612023722781, "distinct_2": 0.68873031496063, "top_token_mass": 0.048095703125, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "cosine", "concentration_max": 1024.0, "update": "resample"}
|
| 29 |
+
[decode] cosine_1p8_to_0p8_c1024
|
| 30 |
+
[summary] {"name": "cosine_1p8_to_0p8_c1024", "step": 168000, "detok_genppl": 49.414174907327684, "sample_entropy": 4.191741629686525, "distinct_2": 0.6957431102362205, "top_token_mass": 0.05908203125, "temp_start": 1.8, "temp_end": 0.8, "temp_schedule": "cosine", "concentration_max": 1024.0, "update": "resample"}
|
| 31 |
+
[decode] late_2p0_to_0p8_c1024
|
| 32 |
+
[summary] {"name": "late_2p0_to_0p8_c1024", "step": 168000, "detok_genppl": 30.292676715795903, "sample_entropy": 1.950918831735721, "distinct_2": 0.21345964566929135, "top_token_mass": 0.4271240234375, "temp_start": 2.0, "temp_end": 0.8, "temp_schedule": "late", "concentration_max": 1024.0, "update": "resample"}
|
LTA_openwebtext_dualt/logs/eval_20260508/mauve_step124k_n64_features.log
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
loaded s64_c256 64
|
| 2 |
+
loaded s64_c1024 64
|
| 3 |
+
loaded s128_c256 64
|
| 4 |
+
loaded s128_c1024 64
|
| 5 |
+
loaded s512_c256 64
|
| 6 |
+
loaded s512_c1024 64
|
| 7 |
+
feat gen_s64_c256_raw (64, 1280) time 1.0
|
| 8 |
+
feat gen_s64_c256_detok (64, 1280) time 0.4
|
| 9 |
+
feat gen_s64_c1024_raw (64, 1280) time 0.6
|
| 10 |
+
feat gen_s64_c1024_detok (64, 1280) time 0.5
|
| 11 |
+
feat gen_s128_c256_raw (64, 1280) time 0.5
|
| 12 |
+
feat gen_s128_c256_detok (64, 1280) time 0.5
|
| 13 |
+
feat gen_s128_c1024_raw (64, 1280) time 0.6
|
| 14 |
+
feat gen_s128_c1024_detok (64, 1280) time 0.4
|
| 15 |
+
feat gen_s512_c256_raw (64, 1280) time 0.6
|
| 16 |
+
feat gen_s512_c256_detok (64, 1280) time 0.5
|
| 17 |
+
feat gen_s512_c1024_raw (64, 1280) time 0.5
|
| 18 |
+
feat gen_s512_c1024_detok (64, 1280) time 0.5
|
| 19 |
+
feat ref_raw (64, 1280) time 0.6
|
| 20 |
+
feat ref_detok (64, 1280) time 0.5
|
| 21 |
+
DONE docs/lta_samples/metrics_20260508/mauve_step124k_n64_features.npz docs/lta_samples/metrics_20260508/mauve_step124k_n64_meta.json
|
LTA_openwebtext_dualt/logs/eval_selfcond/selfcond_step1000_dirres_n16_s256_20260514_023314.log
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[ckpt] runs/lta_owt_gpt2cached_len1024_selfcond_p05_rollout1_autocastfix_c1024_ddit768x12_muon_ema_gbs512_4gpu_50k_20260514_005426/step_0001000.pt step=1000
|
| 2 |
+
[decode-base] n=16 max_len=1024 steps=256 model_t=flow
|
| 3 |
+
[decode] temp=1.30 final=state rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 4/16
|
| 4 |
+
[decode] temp=1.30 final=state rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 8/16
|
| 5 |
+
[decode] temp=1.30 final=state rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 12/16
|
LTA_openwebtext_dualt/logs/eval_selfcond/selfcond_step1000_online_dirres_n16_s256.log
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[ckpt] runs/lta_owt_gpt2cached_len1024_selfcond_p05_rollout1_autocastfix_c1024_ddit768x12_muon_ema_gbs512_4gpu_50k_20260514_005426/step_0001000.pt step=1000
|
| 2 |
+
[decode-base] n=16 max_len=1024 steps=256 model_t=flow
|
| 3 |
+
[decode] temp=1.30 final=state rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 4/16
|
| 4 |
+
[decode] temp=1.30 final=state rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 8/16
|
LTA_openwebtext_dualt/logs/eval_selfcond/selfcond_step1000_online_dirres_n8_s128_smoke.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[ckpt] runs/lta_owt_gpt2cached_len1024_selfcond_p05_rollout1_autocastfix_c1024_ddit768x12_muon_ema_gbs512_4gpu_50k_20260514_005426/step_0001000.pt step=1000
|
| 2 |
+
[decode-base] n=8 max_len=1024 steps=128 model_t=flow
|
| 3 |
+
[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 4/8
|
| 4 |
+
[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 8/8
|
| 5 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_gpt2cached_len1024_selfcond_p05_rollout1_autocastfix_c1024_ddit768x12_muon_ema_gbs512_4gpu_50k_20260514_005426/step_0001000.pt", "step": 1000, "decode": {"steps": 128, "model_t_mode": "flow", "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.5, "anchor_mode": "state", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "blend", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260514}, "raw_genppl": {"ppl": 5.401778570796268, "nll_per_token": 1.6867282643037684, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 4.281976989862169, "nll_per_token": 1.4544148164636947, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 0.8891367845472721, "unique_tokens": 8, "token_count": 8192, "distinct_1": 0.0009765625, "distinct_2": 0.0050097751710654935, "top_token_mass": 0.750732421875}}
|
| 6 |
+
[done] docs/lta_samples/metrics_20260514/selfcond_step1000_quick/selfcond_step1000_online_dirres_n8_s128_smoke.jsonl
|
LTA_openwebtext_dualt/logs/fullycoupled_tpow2_wd0p1_fp32_8gpu/lta_owt_gpt2cached_len1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_tpow2_nanogpt_tf32_ddit768x12_gbs512_8gpu_1m_20260515_003246.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/fullycoupled_uniform_mask1_swiglu_wd0p1_fp32_4gpu/lta_owt_gpt2cached_len1024_fullycoupled_rmsnorm_nobias_swiglu_adamw_wd0p1_uniformt_hardce_mask1p0-1p0_nanogpt_fp32_ddit768x12_gbs512_4gpu_1m_20260517_133638.log
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NCCL version 2.25.1+cuda12.8
|
| 2 |
+
{
|
| 3 |
+
"device": "cuda:0",
|
| 4 |
+
"rank": 0,
|
| 5 |
+
"world_size": 4,
|
| 6 |
+
"samples": "owt_cached_chunks:8734897",
|
| 7 |
+
"vocab_size": 50257,
|
| 8 |
+
"tokenizer_vocab_size": 50257,
|
| 9 |
+
"save_dir": "runs/lta_owt_gpt2cached_len1024_fullycoupled_rmsnorm_nobias_swiglu_adamw_wd0p1_uniformt_hardce_mask1p0-1p0_nanogpt_fp32_ddit768x12_gbs512_4gpu_1m_20260517_133638",
|
| 10 |
+
"batch_size": 32,
|
| 11 |
+
"grad_accum": 4,
|
| 12 |
+
"effective_batch_size": 512,
|
| 13 |
+
"global_batch_size": 512,
|
| 14 |
+
"lr_schedule": "cosine",
|
| 15 |
+
"optimizer": "adamw",
|
| 16 |
+
"epochs": 0.0,
|
| 17 |
+
"steps_per_epoch": 17061,
|
| 18 |
+
"total_steps": 1000000,
|
| 19 |
+
"warmup_steps": 2000,
|
| 20 |
+
"warmup_epochs": -1.0,
|
| 21 |
+
"min_lr": 6e-05,
|
| 22 |
+
"weight_decay": 0.1,
|
| 23 |
+
"output_weight_decay": -1.0,
|
| 24 |
+
"adamw_param_groups": "nanogpt",
|
| 25 |
+
"adam_beta1": 0.9,
|
| 26 |
+
"adam_beta2": 0.95,
|
| 27 |
+
"adam_eps": 1e-08,
|
| 28 |
+
"muon_impl": "legacy",
|
| 29 |
+
"muon_momentum": 0.95,
|
| 30 |
+
"muon_ns_steps": 5,
|
| 31 |
+
"muon_update_scale": 1.0,
|
| 32 |
+
"muon_nesterov": false,
|
| 33 |
+
"muon_width_scale": false,
|
| 34 |
+
"muon_grouping": "",
|
| 35 |
+
"muon_param_count": 0,
|
| 36 |
+
"muon_adam_param_count": 0,
|
| 37 |
+
"muon_param_names": [],
|
| 38 |
+
"muon_adam_param_names": [],
|
| 39 |
+
"muon_effective_nesterov": false,
|
| 40 |
+
"muon_effective_width_scale": false,
|
| 41 |
+
"muon_effective_weight_decay": 0.1,
|
| 42 |
+
"muon_adam_fallback_nesterov": false,
|
| 43 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 44 |
+
"ema_decay": 0.0,
|
| 45 |
+
"ema_start_step": 0,
|
| 46 |
+
"model_type": "ddit",
|
| 47 |
+
"ddit_mlp_type": "swiglu",
|
| 48 |
+
"elf_num_time_tokens": 4,
|
| 49 |
+
"elf_num_model_mode_tokens": 0,
|
| 50 |
+
"qk_norm": true,
|
| 51 |
+
"output_bias": false,
|
| 52 |
+
"output_init_std": -1.0,
|
| 53 |
+
"norm_type": "rmsnorm",
|
| 54 |
+
"target_loss": "hard_ce",
|
| 55 |
+
"linear_soft_target_power": 1.0,
|
| 56 |
+
"linear_soft_target_min_conf": 0.0,
|
| 57 |
+
"linear_soft_target_max_conf": 1.0,
|
| 58 |
+
"t_sampling_mode": "uniform",
|
| 59 |
+
"t_sampling_power": 1.0,
|
| 60 |
+
"t_sampling_eps": 0.0001,
|
| 61 |
+
"t_sampling_logit_mean": -0.22,
|
| 62 |
+
"t_sampling_logit_std": 0.5,
|
| 63 |
+
"dual_t": true,
|
| 64 |
+
"corrupt_t_mode": "same",
|
| 65 |
+
"corrupt_min_t": 0.0,
|
| 66 |
+
"corrupt_max_t": 1.0,
|
| 67 |
+
"prefix_block_prob": 0.0,
|
| 68 |
+
"prefix_block_len": 128,
|
| 69 |
+
"mask_ratio_floor_schedule": "none",
|
| 70 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 71 |
+
"dirichlet_semantic_t_mode": "same",
|
| 72 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 73 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 74 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 75 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 76 |
+
"categorical_wrong_from_full_vocab": true,
|
| 77 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 78 |
+
"categorical_wrong_basin_token_ids": "",
|
| 79 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 80 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 81 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 82 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 83 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 84 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 85 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 86 |
+
"mask_mixture_original_prob": 0.0,
|
| 87 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 88 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 89 |
+
"mask_mixture_block_prob": 0.0,
|
| 90 |
+
"mask_mixture_all_prob": 0.0,
|
| 91 |
+
"mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
|
| 92 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 93 |
+
"mask_mixture_block_tokens": "64,128",
|
| 94 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 95 |
+
"logistic_normal_sigma_min": 0.18,
|
| 96 |
+
"logistic_normal_sigma_max": 2.2,
|
| 97 |
+
"logistic_normal_tau_min": 0.65,
|
| 98 |
+
"logistic_normal_tau_max": 1.15,
|
| 99 |
+
"torch_compile": false,
|
| 100 |
+
"compile_mode": "max-autotune",
|
| 101 |
+
"state_format": "prob",
|
| 102 |
+
"meanflow_weight": 0.0,
|
| 103 |
+
"rollout_train_prob": 0.0,
|
| 104 |
+
"rollout_train_steps": 1,
|
| 105 |
+
"rollout_train_infer_steps": 64,
|
| 106 |
+
"rollout_train_temp": 1.45,
|
| 107 |
+
"rollout_train_max_gamma": 1.0,
|
| 108 |
+
"rollout_train_corrupt_only": true,
|
| 109 |
+
"rollout_train_samplewise": false,
|
| 110 |
+
"rollout_train_compute_always": false,
|
| 111 |
+
"bridge_noise_init": "logistic_normal",
|
| 112 |
+
"noise_sigma": -1.0,
|
| 113 |
+
"allow_tf32": true,
|
| 114 |
+
"activation_checkpointing": false,
|
| 115 |
+
"activation_checkpoint_interval": 1,
|
| 116 |
+
"activation_checkpoint_scope": "block",
|
| 117 |
+
"ddp_static_graph": false,
|
| 118 |
+
"ddp_gradient_as_bucket_view": true,
|
| 119 |
+
"blocking_data_transfer": false,
|
| 120 |
+
"dataloader_prefetch_factor": 4,
|
| 121 |
+
"full_train_stats": false,
|
| 122 |
+
"tokenized_hf": false,
|
| 123 |
+
"tokenized_pad_token": "pad",
|
| 124 |
+
"elf_conditional_hf": false,
|
| 125 |
+
"record_pad_truncate": false,
|
| 126 |
+
"record_add_eos": false,
|
| 127 |
+
"record_add_special_tokens": false,
|
| 128 |
+
"record_pad_token": "pad",
|
| 129 |
+
"record_shuffle_buffer": 10000,
|
| 130 |
+
"wrap": true,
|
| 131 |
+
"wrap_mode": "stream",
|
| 132 |
+
"wrap_record_buffer_size": 200,
|
| 133 |
+
"owt_cached_chunks": true,
|
| 134 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train_minus_100k",
|
| 135 |
+
"owt_chunk_cache_rebuild": false,
|
| 136 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 137 |
+
"owt_exact_repeat_per_chunk": 0,
|
| 138 |
+
"online_chunk_shuffle": false,
|
| 139 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 140 |
+
"openwebtext_split": "train_minus_100k",
|
| 141 |
+
"detokenizer": "auto",
|
| 142 |
+
"resolved_detokenizer": null,
|
| 143 |
+
"num_workers": 8,
|
| 144 |
+
"latest_every": 1000,
|
| 145 |
+
"resume_path": ""
|
| 146 |
+
}
|
| 147 |
+
step=50 epoch=1/59 epoch_step=50/17061 micro_steps=200 elapsed=388.8s lr=1.530000e-05 loss=10.7861 loss_recon=10.7861 loss_meanflow=0.0000 mean_model_t=0.4957 mean_corrupt_t=0.4957 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4577 corrupt_frac=1.0000 acc_corrupt=0.4577 loss_corrupt=10.7861 wrong_frac=0.5041 init_acc_corrupt=0.4610 acc_corrupt_t_0p0_0p2=0.0278 corrupt_frac_t_0p0_0p2=0.2087 acc_corrupt_t_0p2_0p4=0.2440 corrupt_frac_t_0p2_0p4=0.1978 acc_corrupt_t_0p4_0p6=0.4847 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=0.6804 corrupt_frac_t_0p6_0p8=0.1936 acc_corrupt_t_0p8_1p0=0.8739 corrupt_frac_t_0p8_1p0=0.2008 out_w_norm=0.4686 out_g_norm=0.7110 loss_all=10.6822 init_gold_top10=0.5175 init_gold_top100=0.5356
|
| 148 |
+
step=100 epoch=1/59 epoch_step=100/17061 micro_steps=400 elapsed=426.5s lr=3.030000e-05 loss=10.2172 loss_recon=10.2172 loss_meanflow=0.0000 mean_model_t=0.4967 mean_corrupt_t=0.4967 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2252 corrupt_frac=1.0000 acc_corrupt=0.2252 loss_corrupt=10.2172 wrong_frac=0.5029 init_acc_corrupt=0.4624 acc_corrupt_t_0p0_0p2=0.0406 corrupt_frac_t_0p0_0p2=0.1973 acc_corrupt_t_0p2_0p4=0.0829 corrupt_frac_t_0p2_0p4=0.2069 acc_corrupt_t_0p4_0p6=0.1761 corrupt_frac_t_0p4_0p6=0.2028 acc_corrupt_t_0p6_0p8=0.3049 corrupt_frac_t_0p6_0p8=0.1972 acc_corrupt_t_0p8_1p0=0.5321 corrupt_frac_t_0p8_1p0=0.1958 out_w_norm=4.1655 out_g_norm=1.4201 loss_all=9.6351 init_gold_top10=0.5489 init_gold_top100=0.5677
|
| 149 |
+
step=150 epoch=1/59 epoch_step=150/17061 micro_steps=600 elapsed=454.5s lr=4.530000e-05 loss=8.9059 loss_recon=8.9059 loss_meanflow=0.0000 mean_model_t=0.4954 mean_corrupt_t=0.4954 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1046 corrupt_frac=1.0000 acc_corrupt=0.1046 loss_corrupt=8.9059 wrong_frac=0.5044 init_acc_corrupt=0.4602 acc_corrupt_t_0p0_0p2=0.0359 corrupt_frac_t_0p0_0p2=0.2023 acc_corrupt_t_0p2_0p4=0.0433 corrupt_frac_t_0p2_0p4=0.2061 acc_corrupt_t_0p4_0p6=0.0754 corrupt_frac_t_0p4_0p6=0.2009 acc_corrupt_t_0p6_0p8=0.1354 corrupt_frac_t_0p6_0p8=0.1908 acc_corrupt_t_0p8_1p0=0.2374 corrupt_frac_t_0p8_1p0=0.1998 out_w_norm=12.0903 out_g_norm=1.5895 loss_all=8.1304 init_gold_top10=0.5795 init_gold_top100=0.6040
|
| 150 |
+
step=200 epoch=1/59 epoch_step=200/17061 micro_steps=800 elapsed=405.5s lr=6.030000e-05 loss=7.5686 loss_recon=7.5686 loss_meanflow=0.0000 mean_model_t=0.4978 mean_corrupt_t=0.4978 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0820 corrupt_frac=1.0000 acc_corrupt=0.0820 loss_corrupt=7.5686 wrong_frac=0.5022 init_acc_corrupt=0.4627 acc_corrupt_t_0p0_0p2=0.0357 corrupt_frac_t_0p0_0p2=0.2045 acc_corrupt_t_0p2_0p4=0.0444 corrupt_frac_t_0p2_0p4=0.1994 acc_corrupt_t_0p4_0p6=0.0623 corrupt_frac_t_0p4_0p6=0.2011 acc_corrupt_t_0p6_0p8=0.1070 corrupt_frac_t_0p6_0p8=0.1956 acc_corrupt_t_0p8_1p0=0.1626 corrupt_frac_t_0p8_1p0=0.1994 out_w_norm=21.3144 out_g_norm=1.2391 loss_all=7.0871 init_gold_top10=0.4736 init_gold_top100=0.5074
|
| 151 |
+
step=250 epoch=1/59 epoch_step=250/17061 micro_steps=1000 elapsed=280.9s lr=7.530000e-05 loss=6.2367 loss_recon=6.2367 loss_meanflow=0.0000 mean_model_t=0.5006 mean_corrupt_t=0.5006 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2156 corrupt_frac=1.0000 acc_corrupt=0.2156 loss_corrupt=6.2367 wrong_frac=0.4997 init_acc_corrupt=0.4660 acc_corrupt_t_0p0_0p2=0.0445 corrupt_frac_t_0p0_0p2=0.1991 acc_corrupt_t_0p2_0p4=0.1035 corrupt_frac_t_0p2_0p4=0.2019 acc_corrupt_t_0p4_0p6=0.2063 corrupt_frac_t_0p4_0p6=0.1980 acc_corrupt_t_0p6_0p8=0.3031 corrupt_frac_t_0p6_0p8=0.2026 acc_corrupt_t_0p8_1p0=0.4207 corrupt_frac_t_0p8_1p0=0.1995 out_w_norm=30.2056 out_g_norm=0.5883 loss_all=4.9413 init_gold_top10=0.5498 init_gold_top100=0.5749
|
| 152 |
+
step=300 epoch=1/59 epoch_step=300/17061 micro_steps=1200 elapsed=280.4s lr=9.030000e-05 loss=4.7166 loss_recon=4.7166 loss_meanflow=0.0000 mean_model_t=0.4985 mean_corrupt_t=0.4985 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4289 corrupt_frac=1.0000 acc_corrupt=0.4289 loss_corrupt=4.7166 wrong_frac=0.5015 init_acc_corrupt=0.4639 acc_corrupt_t_0p0_0p2=0.0575 corrupt_frac_t_0p0_0p2=0.1995 acc_corrupt_t_0p2_0p4=0.2283 corrupt_frac_t_0p2_0p4=0.2008 acc_corrupt_t_0p4_0p6=0.4425 corrupt_frac_t_0p4_0p6=0.2034 acc_corrupt_t_0p6_0p8=0.6262 corrupt_frac_t_0p6_0p8=0.2002 acc_corrupt_t_0p8_1p0=0.7964 corrupt_frac_t_0p8_1p0=0.1961 out_w_norm=38.0298 out_g_norm=0.2246 loss_all=4.2610 init_gold_top10=0.5237 init_gold_top100=0.5499
|
| 153 |
+
step=350 epoch=1/59 epoch_step=350/17061 micro_steps=1400 elapsed=330.1s lr=1.053000e-04 loss=4.3317 loss_recon=4.3317 loss_meanflow=0.0000 mean_model_t=0.5043 mean_corrupt_t=0.5043 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4660 corrupt_frac=1.0000 acc_corrupt=0.4660 loss_corrupt=4.3317 wrong_frac=0.4958 init_acc_corrupt=0.4701 acc_corrupt_t_0p0_0p2=0.0577 corrupt_frac_t_0p0_0p2=0.1980 acc_corrupt_t_0p2_0p4=0.2453 corrupt_frac_t_0p2_0p4=0.1964 acc_corrupt_t_0p4_0p6=0.4762 corrupt_frac_t_0p4_0p6=0.1986 acc_corrupt_t_0p6_0p8=0.6668 corrupt_frac_t_0p6_0p8=0.2000 acc_corrupt_t_0p8_1p0=0.8604 corrupt_frac_t_0p8_1p0=0.2090 out_w_norm=42.6140 out_g_norm=0.2474 loss_all=5.1987 init_gold_top10=0.3885 init_gold_top100=0.4312
|
| 154 |
+
step=400 epoch=1/59 epoch_step=400/17061 micro_steps=1600 elapsed=285.2s lr=1.203000e-04 loss=4.2408 loss_recon=4.2408 loss_meanflow=0.0000 mean_model_t=0.5011 mean_corrupt_t=0.5011 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4764 corrupt_frac=1.0000 acc_corrupt=0.4764 loss_corrupt=4.2408 wrong_frac=0.4990 init_acc_corrupt=0.4663 acc_corrupt_t_0p0_0p2=0.0571 corrupt_frac_t_0p0_0p2=0.1959 acc_corrupt_t_0p2_0p4=0.2487 corrupt_frac_t_0p2_0p4=0.2062 acc_corrupt_t_0p4_0p6=0.4926 corrupt_frac_t_0p4_0p6=0.1989 acc_corrupt_t_0p6_0p8=0.6913 corrupt_frac_t_0p6_0p8=0.1939 acc_corrupt_t_0p8_1p0=0.8873 corrupt_frac_t_0p8_1p0=0.2050 out_w_norm=45.7084 out_g_norm=0.2444 loss_all=4.2385 init_gold_top10=0.4913 init_gold_top100=0.5316
|
| 155 |
+
step=450 epoch=1/59 epoch_step=450/17061 micro_steps=1800 elapsed=277.1s lr=1.353000e-04 loss=4.1617 loss_recon=4.1617 loss_meanflow=0.0000 mean_model_t=0.5048 mean_corrupt_t=0.5048 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4857 corrupt_frac=1.0000 acc_corrupt=0.4857 loss_corrupt=4.1617 wrong_frac=0.4951 init_acc_corrupt=0.4704 acc_corrupt_t_0p0_0p2=0.0586 corrupt_frac_t_0p0_0p2=0.1971 acc_corrupt_t_0p2_0p4=0.2521 corrupt_frac_t_0p2_0p4=0.2003 acc_corrupt_t_0p4_0p6=0.4975 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=0.6955 corrupt_frac_t_0p6_0p8=0.1892 acc_corrupt_t_0p8_1p0=0.8958 corrupt_frac_t_0p8_1p0=0.2162 out_w_norm=47.9240 out_g_norm=0.2380 loss_all=3.6068 init_gold_top10=0.5674 init_gold_top100=0.5807
|
| 156 |
+
step=500 epoch=1/59 epoch_step=500/17061 micro_steps=2000 elapsed=300.5s lr=1.503000e-04 loss=4.1949 loss_recon=4.1949 loss_meanflow=0.0000 mean_model_t=0.4975 mean_corrupt_t=0.4975 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4796 corrupt_frac=1.0000 acc_corrupt=0.4796 loss_corrupt=4.1949 wrong_frac=0.5028 init_acc_corrupt=0.4625 acc_corrupt_t_0p0_0p2=0.0590 corrupt_frac_t_0p0_0p2=0.2006 acc_corrupt_t_0p2_0p4=0.2582 corrupt_frac_t_0p2_0p4=0.2053 acc_corrupt_t_0p4_0p6=0.4971 corrupt_frac_t_0p4_0p6=0.1923 acc_corrupt_t_0p6_0p8=0.6952 corrupt_frac_t_0p6_0p8=0.2052 acc_corrupt_t_0p8_1p0=0.8972 corrupt_frac_t_0p8_1p0=0.1975 out_w_norm=49.5350 out_g_norm=0.2347 loss_all=3.9710 init_gold_top10=0.5273 init_gold_top100=0.5571
|
| 157 |
+
step=550 epoch=1/59 epoch_step=550/17061 micro_steps=2200 elapsed=287.1s lr=1.653000e-04 loss=4.1088 loss_recon=4.1088 loss_meanflow=0.0000 mean_model_t=0.5066 mean_corrupt_t=0.5066 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4899 corrupt_frac=1.0000 acc_corrupt=0.4899 loss_corrupt=4.1088 wrong_frac=0.4935 init_acc_corrupt=0.4726 acc_corrupt_t_0p0_0p2=0.0611 corrupt_frac_t_0p0_0p2=0.1905 acc_corrupt_t_0p2_0p4=0.2599 corrupt_frac_t_0p2_0p4=0.2028 acc_corrupt_t_0p4_0p6=0.4995 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=0.7002 corrupt_frac_t_0p6_0p8=0.2051 acc_corrupt_t_0p8_1p0=0.9015 corrupt_frac_t_0p8_1p0=0.2027 out_w_norm=51.0302 out_g_norm=0.2278 loss_all=4.3138 init_gold_top10=0.4809 init_gold_top100=0.5140
|
| 158 |
+
step=600 epoch=1/59 epoch_step=600/17061 micro_steps=2400 elapsed=327.2s lr=1.803000e-04 loss=4.1122 loss_recon=4.1122 loss_meanflow=0.0000 mean_model_t=0.5025 mean_corrupt_t=0.5025 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4889 corrupt_frac=1.0000 acc_corrupt=0.4889 loss_corrupt=4.1122 wrong_frac=0.4977 init_acc_corrupt=0.4679 acc_corrupt_t_0p0_0p2=0.0605 corrupt_frac_t_0p0_0p2=0.2006 acc_corrupt_t_0p2_0p4=0.2628 corrupt_frac_t_0p2_0p4=0.1970 acc_corrupt_t_0p4_0p6=0.5033 corrupt_frac_t_0p4_0p6=0.1941 acc_corrupt_t_0p6_0p8=0.7018 corrupt_frac_t_0p6_0p8=0.2026 acc_corrupt_t_0p8_1p0=0.8992 corrupt_frac_t_0p8_1p0=0.2078 out_w_norm=52.4291 out_g_norm=0.2511 loss_all=4.0709 init_gold_top10=0.4996 init_gold_top100=0.5313
|
| 159 |
+
step=650 epoch=1/59 epoch_step=650/17061 micro_steps=2600 elapsed=268.4s lr=1.953000e-04 loss=4.0279 loss_recon=4.0279 loss_meanflow=0.0000 mean_model_t=0.5036 mean_corrupt_t=0.5036 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4964 corrupt_frac=1.0000 acc_corrupt=0.4964 loss_corrupt=4.0279 wrong_frac=0.4964 init_acc_corrupt=0.4702 acc_corrupt_t_0p0_0p2=0.0627 corrupt_frac_t_0p0_0p2=0.1948 acc_corrupt_t_0p2_0p4=0.2751 corrupt_frac_t_0p2_0p4=0.1997 acc_corrupt_t_0p4_0p6=0.5146 corrupt_frac_t_0p4_0p6=0.2028 acc_corrupt_t_0p6_0p8=0.7119 corrupt_frac_t_0p6_0p8=0.2009 acc_corrupt_t_0p8_1p0=0.9017 corrupt_frac_t_0p8_1p0=0.2017 out_w_norm=53.9562 out_g_norm=0.2470 loss_all=3.3140 init_gold_top10=0.5782 init_gold_top100=0.5980
|
| 160 |
+
step=700 epoch=1/59 epoch_step=700/17061 micro_steps=2800 elapsed=286.2s lr=2.103000e-04 loss=3.9270 loss_recon=3.9270 loss_meanflow=0.0000 mean_model_t=0.5033 mean_corrupt_t=0.5033 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5030 corrupt_frac=1.0000 acc_corrupt=0.5030 loss_corrupt=3.9270 wrong_frac=0.4966 init_acc_corrupt=0.4694 acc_corrupt_t_0p0_0p2=0.0662 corrupt_frac_t_0p0_0p2=0.1959 acc_corrupt_t_0p2_0p4=0.2824 corrupt_frac_t_0p2_0p4=0.1964 acc_corrupt_t_0p4_0p6=0.5242 corrupt_frac_t_0p4_0p6=0.2042 acc_corrupt_t_0p6_0p8=0.7141 corrupt_frac_t_0p6_0p8=0.1988 acc_corrupt_t_0p8_1p0=0.9067 corrupt_frac_t_0p8_1p0=0.2047 out_w_norm=55.6574 out_g_norm=0.2686 loss_all=3.8679 init_gold_top10=0.4979 init_gold_top100=0.5239
|
| 161 |
+
step=750 epoch=1/59 epoch_step=750/17061 micro_steps=3000 elapsed=268.1s lr=2.253000e-04 loss=3.7913 loss_recon=3.7913 loss_meanflow=0.0000 mean_model_t=0.5050 mean_corrupt_t=0.5050 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5137 corrupt_frac=1.0000 acc_corrupt=0.5137 loss_corrupt=3.7913 wrong_frac=0.4948 init_acc_corrupt=0.4710 acc_corrupt_t_0p0_0p2=0.0713 corrupt_frac_t_0p0_0p2=0.1955 acc_corrupt_t_0p2_0p4=0.2947 corrupt_frac_t_0p2_0p4=0.1966 acc_corrupt_t_0p4_0p6=0.5423 corrupt_frac_t_0p4_0p6=0.2014 acc_corrupt_t_0p6_0p8=0.7290 corrupt_frac_t_0p6_0p8=0.2086 acc_corrupt_t_0p8_1p0=0.9119 corrupt_frac_t_0p8_1p0=0.1980 out_w_norm=57.6091 out_g_norm=0.2684 loss_all=4.1275 init_gold_top10=0.4555 init_gold_top100=0.4853
|
| 162 |
+
step=800 epoch=1/59 epoch_step=800/17061 micro_steps=3200 elapsed=242.0s lr=2.403000e-04 loss=3.7452 loss_recon=3.7452 loss_meanflow=0.0000 mean_model_t=0.5002 mean_corrupt_t=0.5002 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5149 corrupt_frac=1.0000 acc_corrupt=0.5149 loss_corrupt=3.7452 wrong_frac=0.4996 init_acc_corrupt=0.4653 acc_corrupt_t_0p0_0p2=0.0719 corrupt_frac_t_0p0_0p2=0.2013 acc_corrupt_t_0p2_0p4=0.3060 corrupt_frac_t_0p2_0p4=0.2027 acc_corrupt_t_0p4_0p6=0.5500 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=0.7398 corrupt_frac_t_0p6_0p8=0.2008 acc_corrupt_t_0p8_1p0=0.9144 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=59.5190 out_g_norm=0.2504 loss_all=3.3060 init_gold_top10=0.5482 init_gold_top100=0.5781
|
| 163 |
+
step=850 epoch=1/59 epoch_step=850/17061 micro_steps=3400 elapsed=221.9s lr=2.553000e-04 loss=3.6764 loss_recon=3.6764 loss_meanflow=0.0000 mean_model_t=0.5017 mean_corrupt_t=0.5017 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5207 corrupt_frac=1.0000 acc_corrupt=0.5207 loss_corrupt=3.6764 wrong_frac=0.4984 init_acc_corrupt=0.4672 acc_corrupt_t_0p0_0p2=0.0725 corrupt_frac_t_0p0_0p2=0.1967 acc_corrupt_t_0p2_0p4=0.3063 corrupt_frac_t_0p2_0p4=0.1995 acc_corrupt_t_0p4_0p6=0.5525 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=0.7426 corrupt_frac_t_0p6_0p8=0.2030 acc_corrupt_t_0p8_1p0=0.9156 corrupt_frac_t_0p8_1p0=0.2014 out_w_norm=61.4423 out_g_norm=0.2535 loss_all=3.6387 init_gold_top10=0.4996 init_gold_top100=0.5321
|
LTA_openwebtext_dualt/logs/fullycoupled_uniform_mask1_swiglu_wd0p1_fp32_4gpu/lta_owt_gpt2cached_len1024_fullycoupled_rmsnorm_nobias_swiglu_adamw_wd0p1_uniformt_hardce_mask1p0-1p0_nanogpt_fp32_ddit768x12_gbs512_4gpu_1m_20260517_133638.outer.log
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[launch] method=owt_fullycoupled_adamw_wd0p1_nanogpt_fp32 host=di-20260411014000-djqhq time=2026-05-17T13:36:38+00:00
|
| 2 |
+
[launch] run_name=lta_owt_gpt2cached_len1024_fullycoupled_rmsnorm_nobias_swiglu_adamw_wd0p1_uniformt_hardce_mask1p0-1p0_nanogpt_fp32_ddit768x12_gbs512_4gpu_1m_20260517_133638
|
| 3 |
+
[launch] save_dir=runs/lta_owt_gpt2cached_len1024_fullycoupled_rmsnorm_nobias_swiglu_adamw_wd0p1_uniformt_hardce_mask1p0-1p0_nanogpt_fp32_ddit768x12_gbs512_4gpu_1m_20260517_133638
|
| 4 |
+
[launch] log_file=logs/fullycoupled_uniform_mask1_swiglu_wd0p1_fp32_4gpu/lta_owt_gpt2cached_len1024_fullycoupled_rmsnorm_nobias_swiglu_adamw_wd0p1_uniformt_hardce_mask1p0-1p0_nanogpt_fp32_ddit768x12_gbs512_4gpu_1m_20260517_133638.log
|
| 5 |
+
[launch] data_path=/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext
|
| 6 |
+
[launch] owt_cache=/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train_minus_100k
|
| 7 |
+
[launch] optimizer=adamw lr=6e-4 min_lr=6e-5 wd=0.1 output_wd=-1 param_groups=nanogpt ema=0.0
|
| 8 |
+
[launch] fp32=true bf16=false tf32=true norm_type=rmsnorm output_bias=false ddit_mlp_type=swiglu batch=512 per_gpu=32
|
| 9 |
+
[launch] loss_t_weight_mode=none loss_t_min_weight=0.0 loss_t_drop_below=0.2
|
| 10 |
+
[launch] target_loss=hard_ce t_sampling_mode=uniform t_sampling_logit_mean=-0.22 t_sampling_logit_std=0.5 t_sampling_power=1.0 t_sampling_eps=1e-4 mask_ratio=1.0->1.0
|
| 11 |
+
NCCL version 2.25.1+cuda12.8
|
| 12 |
+
{
|
| 13 |
+
"device": "cuda:0",
|
| 14 |
+
"rank": 0,
|
| 15 |
+
"world_size": 4,
|
| 16 |
+
"samples": "owt_cached_chunks:8734897",
|
| 17 |
+
"vocab_size": 50257,
|
| 18 |
+
"tokenizer_vocab_size": 50257,
|
| 19 |
+
"save_dir": "runs/lta_owt_gpt2cached_len1024_fullycoupled_rmsnorm_nobias_swiglu_adamw_wd0p1_uniformt_hardce_mask1p0-1p0_nanogpt_fp32_ddit768x12_gbs512_4gpu_1m_20260517_133638",
|
| 20 |
+
"batch_size": 32,
|
| 21 |
+
"grad_accum": 4,
|
| 22 |
+
"effective_batch_size": 512,
|
| 23 |
+
"global_batch_size": 512,
|
| 24 |
+
"lr_schedule": "cosine",
|
| 25 |
+
"optimizer": "adamw",
|
| 26 |
+
"epochs": 0.0,
|
| 27 |
+
"steps_per_epoch": 17061,
|
| 28 |
+
"total_steps": 1000000,
|
| 29 |
+
"warmup_steps": 2000,
|
| 30 |
+
"warmup_epochs": -1.0,
|
| 31 |
+
"min_lr": 6e-05,
|
| 32 |
+
"weight_decay": 0.1,
|
| 33 |
+
"output_weight_decay": -1.0,
|
| 34 |
+
"adamw_param_groups": "nanogpt",
|
| 35 |
+
"adam_beta1": 0.9,
|
| 36 |
+
"adam_beta2": 0.95,
|
| 37 |
+
"adam_eps": 1e-08,
|
| 38 |
+
"muon_impl": "legacy",
|
| 39 |
+
"muon_momentum": 0.95,
|
| 40 |
+
"muon_ns_steps": 5,
|
| 41 |
+
"muon_update_scale": 1.0,
|
| 42 |
+
"muon_nesterov": false,
|
| 43 |
+
"muon_width_scale": false,
|
| 44 |
+
"muon_grouping": "",
|
| 45 |
+
"muon_param_count": 0,
|
| 46 |
+
"muon_adam_param_count": 0,
|
| 47 |
+
"muon_param_names": [],
|
| 48 |
+
"muon_adam_param_names": [],
|
| 49 |
+
"muon_effective_nesterov": false,
|
| 50 |
+
"muon_effective_width_scale": false,
|
| 51 |
+
"muon_effective_weight_decay": 0.1,
|
| 52 |
+
"muon_adam_fallback_nesterov": false,
|
| 53 |
+
"muon_adam_fallback_weight_decay": 0.1,
|
| 54 |
+
"ema_decay": 0.0,
|
| 55 |
+
"ema_start_step": 0,
|
| 56 |
+
"model_type": "ddit",
|
| 57 |
+
"ddit_mlp_type": "swiglu",
|
| 58 |
+
"elf_num_time_tokens": 4,
|
| 59 |
+
"elf_num_model_mode_tokens": 0,
|
| 60 |
+
"qk_norm": true,
|
| 61 |
+
"output_bias": false,
|
| 62 |
+
"output_init_std": -1.0,
|
| 63 |
+
"norm_type": "rmsnorm",
|
| 64 |
+
"target_loss": "hard_ce",
|
| 65 |
+
"linear_soft_target_power": 1.0,
|
| 66 |
+
"linear_soft_target_min_conf": 0.0,
|
| 67 |
+
"linear_soft_target_max_conf": 1.0,
|
| 68 |
+
"t_sampling_mode": "uniform",
|
| 69 |
+
"t_sampling_power": 1.0,
|
| 70 |
+
"t_sampling_eps": 0.0001,
|
| 71 |
+
"t_sampling_logit_mean": -0.22,
|
| 72 |
+
"t_sampling_logit_std": 0.5,
|
| 73 |
+
"dual_t": true,
|
| 74 |
+
"corrupt_t_mode": "same",
|
| 75 |
+
"corrupt_min_t": 0.0,
|
| 76 |
+
"corrupt_max_t": 1.0,
|
| 77 |
+
"prefix_block_prob": 0.0,
|
| 78 |
+
"prefix_block_len": 128,
|
| 79 |
+
"mask_ratio_floor_schedule": "none",
|
| 80 |
+
"dirichlet_endpoint_mode": "categorical_dual_t",
|
| 81 |
+
"dirichlet_semantic_t_mode": "same",
|
| 82 |
+
"dirichlet_semantic_t_value": 0.0,
|
| 83 |
+
"dirichlet_semantic_t_curve": "linear",
|
| 84 |
+
"dirichlet_semantic_t_power": 1.0,
|
| 85 |
+
"endpoint_sequence_random_prob_alpha": 0.0,
|
| 86 |
+
"categorical_wrong_from_full_vocab": true,
|
| 87 |
+
"categorical_wrong_from_batch_valid_tokens": false,
|
| 88 |
+
"categorical_wrong_basin_token_ids": "",
|
| 89 |
+
"categorical_wrong_basin_prob": 0.0,
|
| 90 |
+
"categorical_wrong_unigram_prob": 0.0,
|
| 91 |
+
"categorical_wrong_uniform_prob": 0.0,
|
| 92 |
+
"categorical_wrong_corpus_unigram_path": "",
|
| 93 |
+
"categorical_wrong_corpus_unigram_alpha": 1.0,
|
| 94 |
+
"categorical_wrong_basin_shared_prob": 0.0,
|
| 95 |
+
"categorical_wrong_unigram_shared_prob": 0.0,
|
| 96 |
+
"mask_mixture_original_prob": 0.0,
|
| 97 |
+
"mask_mixture_lowk_prob": 0.0,
|
| 98 |
+
"mask_mixture_lowcorrupt_prob": 0.0,
|
| 99 |
+
"mask_mixture_block_prob": 0.0,
|
| 100 |
+
"mask_mixture_all_prob": 0.0,
|
| 101 |
+
"mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
|
| 102 |
+
"mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
|
| 103 |
+
"mask_mixture_block_tokens": "64,128",
|
| 104 |
+
"simplex_bridge_sampler": "dirichlet",
|
| 105 |
+
"logistic_normal_sigma_min": 0.18,
|
| 106 |
+
"logistic_normal_sigma_max": 2.2,
|
| 107 |
+
"logistic_normal_tau_min": 0.65,
|
| 108 |
+
"logistic_normal_tau_max": 1.15,
|
| 109 |
+
"torch_compile": false,
|
| 110 |
+
"compile_mode": "max-autotune",
|
| 111 |
+
"state_format": "prob",
|
| 112 |
+
"meanflow_weight": 0.0,
|
| 113 |
+
"rollout_train_prob": 0.0,
|
| 114 |
+
"rollout_train_steps": 1,
|
| 115 |
+
"rollout_train_infer_steps": 64,
|
| 116 |
+
"rollout_train_temp": 1.45,
|
| 117 |
+
"rollout_train_max_gamma": 1.0,
|
| 118 |
+
"rollout_train_corrupt_only": true,
|
| 119 |
+
"rollout_train_samplewise": false,
|
| 120 |
+
"rollout_train_compute_always": false,
|
| 121 |
+
"bridge_noise_init": "logistic_normal",
|
| 122 |
+
"noise_sigma": -1.0,
|
| 123 |
+
"allow_tf32": true,
|
| 124 |
+
"activation_checkpointing": false,
|
| 125 |
+
"activation_checkpoint_interval": 1,
|
| 126 |
+
"activation_checkpoint_scope": "block",
|
| 127 |
+
"ddp_static_graph": false,
|
| 128 |
+
"ddp_gradient_as_bucket_view": true,
|
| 129 |
+
"blocking_data_transfer": false,
|
| 130 |
+
"dataloader_prefetch_factor": 4,
|
| 131 |
+
"full_train_stats": false,
|
| 132 |
+
"tokenized_hf": false,
|
| 133 |
+
"tokenized_pad_token": "pad",
|
| 134 |
+
"elf_conditional_hf": false,
|
| 135 |
+
"record_pad_truncate": false,
|
| 136 |
+
"record_add_eos": false,
|
| 137 |
+
"record_add_special_tokens": false,
|
| 138 |
+
"record_pad_token": "pad",
|
| 139 |
+
"record_shuffle_buffer": 10000,
|
| 140 |
+
"wrap": true,
|
| 141 |
+
"wrap_mode": "stream",
|
| 142 |
+
"wrap_record_buffer_size": 200,
|
| 143 |
+
"owt_cached_chunks": true,
|
| 144 |
+
"owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train_minus_100k",
|
| 145 |
+
"owt_chunk_cache_rebuild": false,
|
| 146 |
+
"owt_chunk_cache_write_batch": 4096,
|
| 147 |
+
"owt_exact_repeat_per_chunk": 0,
|
| 148 |
+
"online_chunk_shuffle": false,
|
| 149 |
+
"online_chunk_shuffle_buffer": 10000,
|
| 150 |
+
"openwebtext_split": "train_minus_100k",
|
| 151 |
+
"detokenizer": "auto",
|
| 152 |
+
"resolved_detokenizer": null,
|
| 153 |
+
"num_workers": 8,
|
| 154 |
+
"latest_every": 1000,
|
| 155 |
+
"resume_path": ""
|
| 156 |
+
}
|
| 157 |
+
step=50 epoch=1/59 epoch_step=50/17061 micro_steps=200 elapsed=388.8s lr=1.530000e-05 loss=10.7861 loss_recon=10.7861 loss_meanflow=0.0000 mean_model_t=0.4957 mean_corrupt_t=0.4957 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4577 corrupt_frac=1.0000 acc_corrupt=0.4577 loss_corrupt=10.7861 wrong_frac=0.5041 init_acc_corrupt=0.4610 acc_corrupt_t_0p0_0p2=0.0278 corrupt_frac_t_0p0_0p2=0.2087 acc_corrupt_t_0p2_0p4=0.2440 corrupt_frac_t_0p2_0p4=0.1978 acc_corrupt_t_0p4_0p6=0.4847 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=0.6804 corrupt_frac_t_0p6_0p8=0.1936 acc_corrupt_t_0p8_1p0=0.8739 corrupt_frac_t_0p8_1p0=0.2008 out_w_norm=0.4686 out_g_norm=0.7110 loss_all=10.6822 init_gold_top10=0.5175 init_gold_top100=0.5356
|
| 158 |
+
step=100 epoch=1/59 epoch_step=100/17061 micro_steps=400 elapsed=426.5s lr=3.030000e-05 loss=10.2172 loss_recon=10.2172 loss_meanflow=0.0000 mean_model_t=0.4967 mean_corrupt_t=0.4967 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2252 corrupt_frac=1.0000 acc_corrupt=0.2252 loss_corrupt=10.2172 wrong_frac=0.5029 init_acc_corrupt=0.4624 acc_corrupt_t_0p0_0p2=0.0406 corrupt_frac_t_0p0_0p2=0.1973 acc_corrupt_t_0p2_0p4=0.0829 corrupt_frac_t_0p2_0p4=0.2069 acc_corrupt_t_0p4_0p6=0.1761 corrupt_frac_t_0p4_0p6=0.2028 acc_corrupt_t_0p6_0p8=0.3049 corrupt_frac_t_0p6_0p8=0.1972 acc_corrupt_t_0p8_1p0=0.5321 corrupt_frac_t_0p8_1p0=0.1958 out_w_norm=4.1655 out_g_norm=1.4201 loss_all=9.6351 init_gold_top10=0.5489 init_gold_top100=0.5677
|
| 159 |
+
step=150 epoch=1/59 epoch_step=150/17061 micro_steps=600 elapsed=454.5s lr=4.530000e-05 loss=8.9059 loss_recon=8.9059 loss_meanflow=0.0000 mean_model_t=0.4954 mean_corrupt_t=0.4954 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.1046 corrupt_frac=1.0000 acc_corrupt=0.1046 loss_corrupt=8.9059 wrong_frac=0.5044 init_acc_corrupt=0.4602 acc_corrupt_t_0p0_0p2=0.0359 corrupt_frac_t_0p0_0p2=0.2023 acc_corrupt_t_0p2_0p4=0.0433 corrupt_frac_t_0p2_0p4=0.2061 acc_corrupt_t_0p4_0p6=0.0754 corrupt_frac_t_0p4_0p6=0.2009 acc_corrupt_t_0p6_0p8=0.1354 corrupt_frac_t_0p6_0p8=0.1908 acc_corrupt_t_0p8_1p0=0.2374 corrupt_frac_t_0p8_1p0=0.1998 out_w_norm=12.0903 out_g_norm=1.5895 loss_all=8.1304 init_gold_top10=0.5795 init_gold_top100=0.6040
|
| 160 |
+
step=200 epoch=1/59 epoch_step=200/17061 micro_steps=800 elapsed=405.5s lr=6.030000e-05 loss=7.5686 loss_recon=7.5686 loss_meanflow=0.0000 mean_model_t=0.4978 mean_corrupt_t=0.4978 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0820 corrupt_frac=1.0000 acc_corrupt=0.0820 loss_corrupt=7.5686 wrong_frac=0.5022 init_acc_corrupt=0.4627 acc_corrupt_t_0p0_0p2=0.0357 corrupt_frac_t_0p0_0p2=0.2045 acc_corrupt_t_0p2_0p4=0.0444 corrupt_frac_t_0p2_0p4=0.1994 acc_corrupt_t_0p4_0p6=0.0623 corrupt_frac_t_0p4_0p6=0.2011 acc_corrupt_t_0p6_0p8=0.1070 corrupt_frac_t_0p6_0p8=0.1956 acc_corrupt_t_0p8_1p0=0.1626 corrupt_frac_t_0p8_1p0=0.1994 out_w_norm=21.3144 out_g_norm=1.2391 loss_all=7.0871 init_gold_top10=0.4736 init_gold_top100=0.5074
|
| 161 |
+
step=250 epoch=1/59 epoch_step=250/17061 micro_steps=1000 elapsed=280.9s lr=7.530000e-05 loss=6.2367 loss_recon=6.2367 loss_meanflow=0.0000 mean_model_t=0.5006 mean_corrupt_t=0.5006 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.2156 corrupt_frac=1.0000 acc_corrupt=0.2156 loss_corrupt=6.2367 wrong_frac=0.4997 init_acc_corrupt=0.4660 acc_corrupt_t_0p0_0p2=0.0445 corrupt_frac_t_0p0_0p2=0.1991 acc_corrupt_t_0p2_0p4=0.1035 corrupt_frac_t_0p2_0p4=0.2019 acc_corrupt_t_0p4_0p6=0.2063 corrupt_frac_t_0p4_0p6=0.1980 acc_corrupt_t_0p6_0p8=0.3031 corrupt_frac_t_0p6_0p8=0.2026 acc_corrupt_t_0p8_1p0=0.4207 corrupt_frac_t_0p8_1p0=0.1995 out_w_norm=30.2056 out_g_norm=0.5883 loss_all=4.9413 init_gold_top10=0.5498 init_gold_top100=0.5749
|
| 162 |
+
step=300 epoch=1/59 epoch_step=300/17061 micro_steps=1200 elapsed=280.4s lr=9.030000e-05 loss=4.7166 loss_recon=4.7166 loss_meanflow=0.0000 mean_model_t=0.4985 mean_corrupt_t=0.4985 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4289 corrupt_frac=1.0000 acc_corrupt=0.4289 loss_corrupt=4.7166 wrong_frac=0.5015 init_acc_corrupt=0.4639 acc_corrupt_t_0p0_0p2=0.0575 corrupt_frac_t_0p0_0p2=0.1995 acc_corrupt_t_0p2_0p4=0.2283 corrupt_frac_t_0p2_0p4=0.2008 acc_corrupt_t_0p4_0p6=0.4425 corrupt_frac_t_0p4_0p6=0.2034 acc_corrupt_t_0p6_0p8=0.6262 corrupt_frac_t_0p6_0p8=0.2002 acc_corrupt_t_0p8_1p0=0.7964 corrupt_frac_t_0p8_1p0=0.1961 out_w_norm=38.0298 out_g_norm=0.2246 loss_all=4.2610 init_gold_top10=0.5237 init_gold_top100=0.5499
|
| 163 |
+
step=350 epoch=1/59 epoch_step=350/17061 micro_steps=1400 elapsed=330.1s lr=1.053000e-04 loss=4.3317 loss_recon=4.3317 loss_meanflow=0.0000 mean_model_t=0.5043 mean_corrupt_t=0.5043 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4660 corrupt_frac=1.0000 acc_corrupt=0.4660 loss_corrupt=4.3317 wrong_frac=0.4958 init_acc_corrupt=0.4701 acc_corrupt_t_0p0_0p2=0.0577 corrupt_frac_t_0p0_0p2=0.1980 acc_corrupt_t_0p2_0p4=0.2453 corrupt_frac_t_0p2_0p4=0.1964 acc_corrupt_t_0p4_0p6=0.4762 corrupt_frac_t_0p4_0p6=0.1986 acc_corrupt_t_0p6_0p8=0.6668 corrupt_frac_t_0p6_0p8=0.2000 acc_corrupt_t_0p8_1p0=0.8604 corrupt_frac_t_0p8_1p0=0.2090 out_w_norm=42.6140 out_g_norm=0.2474 loss_all=5.1987 init_gold_top10=0.3885 init_gold_top100=0.4312
|
| 164 |
+
step=400 epoch=1/59 epoch_step=400/17061 micro_steps=1600 elapsed=285.2s lr=1.203000e-04 loss=4.2408 loss_recon=4.2408 loss_meanflow=0.0000 mean_model_t=0.5011 mean_corrupt_t=0.5011 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4764 corrupt_frac=1.0000 acc_corrupt=0.4764 loss_corrupt=4.2408 wrong_frac=0.4990 init_acc_corrupt=0.4663 acc_corrupt_t_0p0_0p2=0.0571 corrupt_frac_t_0p0_0p2=0.1959 acc_corrupt_t_0p2_0p4=0.2487 corrupt_frac_t_0p2_0p4=0.2062 acc_corrupt_t_0p4_0p6=0.4926 corrupt_frac_t_0p4_0p6=0.1989 acc_corrupt_t_0p6_0p8=0.6913 corrupt_frac_t_0p6_0p8=0.1939 acc_corrupt_t_0p8_1p0=0.8873 corrupt_frac_t_0p8_1p0=0.2050 out_w_norm=45.7084 out_g_norm=0.2444 loss_all=4.2385 init_gold_top10=0.4913 init_gold_top100=0.5316
|
| 165 |
+
step=450 epoch=1/59 epoch_step=450/17061 micro_steps=1800 elapsed=277.1s lr=1.353000e-04 loss=4.1617 loss_recon=4.1617 loss_meanflow=0.0000 mean_model_t=0.5048 mean_corrupt_t=0.5048 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4857 corrupt_frac=1.0000 acc_corrupt=0.4857 loss_corrupt=4.1617 wrong_frac=0.4951 init_acc_corrupt=0.4704 acc_corrupt_t_0p0_0p2=0.0586 corrupt_frac_t_0p0_0p2=0.1971 acc_corrupt_t_0p2_0p4=0.2521 corrupt_frac_t_0p2_0p4=0.2003 acc_corrupt_t_0p4_0p6=0.4975 corrupt_frac_t_0p4_0p6=0.1991 acc_corrupt_t_0p6_0p8=0.6955 corrupt_frac_t_0p6_0p8=0.1892 acc_corrupt_t_0p8_1p0=0.8958 corrupt_frac_t_0p8_1p0=0.2162 out_w_norm=47.9240 out_g_norm=0.2380 loss_all=3.6068 init_gold_top10=0.5674 init_gold_top100=0.5807
|
| 166 |
+
step=500 epoch=1/59 epoch_step=500/17061 micro_steps=2000 elapsed=300.5s lr=1.503000e-04 loss=4.1949 loss_recon=4.1949 loss_meanflow=0.0000 mean_model_t=0.4975 mean_corrupt_t=0.4975 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4796 corrupt_frac=1.0000 acc_corrupt=0.4796 loss_corrupt=4.1949 wrong_frac=0.5028 init_acc_corrupt=0.4625 acc_corrupt_t_0p0_0p2=0.0590 corrupt_frac_t_0p0_0p2=0.2006 acc_corrupt_t_0p2_0p4=0.2582 corrupt_frac_t_0p2_0p4=0.2053 acc_corrupt_t_0p4_0p6=0.4971 corrupt_frac_t_0p4_0p6=0.1923 acc_corrupt_t_0p6_0p8=0.6952 corrupt_frac_t_0p6_0p8=0.2052 acc_corrupt_t_0p8_1p0=0.8972 corrupt_frac_t_0p8_1p0=0.1975 out_w_norm=49.5350 out_g_norm=0.2347 loss_all=3.9710 init_gold_top10=0.5273 init_gold_top100=0.5571
|
| 167 |
+
step=550 epoch=1/59 epoch_step=550/17061 micro_steps=2200 elapsed=287.1s lr=1.653000e-04 loss=4.1088 loss_recon=4.1088 loss_meanflow=0.0000 mean_model_t=0.5066 mean_corrupt_t=0.5066 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4899 corrupt_frac=1.0000 acc_corrupt=0.4899 loss_corrupt=4.1088 wrong_frac=0.4935 init_acc_corrupt=0.4726 acc_corrupt_t_0p0_0p2=0.0611 corrupt_frac_t_0p0_0p2=0.1905 acc_corrupt_t_0p2_0p4=0.2599 corrupt_frac_t_0p2_0p4=0.2028 acc_corrupt_t_0p4_0p6=0.4995 corrupt_frac_t_0p4_0p6=0.2010 acc_corrupt_t_0p6_0p8=0.7002 corrupt_frac_t_0p6_0p8=0.2051 acc_corrupt_t_0p8_1p0=0.9015 corrupt_frac_t_0p8_1p0=0.2027 out_w_norm=51.0302 out_g_norm=0.2278 loss_all=4.3138 init_gold_top10=0.4809 init_gold_top100=0.5140
|
| 168 |
+
step=600 epoch=1/59 epoch_step=600/17061 micro_steps=2400 elapsed=327.2s lr=1.803000e-04 loss=4.1122 loss_recon=4.1122 loss_meanflow=0.0000 mean_model_t=0.5025 mean_corrupt_t=0.5025 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4889 corrupt_frac=1.0000 acc_corrupt=0.4889 loss_corrupt=4.1122 wrong_frac=0.4977 init_acc_corrupt=0.4679 acc_corrupt_t_0p0_0p2=0.0605 corrupt_frac_t_0p0_0p2=0.2006 acc_corrupt_t_0p2_0p4=0.2628 corrupt_frac_t_0p2_0p4=0.1970 acc_corrupt_t_0p4_0p6=0.5033 corrupt_frac_t_0p4_0p6=0.1941 acc_corrupt_t_0p6_0p8=0.7018 corrupt_frac_t_0p6_0p8=0.2026 acc_corrupt_t_0p8_1p0=0.8992 corrupt_frac_t_0p8_1p0=0.2078 out_w_norm=52.4291 out_g_norm=0.2511 loss_all=4.0709 init_gold_top10=0.4996 init_gold_top100=0.5313
|
| 169 |
+
step=650 epoch=1/59 epoch_step=650/17061 micro_steps=2600 elapsed=268.4s lr=1.953000e-04 loss=4.0279 loss_recon=4.0279 loss_meanflow=0.0000 mean_model_t=0.5036 mean_corrupt_t=0.5036 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.4964 corrupt_frac=1.0000 acc_corrupt=0.4964 loss_corrupt=4.0279 wrong_frac=0.4964 init_acc_corrupt=0.4702 acc_corrupt_t_0p0_0p2=0.0627 corrupt_frac_t_0p0_0p2=0.1948 acc_corrupt_t_0p2_0p4=0.2751 corrupt_frac_t_0p2_0p4=0.1997 acc_corrupt_t_0p4_0p6=0.5146 corrupt_frac_t_0p4_0p6=0.2028 acc_corrupt_t_0p6_0p8=0.7119 corrupt_frac_t_0p6_0p8=0.2009 acc_corrupt_t_0p8_1p0=0.9017 corrupt_frac_t_0p8_1p0=0.2017 out_w_norm=53.9562 out_g_norm=0.2470 loss_all=3.3140 init_gold_top10=0.5782 init_gold_top100=0.5980
|
| 170 |
+
step=700 epoch=1/59 epoch_step=700/17061 micro_steps=2800 elapsed=286.2s lr=2.103000e-04 loss=3.9270 loss_recon=3.9270 loss_meanflow=0.0000 mean_model_t=0.5033 mean_corrupt_t=0.5033 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5030 corrupt_frac=1.0000 acc_corrupt=0.5030 loss_corrupt=3.9270 wrong_frac=0.4966 init_acc_corrupt=0.4694 acc_corrupt_t_0p0_0p2=0.0662 corrupt_frac_t_0p0_0p2=0.1959 acc_corrupt_t_0p2_0p4=0.2824 corrupt_frac_t_0p2_0p4=0.1964 acc_corrupt_t_0p4_0p6=0.5242 corrupt_frac_t_0p4_0p6=0.2042 acc_corrupt_t_0p6_0p8=0.7141 corrupt_frac_t_0p6_0p8=0.1988 acc_corrupt_t_0p8_1p0=0.9067 corrupt_frac_t_0p8_1p0=0.2047 out_w_norm=55.6574 out_g_norm=0.2686 loss_all=3.8679 init_gold_top10=0.4979 init_gold_top100=0.5239
|
| 171 |
+
step=750 epoch=1/59 epoch_step=750/17061 micro_steps=3000 elapsed=268.1s lr=2.253000e-04 loss=3.7913 loss_recon=3.7913 loss_meanflow=0.0000 mean_model_t=0.5050 mean_corrupt_t=0.5050 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5137 corrupt_frac=1.0000 acc_corrupt=0.5137 loss_corrupt=3.7913 wrong_frac=0.4948 init_acc_corrupt=0.4710 acc_corrupt_t_0p0_0p2=0.0713 corrupt_frac_t_0p0_0p2=0.1955 acc_corrupt_t_0p2_0p4=0.2947 corrupt_frac_t_0p2_0p4=0.1966 acc_corrupt_t_0p4_0p6=0.5423 corrupt_frac_t_0p4_0p6=0.2014 acc_corrupt_t_0p6_0p8=0.7290 corrupt_frac_t_0p6_0p8=0.2086 acc_corrupt_t_0p8_1p0=0.9119 corrupt_frac_t_0p8_1p0=0.1980 out_w_norm=57.6091 out_g_norm=0.2684 loss_all=4.1275 init_gold_top10=0.4555 init_gold_top100=0.4853
|
| 172 |
+
step=800 epoch=1/59 epoch_step=800/17061 micro_steps=3200 elapsed=242.0s lr=2.403000e-04 loss=3.7452 loss_recon=3.7452 loss_meanflow=0.0000 mean_model_t=0.5002 mean_corrupt_t=0.5002 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5149 corrupt_frac=1.0000 acc_corrupt=0.5149 loss_corrupt=3.7452 wrong_frac=0.4996 init_acc_corrupt=0.4653 acc_corrupt_t_0p0_0p2=0.0719 corrupt_frac_t_0p0_0p2=0.2013 acc_corrupt_t_0p2_0p4=0.3060 corrupt_frac_t_0p2_0p4=0.2027 acc_corrupt_t_0p4_0p6=0.5500 corrupt_frac_t_0p4_0p6=0.1976 acc_corrupt_t_0p6_0p8=0.7398 corrupt_frac_t_0p6_0p8=0.2008 acc_corrupt_t_0p8_1p0=0.9144 corrupt_frac_t_0p8_1p0=0.1988 out_w_norm=59.5190 out_g_norm=0.2504 loss_all=3.3060 init_gold_top10=0.5482 init_gold_top100=0.5781
|
| 173 |
+
step=850 epoch=1/59 epoch_step=850/17061 micro_steps=3400 elapsed=221.9s lr=2.553000e-04 loss=3.6764 loss_recon=3.6764 loss_meanflow=0.0000 mean_model_t=0.5017 mean_corrupt_t=0.5017 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.5207 corrupt_frac=1.0000 acc_corrupt=0.5207 loss_corrupt=3.6764 wrong_frac=0.4984 init_acc_corrupt=0.4672 acc_corrupt_t_0p0_0p2=0.0725 corrupt_frac_t_0p0_0p2=0.1967 acc_corrupt_t_0p2_0p4=0.3063 corrupt_frac_t_0p2_0p4=0.1995 acc_corrupt_t_0p4_0p6=0.5525 corrupt_frac_t_0p4_0p6=0.1994 acc_corrupt_t_0p6_0p8=0.7426 corrupt_frac_t_0p6_0p8=0.2030 acc_corrupt_t_0p8_1p0=0.9156 corrupt_frac_t_0p8_1p0=0.2014 out_w_norm=61.4423 out_g_norm=0.2535 loss_all=3.6387 init_gold_top10=0.4996 init_gold_top100=0.5321
|
| 174 |
+
Terminated
|
LTA_openwebtext_dualt/logs/genppl_lm1b_step_latest_k1024_s128_flm.log
ADDED
|
File without changes
|
LTA_openwebtext_dualt/logs/infer_owt_compact_v2048_ckpt_sweep_steps128_c256_temps_n8_large_20260520_205159.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LTA_openwebtext_dualt/logs/infer_owt_compact_v8192_probe_flow_onehot_steps128_c1024_t1p45_n8_large_20260520_201801.log
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[infer] step=20359 out=docs/lta_samples/metrics_20260520/owt_compact_v8192_ckpt_probe_flow_onehot_steps128_c1024_t1p45_n8_large/step20359_flow_onehot_steps128_c1024_t1p45.jsonl
|
| 2 |
+
[ckpt] runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0020359.pt step=20359
|
| 3 |
+
[decode-base] n=8 max_len=1024 steps=128 model_t=flow
|
| 4 |
+
[decode-time] schedule=linear s=[0.0,0.25] gumbel=(2.2,0.8) force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.007812 dt_max=0.007812
|
| 5 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 6 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 7 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 8 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 9 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 10 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 11 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 12 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 13 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0020359.pt", "step": 20359, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 81.02696275879805, "nll_per_token": 4.394781972847733, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 89.86994499964428, "nll_per_token": 4.4983635696710325, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 5.328659805364104, "unique_tokens": 1826, "token_count": 8192, "distinct_1": 0.222900390625, "distinct_2": 0.6592130987292277, "top_token_mass": 0.03173828125}}
|
| 14 |
+
[done] docs/lta_samples/metrics_20260520/owt_compact_v8192_ckpt_probe_flow_onehot_steps128_c1024_t1p45_n8_large/step20359_flow_onehot_steps128_c1024_t1p45.jsonl
|
| 15 |
+
[infer] step=81436 out=docs/lta_samples/metrics_20260520/owt_compact_v8192_ckpt_probe_flow_onehot_steps128_c1024_t1p45_n8_large/step81436_flow_onehot_steps128_c1024_t1p45.jsonl
|
| 16 |
+
[ckpt] runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0081436.pt step=81436
|
| 17 |
+
[decode-base] n=8 max_len=1024 steps=128 model_t=flow
|
| 18 |
+
[decode-time] schedule=linear s=[0.0,0.25] gumbel=(2.2,0.8) force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.007812 dt_max=0.007812
|
| 19 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 20 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 21 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 22 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 23 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 24 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 25 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 26 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 27 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0081436.pt", "step": 81436, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 59.11577517968969, "nll_per_token": 4.079497812308517, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 68.59219993052562, "nll_per_token": 4.228178824630438, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 4.876995635387916, "unique_tokens": 1754, "token_count": 8192, "distinct_1": 0.214111328125, "distinct_2": 0.5983626588465298, "top_token_mass": 0.0494384765625}}
|
| 28 |
+
[done] docs/lta_samples/metrics_20260520/owt_compact_v8192_ckpt_probe_flow_onehot_steps128_c1024_t1p45_n8_large/step81436_flow_onehot_steps128_c1024_t1p45.jsonl
|
| 29 |
+
[infer] step=142513 out=docs/lta_samples/metrics_20260520/owt_compact_v8192_ckpt_probe_flow_onehot_steps128_c1024_t1p45_n8_large/step142513_flow_onehot_steps128_c1024_t1p45.jsonl
|
| 30 |
+
[ckpt] runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt step=142513
|
| 31 |
+
[decode-base] n=8 max_len=1024 steps=128 model_t=flow
|
| 32 |
+
[decode-time] schedule=linear s=[0.0,0.25] gumbel=(2.2,0.8) force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.007812 dt_max=0.007812
|
| 33 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 34 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 35 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 36 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 37 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 38 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 39 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 40 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 41 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt", "step": 142513, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 24.558909049838665, "nll_per_token": 3.2010746824975107, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 24.95213814549325, "nll_per_token": 3.2169595157398896, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 3.367480129322778, "unique_tokens": 750, "token_count": 8192, "distinct_1": 0.091552734375, "distinct_2": 0.34616324535679377, "top_token_mass": 0.0933837890625}}
|
| 42 |
+
[done] docs/lta_samples/metrics_20260520/owt_compact_v8192_ckpt_probe_flow_onehot_steps128_c1024_t1p45_n8_large/step142513_flow_onehot_steps128_c1024_t1p45.jsonl
|
LTA_openwebtext_dualt/logs/infer_owt_compact_v8192_step142k_flow_onehot_argmax_sweep_n8_large_20260520_202516.log
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[sweep] ckpt=runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt
|
| 2 |
+
[infer] c=1024 temps=1.45,1.60,1.80,2.00 out=docs/lta_samples/metrics_20260520/owt_compact_v8192_step142k_flow_onehot_argmax_sweep_n8_large/c1024_temps.jsonl
|
| 3 |
+
[ckpt] runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt step=142513
|
| 4 |
+
[decode-base] n=8 max_len=1024 steps=128 model_t=flow
|
| 5 |
+
[decode-time] schedule=linear s=[0.0,0.25] gumbel=(2.2,0.8) force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.007812 dt_max=0.007812
|
| 6 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 7 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 8 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 9 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 10 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 11 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 12 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 13 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 14 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 15 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 16 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 17 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 18 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 19 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 20 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 21 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 22 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 23 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 24 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 25 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 26 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 27 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 28 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 29 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 30 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 31 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 32 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 33 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 34 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 35 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 36 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 37 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 38 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt", "step": 142513, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 24.558909049838665, "nll_per_token": 3.2010746824975107, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 24.95213814549325, "nll_per_token": 3.2169595157398896, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 3.367480129322778, "unique_tokens": 750, "token_count": 8192, "distinct_1": 0.091552734375, "distinct_2": 0.34616324535679377, "top_token_mass": 0.0933837890625}}
|
| 39 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt", "step": 142513, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.6, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 61.90123501159806, "nll_per_token": 4.125540131213619, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 61.82879988490115, "nll_per_token": 4.124369273466223, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 2.8654221065259846, "unique_tokens": 241, "token_count": 8192, "distinct_1": 0.0294189453125, "distinct_2": 0.2380254154447703, "top_token_mass": 0.1497802734375}}
|
| 40 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt", "step": 142513, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.8, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 236.10354018277863, "nll_per_token": 5.46427043839997, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 236.38662406204426, "nll_per_token": 5.465468702129289, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 3.426329846830298, "unique_tokens": 334, "token_count": 8192, "distinct_1": 0.040771484375, "distinct_2": 0.29056695992179865, "top_token_mass": 0.1021728515625}}
|
| 41 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt", "step": 142513, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 2.0, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 797.2969082577021, "nll_per_token": 6.681227141735601, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 797.2969082577021, "nll_per_token": 6.681227141735601, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 3.9980454244051566, "unique_tokens": 594, "token_count": 8192, "distinct_1": 0.072509765625, "distinct_2": 0.43169599217986315, "top_token_mass": 0.0872802734375}}
|
| 42 |
+
[done] docs/lta_samples/metrics_20260520/owt_compact_v8192_step142k_flow_onehot_argmax_sweep_n8_large/c1024_temps.jsonl
|
| 43 |
+
[infer] c=512 temps=1.45,1.60,1.80,2.00 out=docs/lta_samples/metrics_20260520/owt_compact_v8192_step142k_flow_onehot_argmax_sweep_n8_large/c512_temps.jsonl
|
| 44 |
+
[ckpt] runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt step=142513
|
| 45 |
+
[decode-base] n=8 max_len=1024 steps=128 model_t=flow
|
| 46 |
+
[decode-time] schedule=linear s=[0.0,0.25] gumbel=(2.2,0.8) force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.007812 dt_max=0.007812
|
| 47 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 48 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 49 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 50 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 51 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 52 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 53 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 54 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 55 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 56 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 57 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 58 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 59 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 60 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 61 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 62 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 63 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 64 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 65 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 66 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 67 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 68 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 69 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 70 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 71 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 72 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 73 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 74 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 75 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 76 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 77 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 78 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 79 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt", "step": 142513, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 512.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 35.595751158665244, "nll_per_token": 3.572226281259574, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 36.84510754064624, "nll_per_token": 3.6067228429457723, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 3.6511284538790205, "unique_tokens": 651, "token_count": 8192, "distinct_1": 0.0794677734375, "distinct_2": 0.35373900293255134, "top_token_mass": 0.039306640625}}
|
| 80 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt", "step": 142513, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 512.0, "target_prob": 1.0, "endpoint_temp": 1.6, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 125.87374709240041, "nll_per_token": 4.835279397403492, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 126.45687922544968, "nll_per_token": 4.8399013743681065, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 3.827819755664881, "unique_tokens": 429, "token_count": 8192, "distinct_1": 0.0523681640625, "distinct_2": 0.34787390029325516, "top_token_mass": 0.0482177734375}}
|
| 81 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt", "step": 142513, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 512.0, "target_prob": 1.0, "endpoint_temp": 1.8, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 494.19662154369934, "nll_per_token": 6.2029334573184745, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 495.8881854021738, "nll_per_token": 6.206350468654259, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 4.102994734668988, "unique_tokens": 806, "token_count": 8192, "distinct_1": 0.098388671875, "distinct_2": 0.5232160312805474, "top_token_mass": 0.0655517578125}}
|
| 82 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt", "step": 142513, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 512.0, "target_prob": 1.0, "endpoint_temp": 2.0, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 2772.9985691507204, "nll_per_token": 7.927684529622396, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 2757.659277476166, "nll_per_token": 7.922137511010264, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 5.166956872628071, "unique_tokens": 1542, "token_count": 8192, "distinct_1": 0.188232421875, "distinct_2": 0.7913000977517106, "top_token_mass": 0.0462646484375}}
|
| 83 |
+
[done] docs/lta_samples/metrics_20260520/owt_compact_v8192_step142k_flow_onehot_argmax_sweep_n8_large/c512_temps.jsonl
|
| 84 |
+
[infer] c=256 temps=1.45,1.60,1.80,2.00 out=docs/lta_samples/metrics_20260520/owt_compact_v8192_step142k_flow_onehot_argmax_sweep_n8_large/c256_temps.jsonl
|
| 85 |
+
[ckpt] runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt step=142513
|
| 86 |
+
[decode-base] n=8 max_len=1024 steps=128 model_t=flow
|
| 87 |
+
[decode-time] schedule=linear s=[0.0,0.25] gumbel=(2.2,0.8) force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.007812 dt_max=0.007812
|
| 88 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 89 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 90 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 91 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 92 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 93 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 94 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 95 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 96 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 97 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 98 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 99 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 100 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 101 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 102 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 103 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 104 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 105 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 106 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 107 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 108 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 109 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 110 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 111 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 112 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 113 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 114 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 115 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 116 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 117 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 118 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 119 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 120 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt", "step": 142513, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 256.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 93.24024737790586, "nll_per_token": 4.53517946729473, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 99.5832169983887, "nll_per_token": 4.600993646359911, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 4.271051344525033, "unique_tokens": 634, "token_count": 8192, "distinct_1": 0.077392578125, "distinct_2": 0.5084310850439883, "top_token_mass": 0.04736328125}}
|
| 121 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt", "step": 142513, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 256.0, "target_prob": 1.0, "endpoint_temp": 1.6, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 82.17690506325746, "nll_per_token": 4.40887430228439, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 82.90081842761556, "nll_per_token": 4.417644934560738, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 3.701395777792879, "unique_tokens": 371, "token_count": 8192, "distinct_1": 0.0452880859375, "distinct_2": 0.3543499511241447, "top_token_mass": 0.0709228515625}}
|
| 122 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt", "step": 142513, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 256.0, "target_prob": 1.0, "endpoint_temp": 1.8, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 415.99116937865665, "nll_per_token": 6.030664032580805, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 417.34936172966326, "nll_per_token": 6.033923668954887, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 4.6335167480868, "unique_tokens": 1248, "token_count": 8192, "distinct_1": 0.15234375, "distinct_2": 0.6768084066471163, "top_token_mass": 0.0369873046875}}
|
| 123 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt", "step": 142513, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 256.0, "target_prob": 1.0, "endpoint_temp": 2.0, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 5750.184435097303, "nll_per_token": 8.656987208946079, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 5745.7928768823385, "nll_per_token": 8.65622319240196, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 5.908193959743645, "unique_tokens": 2292, "token_count": 8192, "distinct_1": 0.27978515625, "distinct_2": 0.9499022482893451, "top_token_mass": 0.0211181640625}}
|
| 124 |
+
[done] docs/lta_samples/metrics_20260520/owt_compact_v8192_step142k_flow_onehot_argmax_sweep_n8_large/c256_temps.jsonl
|
| 125 |
+
[infer] c=128 temps=1.45,1.60,1.80,2.00 out=docs/lta_samples/metrics_20260520/owt_compact_v8192_step142k_flow_onehot_argmax_sweep_n8_large/c128_temps.jsonl
|
| 126 |
+
[ckpt] runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt step=142513
|
| 127 |
+
[decode-base] n=8 max_len=1024 steps=128 model_t=flow
|
| 128 |
+
[decode-time] schedule=linear s=[0.0,0.25] gumbel=(2.2,0.8) force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.007812 dt_max=0.007812
|
| 129 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 130 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 131 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 132 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 133 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 134 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 135 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 136 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 137 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 138 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 139 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 140 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 141 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 142 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 143 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 144 |
+
[decode] temp=1.60 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 145 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 146 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 147 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 148 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 149 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 150 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 151 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 152 |
+
[decode] temp=1.80 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 153 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 154 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 155 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 156 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 157 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 158 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 159 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 160 |
+
[decode] temp=2.00 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 161 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt", "step": 142513, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 128.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 160.43586427465166, "nll_per_token": 5.07789426317402, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 165.1494624806168, "nll_per_token": 5.10685089709712, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 4.445461443708831, "unique_tokens": 847, "token_count": 8192, "distinct_1": 0.1033935546875, "distinct_2": 0.5334799608993157, "top_token_mass": 0.029541015625}}
|
| 162 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt", "step": 142513, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 128.0, "target_prob": 1.0, "endpoint_temp": 1.6, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 216.4485436312606, "nll_per_token": 5.377352845435049, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 218.31928970818086, "nll_per_token": 5.385958622951134, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 3.9082948069564583, "unique_tokens": 565, "token_count": 8192, "distinct_1": 0.0689697265625, "distinct_2": 0.4560117302052786, "top_token_mass": 0.1065673828125}}
|
| 163 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt", "step": 142513, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 128.0, "target_prob": 1.0, "endpoint_temp": 1.8, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 428.9966791804292, "nll_per_token": 6.061449178059896, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 432.2821554908295, "nll_per_token": 6.069078512752758, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 4.355772572908551, "unique_tokens": 1079, "token_count": 8192, "distinct_1": 0.1317138671875, "distinct_2": 0.5505865102639296, "top_token_mass": 0.0419921875}}
|
| 164 |
+
[summary] {"type": "summary", "checkpoint": "runs/lta_owt_compact_gpt2bpe_v8192_stream1024_fullycoupled_rmsnorm_nobias_adamw_wd0p1_uniformt_hardce_mask0p1-1p0_fp32_ddit768x12_gbs512_8gpu_1m_20260519_201817/step_0142513.pt", "step": 142513, "decode": {"steps": 128, "model_t_mode": "flow", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0078125, 0.015625, 0.0234375, 0.03125, 0.0390625, 0.046875, 0.0546875, 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, 0.125, 0.1328125, 0.140625, 0.1484375, 0.15625, 0.1640625, 0.171875, 0.1796875, 0.1875, 0.1953125, 0.203125, 0.2109375, 0.21875, 0.2265625, 0.234375, 0.2421875, 0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125, 0.328125, 0.3359375, 0.34375, 0.3515625, 0.359375, 0.3671875, 0.375, 0.3828125, 0.390625, 0.3984375, 0.40625, 0.4140625, 0.421875, 0.4296875, 0.4375, 0.4453125, 0.453125, 0.4609375, 0.46875, 0.4765625, 0.484375, 0.4921875, 0.5, 0.5078125, 0.515625, 0.5234375, 0.53125, 0.5390625, 0.546875, 0.5546875, 0.5625, 0.5703125, 0.578125, 0.5859375, 0.59375, 0.6015625, 0.609375, 0.6171875, 0.625, 0.6328125, 0.640625, 0.6484375, 0.65625, 0.6640625, 0.671875, 0.6796875, 0.6875, 0.6953125, 0.703125, 0.7109375, 0.71875, 0.7265625, 0.734375, 0.7421875, 0.75, 0.7578125, 0.765625, 0.7734375, 0.78125, 0.7890625, 0.796875, 0.8046875, 0.8125, 0.8203125, 0.828125, 0.8359375, 0.84375, 0.8515625, 0.859375, 0.8671875, 0.875, 0.8828125, 0.890625, 0.8984375, 0.90625, 0.9140625, 0.921875, 0.9296875, 0.9375, 0.9453125, 0.953125, 0.9609375, 0.96875, 0.9765625, 0.984375, 0.9921875, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 128.0, "target_prob": 1.0, "endpoint_temp": 2.0, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 8196.449066792626, "nll_per_token": 9.011456298828126, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 8202.819703826315, "nll_per_token": 9.012233240464154, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 6.2585507689933495, "unique_tokens": 2703, "token_count": 8192, "distinct_1": 0.3299560546875, "distinct_2": 0.9870478983382209, "top_token_mass": 0.0123291015625}}
|
| 165 |
+
[done] docs/lta_samples/metrics_20260520/owt_compact_v8192_step142k_flow_onehot_argmax_sweep_n8_large/c128_temps.jsonl
|
| 166 |
+
cmax temp raw stripped entropy unique top_mass tokens kept
|
| 167 |
+
128.0 1.45 160.4359 165.1495 4.4455 847 0.0295 2040 8
|
| 168 |
+
128.0 1.60 216.4485 218.3193 3.9083 565 0.1066 2040 8
|
| 169 |
+
128.0 1.80 428.9967 432.2822 4.3558 1079 0.0420 2040 8
|
| 170 |
+
128.0 2.00 8196.4491 8202.8197 6.2586 2703 0.0123 2040 8
|
| 171 |
+
256.0 1.45 93.2402 99.5832 4.2711 634 0.0474 2040 8
|
| 172 |
+
256.0 1.60 82.1769 82.9008 3.7014 371 0.0709 2040 8
|
| 173 |
+
256.0 1.80 415.9912 417.3494 4.6335 1248 0.0370 2040 8
|
| 174 |
+
256.0 2.00 5750.1844 5745.7929 5.9082 2292 0.0211 2040 8
|
| 175 |
+
512.0 1.45 35.5958 36.8451 3.6511 651 0.0393 2040 8
|
| 176 |
+
512.0 1.60 125.8737 126.4569 3.8278 429 0.0482 2040 8
|
| 177 |
+
512.0 1.80 494.1966 495.8882 4.1030 806 0.0656 2040 8
|
| 178 |
+
512.0 2.00 2772.9986 2757.6593 5.1670 1542 0.0463 2040 8
|
| 179 |
+
1024.0 1.45 24.5589 24.9521 3.3675 750 0.0934 2040 8
|
| 180 |
+
1024.0 1.60 61.9012 61.8288 2.8654 241 0.1498 2040 8
|
| 181 |
+
1024.0 1.80 236.1035 236.3866 3.4263 334 0.1022 2040 8
|
| 182 |
+
1024.0 2.00 797.2969 797.2969 3.9980 594 0.0873 2040 8
|
| 183 |
+
|
| 184 |
+
[rank target-ish entropy>=4.5 raw<80]
|
LTA_openwebtext_dualt/logs/infer_owt_t5_2node_latest_trainmatched_dirres_c128_lowtemp_n8.log
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[ckpt] eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step101200_20260519_092441.pt step=101000
|
| 2 |
+
[decode-base] n=8 max_len=1024 steps=1024 model_t=post
|
| 3 |
+
[decode-time] schedule=linear s=[0.0,0.25] force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.000977 dt_max=0.000977
|
| 4 |
+
[decode] temp=1.55 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 5 |
+
[decode] temp=1.55 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 6 |
+
[decode] temp=1.55 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 7 |
+
[decode] temp=1.55 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 8 |
+
[decode] temp=1.55 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 9 |
+
[decode] temp=1.55 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 10 |
+
[decode] temp=1.55 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 11 |
+
[decode] temp=1.55 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 12 |
+
[summary] {"type": "summary", "checkpoint": "eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step101200_20260519_092441.pt", "step": 101000, "decode": {"steps": 1024, "model_t_mode": "post", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0009765625, 0.001953125, 0.0029296875, 0.00390625, 0.0048828125, 0.005859375, 0.0068359375, 0.0078125, 0.0087890625, 0.009765625, 0.0107421875, 0.01171875, 0.0126953125, 0.013671875, 0.0146484375, 0.015625, 0.0166015625, 0.017578125, 0.0185546875, 0.01953125, 0.0205078125, 0.021484375, 0.0224609375, 0.0234375, 0.0244140625, 0.025390625, 0.0263671875, 0.02734375, 0.0283203125, 0.029296875, 0.0302734375, 0.03125, 0.0322265625, 0.033203125, 0.0341796875, 0.03515625, 0.0361328125, 0.037109375, 0.0380859375, 0.0390625, 0.0400390625, 0.041015625, 0.0419921875, 0.04296875, 0.0439453125, 0.044921875, 0.0458984375, 0.046875, 0.0478515625, 0.048828125, 0.0498046875, 0.05078125, 0.0517578125, 0.052734375, 0.0537109375, 0.0546875, 0.0556640625, 0.056640625, 0.0576171875, 0.05859375, 0.0595703125, 0.060546875, 0.0615234375, 0.0625, 0.0634765625, 0.064453125, 0.0654296875, 0.06640625, 0.0673828125, 0.068359375, 0.0693359375, 0.0703125, 0.0712890625, 0.072265625, 0.0732421875, 0.07421875, 0.0751953125, 0.076171875, 0.0771484375, 0.078125, 0.0791015625, 0.080078125, 0.0810546875, 0.08203125, 0.0830078125, 0.083984375, 0.0849609375, 0.0859375, 0.0869140625, 0.087890625, 0.0888671875, 0.08984375, 0.0908203125, 0.091796875, 0.0927734375, 0.09375, 0.0947265625, 0.095703125, 0.0966796875, 0.09765625, 0.0986328125, 0.099609375, 0.1005859375, 0.1015625, 0.1025390625, 0.103515625, 0.1044921875, 0.10546875, 0.1064453125, 0.107421875, 0.1083984375, 0.109375, 0.1103515625, 0.111328125, 0.1123046875, 0.11328125, 0.1142578125, 0.115234375, 0.1162109375, 0.1171875, 0.1181640625, 0.119140625, 0.1201171875, 0.12109375, 0.1220703125, 0.123046875, 0.1240234375, 0.125, 0.1259765625, 0.126953125, 0.1279296875, 0.12890625, 0.1298828125, 0.130859375, 0.1318359375, 0.1328125, 0.1337890625, 0.134765625, 0.1357421875, 0.13671875, 0.1376953125, 0.138671875, 0.1396484375, 0.140625, 0.1416015625, 0.142578125, 0.1435546875, 0.14453125, 0.1455078125, 0.146484375, 0.1474609375, 0.1484375, 0.1494140625, 0.150390625, 0.1513671875, 0.15234375, 0.1533203125, 0.154296875, 0.1552734375, 0.15625, 0.1572265625, 0.158203125, 0.1591796875, 0.16015625, 0.1611328125, 0.162109375, 0.1630859375, 0.1640625, 0.1650390625, 0.166015625, 0.1669921875, 0.16796875, 0.1689453125, 0.169921875, 0.1708984375, 0.171875, 0.1728515625, 0.173828125, 0.1748046875, 0.17578125, 0.1767578125, 0.177734375, 0.1787109375, 0.1796875, 0.1806640625, 0.181640625, 0.1826171875, 0.18359375, 0.1845703125, 0.185546875, 0.1865234375, 0.1875, 0.1884765625, 0.189453125, 0.1904296875, 0.19140625, 0.1923828125, 0.193359375, 0.1943359375, 0.1953125, 0.1962890625, 0.197265625, 0.1982421875, 0.19921875, 0.2001953125, 0.201171875, 0.2021484375, 0.203125, 0.2041015625, 0.205078125, 0.2060546875, 0.20703125, 0.2080078125, 0.208984375, 0.2099609375, 0.2109375, 0.2119140625, 0.212890625, 0.2138671875, 0.21484375, 0.2158203125, 0.216796875, 0.2177734375, 0.21875, 0.2197265625, 0.220703125, 0.2216796875, 0.22265625, 0.2236328125, 0.224609375, 0.2255859375, 0.2265625, 0.2275390625, 0.228515625, 0.2294921875, 0.23046875, 0.2314453125, 0.232421875, 0.2333984375, 0.234375, 0.2353515625, 0.236328125, 0.2373046875, 0.23828125, 0.2392578125, 0.240234375, 0.2412109375, 0.2421875, 0.2431640625, 0.244140625, 0.2451171875, 0.24609375, 0.2470703125, 0.248046875, 0.2490234375, 0.25, 0.2509765625, 0.251953125, 0.2529296875, 0.25390625, 0.2548828125, 0.255859375, 0.2568359375, 0.2578125, 0.2587890625, 0.259765625, 0.2607421875, 0.26171875, 0.2626953125, 0.263671875, 0.2646484375, 0.265625, 0.2666015625, 0.267578125, 0.2685546875, 0.26953125, 0.2705078125, 0.271484375, 0.2724609375, 0.2734375, 0.2744140625, 0.275390625, 0.2763671875, 0.27734375, 0.2783203125, 0.279296875, 0.2802734375, 0.28125, 0.2822265625, 0.283203125, 0.2841796875, 0.28515625, 0.2861328125, 0.287109375, 0.2880859375, 0.2890625, 0.2900390625, 0.291015625, 0.2919921875, 0.29296875, 0.2939453125, 0.294921875, 0.2958984375, 0.296875, 0.2978515625, 0.298828125, 0.2998046875, 0.30078125, 0.3017578125, 0.302734375, 0.3037109375, 0.3046875, 0.3056640625, 0.306640625, 0.3076171875, 0.30859375, 0.3095703125, 0.310546875, 0.3115234375, 0.3125, 0.3134765625, 0.314453125, 0.3154296875, 0.31640625, 0.3173828125, 0.318359375, 0.3193359375, 0.3203125, 0.3212890625, 0.322265625, 0.3232421875, 0.32421875, 0.3251953125, 0.326171875, 0.3271484375, 0.328125, 0.3291015625, 0.330078125, 0.3310546875, 0.33203125, 0.3330078125, 0.333984375, 0.3349609375, 0.3359375, 0.3369140625, 0.337890625, 0.3388671875, 0.33984375, 0.3408203125, 0.341796875, 0.3427734375, 0.34375, 0.3447265625, 0.345703125, 0.3466796875, 0.34765625, 0.3486328125, 0.349609375, 0.3505859375, 0.3515625, 0.3525390625, 0.353515625, 0.3544921875, 0.35546875, 0.3564453125, 0.357421875, 0.3583984375, 0.359375, 0.3603515625, 0.361328125, 0.3623046875, 0.36328125, 0.3642578125, 0.365234375, 0.3662109375, 0.3671875, 0.3681640625, 0.369140625, 0.3701171875, 0.37109375, 0.3720703125, 0.373046875, 0.3740234375, 0.375, 0.3759765625, 0.376953125, 0.3779296875, 0.37890625, 0.3798828125, 0.380859375, 0.3818359375, 0.3828125, 0.3837890625, 0.384765625, 0.3857421875, 0.38671875, 0.3876953125, 0.388671875, 0.3896484375, 0.390625, 0.3916015625, 0.392578125, 0.3935546875, 0.39453125, 0.3955078125, 0.396484375, 0.3974609375, 0.3984375, 0.3994140625, 0.400390625, 0.4013671875, 0.40234375, 0.4033203125, 0.404296875, 0.4052734375, 0.40625, 0.4072265625, 0.408203125, 0.4091796875, 0.41015625, 0.4111328125, 0.412109375, 0.4130859375, 0.4140625, 0.4150390625, 0.416015625, 0.4169921875, 0.41796875, 0.4189453125, 0.419921875, 0.4208984375, 0.421875, 0.4228515625, 0.423828125, 0.4248046875, 0.42578125, 0.4267578125, 0.427734375, 0.4287109375, 0.4296875, 0.4306640625, 0.431640625, 0.4326171875, 0.43359375, 0.4345703125, 0.435546875, 0.4365234375, 0.4375, 0.4384765625, 0.439453125, 0.4404296875, 0.44140625, 0.4423828125, 0.443359375, 0.4443359375, 0.4453125, 0.4462890625, 0.447265625, 0.4482421875, 0.44921875, 0.4501953125, 0.451171875, 0.4521484375, 0.453125, 0.4541015625, 0.455078125, 0.4560546875, 0.45703125, 0.4580078125, 0.458984375, 0.4599609375, 0.4609375, 0.4619140625, 0.462890625, 0.4638671875, 0.46484375, 0.4658203125, 0.466796875, 0.4677734375, 0.46875, 0.4697265625, 0.470703125, 0.4716796875, 0.47265625, 0.4736328125, 0.474609375, 0.4755859375, 0.4765625, 0.4775390625, 0.478515625, 0.4794921875, 0.48046875, 0.4814453125, 0.482421875, 0.4833984375, 0.484375, 0.4853515625, 0.486328125, 0.4873046875, 0.48828125, 0.4892578125, 0.490234375, 0.4912109375, 0.4921875, 0.4931640625, 0.494140625, 0.4951171875, 0.49609375, 0.4970703125, 0.498046875, 0.4990234375, 0.5, 0.5009765625, 0.501953125, 0.5029296875, 0.50390625, 0.5048828125, 0.505859375, 0.5068359375, 0.5078125, 0.5087890625, 0.509765625, 0.5107421875, 0.51171875, 0.5126953125, 0.513671875, 0.5146484375, 0.515625, 0.5166015625, 0.517578125, 0.5185546875, 0.51953125, 0.5205078125, 0.521484375, 0.5224609375, 0.5234375, 0.5244140625, 0.525390625, 0.5263671875, 0.52734375, 0.5283203125, 0.529296875, 0.5302734375, 0.53125, 0.5322265625, 0.533203125, 0.5341796875, 0.53515625, 0.5361328125, 0.537109375, 0.5380859375, 0.5390625, 0.5400390625, 0.541015625, 0.5419921875, 0.54296875, 0.5439453125, 0.544921875, 0.5458984375, 0.546875, 0.5478515625, 0.548828125, 0.5498046875, 0.55078125, 0.5517578125, 0.552734375, 0.5537109375, 0.5546875, 0.5556640625, 0.556640625, 0.5576171875, 0.55859375, 0.5595703125, 0.560546875, 0.5615234375, 0.5625, 0.5634765625, 0.564453125, 0.5654296875, 0.56640625, 0.5673828125, 0.568359375, 0.5693359375, 0.5703125, 0.5712890625, 0.572265625, 0.5732421875, 0.57421875, 0.5751953125, 0.576171875, 0.5771484375, 0.578125, 0.5791015625, 0.580078125, 0.5810546875, 0.58203125, 0.5830078125, 0.583984375, 0.5849609375, 0.5859375, 0.5869140625, 0.587890625, 0.5888671875, 0.58984375, 0.5908203125, 0.591796875, 0.5927734375, 0.59375, 0.5947265625, 0.595703125, 0.5966796875, 0.59765625, 0.5986328125, 0.599609375, 0.6005859375, 0.6015625, 0.6025390625, 0.603515625, 0.6044921875, 0.60546875, 0.6064453125, 0.607421875, 0.6083984375, 0.609375, 0.6103515625, 0.611328125, 0.6123046875, 0.61328125, 0.6142578125, 0.615234375, 0.6162109375, 0.6171875, 0.6181640625, 0.619140625, 0.6201171875, 0.62109375, 0.6220703125, 0.623046875, 0.6240234375, 0.625, 0.6259765625, 0.626953125, 0.6279296875, 0.62890625, 0.6298828125, 0.630859375, 0.6318359375, 0.6328125, 0.6337890625, 0.634765625, 0.6357421875, 0.63671875, 0.6376953125, 0.638671875, 0.6396484375, 0.640625, 0.6416015625, 0.642578125, 0.6435546875, 0.64453125, 0.6455078125, 0.646484375, 0.6474609375, 0.6484375, 0.6494140625, 0.650390625, 0.6513671875, 0.65234375, 0.6533203125, 0.654296875, 0.6552734375, 0.65625, 0.6572265625, 0.658203125, 0.6591796875, 0.66015625, 0.6611328125, 0.662109375, 0.6630859375, 0.6640625, 0.6650390625, 0.666015625, 0.6669921875, 0.66796875, 0.6689453125, 0.669921875, 0.6708984375, 0.671875, 0.6728515625, 0.673828125, 0.6748046875, 0.67578125, 0.6767578125, 0.677734375, 0.6787109375, 0.6796875, 0.6806640625, 0.681640625, 0.6826171875, 0.68359375, 0.6845703125, 0.685546875, 0.6865234375, 0.6875, 0.6884765625, 0.689453125, 0.6904296875, 0.69140625, 0.6923828125, 0.693359375, 0.6943359375, 0.6953125, 0.6962890625, 0.697265625, 0.6982421875, 0.69921875, 0.7001953125, 0.701171875, 0.7021484375, 0.703125, 0.7041015625, 0.705078125, 0.7060546875, 0.70703125, 0.7080078125, 0.708984375, 0.7099609375, 0.7109375, 0.7119140625, 0.712890625, 0.7138671875, 0.71484375, 0.7158203125, 0.716796875, 0.7177734375, 0.71875, 0.7197265625, 0.720703125, 0.7216796875, 0.72265625, 0.7236328125, 0.724609375, 0.7255859375, 0.7265625, 0.7275390625, 0.728515625, 0.7294921875, 0.73046875, 0.7314453125, 0.732421875, 0.7333984375, 0.734375, 0.7353515625, 0.736328125, 0.7373046875, 0.73828125, 0.7392578125, 0.740234375, 0.7412109375, 0.7421875, 0.7431640625, 0.744140625, 0.7451171875, 0.74609375, 0.7470703125, 0.748046875, 0.7490234375, 0.75, 0.7509765625, 0.751953125, 0.7529296875, 0.75390625, 0.7548828125, 0.755859375, 0.7568359375, 0.7578125, 0.7587890625, 0.759765625, 0.7607421875, 0.76171875, 0.7626953125, 0.763671875, 0.7646484375, 0.765625, 0.7666015625, 0.767578125, 0.7685546875, 0.76953125, 0.7705078125, 0.771484375, 0.7724609375, 0.7734375, 0.7744140625, 0.775390625, 0.7763671875, 0.77734375, 0.7783203125, 0.779296875, 0.7802734375, 0.78125, 0.7822265625, 0.783203125, 0.7841796875, 0.78515625, 0.7861328125, 0.787109375, 0.7880859375, 0.7890625, 0.7900390625, 0.791015625, 0.7919921875, 0.79296875, 0.7939453125, 0.794921875, 0.7958984375, 0.796875, 0.7978515625, 0.798828125, 0.7998046875, 0.80078125, 0.8017578125, 0.802734375, 0.8037109375, 0.8046875, 0.8056640625, 0.806640625, 0.8076171875, 0.80859375, 0.8095703125, 0.810546875, 0.8115234375, 0.8125, 0.8134765625, 0.814453125, 0.8154296875, 0.81640625, 0.8173828125, 0.818359375, 0.8193359375, 0.8203125, 0.8212890625, 0.822265625, 0.8232421875, 0.82421875, 0.8251953125, 0.826171875, 0.8271484375, 0.828125, 0.8291015625, 0.830078125, 0.8310546875, 0.83203125, 0.8330078125, 0.833984375, 0.8349609375, 0.8359375, 0.8369140625, 0.837890625, 0.8388671875, 0.83984375, 0.8408203125, 0.841796875, 0.8427734375, 0.84375, 0.8447265625, 0.845703125, 0.8466796875, 0.84765625, 0.8486328125, 0.849609375, 0.8505859375, 0.8515625, 0.8525390625, 0.853515625, 0.8544921875, 0.85546875, 0.8564453125, 0.857421875, 0.8583984375, 0.859375, 0.8603515625, 0.861328125, 0.8623046875, 0.86328125, 0.8642578125, 0.865234375, 0.8662109375, 0.8671875, 0.8681640625, 0.869140625, 0.8701171875, 0.87109375, 0.8720703125, 0.873046875, 0.8740234375, 0.875, 0.8759765625, 0.876953125, 0.8779296875, 0.87890625, 0.8798828125, 0.880859375, 0.8818359375, 0.8828125, 0.8837890625, 0.884765625, 0.8857421875, 0.88671875, 0.8876953125, 0.888671875, 0.8896484375, 0.890625, 0.8916015625, 0.892578125, 0.8935546875, 0.89453125, 0.8955078125, 0.896484375, 0.8974609375, 0.8984375, 0.8994140625, 0.900390625, 0.9013671875, 0.90234375, 0.9033203125, 0.904296875, 0.9052734375, 0.90625, 0.9072265625, 0.908203125, 0.9091796875, 0.91015625, 0.9111328125, 0.912109375, 0.9130859375, 0.9140625, 0.9150390625, 0.916015625, 0.9169921875, 0.91796875, 0.9189453125, 0.919921875, 0.9208984375, 0.921875, 0.9228515625, 0.923828125, 0.9248046875, 0.92578125, 0.9267578125, 0.927734375, 0.9287109375, 0.9296875, 0.9306640625, 0.931640625, 0.9326171875, 0.93359375, 0.9345703125, 0.935546875, 0.9365234375, 0.9375, 0.9384765625, 0.939453125, 0.9404296875, 0.94140625, 0.9423828125, 0.943359375, 0.9443359375, 0.9453125, 0.9462890625, 0.947265625, 0.9482421875, 0.94921875, 0.9501953125, 0.951171875, 0.9521484375, 0.953125, 0.9541015625, 0.955078125, 0.9560546875, 0.95703125, 0.9580078125, 0.958984375, 0.9599609375, 0.9609375, 0.9619140625, 0.962890625, 0.9638671875, 0.96484375, 0.9658203125, 0.966796875, 0.9677734375, 0.96875, 0.9697265625, 0.970703125, 0.9716796875, 0.97265625, 0.9736328125, 0.974609375, 0.9755859375, 0.9765625, 0.9775390625, 0.978515625, 0.9794921875, 0.98046875, 0.9814453125, 0.982421875, 0.9833984375, 0.984375, 0.9853515625, 0.986328125, 0.9873046875, 0.98828125, 0.9892578125, 0.990234375, 0.9912109375, 0.9921875, 0.9931640625, 0.994140625, 0.9951171875, 0.99609375, 0.9970703125, 0.998046875, 0.9990234375, 1.0], "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 128.0, "target_prob": 1.0, "endpoint_temp": 1.55, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260519}, "raw_genppl": {"ppl": 54.220414987338124, "nll_per_token": 3.993057497809915, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 54.851453523563, "nll_per_token": 4.004628686343922, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 3.7687910224042147, "unique_tokens": 2329, "token_count": 8192, "distinct_1": 0.2843017578125, "distinct_2": 0.38428641251221896, "top_token_mass": 0.0831298828125}}
|
| 13 |
+
[done] docs/lta_samples/metrics_20260519/owt_t5_2node_latest_trainmatched_dirres_c128_lowtemp_n8/t1p55.jsonl
|
| 14 |
+
[ckpt] eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step101200_20260519_092441.pt step=101000
|
| 15 |
+
[decode-base] n=8 max_len=1024 steps=1024 model_t=post
|
| 16 |
+
[decode-time] schedule=linear s=[0.0,0.25] force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.000977 dt_max=0.000977
|
| 17 |
+
[decode] temp=1.60 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 18 |
+
[decode] temp=1.60 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 19 |
+
[decode] temp=1.60 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 20 |
+
[decode] temp=1.60 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 21 |
+
[decode] temp=1.60 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 22 |
+
[decode] temp=1.60 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 23 |
+
[decode] temp=1.60 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 24 |
+
[decode] temp=1.60 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 25 |
+
[summary] {"type": "summary", "checkpoint": "eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step101200_20260519_092441.pt", "step": 101000, "decode": {"steps": 1024, "model_t_mode": "post", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0009765625, 0.001953125, 0.0029296875, 0.00390625, 0.0048828125, 0.005859375, 0.0068359375, 0.0078125, 0.0087890625, 0.009765625, 0.0107421875, 0.01171875, 0.0126953125, 0.013671875, 0.0146484375, 0.015625, 0.0166015625, 0.017578125, 0.0185546875, 0.01953125, 0.0205078125, 0.021484375, 0.0224609375, 0.0234375, 0.0244140625, 0.025390625, 0.0263671875, 0.02734375, 0.0283203125, 0.029296875, 0.0302734375, 0.03125, 0.0322265625, 0.033203125, 0.0341796875, 0.03515625, 0.0361328125, 0.037109375, 0.0380859375, 0.0390625, 0.0400390625, 0.041015625, 0.0419921875, 0.04296875, 0.0439453125, 0.044921875, 0.0458984375, 0.046875, 0.0478515625, 0.048828125, 0.0498046875, 0.05078125, 0.0517578125, 0.052734375, 0.0537109375, 0.0546875, 0.0556640625, 0.056640625, 0.0576171875, 0.05859375, 0.0595703125, 0.060546875, 0.0615234375, 0.0625, 0.0634765625, 0.064453125, 0.0654296875, 0.06640625, 0.0673828125, 0.068359375, 0.0693359375, 0.0703125, 0.0712890625, 0.072265625, 0.0732421875, 0.07421875, 0.0751953125, 0.076171875, 0.0771484375, 0.078125, 0.0791015625, 0.080078125, 0.0810546875, 0.08203125, 0.0830078125, 0.083984375, 0.0849609375, 0.0859375, 0.0869140625, 0.087890625, 0.0888671875, 0.08984375, 0.0908203125, 0.091796875, 0.0927734375, 0.09375, 0.0947265625, 0.095703125, 0.0966796875, 0.09765625, 0.0986328125, 0.099609375, 0.1005859375, 0.1015625, 0.1025390625, 0.103515625, 0.1044921875, 0.10546875, 0.1064453125, 0.107421875, 0.1083984375, 0.109375, 0.1103515625, 0.111328125, 0.1123046875, 0.11328125, 0.1142578125, 0.115234375, 0.1162109375, 0.1171875, 0.1181640625, 0.119140625, 0.1201171875, 0.12109375, 0.1220703125, 0.123046875, 0.1240234375, 0.125, 0.1259765625, 0.126953125, 0.1279296875, 0.12890625, 0.1298828125, 0.130859375, 0.1318359375, 0.1328125, 0.1337890625, 0.134765625, 0.1357421875, 0.13671875, 0.1376953125, 0.138671875, 0.1396484375, 0.140625, 0.1416015625, 0.142578125, 0.1435546875, 0.14453125, 0.1455078125, 0.146484375, 0.1474609375, 0.1484375, 0.1494140625, 0.150390625, 0.1513671875, 0.15234375, 0.1533203125, 0.154296875, 0.1552734375, 0.15625, 0.1572265625, 0.158203125, 0.1591796875, 0.16015625, 0.1611328125, 0.162109375, 0.1630859375, 0.1640625, 0.1650390625, 0.166015625, 0.1669921875, 0.16796875, 0.1689453125, 0.169921875, 0.1708984375, 0.171875, 0.1728515625, 0.173828125, 0.1748046875, 0.17578125, 0.1767578125, 0.177734375, 0.1787109375, 0.1796875, 0.1806640625, 0.181640625, 0.1826171875, 0.18359375, 0.1845703125, 0.185546875, 0.1865234375, 0.1875, 0.1884765625, 0.189453125, 0.1904296875, 0.19140625, 0.1923828125, 0.193359375, 0.1943359375, 0.1953125, 0.1962890625, 0.197265625, 0.1982421875, 0.19921875, 0.2001953125, 0.201171875, 0.2021484375, 0.203125, 0.2041015625, 0.205078125, 0.2060546875, 0.20703125, 0.2080078125, 0.208984375, 0.2099609375, 0.2109375, 0.2119140625, 0.212890625, 0.2138671875, 0.21484375, 0.2158203125, 0.216796875, 0.2177734375, 0.21875, 0.2197265625, 0.220703125, 0.2216796875, 0.22265625, 0.2236328125, 0.224609375, 0.2255859375, 0.2265625, 0.2275390625, 0.228515625, 0.2294921875, 0.23046875, 0.2314453125, 0.232421875, 0.2333984375, 0.234375, 0.2353515625, 0.236328125, 0.2373046875, 0.23828125, 0.2392578125, 0.240234375, 0.2412109375, 0.2421875, 0.2431640625, 0.244140625, 0.2451171875, 0.24609375, 0.2470703125, 0.248046875, 0.2490234375, 0.25, 0.2509765625, 0.251953125, 0.2529296875, 0.25390625, 0.2548828125, 0.255859375, 0.2568359375, 0.2578125, 0.2587890625, 0.259765625, 0.2607421875, 0.26171875, 0.2626953125, 0.263671875, 0.2646484375, 0.265625, 0.2666015625, 0.267578125, 0.2685546875, 0.26953125, 0.2705078125, 0.271484375, 0.2724609375, 0.2734375, 0.2744140625, 0.275390625, 0.2763671875, 0.27734375, 0.2783203125, 0.279296875, 0.2802734375, 0.28125, 0.2822265625, 0.283203125, 0.2841796875, 0.28515625, 0.2861328125, 0.287109375, 0.2880859375, 0.2890625, 0.2900390625, 0.291015625, 0.2919921875, 0.29296875, 0.2939453125, 0.294921875, 0.2958984375, 0.296875, 0.2978515625, 0.298828125, 0.2998046875, 0.30078125, 0.3017578125, 0.302734375, 0.3037109375, 0.3046875, 0.3056640625, 0.306640625, 0.3076171875, 0.30859375, 0.3095703125, 0.310546875, 0.3115234375, 0.3125, 0.3134765625, 0.314453125, 0.3154296875, 0.31640625, 0.3173828125, 0.318359375, 0.3193359375, 0.3203125, 0.3212890625, 0.322265625, 0.3232421875, 0.32421875, 0.3251953125, 0.326171875, 0.3271484375, 0.328125, 0.3291015625, 0.330078125, 0.3310546875, 0.33203125, 0.3330078125, 0.333984375, 0.3349609375, 0.3359375, 0.3369140625, 0.337890625, 0.3388671875, 0.33984375, 0.3408203125, 0.341796875, 0.3427734375, 0.34375, 0.3447265625, 0.345703125, 0.3466796875, 0.34765625, 0.3486328125, 0.349609375, 0.3505859375, 0.3515625, 0.3525390625, 0.353515625, 0.3544921875, 0.35546875, 0.3564453125, 0.357421875, 0.3583984375, 0.359375, 0.3603515625, 0.361328125, 0.3623046875, 0.36328125, 0.3642578125, 0.365234375, 0.3662109375, 0.3671875, 0.3681640625, 0.369140625, 0.3701171875, 0.37109375, 0.3720703125, 0.373046875, 0.3740234375, 0.375, 0.3759765625, 0.376953125, 0.3779296875, 0.37890625, 0.3798828125, 0.380859375, 0.3818359375, 0.3828125, 0.3837890625, 0.384765625, 0.3857421875, 0.38671875, 0.3876953125, 0.388671875, 0.3896484375, 0.390625, 0.3916015625, 0.392578125, 0.3935546875, 0.39453125, 0.3955078125, 0.396484375, 0.3974609375, 0.3984375, 0.3994140625, 0.400390625, 0.4013671875, 0.40234375, 0.4033203125, 0.404296875, 0.4052734375, 0.40625, 0.4072265625, 0.408203125, 0.4091796875, 0.41015625, 0.4111328125, 0.412109375, 0.4130859375, 0.4140625, 0.4150390625, 0.416015625, 0.4169921875, 0.41796875, 0.4189453125, 0.419921875, 0.4208984375, 0.421875, 0.4228515625, 0.423828125, 0.4248046875, 0.42578125, 0.4267578125, 0.427734375, 0.4287109375, 0.4296875, 0.4306640625, 0.431640625, 0.4326171875, 0.43359375, 0.4345703125, 0.435546875, 0.4365234375, 0.4375, 0.4384765625, 0.439453125, 0.4404296875, 0.44140625, 0.4423828125, 0.443359375, 0.4443359375, 0.4453125, 0.4462890625, 0.447265625, 0.4482421875, 0.44921875, 0.4501953125, 0.451171875, 0.4521484375, 0.453125, 0.4541015625, 0.455078125, 0.4560546875, 0.45703125, 0.4580078125, 0.458984375, 0.4599609375, 0.4609375, 0.4619140625, 0.462890625, 0.4638671875, 0.46484375, 0.4658203125, 0.466796875, 0.4677734375, 0.46875, 0.4697265625, 0.470703125, 0.4716796875, 0.47265625, 0.4736328125, 0.474609375, 0.4755859375, 0.4765625, 0.4775390625, 0.478515625, 0.4794921875, 0.48046875, 0.4814453125, 0.482421875, 0.4833984375, 0.484375, 0.4853515625, 0.486328125, 0.4873046875, 0.48828125, 0.4892578125, 0.490234375, 0.4912109375, 0.4921875, 0.4931640625, 0.494140625, 0.4951171875, 0.49609375, 0.4970703125, 0.498046875, 0.4990234375, 0.5, 0.5009765625, 0.501953125, 0.5029296875, 0.50390625, 0.5048828125, 0.505859375, 0.5068359375, 0.5078125, 0.5087890625, 0.509765625, 0.5107421875, 0.51171875, 0.5126953125, 0.513671875, 0.5146484375, 0.515625, 0.5166015625, 0.517578125, 0.5185546875, 0.51953125, 0.5205078125, 0.521484375, 0.5224609375, 0.5234375, 0.5244140625, 0.525390625, 0.5263671875, 0.52734375, 0.5283203125, 0.529296875, 0.5302734375, 0.53125, 0.5322265625, 0.533203125, 0.5341796875, 0.53515625, 0.5361328125, 0.537109375, 0.5380859375, 0.5390625, 0.5400390625, 0.541015625, 0.5419921875, 0.54296875, 0.5439453125, 0.544921875, 0.5458984375, 0.546875, 0.5478515625, 0.548828125, 0.5498046875, 0.55078125, 0.5517578125, 0.552734375, 0.5537109375, 0.5546875, 0.5556640625, 0.556640625, 0.5576171875, 0.55859375, 0.5595703125, 0.560546875, 0.5615234375, 0.5625, 0.5634765625, 0.564453125, 0.5654296875, 0.56640625, 0.5673828125, 0.568359375, 0.5693359375, 0.5703125, 0.5712890625, 0.572265625, 0.5732421875, 0.57421875, 0.5751953125, 0.576171875, 0.5771484375, 0.578125, 0.5791015625, 0.580078125, 0.5810546875, 0.58203125, 0.5830078125, 0.583984375, 0.5849609375, 0.5859375, 0.5869140625, 0.587890625, 0.5888671875, 0.58984375, 0.5908203125, 0.591796875, 0.5927734375, 0.59375, 0.5947265625, 0.595703125, 0.5966796875, 0.59765625, 0.5986328125, 0.599609375, 0.6005859375, 0.6015625, 0.6025390625, 0.603515625, 0.6044921875, 0.60546875, 0.6064453125, 0.607421875, 0.6083984375, 0.609375, 0.6103515625, 0.611328125, 0.6123046875, 0.61328125, 0.6142578125, 0.615234375, 0.6162109375, 0.6171875, 0.6181640625, 0.619140625, 0.6201171875, 0.62109375, 0.6220703125, 0.623046875, 0.6240234375, 0.625, 0.6259765625, 0.626953125, 0.6279296875, 0.62890625, 0.6298828125, 0.630859375, 0.6318359375, 0.6328125, 0.6337890625, 0.634765625, 0.6357421875, 0.63671875, 0.6376953125, 0.638671875, 0.6396484375, 0.640625, 0.6416015625, 0.642578125, 0.6435546875, 0.64453125, 0.6455078125, 0.646484375, 0.6474609375, 0.6484375, 0.6494140625, 0.650390625, 0.6513671875, 0.65234375, 0.6533203125, 0.654296875, 0.6552734375, 0.65625, 0.6572265625, 0.658203125, 0.6591796875, 0.66015625, 0.6611328125, 0.662109375, 0.6630859375, 0.6640625, 0.6650390625, 0.666015625, 0.6669921875, 0.66796875, 0.6689453125, 0.669921875, 0.6708984375, 0.671875, 0.6728515625, 0.673828125, 0.6748046875, 0.67578125, 0.6767578125, 0.677734375, 0.6787109375, 0.6796875, 0.6806640625, 0.681640625, 0.6826171875, 0.68359375, 0.6845703125, 0.685546875, 0.6865234375, 0.6875, 0.6884765625, 0.689453125, 0.6904296875, 0.69140625, 0.6923828125, 0.693359375, 0.6943359375, 0.6953125, 0.6962890625, 0.697265625, 0.6982421875, 0.69921875, 0.7001953125, 0.701171875, 0.7021484375, 0.703125, 0.7041015625, 0.705078125, 0.7060546875, 0.70703125, 0.7080078125, 0.708984375, 0.7099609375, 0.7109375, 0.7119140625, 0.712890625, 0.7138671875, 0.71484375, 0.7158203125, 0.716796875, 0.7177734375, 0.71875, 0.7197265625, 0.720703125, 0.7216796875, 0.72265625, 0.7236328125, 0.724609375, 0.7255859375, 0.7265625, 0.7275390625, 0.728515625, 0.7294921875, 0.73046875, 0.7314453125, 0.732421875, 0.7333984375, 0.734375, 0.7353515625, 0.736328125, 0.7373046875, 0.73828125, 0.7392578125, 0.740234375, 0.7412109375, 0.7421875, 0.7431640625, 0.744140625, 0.7451171875, 0.74609375, 0.7470703125, 0.748046875, 0.7490234375, 0.75, 0.7509765625, 0.751953125, 0.7529296875, 0.75390625, 0.7548828125, 0.755859375, 0.7568359375, 0.7578125, 0.7587890625, 0.759765625, 0.7607421875, 0.76171875, 0.7626953125, 0.763671875, 0.7646484375, 0.765625, 0.7666015625, 0.767578125, 0.7685546875, 0.76953125, 0.7705078125, 0.771484375, 0.7724609375, 0.7734375, 0.7744140625, 0.775390625, 0.7763671875, 0.77734375, 0.7783203125, 0.779296875, 0.7802734375, 0.78125, 0.7822265625, 0.783203125, 0.7841796875, 0.78515625, 0.7861328125, 0.787109375, 0.7880859375, 0.7890625, 0.7900390625, 0.791015625, 0.7919921875, 0.79296875, 0.7939453125, 0.794921875, 0.7958984375, 0.796875, 0.7978515625, 0.798828125, 0.7998046875, 0.80078125, 0.8017578125, 0.802734375, 0.8037109375, 0.8046875, 0.8056640625, 0.806640625, 0.8076171875, 0.80859375, 0.8095703125, 0.810546875, 0.8115234375, 0.8125, 0.8134765625, 0.814453125, 0.8154296875, 0.81640625, 0.8173828125, 0.818359375, 0.8193359375, 0.8203125, 0.8212890625, 0.822265625, 0.8232421875, 0.82421875, 0.8251953125, 0.826171875, 0.8271484375, 0.828125, 0.8291015625, 0.830078125, 0.8310546875, 0.83203125, 0.8330078125, 0.833984375, 0.8349609375, 0.8359375, 0.8369140625, 0.837890625, 0.8388671875, 0.83984375, 0.8408203125, 0.841796875, 0.8427734375, 0.84375, 0.8447265625, 0.845703125, 0.8466796875, 0.84765625, 0.8486328125, 0.849609375, 0.8505859375, 0.8515625, 0.8525390625, 0.853515625, 0.8544921875, 0.85546875, 0.8564453125, 0.857421875, 0.8583984375, 0.859375, 0.8603515625, 0.861328125, 0.8623046875, 0.86328125, 0.8642578125, 0.865234375, 0.8662109375, 0.8671875, 0.8681640625, 0.869140625, 0.8701171875, 0.87109375, 0.8720703125, 0.873046875, 0.8740234375, 0.875, 0.8759765625, 0.876953125, 0.8779296875, 0.87890625, 0.8798828125, 0.880859375, 0.8818359375, 0.8828125, 0.8837890625, 0.884765625, 0.8857421875, 0.88671875, 0.8876953125, 0.888671875, 0.8896484375, 0.890625, 0.8916015625, 0.892578125, 0.8935546875, 0.89453125, 0.8955078125, 0.896484375, 0.8974609375, 0.8984375, 0.8994140625, 0.900390625, 0.9013671875, 0.90234375, 0.9033203125, 0.904296875, 0.9052734375, 0.90625, 0.9072265625, 0.908203125, 0.9091796875, 0.91015625, 0.9111328125, 0.912109375, 0.9130859375, 0.9140625, 0.9150390625, 0.916015625, 0.9169921875, 0.91796875, 0.9189453125, 0.919921875, 0.9208984375, 0.921875, 0.9228515625, 0.923828125, 0.9248046875, 0.92578125, 0.9267578125, 0.927734375, 0.9287109375, 0.9296875, 0.9306640625, 0.931640625, 0.9326171875, 0.93359375, 0.9345703125, 0.935546875, 0.9365234375, 0.9375, 0.9384765625, 0.939453125, 0.9404296875, 0.94140625, 0.9423828125, 0.943359375, 0.9443359375, 0.9453125, 0.9462890625, 0.947265625, 0.9482421875, 0.94921875, 0.9501953125, 0.951171875, 0.9521484375, 0.953125, 0.9541015625, 0.955078125, 0.9560546875, 0.95703125, 0.9580078125, 0.958984375, 0.9599609375, 0.9609375, 0.9619140625, 0.962890625, 0.9638671875, 0.96484375, 0.9658203125, 0.966796875, 0.9677734375, 0.96875, 0.9697265625, 0.970703125, 0.9716796875, 0.97265625, 0.9736328125, 0.974609375, 0.9755859375, 0.9765625, 0.9775390625, 0.978515625, 0.9794921875, 0.98046875, 0.9814453125, 0.982421875, 0.9833984375, 0.984375, 0.9853515625, 0.986328125, 0.9873046875, 0.98828125, 0.9892578125, 0.990234375, 0.9912109375, 0.9921875, 0.9931640625, 0.994140625, 0.9951171875, 0.99609375, 0.9970703125, 0.998046875, 0.9990234375, 1.0], "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 128.0, "target_prob": 1.0, "endpoint_temp": 1.6, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260519}, "raw_genppl": {"ppl": 6.049422227527874, "nll_per_token": 1.799962767900205, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 6.08581982781651, "nll_per_token": 1.8059614466685874, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 2.3043278884277236, "unique_tokens": 915, "token_count": 8192, "distinct_1": 0.1116943359375, "distinct_2": 0.13734115347018572, "top_token_mass": 0.125}}
|
| 26 |
+
[done] docs/lta_samples/metrics_20260519/owt_t5_2node_latest_trainmatched_dirres_c128_lowtemp_n8/t1p60.jsonl
|
| 27 |
+
[ckpt] eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step101200_20260519_092441.pt step=101000
|
| 28 |
+
[decode-base] n=8 max_len=1024 steps=1024 model_t=post
|
| 29 |
+
[decode-time] schedule=linear s=[0.0,0.25] force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.000977 dt_max=0.000977
|
| 30 |
+
[decode] temp=1.65 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 31 |
+
[decode] temp=1.65 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 32 |
+
[decode] temp=1.65 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 33 |
+
[decode] temp=1.65 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 34 |
+
[decode] temp=1.65 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 35 |
+
[decode] temp=1.65 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 36 |
+
[decode] temp=1.65 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 37 |
+
[decode] temp=1.65 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 38 |
+
[summary] {"type": "summary", "checkpoint": "eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step101200_20260519_092441.pt", "step": 101000, "decode": {"steps": 1024, "model_t_mode": "post", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0009765625, 0.001953125, 0.0029296875, 0.00390625, 0.0048828125, 0.005859375, 0.0068359375, 0.0078125, 0.0087890625, 0.009765625, 0.0107421875, 0.01171875, 0.0126953125, 0.013671875, 0.0146484375, 0.015625, 0.0166015625, 0.017578125, 0.0185546875, 0.01953125, 0.0205078125, 0.021484375, 0.0224609375, 0.0234375, 0.0244140625, 0.025390625, 0.0263671875, 0.02734375, 0.0283203125, 0.029296875, 0.0302734375, 0.03125, 0.0322265625, 0.033203125, 0.0341796875, 0.03515625, 0.0361328125, 0.037109375, 0.0380859375, 0.0390625, 0.0400390625, 0.041015625, 0.0419921875, 0.04296875, 0.0439453125, 0.044921875, 0.0458984375, 0.046875, 0.0478515625, 0.048828125, 0.0498046875, 0.05078125, 0.0517578125, 0.052734375, 0.0537109375, 0.0546875, 0.0556640625, 0.056640625, 0.0576171875, 0.05859375, 0.0595703125, 0.060546875, 0.0615234375, 0.0625, 0.0634765625, 0.064453125, 0.0654296875, 0.06640625, 0.0673828125, 0.068359375, 0.0693359375, 0.0703125, 0.0712890625, 0.072265625, 0.0732421875, 0.07421875, 0.0751953125, 0.076171875, 0.0771484375, 0.078125, 0.0791015625, 0.080078125, 0.0810546875, 0.08203125, 0.0830078125, 0.083984375, 0.0849609375, 0.0859375, 0.0869140625, 0.087890625, 0.0888671875, 0.08984375, 0.0908203125, 0.091796875, 0.0927734375, 0.09375, 0.0947265625, 0.095703125, 0.0966796875, 0.09765625, 0.0986328125, 0.099609375, 0.1005859375, 0.1015625, 0.1025390625, 0.103515625, 0.1044921875, 0.10546875, 0.1064453125, 0.107421875, 0.1083984375, 0.109375, 0.1103515625, 0.111328125, 0.1123046875, 0.11328125, 0.1142578125, 0.115234375, 0.1162109375, 0.1171875, 0.1181640625, 0.119140625, 0.1201171875, 0.12109375, 0.1220703125, 0.123046875, 0.1240234375, 0.125, 0.1259765625, 0.126953125, 0.1279296875, 0.12890625, 0.1298828125, 0.130859375, 0.1318359375, 0.1328125, 0.1337890625, 0.134765625, 0.1357421875, 0.13671875, 0.1376953125, 0.138671875, 0.1396484375, 0.140625, 0.1416015625, 0.142578125, 0.1435546875, 0.14453125, 0.1455078125, 0.146484375, 0.1474609375, 0.1484375, 0.1494140625, 0.150390625, 0.1513671875, 0.15234375, 0.1533203125, 0.154296875, 0.1552734375, 0.15625, 0.1572265625, 0.158203125, 0.1591796875, 0.16015625, 0.1611328125, 0.162109375, 0.1630859375, 0.1640625, 0.1650390625, 0.166015625, 0.1669921875, 0.16796875, 0.1689453125, 0.169921875, 0.1708984375, 0.171875, 0.1728515625, 0.173828125, 0.1748046875, 0.17578125, 0.1767578125, 0.177734375, 0.1787109375, 0.1796875, 0.1806640625, 0.181640625, 0.1826171875, 0.18359375, 0.1845703125, 0.185546875, 0.1865234375, 0.1875, 0.1884765625, 0.189453125, 0.1904296875, 0.19140625, 0.1923828125, 0.193359375, 0.1943359375, 0.1953125, 0.1962890625, 0.197265625, 0.1982421875, 0.19921875, 0.2001953125, 0.201171875, 0.2021484375, 0.203125, 0.2041015625, 0.205078125, 0.2060546875, 0.20703125, 0.2080078125, 0.208984375, 0.2099609375, 0.2109375, 0.2119140625, 0.212890625, 0.2138671875, 0.21484375, 0.2158203125, 0.216796875, 0.2177734375, 0.21875, 0.2197265625, 0.220703125, 0.2216796875, 0.22265625, 0.2236328125, 0.224609375, 0.2255859375, 0.2265625, 0.2275390625, 0.228515625, 0.2294921875, 0.23046875, 0.2314453125, 0.232421875, 0.2333984375, 0.234375, 0.2353515625, 0.236328125, 0.2373046875, 0.23828125, 0.2392578125, 0.240234375, 0.2412109375, 0.2421875, 0.2431640625, 0.244140625, 0.2451171875, 0.24609375, 0.2470703125, 0.248046875, 0.2490234375, 0.25, 0.2509765625, 0.251953125, 0.2529296875, 0.25390625, 0.2548828125, 0.255859375, 0.2568359375, 0.2578125, 0.2587890625, 0.259765625, 0.2607421875, 0.26171875, 0.2626953125, 0.263671875, 0.2646484375, 0.265625, 0.2666015625, 0.267578125, 0.2685546875, 0.26953125, 0.2705078125, 0.271484375, 0.2724609375, 0.2734375, 0.2744140625, 0.275390625, 0.2763671875, 0.27734375, 0.2783203125, 0.279296875, 0.2802734375, 0.28125, 0.2822265625, 0.283203125, 0.2841796875, 0.28515625, 0.2861328125, 0.287109375, 0.2880859375, 0.2890625, 0.2900390625, 0.291015625, 0.2919921875, 0.29296875, 0.2939453125, 0.294921875, 0.2958984375, 0.296875, 0.2978515625, 0.298828125, 0.2998046875, 0.30078125, 0.3017578125, 0.302734375, 0.3037109375, 0.3046875, 0.3056640625, 0.306640625, 0.3076171875, 0.30859375, 0.3095703125, 0.310546875, 0.3115234375, 0.3125, 0.3134765625, 0.314453125, 0.3154296875, 0.31640625, 0.3173828125, 0.318359375, 0.3193359375, 0.3203125, 0.3212890625, 0.322265625, 0.3232421875, 0.32421875, 0.3251953125, 0.326171875, 0.3271484375, 0.328125, 0.3291015625, 0.330078125, 0.3310546875, 0.33203125, 0.3330078125, 0.333984375, 0.3349609375, 0.3359375, 0.3369140625, 0.337890625, 0.3388671875, 0.33984375, 0.3408203125, 0.341796875, 0.3427734375, 0.34375, 0.3447265625, 0.345703125, 0.3466796875, 0.34765625, 0.3486328125, 0.349609375, 0.3505859375, 0.3515625, 0.3525390625, 0.353515625, 0.3544921875, 0.35546875, 0.3564453125, 0.357421875, 0.3583984375, 0.359375, 0.3603515625, 0.361328125, 0.3623046875, 0.36328125, 0.3642578125, 0.365234375, 0.3662109375, 0.3671875, 0.3681640625, 0.369140625, 0.3701171875, 0.37109375, 0.3720703125, 0.373046875, 0.3740234375, 0.375, 0.3759765625, 0.376953125, 0.3779296875, 0.37890625, 0.3798828125, 0.380859375, 0.3818359375, 0.3828125, 0.3837890625, 0.384765625, 0.3857421875, 0.38671875, 0.3876953125, 0.388671875, 0.3896484375, 0.390625, 0.3916015625, 0.392578125, 0.3935546875, 0.39453125, 0.3955078125, 0.396484375, 0.3974609375, 0.3984375, 0.3994140625, 0.400390625, 0.4013671875, 0.40234375, 0.4033203125, 0.404296875, 0.4052734375, 0.40625, 0.4072265625, 0.408203125, 0.4091796875, 0.41015625, 0.4111328125, 0.412109375, 0.4130859375, 0.4140625, 0.4150390625, 0.416015625, 0.4169921875, 0.41796875, 0.4189453125, 0.419921875, 0.4208984375, 0.421875, 0.4228515625, 0.423828125, 0.4248046875, 0.42578125, 0.4267578125, 0.427734375, 0.4287109375, 0.4296875, 0.4306640625, 0.431640625, 0.4326171875, 0.43359375, 0.4345703125, 0.435546875, 0.4365234375, 0.4375, 0.4384765625, 0.439453125, 0.4404296875, 0.44140625, 0.4423828125, 0.443359375, 0.4443359375, 0.4453125, 0.4462890625, 0.447265625, 0.4482421875, 0.44921875, 0.4501953125, 0.451171875, 0.4521484375, 0.453125, 0.4541015625, 0.455078125, 0.4560546875, 0.45703125, 0.4580078125, 0.458984375, 0.4599609375, 0.4609375, 0.4619140625, 0.462890625, 0.4638671875, 0.46484375, 0.4658203125, 0.466796875, 0.4677734375, 0.46875, 0.4697265625, 0.470703125, 0.4716796875, 0.47265625, 0.4736328125, 0.474609375, 0.4755859375, 0.4765625, 0.4775390625, 0.478515625, 0.4794921875, 0.48046875, 0.4814453125, 0.482421875, 0.4833984375, 0.484375, 0.4853515625, 0.486328125, 0.4873046875, 0.48828125, 0.4892578125, 0.490234375, 0.4912109375, 0.4921875, 0.4931640625, 0.494140625, 0.4951171875, 0.49609375, 0.4970703125, 0.498046875, 0.4990234375, 0.5, 0.5009765625, 0.501953125, 0.5029296875, 0.50390625, 0.5048828125, 0.505859375, 0.5068359375, 0.5078125, 0.5087890625, 0.509765625, 0.5107421875, 0.51171875, 0.5126953125, 0.513671875, 0.5146484375, 0.515625, 0.5166015625, 0.517578125, 0.5185546875, 0.51953125, 0.5205078125, 0.521484375, 0.5224609375, 0.5234375, 0.5244140625, 0.525390625, 0.5263671875, 0.52734375, 0.5283203125, 0.529296875, 0.5302734375, 0.53125, 0.5322265625, 0.533203125, 0.5341796875, 0.53515625, 0.5361328125, 0.537109375, 0.5380859375, 0.5390625, 0.5400390625, 0.541015625, 0.5419921875, 0.54296875, 0.5439453125, 0.544921875, 0.5458984375, 0.546875, 0.5478515625, 0.548828125, 0.5498046875, 0.55078125, 0.5517578125, 0.552734375, 0.5537109375, 0.5546875, 0.5556640625, 0.556640625, 0.5576171875, 0.55859375, 0.5595703125, 0.560546875, 0.5615234375, 0.5625, 0.5634765625, 0.564453125, 0.5654296875, 0.56640625, 0.5673828125, 0.568359375, 0.5693359375, 0.5703125, 0.5712890625, 0.572265625, 0.5732421875, 0.57421875, 0.5751953125, 0.576171875, 0.5771484375, 0.578125, 0.5791015625, 0.580078125, 0.5810546875, 0.58203125, 0.5830078125, 0.583984375, 0.5849609375, 0.5859375, 0.5869140625, 0.587890625, 0.5888671875, 0.58984375, 0.5908203125, 0.591796875, 0.5927734375, 0.59375, 0.5947265625, 0.595703125, 0.5966796875, 0.59765625, 0.5986328125, 0.599609375, 0.6005859375, 0.6015625, 0.6025390625, 0.603515625, 0.6044921875, 0.60546875, 0.6064453125, 0.607421875, 0.6083984375, 0.609375, 0.6103515625, 0.611328125, 0.6123046875, 0.61328125, 0.6142578125, 0.615234375, 0.6162109375, 0.6171875, 0.6181640625, 0.619140625, 0.6201171875, 0.62109375, 0.6220703125, 0.623046875, 0.6240234375, 0.625, 0.6259765625, 0.626953125, 0.6279296875, 0.62890625, 0.6298828125, 0.630859375, 0.6318359375, 0.6328125, 0.6337890625, 0.634765625, 0.6357421875, 0.63671875, 0.6376953125, 0.638671875, 0.6396484375, 0.640625, 0.6416015625, 0.642578125, 0.6435546875, 0.64453125, 0.6455078125, 0.646484375, 0.6474609375, 0.6484375, 0.6494140625, 0.650390625, 0.6513671875, 0.65234375, 0.6533203125, 0.654296875, 0.6552734375, 0.65625, 0.6572265625, 0.658203125, 0.6591796875, 0.66015625, 0.6611328125, 0.662109375, 0.6630859375, 0.6640625, 0.6650390625, 0.666015625, 0.6669921875, 0.66796875, 0.6689453125, 0.669921875, 0.6708984375, 0.671875, 0.6728515625, 0.673828125, 0.6748046875, 0.67578125, 0.6767578125, 0.677734375, 0.6787109375, 0.6796875, 0.6806640625, 0.681640625, 0.6826171875, 0.68359375, 0.6845703125, 0.685546875, 0.6865234375, 0.6875, 0.6884765625, 0.689453125, 0.6904296875, 0.69140625, 0.6923828125, 0.693359375, 0.6943359375, 0.6953125, 0.6962890625, 0.697265625, 0.6982421875, 0.69921875, 0.7001953125, 0.701171875, 0.7021484375, 0.703125, 0.7041015625, 0.705078125, 0.7060546875, 0.70703125, 0.7080078125, 0.708984375, 0.7099609375, 0.7109375, 0.7119140625, 0.712890625, 0.7138671875, 0.71484375, 0.7158203125, 0.716796875, 0.7177734375, 0.71875, 0.7197265625, 0.720703125, 0.7216796875, 0.72265625, 0.7236328125, 0.724609375, 0.7255859375, 0.7265625, 0.7275390625, 0.728515625, 0.7294921875, 0.73046875, 0.7314453125, 0.732421875, 0.7333984375, 0.734375, 0.7353515625, 0.736328125, 0.7373046875, 0.73828125, 0.7392578125, 0.740234375, 0.7412109375, 0.7421875, 0.7431640625, 0.744140625, 0.7451171875, 0.74609375, 0.7470703125, 0.748046875, 0.7490234375, 0.75, 0.7509765625, 0.751953125, 0.7529296875, 0.75390625, 0.7548828125, 0.755859375, 0.7568359375, 0.7578125, 0.7587890625, 0.759765625, 0.7607421875, 0.76171875, 0.7626953125, 0.763671875, 0.7646484375, 0.765625, 0.7666015625, 0.767578125, 0.7685546875, 0.76953125, 0.7705078125, 0.771484375, 0.7724609375, 0.7734375, 0.7744140625, 0.775390625, 0.7763671875, 0.77734375, 0.7783203125, 0.779296875, 0.7802734375, 0.78125, 0.7822265625, 0.783203125, 0.7841796875, 0.78515625, 0.7861328125, 0.787109375, 0.7880859375, 0.7890625, 0.7900390625, 0.791015625, 0.7919921875, 0.79296875, 0.7939453125, 0.794921875, 0.7958984375, 0.796875, 0.7978515625, 0.798828125, 0.7998046875, 0.80078125, 0.8017578125, 0.802734375, 0.8037109375, 0.8046875, 0.8056640625, 0.806640625, 0.8076171875, 0.80859375, 0.8095703125, 0.810546875, 0.8115234375, 0.8125, 0.8134765625, 0.814453125, 0.8154296875, 0.81640625, 0.8173828125, 0.818359375, 0.8193359375, 0.8203125, 0.8212890625, 0.822265625, 0.8232421875, 0.82421875, 0.8251953125, 0.826171875, 0.8271484375, 0.828125, 0.8291015625, 0.830078125, 0.8310546875, 0.83203125, 0.8330078125, 0.833984375, 0.8349609375, 0.8359375, 0.8369140625, 0.837890625, 0.8388671875, 0.83984375, 0.8408203125, 0.841796875, 0.8427734375, 0.84375, 0.8447265625, 0.845703125, 0.8466796875, 0.84765625, 0.8486328125, 0.849609375, 0.8505859375, 0.8515625, 0.8525390625, 0.853515625, 0.8544921875, 0.85546875, 0.8564453125, 0.857421875, 0.8583984375, 0.859375, 0.8603515625, 0.861328125, 0.8623046875, 0.86328125, 0.8642578125, 0.865234375, 0.8662109375, 0.8671875, 0.8681640625, 0.869140625, 0.8701171875, 0.87109375, 0.8720703125, 0.873046875, 0.8740234375, 0.875, 0.8759765625, 0.876953125, 0.8779296875, 0.87890625, 0.8798828125, 0.880859375, 0.8818359375, 0.8828125, 0.8837890625, 0.884765625, 0.8857421875, 0.88671875, 0.8876953125, 0.888671875, 0.8896484375, 0.890625, 0.8916015625, 0.892578125, 0.8935546875, 0.89453125, 0.8955078125, 0.896484375, 0.8974609375, 0.8984375, 0.8994140625, 0.900390625, 0.9013671875, 0.90234375, 0.9033203125, 0.904296875, 0.9052734375, 0.90625, 0.9072265625, 0.908203125, 0.9091796875, 0.91015625, 0.9111328125, 0.912109375, 0.9130859375, 0.9140625, 0.9150390625, 0.916015625, 0.9169921875, 0.91796875, 0.9189453125, 0.919921875, 0.9208984375, 0.921875, 0.9228515625, 0.923828125, 0.9248046875, 0.92578125, 0.9267578125, 0.927734375, 0.9287109375, 0.9296875, 0.9306640625, 0.931640625, 0.9326171875, 0.93359375, 0.9345703125, 0.935546875, 0.9365234375, 0.9375, 0.9384765625, 0.939453125, 0.9404296875, 0.94140625, 0.9423828125, 0.943359375, 0.9443359375, 0.9453125, 0.9462890625, 0.947265625, 0.9482421875, 0.94921875, 0.9501953125, 0.951171875, 0.9521484375, 0.953125, 0.9541015625, 0.955078125, 0.9560546875, 0.95703125, 0.9580078125, 0.958984375, 0.9599609375, 0.9609375, 0.9619140625, 0.962890625, 0.9638671875, 0.96484375, 0.9658203125, 0.966796875, 0.9677734375, 0.96875, 0.9697265625, 0.970703125, 0.9716796875, 0.97265625, 0.9736328125, 0.974609375, 0.9755859375, 0.9765625, 0.9775390625, 0.978515625, 0.9794921875, 0.98046875, 0.9814453125, 0.982421875, 0.9833984375, 0.984375, 0.9853515625, 0.986328125, 0.9873046875, 0.98828125, 0.9892578125, 0.990234375, 0.9912109375, 0.9921875, 0.9931640625, 0.994140625, 0.9951171875, 0.99609375, 0.9970703125, 0.998046875, 0.9990234375, 1.0], "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 128.0, "target_prob": 1.0, "endpoint_temp": 1.65, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260519}, "raw_genppl": {"ppl": 75.77343953176887, "nll_per_token": 4.32774782928766, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 76.36644019355695, "nll_per_token": 4.335543335185331, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 3.274294560163754, "unique_tokens": 2246, "token_count": 8192, "distinct_1": 0.274169921875, "distinct_2": 0.36656891495601174, "top_token_mass": 0.1500244140625}}
|
| 39 |
+
[done] docs/lta_samples/metrics_20260519/owt_t5_2node_latest_trainmatched_dirres_c128_lowtemp_n8/t1p65.jsonl
|
| 40 |
+
[ckpt] eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step101200_20260519_092441.pt step=101000
|
| 41 |
+
[decode-base] n=8 max_len=1024 steps=1024 model_t=post
|
| 42 |
+
[decode-time] schedule=linear s=[0.0,0.25] gumbel=(2.2,0.8) force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.000977 dt_max=0.000977
|
| 43 |
+
[decode] temp=1.70 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 44 |
+
[decode] temp=1.70 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 45 |
+
[decode] temp=1.70 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 46 |
+
[decode] temp=1.70 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 47 |
+
[decode] temp=1.70 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 48 |
+
[decode] temp=1.70 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 49 |
+
[decode] temp=1.70 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 50 |
+
[decode] temp=1.70 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 51 |
+
[summary] {"type": "summary", "checkpoint": "eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step101200_20260519_092441.pt", "step": 101000, "decode": {"steps": 1024, "model_t_mode": "post", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0009765625, 0.001953125, 0.0029296875, 0.00390625, 0.0048828125, 0.005859375, 0.0068359375, 0.0078125, 0.0087890625, 0.009765625, 0.0107421875, 0.01171875, 0.0126953125, 0.013671875, 0.0146484375, 0.015625, 0.0166015625, 0.017578125, 0.0185546875, 0.01953125, 0.0205078125, 0.021484375, 0.0224609375, 0.0234375, 0.0244140625, 0.025390625, 0.0263671875, 0.02734375, 0.0283203125, 0.029296875, 0.0302734375, 0.03125, 0.0322265625, 0.033203125, 0.0341796875, 0.03515625, 0.0361328125, 0.037109375, 0.0380859375, 0.0390625, 0.0400390625, 0.041015625, 0.0419921875, 0.04296875, 0.0439453125, 0.044921875, 0.0458984375, 0.046875, 0.0478515625, 0.048828125, 0.0498046875, 0.05078125, 0.0517578125, 0.052734375, 0.0537109375, 0.0546875, 0.0556640625, 0.056640625, 0.0576171875, 0.05859375, 0.0595703125, 0.060546875, 0.0615234375, 0.0625, 0.0634765625, 0.064453125, 0.0654296875, 0.06640625, 0.0673828125, 0.068359375, 0.0693359375, 0.0703125, 0.0712890625, 0.072265625, 0.0732421875, 0.07421875, 0.0751953125, 0.076171875, 0.0771484375, 0.078125, 0.0791015625, 0.080078125, 0.0810546875, 0.08203125, 0.0830078125, 0.083984375, 0.0849609375, 0.0859375, 0.0869140625, 0.087890625, 0.0888671875, 0.08984375, 0.0908203125, 0.091796875, 0.0927734375, 0.09375, 0.0947265625, 0.095703125, 0.0966796875, 0.09765625, 0.0986328125, 0.099609375, 0.1005859375, 0.1015625, 0.1025390625, 0.103515625, 0.1044921875, 0.10546875, 0.1064453125, 0.107421875, 0.1083984375, 0.109375, 0.1103515625, 0.111328125, 0.1123046875, 0.11328125, 0.1142578125, 0.115234375, 0.1162109375, 0.1171875, 0.1181640625, 0.119140625, 0.1201171875, 0.12109375, 0.1220703125, 0.123046875, 0.1240234375, 0.125, 0.1259765625, 0.126953125, 0.1279296875, 0.12890625, 0.1298828125, 0.130859375, 0.1318359375, 0.1328125, 0.1337890625, 0.134765625, 0.1357421875, 0.13671875, 0.1376953125, 0.138671875, 0.1396484375, 0.140625, 0.1416015625, 0.142578125, 0.1435546875, 0.14453125, 0.1455078125, 0.146484375, 0.1474609375, 0.1484375, 0.1494140625, 0.150390625, 0.1513671875, 0.15234375, 0.1533203125, 0.154296875, 0.1552734375, 0.15625, 0.1572265625, 0.158203125, 0.1591796875, 0.16015625, 0.1611328125, 0.162109375, 0.1630859375, 0.1640625, 0.1650390625, 0.166015625, 0.1669921875, 0.16796875, 0.1689453125, 0.169921875, 0.1708984375, 0.171875, 0.1728515625, 0.173828125, 0.1748046875, 0.17578125, 0.1767578125, 0.177734375, 0.1787109375, 0.1796875, 0.1806640625, 0.181640625, 0.1826171875, 0.18359375, 0.1845703125, 0.185546875, 0.1865234375, 0.1875, 0.1884765625, 0.189453125, 0.1904296875, 0.19140625, 0.1923828125, 0.193359375, 0.1943359375, 0.1953125, 0.1962890625, 0.197265625, 0.1982421875, 0.19921875, 0.2001953125, 0.201171875, 0.2021484375, 0.203125, 0.2041015625, 0.205078125, 0.2060546875, 0.20703125, 0.2080078125, 0.208984375, 0.2099609375, 0.2109375, 0.2119140625, 0.212890625, 0.2138671875, 0.21484375, 0.2158203125, 0.216796875, 0.2177734375, 0.21875, 0.2197265625, 0.220703125, 0.2216796875, 0.22265625, 0.2236328125, 0.224609375, 0.2255859375, 0.2265625, 0.2275390625, 0.228515625, 0.2294921875, 0.23046875, 0.2314453125, 0.232421875, 0.2333984375, 0.234375, 0.2353515625, 0.236328125, 0.2373046875, 0.23828125, 0.2392578125, 0.240234375, 0.2412109375, 0.2421875, 0.2431640625, 0.244140625, 0.2451171875, 0.24609375, 0.2470703125, 0.248046875, 0.2490234375, 0.25, 0.2509765625, 0.251953125, 0.2529296875, 0.25390625, 0.2548828125, 0.255859375, 0.2568359375, 0.2578125, 0.2587890625, 0.259765625, 0.2607421875, 0.26171875, 0.2626953125, 0.263671875, 0.2646484375, 0.265625, 0.2666015625, 0.267578125, 0.2685546875, 0.26953125, 0.2705078125, 0.271484375, 0.2724609375, 0.2734375, 0.2744140625, 0.275390625, 0.2763671875, 0.27734375, 0.2783203125, 0.279296875, 0.2802734375, 0.28125, 0.2822265625, 0.283203125, 0.2841796875, 0.28515625, 0.2861328125, 0.287109375, 0.2880859375, 0.2890625, 0.2900390625, 0.291015625, 0.2919921875, 0.29296875, 0.2939453125, 0.294921875, 0.2958984375, 0.296875, 0.2978515625, 0.298828125, 0.2998046875, 0.30078125, 0.3017578125, 0.302734375, 0.3037109375, 0.3046875, 0.3056640625, 0.306640625, 0.3076171875, 0.30859375, 0.3095703125, 0.310546875, 0.3115234375, 0.3125, 0.3134765625, 0.314453125, 0.3154296875, 0.31640625, 0.3173828125, 0.318359375, 0.3193359375, 0.3203125, 0.3212890625, 0.322265625, 0.3232421875, 0.32421875, 0.3251953125, 0.326171875, 0.3271484375, 0.328125, 0.3291015625, 0.330078125, 0.3310546875, 0.33203125, 0.3330078125, 0.333984375, 0.3349609375, 0.3359375, 0.3369140625, 0.337890625, 0.3388671875, 0.33984375, 0.3408203125, 0.341796875, 0.3427734375, 0.34375, 0.3447265625, 0.345703125, 0.3466796875, 0.34765625, 0.3486328125, 0.349609375, 0.3505859375, 0.3515625, 0.3525390625, 0.353515625, 0.3544921875, 0.35546875, 0.3564453125, 0.357421875, 0.3583984375, 0.359375, 0.3603515625, 0.361328125, 0.3623046875, 0.36328125, 0.3642578125, 0.365234375, 0.3662109375, 0.3671875, 0.3681640625, 0.369140625, 0.3701171875, 0.37109375, 0.3720703125, 0.373046875, 0.3740234375, 0.375, 0.3759765625, 0.376953125, 0.3779296875, 0.37890625, 0.3798828125, 0.380859375, 0.3818359375, 0.3828125, 0.3837890625, 0.384765625, 0.3857421875, 0.38671875, 0.3876953125, 0.388671875, 0.3896484375, 0.390625, 0.3916015625, 0.392578125, 0.3935546875, 0.39453125, 0.3955078125, 0.396484375, 0.3974609375, 0.3984375, 0.3994140625, 0.400390625, 0.4013671875, 0.40234375, 0.4033203125, 0.404296875, 0.4052734375, 0.40625, 0.4072265625, 0.408203125, 0.4091796875, 0.41015625, 0.4111328125, 0.412109375, 0.4130859375, 0.4140625, 0.4150390625, 0.416015625, 0.4169921875, 0.41796875, 0.4189453125, 0.419921875, 0.4208984375, 0.421875, 0.4228515625, 0.423828125, 0.4248046875, 0.42578125, 0.4267578125, 0.427734375, 0.4287109375, 0.4296875, 0.4306640625, 0.431640625, 0.4326171875, 0.43359375, 0.4345703125, 0.435546875, 0.4365234375, 0.4375, 0.4384765625, 0.439453125, 0.4404296875, 0.44140625, 0.4423828125, 0.443359375, 0.4443359375, 0.4453125, 0.4462890625, 0.447265625, 0.4482421875, 0.44921875, 0.4501953125, 0.451171875, 0.4521484375, 0.453125, 0.4541015625, 0.455078125, 0.4560546875, 0.45703125, 0.4580078125, 0.458984375, 0.4599609375, 0.4609375, 0.4619140625, 0.462890625, 0.4638671875, 0.46484375, 0.4658203125, 0.466796875, 0.4677734375, 0.46875, 0.4697265625, 0.470703125, 0.4716796875, 0.47265625, 0.4736328125, 0.474609375, 0.4755859375, 0.4765625, 0.4775390625, 0.478515625, 0.4794921875, 0.48046875, 0.4814453125, 0.482421875, 0.4833984375, 0.484375, 0.4853515625, 0.486328125, 0.4873046875, 0.48828125, 0.4892578125, 0.490234375, 0.4912109375, 0.4921875, 0.4931640625, 0.494140625, 0.4951171875, 0.49609375, 0.4970703125, 0.498046875, 0.4990234375, 0.5, 0.5009765625, 0.501953125, 0.5029296875, 0.50390625, 0.5048828125, 0.505859375, 0.5068359375, 0.5078125, 0.5087890625, 0.509765625, 0.5107421875, 0.51171875, 0.5126953125, 0.513671875, 0.5146484375, 0.515625, 0.5166015625, 0.517578125, 0.5185546875, 0.51953125, 0.5205078125, 0.521484375, 0.5224609375, 0.5234375, 0.5244140625, 0.525390625, 0.5263671875, 0.52734375, 0.5283203125, 0.529296875, 0.5302734375, 0.53125, 0.5322265625, 0.533203125, 0.5341796875, 0.53515625, 0.5361328125, 0.537109375, 0.5380859375, 0.5390625, 0.5400390625, 0.541015625, 0.5419921875, 0.54296875, 0.5439453125, 0.544921875, 0.5458984375, 0.546875, 0.5478515625, 0.548828125, 0.5498046875, 0.55078125, 0.5517578125, 0.552734375, 0.5537109375, 0.5546875, 0.5556640625, 0.556640625, 0.5576171875, 0.55859375, 0.5595703125, 0.560546875, 0.5615234375, 0.5625, 0.5634765625, 0.564453125, 0.5654296875, 0.56640625, 0.5673828125, 0.568359375, 0.5693359375, 0.5703125, 0.5712890625, 0.572265625, 0.5732421875, 0.57421875, 0.5751953125, 0.576171875, 0.5771484375, 0.578125, 0.5791015625, 0.580078125, 0.5810546875, 0.58203125, 0.5830078125, 0.583984375, 0.5849609375, 0.5859375, 0.5869140625, 0.587890625, 0.5888671875, 0.58984375, 0.5908203125, 0.591796875, 0.5927734375, 0.59375, 0.5947265625, 0.595703125, 0.5966796875, 0.59765625, 0.5986328125, 0.599609375, 0.6005859375, 0.6015625, 0.6025390625, 0.603515625, 0.6044921875, 0.60546875, 0.6064453125, 0.607421875, 0.6083984375, 0.609375, 0.6103515625, 0.611328125, 0.6123046875, 0.61328125, 0.6142578125, 0.615234375, 0.6162109375, 0.6171875, 0.6181640625, 0.619140625, 0.6201171875, 0.62109375, 0.6220703125, 0.623046875, 0.6240234375, 0.625, 0.6259765625, 0.626953125, 0.6279296875, 0.62890625, 0.6298828125, 0.630859375, 0.6318359375, 0.6328125, 0.6337890625, 0.634765625, 0.6357421875, 0.63671875, 0.6376953125, 0.638671875, 0.6396484375, 0.640625, 0.6416015625, 0.642578125, 0.6435546875, 0.64453125, 0.6455078125, 0.646484375, 0.6474609375, 0.6484375, 0.6494140625, 0.650390625, 0.6513671875, 0.65234375, 0.6533203125, 0.654296875, 0.6552734375, 0.65625, 0.6572265625, 0.658203125, 0.6591796875, 0.66015625, 0.6611328125, 0.662109375, 0.6630859375, 0.6640625, 0.6650390625, 0.666015625, 0.6669921875, 0.66796875, 0.6689453125, 0.669921875, 0.6708984375, 0.671875, 0.6728515625, 0.673828125, 0.6748046875, 0.67578125, 0.6767578125, 0.677734375, 0.6787109375, 0.6796875, 0.6806640625, 0.681640625, 0.6826171875, 0.68359375, 0.6845703125, 0.685546875, 0.6865234375, 0.6875, 0.6884765625, 0.689453125, 0.6904296875, 0.69140625, 0.6923828125, 0.693359375, 0.6943359375, 0.6953125, 0.6962890625, 0.697265625, 0.6982421875, 0.69921875, 0.7001953125, 0.701171875, 0.7021484375, 0.703125, 0.7041015625, 0.705078125, 0.7060546875, 0.70703125, 0.7080078125, 0.708984375, 0.7099609375, 0.7109375, 0.7119140625, 0.712890625, 0.7138671875, 0.71484375, 0.7158203125, 0.716796875, 0.7177734375, 0.71875, 0.7197265625, 0.720703125, 0.7216796875, 0.72265625, 0.7236328125, 0.724609375, 0.7255859375, 0.7265625, 0.7275390625, 0.728515625, 0.7294921875, 0.73046875, 0.7314453125, 0.732421875, 0.7333984375, 0.734375, 0.7353515625, 0.736328125, 0.7373046875, 0.73828125, 0.7392578125, 0.740234375, 0.7412109375, 0.7421875, 0.7431640625, 0.744140625, 0.7451171875, 0.74609375, 0.7470703125, 0.748046875, 0.7490234375, 0.75, 0.7509765625, 0.751953125, 0.7529296875, 0.75390625, 0.7548828125, 0.755859375, 0.7568359375, 0.7578125, 0.7587890625, 0.759765625, 0.7607421875, 0.76171875, 0.7626953125, 0.763671875, 0.7646484375, 0.765625, 0.7666015625, 0.767578125, 0.7685546875, 0.76953125, 0.7705078125, 0.771484375, 0.7724609375, 0.7734375, 0.7744140625, 0.775390625, 0.7763671875, 0.77734375, 0.7783203125, 0.779296875, 0.7802734375, 0.78125, 0.7822265625, 0.783203125, 0.7841796875, 0.78515625, 0.7861328125, 0.787109375, 0.7880859375, 0.7890625, 0.7900390625, 0.791015625, 0.7919921875, 0.79296875, 0.7939453125, 0.794921875, 0.7958984375, 0.796875, 0.7978515625, 0.798828125, 0.7998046875, 0.80078125, 0.8017578125, 0.802734375, 0.8037109375, 0.8046875, 0.8056640625, 0.806640625, 0.8076171875, 0.80859375, 0.8095703125, 0.810546875, 0.8115234375, 0.8125, 0.8134765625, 0.814453125, 0.8154296875, 0.81640625, 0.8173828125, 0.818359375, 0.8193359375, 0.8203125, 0.8212890625, 0.822265625, 0.8232421875, 0.82421875, 0.8251953125, 0.826171875, 0.8271484375, 0.828125, 0.8291015625, 0.830078125, 0.8310546875, 0.83203125, 0.8330078125, 0.833984375, 0.8349609375, 0.8359375, 0.8369140625, 0.837890625, 0.8388671875, 0.83984375, 0.8408203125, 0.841796875, 0.8427734375, 0.84375, 0.8447265625, 0.845703125, 0.8466796875, 0.84765625, 0.8486328125, 0.849609375, 0.8505859375, 0.8515625, 0.8525390625, 0.853515625, 0.8544921875, 0.85546875, 0.8564453125, 0.857421875, 0.8583984375, 0.859375, 0.8603515625, 0.861328125, 0.8623046875, 0.86328125, 0.8642578125, 0.865234375, 0.8662109375, 0.8671875, 0.8681640625, 0.869140625, 0.8701171875, 0.87109375, 0.8720703125, 0.873046875, 0.8740234375, 0.875, 0.8759765625, 0.876953125, 0.8779296875, 0.87890625, 0.8798828125, 0.880859375, 0.8818359375, 0.8828125, 0.8837890625, 0.884765625, 0.8857421875, 0.88671875, 0.8876953125, 0.888671875, 0.8896484375, 0.890625, 0.8916015625, 0.892578125, 0.8935546875, 0.89453125, 0.8955078125, 0.896484375, 0.8974609375, 0.8984375, 0.8994140625, 0.900390625, 0.9013671875, 0.90234375, 0.9033203125, 0.904296875, 0.9052734375, 0.90625, 0.9072265625, 0.908203125, 0.9091796875, 0.91015625, 0.9111328125, 0.912109375, 0.9130859375, 0.9140625, 0.9150390625, 0.916015625, 0.9169921875, 0.91796875, 0.9189453125, 0.919921875, 0.9208984375, 0.921875, 0.9228515625, 0.923828125, 0.9248046875, 0.92578125, 0.9267578125, 0.927734375, 0.9287109375, 0.9296875, 0.9306640625, 0.931640625, 0.9326171875, 0.93359375, 0.9345703125, 0.935546875, 0.9365234375, 0.9375, 0.9384765625, 0.939453125, 0.9404296875, 0.94140625, 0.9423828125, 0.943359375, 0.9443359375, 0.9453125, 0.9462890625, 0.947265625, 0.9482421875, 0.94921875, 0.9501953125, 0.951171875, 0.9521484375, 0.953125, 0.9541015625, 0.955078125, 0.9560546875, 0.95703125, 0.9580078125, 0.958984375, 0.9599609375, 0.9609375, 0.9619140625, 0.962890625, 0.9638671875, 0.96484375, 0.9658203125, 0.966796875, 0.9677734375, 0.96875, 0.9697265625, 0.970703125, 0.9716796875, 0.97265625, 0.9736328125, 0.974609375, 0.9755859375, 0.9765625, 0.9775390625, 0.978515625, 0.9794921875, 0.98046875, 0.9814453125, 0.982421875, 0.9833984375, 0.984375, 0.9853515625, 0.986328125, 0.9873046875, 0.98828125, 0.9892578125, 0.990234375, 0.9912109375, 0.9921875, 0.9931640625, 0.994140625, 0.9951171875, 0.99609375, 0.9970703125, 0.998046875, 0.9990234375, 1.0], "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 128.0, "target_prob": 1.0, "endpoint_temp": 1.7, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260519}, "raw_genppl": {"ppl": 1397.261927910019, "nll_per_token": 7.242269834817624, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 1403.9901107955734, "nll_per_token": 7.247073540968054, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 5.037653085441597, "unique_tokens": 4476, "token_count": 8192, "distinct_1": 0.54638671875, "distinct_2": 0.7487781036168133, "top_token_mass": 0.25}}
|
| 52 |
+
[done] docs/lta_samples/metrics_20260519/owt_t5_2node_latest_trainmatched_dirres_c128_lowtemp_n8/t1p70.jsonl
|
| 53 |
+
[ckpt] eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step101200_20260519_092441.pt step=101000
|
| 54 |
+
[decode-base] n=8 max_len=1024 steps=1024 model_t=post
|
| 55 |
+
[decode-time] schedule=linear s=[0.0,0.25] gumbel=(2.2,0.8) force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.000977 dt_max=0.000977
|
| 56 |
+
[decode] temp=1.75 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 57 |
+
[decode] temp=1.75 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 58 |
+
[decode] temp=1.75 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
LTA_openwebtext_dualt/logs/infer_owt_t5_2node_latest_trainmatched_dirres_grid_n8.log
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[ckpt] eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step101200_20260519_092441.pt step=101000
|
| 2 |
+
[decode-base] n=8 max_len=1024 steps=1024 model_t=post
|
| 3 |
+
[decode-time] schedule=linear s=[0.0,0.25] force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.000977 dt_max=0.000977
|
| 4 |
+
[decode] temp=1.50 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 5 |
+
[decode] temp=1.50 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 6 |
+
[decode] temp=1.50 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 7 |
+
[decode] temp=1.50 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 8 |
+
[decode] temp=1.50 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 9 |
+
[decode] temp=1.50 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 10 |
+
[decode] temp=1.50 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 11 |
+
[decode] temp=1.50 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 12 |
+
[summary] {"type": "summary", "checkpoint": "eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step101200_20260519_092441.pt", "step": 101000, "decode": {"steps": 1024, "model_t_mode": "post", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0009765625, 0.001953125, 0.0029296875, 0.00390625, 0.0048828125, 0.005859375, 0.0068359375, 0.0078125, 0.0087890625, 0.009765625, 0.0107421875, 0.01171875, 0.0126953125, 0.013671875, 0.0146484375, 0.015625, 0.0166015625, 0.017578125, 0.0185546875, 0.01953125, 0.0205078125, 0.021484375, 0.0224609375, 0.0234375, 0.0244140625, 0.025390625, 0.0263671875, 0.02734375, 0.0283203125, 0.029296875, 0.0302734375, 0.03125, 0.0322265625, 0.033203125, 0.0341796875, 0.03515625, 0.0361328125, 0.037109375, 0.0380859375, 0.0390625, 0.0400390625, 0.041015625, 0.0419921875, 0.04296875, 0.0439453125, 0.044921875, 0.0458984375, 0.046875, 0.0478515625, 0.048828125, 0.0498046875, 0.05078125, 0.0517578125, 0.052734375, 0.0537109375, 0.0546875, 0.0556640625, 0.056640625, 0.0576171875, 0.05859375, 0.0595703125, 0.060546875, 0.0615234375, 0.0625, 0.0634765625, 0.064453125, 0.0654296875, 0.06640625, 0.0673828125, 0.068359375, 0.0693359375, 0.0703125, 0.0712890625, 0.072265625, 0.0732421875, 0.07421875, 0.0751953125, 0.076171875, 0.0771484375, 0.078125, 0.0791015625, 0.080078125, 0.0810546875, 0.08203125, 0.0830078125, 0.083984375, 0.0849609375, 0.0859375, 0.0869140625, 0.087890625, 0.0888671875, 0.08984375, 0.0908203125, 0.091796875, 0.0927734375, 0.09375, 0.0947265625, 0.095703125, 0.0966796875, 0.09765625, 0.0986328125, 0.099609375, 0.1005859375, 0.1015625, 0.1025390625, 0.103515625, 0.1044921875, 0.10546875, 0.1064453125, 0.107421875, 0.1083984375, 0.109375, 0.1103515625, 0.111328125, 0.1123046875, 0.11328125, 0.1142578125, 0.115234375, 0.1162109375, 0.1171875, 0.1181640625, 0.119140625, 0.1201171875, 0.12109375, 0.1220703125, 0.123046875, 0.1240234375, 0.125, 0.1259765625, 0.126953125, 0.1279296875, 0.12890625, 0.1298828125, 0.130859375, 0.1318359375, 0.1328125, 0.1337890625, 0.134765625, 0.1357421875, 0.13671875, 0.1376953125, 0.138671875, 0.1396484375, 0.140625, 0.1416015625, 0.142578125, 0.1435546875, 0.14453125, 0.1455078125, 0.146484375, 0.1474609375, 0.1484375, 0.1494140625, 0.150390625, 0.1513671875, 0.15234375, 0.1533203125, 0.154296875, 0.1552734375, 0.15625, 0.1572265625, 0.158203125, 0.1591796875, 0.16015625, 0.1611328125, 0.162109375, 0.1630859375, 0.1640625, 0.1650390625, 0.166015625, 0.1669921875, 0.16796875, 0.1689453125, 0.169921875, 0.1708984375, 0.171875, 0.1728515625, 0.173828125, 0.1748046875, 0.17578125, 0.1767578125, 0.177734375, 0.1787109375, 0.1796875, 0.1806640625, 0.181640625, 0.1826171875, 0.18359375, 0.1845703125, 0.185546875, 0.1865234375, 0.1875, 0.1884765625, 0.189453125, 0.1904296875, 0.19140625, 0.1923828125, 0.193359375, 0.1943359375, 0.1953125, 0.1962890625, 0.197265625, 0.1982421875, 0.19921875, 0.2001953125, 0.201171875, 0.2021484375, 0.203125, 0.2041015625, 0.205078125, 0.2060546875, 0.20703125, 0.2080078125, 0.208984375, 0.2099609375, 0.2109375, 0.2119140625, 0.212890625, 0.2138671875, 0.21484375, 0.2158203125, 0.216796875, 0.2177734375, 0.21875, 0.2197265625, 0.220703125, 0.2216796875, 0.22265625, 0.2236328125, 0.224609375, 0.2255859375, 0.2265625, 0.2275390625, 0.228515625, 0.2294921875, 0.23046875, 0.2314453125, 0.232421875, 0.2333984375, 0.234375, 0.2353515625, 0.236328125, 0.2373046875, 0.23828125, 0.2392578125, 0.240234375, 0.2412109375, 0.2421875, 0.2431640625, 0.244140625, 0.2451171875, 0.24609375, 0.2470703125, 0.248046875, 0.2490234375, 0.25, 0.2509765625, 0.251953125, 0.2529296875, 0.25390625, 0.2548828125, 0.255859375, 0.2568359375, 0.2578125, 0.2587890625, 0.259765625, 0.2607421875, 0.26171875, 0.2626953125, 0.263671875, 0.2646484375, 0.265625, 0.2666015625, 0.267578125, 0.2685546875, 0.26953125, 0.2705078125, 0.271484375, 0.2724609375, 0.2734375, 0.2744140625, 0.275390625, 0.2763671875, 0.27734375, 0.2783203125, 0.279296875, 0.2802734375, 0.28125, 0.2822265625, 0.283203125, 0.2841796875, 0.28515625, 0.2861328125, 0.287109375, 0.2880859375, 0.2890625, 0.2900390625, 0.291015625, 0.2919921875, 0.29296875, 0.2939453125, 0.294921875, 0.2958984375, 0.296875, 0.2978515625, 0.298828125, 0.2998046875, 0.30078125, 0.3017578125, 0.302734375, 0.3037109375, 0.3046875, 0.3056640625, 0.306640625, 0.3076171875, 0.30859375, 0.3095703125, 0.310546875, 0.3115234375, 0.3125, 0.3134765625, 0.314453125, 0.3154296875, 0.31640625, 0.3173828125, 0.318359375, 0.3193359375, 0.3203125, 0.3212890625, 0.322265625, 0.3232421875, 0.32421875, 0.3251953125, 0.326171875, 0.3271484375, 0.328125, 0.3291015625, 0.330078125, 0.3310546875, 0.33203125, 0.3330078125, 0.333984375, 0.3349609375, 0.3359375, 0.3369140625, 0.337890625, 0.3388671875, 0.33984375, 0.3408203125, 0.341796875, 0.3427734375, 0.34375, 0.3447265625, 0.345703125, 0.3466796875, 0.34765625, 0.3486328125, 0.349609375, 0.3505859375, 0.3515625, 0.3525390625, 0.353515625, 0.3544921875, 0.35546875, 0.3564453125, 0.357421875, 0.3583984375, 0.359375, 0.3603515625, 0.361328125, 0.3623046875, 0.36328125, 0.3642578125, 0.365234375, 0.3662109375, 0.3671875, 0.3681640625, 0.369140625, 0.3701171875, 0.37109375, 0.3720703125, 0.373046875, 0.3740234375, 0.375, 0.3759765625, 0.376953125, 0.3779296875, 0.37890625, 0.3798828125, 0.380859375, 0.3818359375, 0.3828125, 0.3837890625, 0.384765625, 0.3857421875, 0.38671875, 0.3876953125, 0.388671875, 0.3896484375, 0.390625, 0.3916015625, 0.392578125, 0.3935546875, 0.39453125, 0.3955078125, 0.396484375, 0.3974609375, 0.3984375, 0.3994140625, 0.400390625, 0.4013671875, 0.40234375, 0.4033203125, 0.404296875, 0.4052734375, 0.40625, 0.4072265625, 0.408203125, 0.4091796875, 0.41015625, 0.4111328125, 0.412109375, 0.4130859375, 0.4140625, 0.4150390625, 0.416015625, 0.4169921875, 0.41796875, 0.4189453125, 0.419921875, 0.4208984375, 0.421875, 0.4228515625, 0.423828125, 0.4248046875, 0.42578125, 0.4267578125, 0.427734375, 0.4287109375, 0.4296875, 0.4306640625, 0.431640625, 0.4326171875, 0.43359375, 0.4345703125, 0.435546875, 0.4365234375, 0.4375, 0.4384765625, 0.439453125, 0.4404296875, 0.44140625, 0.4423828125, 0.443359375, 0.4443359375, 0.4453125, 0.4462890625, 0.447265625, 0.4482421875, 0.44921875, 0.4501953125, 0.451171875, 0.4521484375, 0.453125, 0.4541015625, 0.455078125, 0.4560546875, 0.45703125, 0.4580078125, 0.458984375, 0.4599609375, 0.4609375, 0.4619140625, 0.462890625, 0.4638671875, 0.46484375, 0.4658203125, 0.466796875, 0.4677734375, 0.46875, 0.4697265625, 0.470703125, 0.4716796875, 0.47265625, 0.4736328125, 0.474609375, 0.4755859375, 0.4765625, 0.4775390625, 0.478515625, 0.4794921875, 0.48046875, 0.4814453125, 0.482421875, 0.4833984375, 0.484375, 0.4853515625, 0.486328125, 0.4873046875, 0.48828125, 0.4892578125, 0.490234375, 0.4912109375, 0.4921875, 0.4931640625, 0.494140625, 0.4951171875, 0.49609375, 0.4970703125, 0.498046875, 0.4990234375, 0.5, 0.5009765625, 0.501953125, 0.5029296875, 0.50390625, 0.5048828125, 0.505859375, 0.5068359375, 0.5078125, 0.5087890625, 0.509765625, 0.5107421875, 0.51171875, 0.5126953125, 0.513671875, 0.5146484375, 0.515625, 0.5166015625, 0.517578125, 0.5185546875, 0.51953125, 0.5205078125, 0.521484375, 0.5224609375, 0.5234375, 0.5244140625, 0.525390625, 0.5263671875, 0.52734375, 0.5283203125, 0.529296875, 0.5302734375, 0.53125, 0.5322265625, 0.533203125, 0.5341796875, 0.53515625, 0.5361328125, 0.537109375, 0.5380859375, 0.5390625, 0.5400390625, 0.541015625, 0.5419921875, 0.54296875, 0.5439453125, 0.544921875, 0.5458984375, 0.546875, 0.5478515625, 0.548828125, 0.5498046875, 0.55078125, 0.5517578125, 0.552734375, 0.5537109375, 0.5546875, 0.5556640625, 0.556640625, 0.5576171875, 0.55859375, 0.5595703125, 0.560546875, 0.5615234375, 0.5625, 0.5634765625, 0.564453125, 0.5654296875, 0.56640625, 0.5673828125, 0.568359375, 0.5693359375, 0.5703125, 0.5712890625, 0.572265625, 0.5732421875, 0.57421875, 0.5751953125, 0.576171875, 0.5771484375, 0.578125, 0.5791015625, 0.580078125, 0.5810546875, 0.58203125, 0.5830078125, 0.583984375, 0.5849609375, 0.5859375, 0.5869140625, 0.587890625, 0.5888671875, 0.58984375, 0.5908203125, 0.591796875, 0.5927734375, 0.59375, 0.5947265625, 0.595703125, 0.5966796875, 0.59765625, 0.5986328125, 0.599609375, 0.6005859375, 0.6015625, 0.6025390625, 0.603515625, 0.6044921875, 0.60546875, 0.6064453125, 0.607421875, 0.6083984375, 0.609375, 0.6103515625, 0.611328125, 0.6123046875, 0.61328125, 0.6142578125, 0.615234375, 0.6162109375, 0.6171875, 0.6181640625, 0.619140625, 0.6201171875, 0.62109375, 0.6220703125, 0.623046875, 0.6240234375, 0.625, 0.6259765625, 0.626953125, 0.6279296875, 0.62890625, 0.6298828125, 0.630859375, 0.6318359375, 0.6328125, 0.6337890625, 0.634765625, 0.6357421875, 0.63671875, 0.6376953125, 0.638671875, 0.6396484375, 0.640625, 0.6416015625, 0.642578125, 0.6435546875, 0.64453125, 0.6455078125, 0.646484375, 0.6474609375, 0.6484375, 0.6494140625, 0.650390625, 0.6513671875, 0.65234375, 0.6533203125, 0.654296875, 0.6552734375, 0.65625, 0.6572265625, 0.658203125, 0.6591796875, 0.66015625, 0.6611328125, 0.662109375, 0.6630859375, 0.6640625, 0.6650390625, 0.666015625, 0.6669921875, 0.66796875, 0.6689453125, 0.669921875, 0.6708984375, 0.671875, 0.6728515625, 0.673828125, 0.6748046875, 0.67578125, 0.6767578125, 0.677734375, 0.6787109375, 0.6796875, 0.6806640625, 0.681640625, 0.6826171875, 0.68359375, 0.6845703125, 0.685546875, 0.6865234375, 0.6875, 0.6884765625, 0.689453125, 0.6904296875, 0.69140625, 0.6923828125, 0.693359375, 0.6943359375, 0.6953125, 0.6962890625, 0.697265625, 0.6982421875, 0.69921875, 0.7001953125, 0.701171875, 0.7021484375, 0.703125, 0.7041015625, 0.705078125, 0.7060546875, 0.70703125, 0.7080078125, 0.708984375, 0.7099609375, 0.7109375, 0.7119140625, 0.712890625, 0.7138671875, 0.71484375, 0.7158203125, 0.716796875, 0.7177734375, 0.71875, 0.7197265625, 0.720703125, 0.7216796875, 0.72265625, 0.7236328125, 0.724609375, 0.7255859375, 0.7265625, 0.7275390625, 0.728515625, 0.7294921875, 0.73046875, 0.7314453125, 0.732421875, 0.7333984375, 0.734375, 0.7353515625, 0.736328125, 0.7373046875, 0.73828125, 0.7392578125, 0.740234375, 0.7412109375, 0.7421875, 0.7431640625, 0.744140625, 0.7451171875, 0.74609375, 0.7470703125, 0.748046875, 0.7490234375, 0.75, 0.7509765625, 0.751953125, 0.7529296875, 0.75390625, 0.7548828125, 0.755859375, 0.7568359375, 0.7578125, 0.7587890625, 0.759765625, 0.7607421875, 0.76171875, 0.7626953125, 0.763671875, 0.7646484375, 0.765625, 0.7666015625, 0.767578125, 0.7685546875, 0.76953125, 0.7705078125, 0.771484375, 0.7724609375, 0.7734375, 0.7744140625, 0.775390625, 0.7763671875, 0.77734375, 0.7783203125, 0.779296875, 0.7802734375, 0.78125, 0.7822265625, 0.783203125, 0.7841796875, 0.78515625, 0.7861328125, 0.787109375, 0.7880859375, 0.7890625, 0.7900390625, 0.791015625, 0.7919921875, 0.79296875, 0.7939453125, 0.794921875, 0.7958984375, 0.796875, 0.7978515625, 0.798828125, 0.7998046875, 0.80078125, 0.8017578125, 0.802734375, 0.8037109375, 0.8046875, 0.8056640625, 0.806640625, 0.8076171875, 0.80859375, 0.8095703125, 0.810546875, 0.8115234375, 0.8125, 0.8134765625, 0.814453125, 0.8154296875, 0.81640625, 0.8173828125, 0.818359375, 0.8193359375, 0.8203125, 0.8212890625, 0.822265625, 0.8232421875, 0.82421875, 0.8251953125, 0.826171875, 0.8271484375, 0.828125, 0.8291015625, 0.830078125, 0.8310546875, 0.83203125, 0.8330078125, 0.833984375, 0.8349609375, 0.8359375, 0.8369140625, 0.837890625, 0.8388671875, 0.83984375, 0.8408203125, 0.841796875, 0.8427734375, 0.84375, 0.8447265625, 0.845703125, 0.8466796875, 0.84765625, 0.8486328125, 0.849609375, 0.8505859375, 0.8515625, 0.8525390625, 0.853515625, 0.8544921875, 0.85546875, 0.8564453125, 0.857421875, 0.8583984375, 0.859375, 0.8603515625, 0.861328125, 0.8623046875, 0.86328125, 0.8642578125, 0.865234375, 0.8662109375, 0.8671875, 0.8681640625, 0.869140625, 0.8701171875, 0.87109375, 0.8720703125, 0.873046875, 0.8740234375, 0.875, 0.8759765625, 0.876953125, 0.8779296875, 0.87890625, 0.8798828125, 0.880859375, 0.8818359375, 0.8828125, 0.8837890625, 0.884765625, 0.8857421875, 0.88671875, 0.8876953125, 0.888671875, 0.8896484375, 0.890625, 0.8916015625, 0.892578125, 0.8935546875, 0.89453125, 0.8955078125, 0.896484375, 0.8974609375, 0.8984375, 0.8994140625, 0.900390625, 0.9013671875, 0.90234375, 0.9033203125, 0.904296875, 0.9052734375, 0.90625, 0.9072265625, 0.908203125, 0.9091796875, 0.91015625, 0.9111328125, 0.912109375, 0.9130859375, 0.9140625, 0.9150390625, 0.916015625, 0.9169921875, 0.91796875, 0.9189453125, 0.919921875, 0.9208984375, 0.921875, 0.9228515625, 0.923828125, 0.9248046875, 0.92578125, 0.9267578125, 0.927734375, 0.9287109375, 0.9296875, 0.9306640625, 0.931640625, 0.9326171875, 0.93359375, 0.9345703125, 0.935546875, 0.9365234375, 0.9375, 0.9384765625, 0.939453125, 0.9404296875, 0.94140625, 0.9423828125, 0.943359375, 0.9443359375, 0.9453125, 0.9462890625, 0.947265625, 0.9482421875, 0.94921875, 0.9501953125, 0.951171875, 0.9521484375, 0.953125, 0.9541015625, 0.955078125, 0.9560546875, 0.95703125, 0.9580078125, 0.958984375, 0.9599609375, 0.9609375, 0.9619140625, 0.962890625, 0.9638671875, 0.96484375, 0.9658203125, 0.966796875, 0.9677734375, 0.96875, 0.9697265625, 0.970703125, 0.9716796875, 0.97265625, 0.9736328125, 0.974609375, 0.9755859375, 0.9765625, 0.9775390625, 0.978515625, 0.9794921875, 0.98046875, 0.9814453125, 0.982421875, 0.9833984375, 0.984375, 0.9853515625, 0.986328125, 0.9873046875, 0.98828125, 0.9892578125, 0.990234375, 0.9912109375, 0.9921875, 0.9931640625, 0.994140625, 0.9951171875, 0.99609375, 0.9970703125, 0.998046875, 0.9990234375, 1.0], "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 128.0, "target_prob": 1.0, "endpoint_temp": 1.5, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260519}, "raw_genppl": {"ppl": 2.3705850238950044, "nll_per_token": 0.8631367702110141, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 2.3705850238950044, "nll_per_token": 0.8631367702110141, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 1.9894434601248572, "unique_tokens": 194, "token_count": 8192, "distinct_1": 0.023681640625, "distinct_2": 0.050342130987292275, "top_token_mass": 0.1419677734375}}
|
| 13 |
+
[done] docs/lta_samples/metrics_20260519/owt_t5_2node_latest_trainmatched_dirres_grid_n8/cmax128_t1p50.jsonl
|
| 14 |
+
[ckpt] eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step101200_20260519_092441.pt step=101000
|
| 15 |
+
[decode-base] n=8 max_len=1024 steps=1024 model_t=post
|
| 16 |
+
[decode-time] schedule=linear s=[0.0,0.25] force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.000977 dt_max=0.000977
|
| 17 |
+
[decode] temp=1.80 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 18 |
+
[decode] temp=1.80 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 19 |
+
[decode] temp=1.80 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 20 |
+
[decode] temp=1.80 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 21 |
+
[decode] temp=1.80 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 22 |
+
[decode] temp=1.80 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 23 |
+
[decode] temp=1.80 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 24 |
+
[decode] temp=1.80 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 25 |
+
[summary] {"type": "summary", "checkpoint": "eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step101200_20260519_092441.pt", "step": 101000, "decode": {"steps": 1024, "model_t_mode": "post", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0009765625, 0.001953125, 0.0029296875, 0.00390625, 0.0048828125, 0.005859375, 0.0068359375, 0.0078125, 0.0087890625, 0.009765625, 0.0107421875, 0.01171875, 0.0126953125, 0.013671875, 0.0146484375, 0.015625, 0.0166015625, 0.017578125, 0.0185546875, 0.01953125, 0.0205078125, 0.021484375, 0.0224609375, 0.0234375, 0.0244140625, 0.025390625, 0.0263671875, 0.02734375, 0.0283203125, 0.029296875, 0.0302734375, 0.03125, 0.0322265625, 0.033203125, 0.0341796875, 0.03515625, 0.0361328125, 0.037109375, 0.0380859375, 0.0390625, 0.0400390625, 0.041015625, 0.0419921875, 0.04296875, 0.0439453125, 0.044921875, 0.0458984375, 0.046875, 0.0478515625, 0.048828125, 0.0498046875, 0.05078125, 0.0517578125, 0.052734375, 0.0537109375, 0.0546875, 0.0556640625, 0.056640625, 0.0576171875, 0.05859375, 0.0595703125, 0.060546875, 0.0615234375, 0.0625, 0.0634765625, 0.064453125, 0.0654296875, 0.06640625, 0.0673828125, 0.068359375, 0.0693359375, 0.0703125, 0.0712890625, 0.072265625, 0.0732421875, 0.07421875, 0.0751953125, 0.076171875, 0.0771484375, 0.078125, 0.0791015625, 0.080078125, 0.0810546875, 0.08203125, 0.0830078125, 0.083984375, 0.0849609375, 0.0859375, 0.0869140625, 0.087890625, 0.0888671875, 0.08984375, 0.0908203125, 0.091796875, 0.0927734375, 0.09375, 0.0947265625, 0.095703125, 0.0966796875, 0.09765625, 0.0986328125, 0.099609375, 0.1005859375, 0.1015625, 0.1025390625, 0.103515625, 0.1044921875, 0.10546875, 0.1064453125, 0.107421875, 0.1083984375, 0.109375, 0.1103515625, 0.111328125, 0.1123046875, 0.11328125, 0.1142578125, 0.115234375, 0.1162109375, 0.1171875, 0.1181640625, 0.119140625, 0.1201171875, 0.12109375, 0.1220703125, 0.123046875, 0.1240234375, 0.125, 0.1259765625, 0.126953125, 0.1279296875, 0.12890625, 0.1298828125, 0.130859375, 0.1318359375, 0.1328125, 0.1337890625, 0.134765625, 0.1357421875, 0.13671875, 0.1376953125, 0.138671875, 0.1396484375, 0.140625, 0.1416015625, 0.142578125, 0.1435546875, 0.14453125, 0.1455078125, 0.146484375, 0.1474609375, 0.1484375, 0.1494140625, 0.150390625, 0.1513671875, 0.15234375, 0.1533203125, 0.154296875, 0.1552734375, 0.15625, 0.1572265625, 0.158203125, 0.1591796875, 0.16015625, 0.1611328125, 0.162109375, 0.1630859375, 0.1640625, 0.1650390625, 0.166015625, 0.1669921875, 0.16796875, 0.1689453125, 0.169921875, 0.1708984375, 0.171875, 0.1728515625, 0.173828125, 0.1748046875, 0.17578125, 0.1767578125, 0.177734375, 0.1787109375, 0.1796875, 0.1806640625, 0.181640625, 0.1826171875, 0.18359375, 0.1845703125, 0.185546875, 0.1865234375, 0.1875, 0.1884765625, 0.189453125, 0.1904296875, 0.19140625, 0.1923828125, 0.193359375, 0.1943359375, 0.1953125, 0.1962890625, 0.197265625, 0.1982421875, 0.19921875, 0.2001953125, 0.201171875, 0.2021484375, 0.203125, 0.2041015625, 0.205078125, 0.2060546875, 0.20703125, 0.2080078125, 0.208984375, 0.2099609375, 0.2109375, 0.2119140625, 0.212890625, 0.2138671875, 0.21484375, 0.2158203125, 0.216796875, 0.2177734375, 0.21875, 0.2197265625, 0.220703125, 0.2216796875, 0.22265625, 0.2236328125, 0.224609375, 0.2255859375, 0.2265625, 0.2275390625, 0.228515625, 0.2294921875, 0.23046875, 0.2314453125, 0.232421875, 0.2333984375, 0.234375, 0.2353515625, 0.236328125, 0.2373046875, 0.23828125, 0.2392578125, 0.240234375, 0.2412109375, 0.2421875, 0.2431640625, 0.244140625, 0.2451171875, 0.24609375, 0.2470703125, 0.248046875, 0.2490234375, 0.25, 0.2509765625, 0.251953125, 0.2529296875, 0.25390625, 0.2548828125, 0.255859375, 0.2568359375, 0.2578125, 0.2587890625, 0.259765625, 0.2607421875, 0.26171875, 0.2626953125, 0.263671875, 0.2646484375, 0.265625, 0.2666015625, 0.267578125, 0.2685546875, 0.26953125, 0.2705078125, 0.271484375, 0.2724609375, 0.2734375, 0.2744140625, 0.275390625, 0.2763671875, 0.27734375, 0.2783203125, 0.279296875, 0.2802734375, 0.28125, 0.2822265625, 0.283203125, 0.2841796875, 0.28515625, 0.2861328125, 0.287109375, 0.2880859375, 0.2890625, 0.2900390625, 0.291015625, 0.2919921875, 0.29296875, 0.2939453125, 0.294921875, 0.2958984375, 0.296875, 0.2978515625, 0.298828125, 0.2998046875, 0.30078125, 0.3017578125, 0.302734375, 0.3037109375, 0.3046875, 0.3056640625, 0.306640625, 0.3076171875, 0.30859375, 0.3095703125, 0.310546875, 0.3115234375, 0.3125, 0.3134765625, 0.314453125, 0.3154296875, 0.31640625, 0.3173828125, 0.318359375, 0.3193359375, 0.3203125, 0.3212890625, 0.322265625, 0.3232421875, 0.32421875, 0.3251953125, 0.326171875, 0.3271484375, 0.328125, 0.3291015625, 0.330078125, 0.3310546875, 0.33203125, 0.3330078125, 0.333984375, 0.3349609375, 0.3359375, 0.3369140625, 0.337890625, 0.3388671875, 0.33984375, 0.3408203125, 0.341796875, 0.3427734375, 0.34375, 0.3447265625, 0.345703125, 0.3466796875, 0.34765625, 0.3486328125, 0.349609375, 0.3505859375, 0.3515625, 0.3525390625, 0.353515625, 0.3544921875, 0.35546875, 0.3564453125, 0.357421875, 0.3583984375, 0.359375, 0.3603515625, 0.361328125, 0.3623046875, 0.36328125, 0.3642578125, 0.365234375, 0.3662109375, 0.3671875, 0.3681640625, 0.369140625, 0.3701171875, 0.37109375, 0.3720703125, 0.373046875, 0.3740234375, 0.375, 0.3759765625, 0.376953125, 0.3779296875, 0.37890625, 0.3798828125, 0.380859375, 0.3818359375, 0.3828125, 0.3837890625, 0.384765625, 0.3857421875, 0.38671875, 0.3876953125, 0.388671875, 0.3896484375, 0.390625, 0.3916015625, 0.392578125, 0.3935546875, 0.39453125, 0.3955078125, 0.396484375, 0.3974609375, 0.3984375, 0.3994140625, 0.400390625, 0.4013671875, 0.40234375, 0.4033203125, 0.404296875, 0.4052734375, 0.40625, 0.4072265625, 0.408203125, 0.4091796875, 0.41015625, 0.4111328125, 0.412109375, 0.4130859375, 0.4140625, 0.4150390625, 0.416015625, 0.4169921875, 0.41796875, 0.4189453125, 0.419921875, 0.4208984375, 0.421875, 0.4228515625, 0.423828125, 0.4248046875, 0.42578125, 0.4267578125, 0.427734375, 0.4287109375, 0.4296875, 0.4306640625, 0.431640625, 0.4326171875, 0.43359375, 0.4345703125, 0.435546875, 0.4365234375, 0.4375, 0.4384765625, 0.439453125, 0.4404296875, 0.44140625, 0.4423828125, 0.443359375, 0.4443359375, 0.4453125, 0.4462890625, 0.447265625, 0.4482421875, 0.44921875, 0.4501953125, 0.451171875, 0.4521484375, 0.453125, 0.4541015625, 0.455078125, 0.4560546875, 0.45703125, 0.4580078125, 0.458984375, 0.4599609375, 0.4609375, 0.4619140625, 0.462890625, 0.4638671875, 0.46484375, 0.4658203125, 0.466796875, 0.4677734375, 0.46875, 0.4697265625, 0.470703125, 0.4716796875, 0.47265625, 0.4736328125, 0.474609375, 0.4755859375, 0.4765625, 0.4775390625, 0.478515625, 0.4794921875, 0.48046875, 0.4814453125, 0.482421875, 0.4833984375, 0.484375, 0.4853515625, 0.486328125, 0.4873046875, 0.48828125, 0.4892578125, 0.490234375, 0.4912109375, 0.4921875, 0.4931640625, 0.494140625, 0.4951171875, 0.49609375, 0.4970703125, 0.498046875, 0.4990234375, 0.5, 0.5009765625, 0.501953125, 0.5029296875, 0.50390625, 0.5048828125, 0.505859375, 0.5068359375, 0.5078125, 0.5087890625, 0.509765625, 0.5107421875, 0.51171875, 0.5126953125, 0.513671875, 0.5146484375, 0.515625, 0.5166015625, 0.517578125, 0.5185546875, 0.51953125, 0.5205078125, 0.521484375, 0.5224609375, 0.5234375, 0.5244140625, 0.525390625, 0.5263671875, 0.52734375, 0.5283203125, 0.529296875, 0.5302734375, 0.53125, 0.5322265625, 0.533203125, 0.5341796875, 0.53515625, 0.5361328125, 0.537109375, 0.5380859375, 0.5390625, 0.5400390625, 0.541015625, 0.5419921875, 0.54296875, 0.5439453125, 0.544921875, 0.5458984375, 0.546875, 0.5478515625, 0.548828125, 0.5498046875, 0.55078125, 0.5517578125, 0.552734375, 0.5537109375, 0.5546875, 0.5556640625, 0.556640625, 0.5576171875, 0.55859375, 0.5595703125, 0.560546875, 0.5615234375, 0.5625, 0.5634765625, 0.564453125, 0.5654296875, 0.56640625, 0.5673828125, 0.568359375, 0.5693359375, 0.5703125, 0.5712890625, 0.572265625, 0.5732421875, 0.57421875, 0.5751953125, 0.576171875, 0.5771484375, 0.578125, 0.5791015625, 0.580078125, 0.5810546875, 0.58203125, 0.5830078125, 0.583984375, 0.5849609375, 0.5859375, 0.5869140625, 0.587890625, 0.5888671875, 0.58984375, 0.5908203125, 0.591796875, 0.5927734375, 0.59375, 0.5947265625, 0.595703125, 0.5966796875, 0.59765625, 0.5986328125, 0.599609375, 0.6005859375, 0.6015625, 0.6025390625, 0.603515625, 0.6044921875, 0.60546875, 0.6064453125, 0.607421875, 0.6083984375, 0.609375, 0.6103515625, 0.611328125, 0.6123046875, 0.61328125, 0.6142578125, 0.615234375, 0.6162109375, 0.6171875, 0.6181640625, 0.619140625, 0.6201171875, 0.62109375, 0.6220703125, 0.623046875, 0.6240234375, 0.625, 0.6259765625, 0.626953125, 0.6279296875, 0.62890625, 0.6298828125, 0.630859375, 0.6318359375, 0.6328125, 0.6337890625, 0.634765625, 0.6357421875, 0.63671875, 0.6376953125, 0.638671875, 0.6396484375, 0.640625, 0.6416015625, 0.642578125, 0.6435546875, 0.64453125, 0.6455078125, 0.646484375, 0.6474609375, 0.6484375, 0.6494140625, 0.650390625, 0.6513671875, 0.65234375, 0.6533203125, 0.654296875, 0.6552734375, 0.65625, 0.6572265625, 0.658203125, 0.6591796875, 0.66015625, 0.6611328125, 0.662109375, 0.6630859375, 0.6640625, 0.6650390625, 0.666015625, 0.6669921875, 0.66796875, 0.6689453125, 0.669921875, 0.6708984375, 0.671875, 0.6728515625, 0.673828125, 0.6748046875, 0.67578125, 0.6767578125, 0.677734375, 0.6787109375, 0.6796875, 0.6806640625, 0.681640625, 0.6826171875, 0.68359375, 0.6845703125, 0.685546875, 0.6865234375, 0.6875, 0.6884765625, 0.689453125, 0.6904296875, 0.69140625, 0.6923828125, 0.693359375, 0.6943359375, 0.6953125, 0.6962890625, 0.697265625, 0.6982421875, 0.69921875, 0.7001953125, 0.701171875, 0.7021484375, 0.703125, 0.7041015625, 0.705078125, 0.7060546875, 0.70703125, 0.7080078125, 0.708984375, 0.7099609375, 0.7109375, 0.7119140625, 0.712890625, 0.7138671875, 0.71484375, 0.7158203125, 0.716796875, 0.7177734375, 0.71875, 0.7197265625, 0.720703125, 0.7216796875, 0.72265625, 0.7236328125, 0.724609375, 0.7255859375, 0.7265625, 0.7275390625, 0.728515625, 0.7294921875, 0.73046875, 0.7314453125, 0.732421875, 0.7333984375, 0.734375, 0.7353515625, 0.736328125, 0.7373046875, 0.73828125, 0.7392578125, 0.740234375, 0.7412109375, 0.7421875, 0.7431640625, 0.744140625, 0.7451171875, 0.74609375, 0.7470703125, 0.748046875, 0.7490234375, 0.75, 0.7509765625, 0.751953125, 0.7529296875, 0.75390625, 0.7548828125, 0.755859375, 0.7568359375, 0.7578125, 0.7587890625, 0.759765625, 0.7607421875, 0.76171875, 0.7626953125, 0.763671875, 0.7646484375, 0.765625, 0.7666015625, 0.767578125, 0.7685546875, 0.76953125, 0.7705078125, 0.771484375, 0.7724609375, 0.7734375, 0.7744140625, 0.775390625, 0.7763671875, 0.77734375, 0.7783203125, 0.779296875, 0.7802734375, 0.78125, 0.7822265625, 0.783203125, 0.7841796875, 0.78515625, 0.7861328125, 0.787109375, 0.7880859375, 0.7890625, 0.7900390625, 0.791015625, 0.7919921875, 0.79296875, 0.7939453125, 0.794921875, 0.7958984375, 0.796875, 0.7978515625, 0.798828125, 0.7998046875, 0.80078125, 0.8017578125, 0.802734375, 0.8037109375, 0.8046875, 0.8056640625, 0.806640625, 0.8076171875, 0.80859375, 0.8095703125, 0.810546875, 0.8115234375, 0.8125, 0.8134765625, 0.814453125, 0.8154296875, 0.81640625, 0.8173828125, 0.818359375, 0.8193359375, 0.8203125, 0.8212890625, 0.822265625, 0.8232421875, 0.82421875, 0.8251953125, 0.826171875, 0.8271484375, 0.828125, 0.8291015625, 0.830078125, 0.8310546875, 0.83203125, 0.8330078125, 0.833984375, 0.8349609375, 0.8359375, 0.8369140625, 0.837890625, 0.8388671875, 0.83984375, 0.8408203125, 0.841796875, 0.8427734375, 0.84375, 0.8447265625, 0.845703125, 0.8466796875, 0.84765625, 0.8486328125, 0.849609375, 0.8505859375, 0.8515625, 0.8525390625, 0.853515625, 0.8544921875, 0.85546875, 0.8564453125, 0.857421875, 0.8583984375, 0.859375, 0.8603515625, 0.861328125, 0.8623046875, 0.86328125, 0.8642578125, 0.865234375, 0.8662109375, 0.8671875, 0.8681640625, 0.869140625, 0.8701171875, 0.87109375, 0.8720703125, 0.873046875, 0.8740234375, 0.875, 0.8759765625, 0.876953125, 0.8779296875, 0.87890625, 0.8798828125, 0.880859375, 0.8818359375, 0.8828125, 0.8837890625, 0.884765625, 0.8857421875, 0.88671875, 0.8876953125, 0.888671875, 0.8896484375, 0.890625, 0.8916015625, 0.892578125, 0.8935546875, 0.89453125, 0.8955078125, 0.896484375, 0.8974609375, 0.8984375, 0.8994140625, 0.900390625, 0.9013671875, 0.90234375, 0.9033203125, 0.904296875, 0.9052734375, 0.90625, 0.9072265625, 0.908203125, 0.9091796875, 0.91015625, 0.9111328125, 0.912109375, 0.9130859375, 0.9140625, 0.9150390625, 0.916015625, 0.9169921875, 0.91796875, 0.9189453125, 0.919921875, 0.9208984375, 0.921875, 0.9228515625, 0.923828125, 0.9248046875, 0.92578125, 0.9267578125, 0.927734375, 0.9287109375, 0.9296875, 0.9306640625, 0.931640625, 0.9326171875, 0.93359375, 0.9345703125, 0.935546875, 0.9365234375, 0.9375, 0.9384765625, 0.939453125, 0.9404296875, 0.94140625, 0.9423828125, 0.943359375, 0.9443359375, 0.9453125, 0.9462890625, 0.947265625, 0.9482421875, 0.94921875, 0.9501953125, 0.951171875, 0.9521484375, 0.953125, 0.9541015625, 0.955078125, 0.9560546875, 0.95703125, 0.9580078125, 0.958984375, 0.9599609375, 0.9609375, 0.9619140625, 0.962890625, 0.9638671875, 0.96484375, 0.9658203125, 0.966796875, 0.9677734375, 0.96875, 0.9697265625, 0.970703125, 0.9716796875, 0.97265625, 0.9736328125, 0.974609375, 0.9755859375, 0.9765625, 0.9775390625, 0.978515625, 0.9794921875, 0.98046875, 0.9814453125, 0.982421875, 0.9833984375, 0.984375, 0.9853515625, 0.986328125, 0.9873046875, 0.98828125, 0.9892578125, 0.990234375, 0.9912109375, 0.9921875, 0.9931640625, 0.994140625, 0.9951171875, 0.99609375, 0.9970703125, 0.998046875, 0.9990234375, 1.0], "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 128.0, "target_prob": 1.0, "endpoint_temp": 1.8, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260519}, "raw_genppl": {"ppl": 15174.468645389643, "nll_per_token": 9.627369600183824, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 15291.225085679736, "nll_per_token": 9.635034419041054, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 6.7776062571987445, "unique_tokens": 5795, "token_count": 8192, "distinct_1": 0.7073974609375, "distinct_2": 0.9993890518084066, "top_token_mass": 0.0107421875}}
|
| 26 |
+
[done] docs/lta_samples/metrics_20260519/owt_t5_2node_latest_trainmatched_dirres_grid_n8/cmax128_t1p80.jsonl
|
| 27 |
+
[ckpt] eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step101200_20260519_092441.pt step=101000
|
| 28 |
+
[decode-base] n=8 max_len=1024 steps=1024 model_t=post
|
| 29 |
+
[decode-time] schedule=linear s=[0.0,0.25] force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.000977 dt_max=0.000977
|
| 30 |
+
[decode] temp=2.00 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 31 |
+
[decode] temp=2.00 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 32 |
+
[decode] temp=2.00 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
LTA_openwebtext_dualt/logs/infer_owt_t5_2node_step290000_compare_n8_20260520_200659.log
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[infer] src=runs/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest.pt
|
| 2 |
+
[infer] frozen=eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step290000_20260520_200659.pt step=290000
|
| 3 |
+
[infer] tag=dual_state_c1024_t1p45 rule=dual_line_resample anchor=state cmax=1024 temp=1.45 out=docs/lta_samples/metrics_20260520/owt_t5_2node_step290000_infer_compare_n8/dual_state_c1024_t1p45.jsonl
|
| 4 |
+
[ckpt] eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step290000_20260520_200659.pt step=290000
|
| 5 |
+
[decode-base] n=8 max_len=1024 steps=1024 model_t=post
|
| 6 |
+
[decode-time] schedule=linear s=[0.0,0.25] gumbel=(2.2,0.8) force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.000977 dt_max=0.000977
|
| 7 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 8 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 9 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 10 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 11 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 12 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 13 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 14 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 15 |
+
[summary] {"type": "summary", "checkpoint": "eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step290000_20260520_200659.pt", "step": 290000, "decode": {"steps": 1024, "model_t_mode": "post", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0009765625, 0.001953125, 0.0029296875, 0.00390625, 0.0048828125, 0.005859375, 0.0068359375, 0.0078125, 0.0087890625, 0.009765625, 0.0107421875, 0.01171875, 0.0126953125, 0.013671875, 0.0146484375, 0.015625, 0.0166015625, 0.017578125, 0.0185546875, 0.01953125, 0.0205078125, 0.021484375, 0.0224609375, 0.0234375, 0.0244140625, 0.025390625, 0.0263671875, 0.02734375, 0.0283203125, 0.029296875, 0.0302734375, 0.03125, 0.0322265625, 0.033203125, 0.0341796875, 0.03515625, 0.0361328125, 0.037109375, 0.0380859375, 0.0390625, 0.0400390625, 0.041015625, 0.0419921875, 0.04296875, 0.0439453125, 0.044921875, 0.0458984375, 0.046875, 0.0478515625, 0.048828125, 0.0498046875, 0.05078125, 0.0517578125, 0.052734375, 0.0537109375, 0.0546875, 0.0556640625, 0.056640625, 0.0576171875, 0.05859375, 0.0595703125, 0.060546875, 0.0615234375, 0.0625, 0.0634765625, 0.064453125, 0.0654296875, 0.06640625, 0.0673828125, 0.068359375, 0.0693359375, 0.0703125, 0.0712890625, 0.072265625, 0.0732421875, 0.07421875, 0.0751953125, 0.076171875, 0.0771484375, 0.078125, 0.0791015625, 0.080078125, 0.0810546875, 0.08203125, 0.0830078125, 0.083984375, 0.0849609375, 0.0859375, 0.0869140625, 0.087890625, 0.0888671875, 0.08984375, 0.0908203125, 0.091796875, 0.0927734375, 0.09375, 0.0947265625, 0.095703125, 0.0966796875, 0.09765625, 0.0986328125, 0.099609375, 0.1005859375, 0.1015625, 0.1025390625, 0.103515625, 0.1044921875, 0.10546875, 0.1064453125, 0.107421875, 0.1083984375, 0.109375, 0.1103515625, 0.111328125, 0.1123046875, 0.11328125, 0.1142578125, 0.115234375, 0.1162109375, 0.1171875, 0.1181640625, 0.119140625, 0.1201171875, 0.12109375, 0.1220703125, 0.123046875, 0.1240234375, 0.125, 0.1259765625, 0.126953125, 0.1279296875, 0.12890625, 0.1298828125, 0.130859375, 0.1318359375, 0.1328125, 0.1337890625, 0.134765625, 0.1357421875, 0.13671875, 0.1376953125, 0.138671875, 0.1396484375, 0.140625, 0.1416015625, 0.142578125, 0.1435546875, 0.14453125, 0.1455078125, 0.146484375, 0.1474609375, 0.1484375, 0.1494140625, 0.150390625, 0.1513671875, 0.15234375, 0.1533203125, 0.154296875, 0.1552734375, 0.15625, 0.1572265625, 0.158203125, 0.1591796875, 0.16015625, 0.1611328125, 0.162109375, 0.1630859375, 0.1640625, 0.1650390625, 0.166015625, 0.1669921875, 0.16796875, 0.1689453125, 0.169921875, 0.1708984375, 0.171875, 0.1728515625, 0.173828125, 0.1748046875, 0.17578125, 0.1767578125, 0.177734375, 0.1787109375, 0.1796875, 0.1806640625, 0.181640625, 0.1826171875, 0.18359375, 0.1845703125, 0.185546875, 0.1865234375, 0.1875, 0.1884765625, 0.189453125, 0.1904296875, 0.19140625, 0.1923828125, 0.193359375, 0.1943359375, 0.1953125, 0.1962890625, 0.197265625, 0.1982421875, 0.19921875, 0.2001953125, 0.201171875, 0.2021484375, 0.203125, 0.2041015625, 0.205078125, 0.2060546875, 0.20703125, 0.2080078125, 0.208984375, 0.2099609375, 0.2109375, 0.2119140625, 0.212890625, 0.2138671875, 0.21484375, 0.2158203125, 0.216796875, 0.2177734375, 0.21875, 0.2197265625, 0.220703125, 0.2216796875, 0.22265625, 0.2236328125, 0.224609375, 0.2255859375, 0.2265625, 0.2275390625, 0.228515625, 0.2294921875, 0.23046875, 0.2314453125, 0.232421875, 0.2333984375, 0.234375, 0.2353515625, 0.236328125, 0.2373046875, 0.23828125, 0.2392578125, 0.240234375, 0.2412109375, 0.2421875, 0.2431640625, 0.244140625, 0.2451171875, 0.24609375, 0.2470703125, 0.248046875, 0.2490234375, 0.25, 0.2509765625, 0.251953125, 0.2529296875, 0.25390625, 0.2548828125, 0.255859375, 0.2568359375, 0.2578125, 0.2587890625, 0.259765625, 0.2607421875, 0.26171875, 0.2626953125, 0.263671875, 0.2646484375, 0.265625, 0.2666015625, 0.267578125, 0.2685546875, 0.26953125, 0.2705078125, 0.271484375, 0.2724609375, 0.2734375, 0.2744140625, 0.275390625, 0.2763671875, 0.27734375, 0.2783203125, 0.279296875, 0.2802734375, 0.28125, 0.2822265625, 0.283203125, 0.2841796875, 0.28515625, 0.2861328125, 0.287109375, 0.2880859375, 0.2890625, 0.2900390625, 0.291015625, 0.2919921875, 0.29296875, 0.2939453125, 0.294921875, 0.2958984375, 0.296875, 0.2978515625, 0.298828125, 0.2998046875, 0.30078125, 0.3017578125, 0.302734375, 0.3037109375, 0.3046875, 0.3056640625, 0.306640625, 0.3076171875, 0.30859375, 0.3095703125, 0.310546875, 0.3115234375, 0.3125, 0.3134765625, 0.314453125, 0.3154296875, 0.31640625, 0.3173828125, 0.318359375, 0.3193359375, 0.3203125, 0.3212890625, 0.322265625, 0.3232421875, 0.32421875, 0.3251953125, 0.326171875, 0.3271484375, 0.328125, 0.3291015625, 0.330078125, 0.3310546875, 0.33203125, 0.3330078125, 0.333984375, 0.3349609375, 0.3359375, 0.3369140625, 0.337890625, 0.3388671875, 0.33984375, 0.3408203125, 0.341796875, 0.3427734375, 0.34375, 0.3447265625, 0.345703125, 0.3466796875, 0.34765625, 0.3486328125, 0.349609375, 0.3505859375, 0.3515625, 0.3525390625, 0.353515625, 0.3544921875, 0.35546875, 0.3564453125, 0.357421875, 0.3583984375, 0.359375, 0.3603515625, 0.361328125, 0.3623046875, 0.36328125, 0.3642578125, 0.365234375, 0.3662109375, 0.3671875, 0.3681640625, 0.369140625, 0.3701171875, 0.37109375, 0.3720703125, 0.373046875, 0.3740234375, 0.375, 0.3759765625, 0.376953125, 0.3779296875, 0.37890625, 0.3798828125, 0.380859375, 0.3818359375, 0.3828125, 0.3837890625, 0.384765625, 0.3857421875, 0.38671875, 0.3876953125, 0.388671875, 0.3896484375, 0.390625, 0.3916015625, 0.392578125, 0.3935546875, 0.39453125, 0.3955078125, 0.396484375, 0.3974609375, 0.3984375, 0.3994140625, 0.400390625, 0.4013671875, 0.40234375, 0.4033203125, 0.404296875, 0.4052734375, 0.40625, 0.4072265625, 0.408203125, 0.4091796875, 0.41015625, 0.4111328125, 0.412109375, 0.4130859375, 0.4140625, 0.4150390625, 0.416015625, 0.4169921875, 0.41796875, 0.4189453125, 0.419921875, 0.4208984375, 0.421875, 0.4228515625, 0.423828125, 0.4248046875, 0.42578125, 0.4267578125, 0.427734375, 0.4287109375, 0.4296875, 0.4306640625, 0.431640625, 0.4326171875, 0.43359375, 0.4345703125, 0.435546875, 0.4365234375, 0.4375, 0.4384765625, 0.439453125, 0.4404296875, 0.44140625, 0.4423828125, 0.443359375, 0.4443359375, 0.4453125, 0.4462890625, 0.447265625, 0.4482421875, 0.44921875, 0.4501953125, 0.451171875, 0.4521484375, 0.453125, 0.4541015625, 0.455078125, 0.4560546875, 0.45703125, 0.4580078125, 0.458984375, 0.4599609375, 0.4609375, 0.4619140625, 0.462890625, 0.4638671875, 0.46484375, 0.4658203125, 0.466796875, 0.4677734375, 0.46875, 0.4697265625, 0.470703125, 0.4716796875, 0.47265625, 0.4736328125, 0.474609375, 0.4755859375, 0.4765625, 0.4775390625, 0.478515625, 0.4794921875, 0.48046875, 0.4814453125, 0.482421875, 0.4833984375, 0.484375, 0.4853515625, 0.486328125, 0.4873046875, 0.48828125, 0.4892578125, 0.490234375, 0.4912109375, 0.4921875, 0.4931640625, 0.494140625, 0.4951171875, 0.49609375, 0.4970703125, 0.498046875, 0.4990234375, 0.5, 0.5009765625, 0.501953125, 0.5029296875, 0.50390625, 0.5048828125, 0.505859375, 0.5068359375, 0.5078125, 0.5087890625, 0.509765625, 0.5107421875, 0.51171875, 0.5126953125, 0.513671875, 0.5146484375, 0.515625, 0.5166015625, 0.517578125, 0.5185546875, 0.51953125, 0.5205078125, 0.521484375, 0.5224609375, 0.5234375, 0.5244140625, 0.525390625, 0.5263671875, 0.52734375, 0.5283203125, 0.529296875, 0.5302734375, 0.53125, 0.5322265625, 0.533203125, 0.5341796875, 0.53515625, 0.5361328125, 0.537109375, 0.5380859375, 0.5390625, 0.5400390625, 0.541015625, 0.5419921875, 0.54296875, 0.5439453125, 0.544921875, 0.5458984375, 0.546875, 0.5478515625, 0.548828125, 0.5498046875, 0.55078125, 0.5517578125, 0.552734375, 0.5537109375, 0.5546875, 0.5556640625, 0.556640625, 0.5576171875, 0.55859375, 0.5595703125, 0.560546875, 0.5615234375, 0.5625, 0.5634765625, 0.564453125, 0.5654296875, 0.56640625, 0.5673828125, 0.568359375, 0.5693359375, 0.5703125, 0.5712890625, 0.572265625, 0.5732421875, 0.57421875, 0.5751953125, 0.576171875, 0.5771484375, 0.578125, 0.5791015625, 0.580078125, 0.5810546875, 0.58203125, 0.5830078125, 0.583984375, 0.5849609375, 0.5859375, 0.5869140625, 0.587890625, 0.5888671875, 0.58984375, 0.5908203125, 0.591796875, 0.5927734375, 0.59375, 0.5947265625, 0.595703125, 0.5966796875, 0.59765625, 0.5986328125, 0.599609375, 0.6005859375, 0.6015625, 0.6025390625, 0.603515625, 0.6044921875, 0.60546875, 0.6064453125, 0.607421875, 0.6083984375, 0.609375, 0.6103515625, 0.611328125, 0.6123046875, 0.61328125, 0.6142578125, 0.615234375, 0.6162109375, 0.6171875, 0.6181640625, 0.619140625, 0.6201171875, 0.62109375, 0.6220703125, 0.623046875, 0.6240234375, 0.625, 0.6259765625, 0.626953125, 0.6279296875, 0.62890625, 0.6298828125, 0.630859375, 0.6318359375, 0.6328125, 0.6337890625, 0.634765625, 0.6357421875, 0.63671875, 0.6376953125, 0.638671875, 0.6396484375, 0.640625, 0.6416015625, 0.642578125, 0.6435546875, 0.64453125, 0.6455078125, 0.646484375, 0.6474609375, 0.6484375, 0.6494140625, 0.650390625, 0.6513671875, 0.65234375, 0.6533203125, 0.654296875, 0.6552734375, 0.65625, 0.6572265625, 0.658203125, 0.6591796875, 0.66015625, 0.6611328125, 0.662109375, 0.6630859375, 0.6640625, 0.6650390625, 0.666015625, 0.6669921875, 0.66796875, 0.6689453125, 0.669921875, 0.6708984375, 0.671875, 0.6728515625, 0.673828125, 0.6748046875, 0.67578125, 0.6767578125, 0.677734375, 0.6787109375, 0.6796875, 0.6806640625, 0.681640625, 0.6826171875, 0.68359375, 0.6845703125, 0.685546875, 0.6865234375, 0.6875, 0.6884765625, 0.689453125, 0.6904296875, 0.69140625, 0.6923828125, 0.693359375, 0.6943359375, 0.6953125, 0.6962890625, 0.697265625, 0.6982421875, 0.69921875, 0.7001953125, 0.701171875, 0.7021484375, 0.703125, 0.7041015625, 0.705078125, 0.7060546875, 0.70703125, 0.7080078125, 0.708984375, 0.7099609375, 0.7109375, 0.7119140625, 0.712890625, 0.7138671875, 0.71484375, 0.7158203125, 0.716796875, 0.7177734375, 0.71875, 0.7197265625, 0.720703125, 0.7216796875, 0.72265625, 0.7236328125, 0.724609375, 0.7255859375, 0.7265625, 0.7275390625, 0.728515625, 0.7294921875, 0.73046875, 0.7314453125, 0.732421875, 0.7333984375, 0.734375, 0.7353515625, 0.736328125, 0.7373046875, 0.73828125, 0.7392578125, 0.740234375, 0.7412109375, 0.7421875, 0.7431640625, 0.744140625, 0.7451171875, 0.74609375, 0.7470703125, 0.748046875, 0.7490234375, 0.75, 0.7509765625, 0.751953125, 0.7529296875, 0.75390625, 0.7548828125, 0.755859375, 0.7568359375, 0.7578125, 0.7587890625, 0.759765625, 0.7607421875, 0.76171875, 0.7626953125, 0.763671875, 0.7646484375, 0.765625, 0.7666015625, 0.767578125, 0.7685546875, 0.76953125, 0.7705078125, 0.771484375, 0.7724609375, 0.7734375, 0.7744140625, 0.775390625, 0.7763671875, 0.77734375, 0.7783203125, 0.779296875, 0.7802734375, 0.78125, 0.7822265625, 0.783203125, 0.7841796875, 0.78515625, 0.7861328125, 0.787109375, 0.7880859375, 0.7890625, 0.7900390625, 0.791015625, 0.7919921875, 0.79296875, 0.7939453125, 0.794921875, 0.7958984375, 0.796875, 0.7978515625, 0.798828125, 0.7998046875, 0.80078125, 0.8017578125, 0.802734375, 0.8037109375, 0.8046875, 0.8056640625, 0.806640625, 0.8076171875, 0.80859375, 0.8095703125, 0.810546875, 0.8115234375, 0.8125, 0.8134765625, 0.814453125, 0.8154296875, 0.81640625, 0.8173828125, 0.818359375, 0.8193359375, 0.8203125, 0.8212890625, 0.822265625, 0.8232421875, 0.82421875, 0.8251953125, 0.826171875, 0.8271484375, 0.828125, 0.8291015625, 0.830078125, 0.8310546875, 0.83203125, 0.8330078125, 0.833984375, 0.8349609375, 0.8359375, 0.8369140625, 0.837890625, 0.8388671875, 0.83984375, 0.8408203125, 0.841796875, 0.8427734375, 0.84375, 0.8447265625, 0.845703125, 0.8466796875, 0.84765625, 0.8486328125, 0.849609375, 0.8505859375, 0.8515625, 0.8525390625, 0.853515625, 0.8544921875, 0.85546875, 0.8564453125, 0.857421875, 0.8583984375, 0.859375, 0.8603515625, 0.861328125, 0.8623046875, 0.86328125, 0.8642578125, 0.865234375, 0.8662109375, 0.8671875, 0.8681640625, 0.869140625, 0.8701171875, 0.87109375, 0.8720703125, 0.873046875, 0.8740234375, 0.875, 0.8759765625, 0.876953125, 0.8779296875, 0.87890625, 0.8798828125, 0.880859375, 0.8818359375, 0.8828125, 0.8837890625, 0.884765625, 0.8857421875, 0.88671875, 0.8876953125, 0.888671875, 0.8896484375, 0.890625, 0.8916015625, 0.892578125, 0.8935546875, 0.89453125, 0.8955078125, 0.896484375, 0.8974609375, 0.8984375, 0.8994140625, 0.900390625, 0.9013671875, 0.90234375, 0.9033203125, 0.904296875, 0.9052734375, 0.90625, 0.9072265625, 0.908203125, 0.9091796875, 0.91015625, 0.9111328125, 0.912109375, 0.9130859375, 0.9140625, 0.9150390625, 0.916015625, 0.9169921875, 0.91796875, 0.9189453125, 0.919921875, 0.9208984375, 0.921875, 0.9228515625, 0.923828125, 0.9248046875, 0.92578125, 0.9267578125, 0.927734375, 0.9287109375, 0.9296875, 0.9306640625, 0.931640625, 0.9326171875, 0.93359375, 0.9345703125, 0.935546875, 0.9365234375, 0.9375, 0.9384765625, 0.939453125, 0.9404296875, 0.94140625, 0.9423828125, 0.943359375, 0.9443359375, 0.9453125, 0.9462890625, 0.947265625, 0.9482421875, 0.94921875, 0.9501953125, 0.951171875, 0.9521484375, 0.953125, 0.9541015625, 0.955078125, 0.9560546875, 0.95703125, 0.9580078125, 0.958984375, 0.9599609375, 0.9609375, 0.9619140625, 0.962890625, 0.9638671875, 0.96484375, 0.9658203125, 0.966796875, 0.9677734375, 0.96875, 0.9697265625, 0.970703125, 0.9716796875, 0.97265625, 0.9736328125, 0.974609375, 0.9755859375, 0.9765625, 0.9775390625, 0.978515625, 0.9794921875, 0.98046875, 0.9814453125, 0.982421875, 0.9833984375, 0.984375, 0.9853515625, 0.986328125, 0.9873046875, 0.98828125, 0.9892578125, 0.990234375, 0.9912109375, 0.9921875, 0.9931640625, 0.994140625, 0.9951171875, 0.99609375, 0.9970703125, 0.998046875, 0.9990234375, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "state", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 49.921613025080326, "nll_per_token": 3.9104540357402726, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 80.28644877394166, "nll_per_token": 4.385600849226409, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 1.834606444575173, "unique_tokens": 256, "token_count": 8192, "distinct_1": 0.03125, "distinct_2": 0.08870967741935484, "top_token_mass": 0.475830078125}}
|
| 16 |
+
[done] docs/lta_samples/metrics_20260520/owt_t5_2node_step290000_infer_compare_n8/dual_state_c1024_t1p45.jsonl
|
| 17 |
+
[infer] tag=dual_onehot_c1024_t1p45 rule=dual_line_resample anchor=onehot cmax=1024 temp=1.45 out=docs/lta_samples/metrics_20260520/owt_t5_2node_step290000_infer_compare_n8/dual_onehot_c1024_t1p45.jsonl
|
| 18 |
+
[ckpt] eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step290000_20260520_200659.pt step=290000
|
| 19 |
+
[decode-base] n=8 max_len=1024 steps=1024 model_t=post
|
| 20 |
+
[decode-time] schedule=linear s=[0.0,0.25] gumbel=(2.2,0.8) force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.000977 dt_max=0.000977
|
| 21 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 22 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 23 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 24 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 25 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 26 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 27 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 28 |
+
[decode] temp=1.45 final=state rule=dual_line_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 29 |
+
[summary] {"type": "summary", "checkpoint": "eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step290000_20260520_200659.pt", "step": 290000, "decode": {"steps": 1024, "model_t_mode": "post", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0009765625, 0.001953125, 0.0029296875, 0.00390625, 0.0048828125, 0.005859375, 0.0068359375, 0.0078125, 0.0087890625, 0.009765625, 0.0107421875, 0.01171875, 0.0126953125, 0.013671875, 0.0146484375, 0.015625, 0.0166015625, 0.017578125, 0.0185546875, 0.01953125, 0.0205078125, 0.021484375, 0.0224609375, 0.0234375, 0.0244140625, 0.025390625, 0.0263671875, 0.02734375, 0.0283203125, 0.029296875, 0.0302734375, 0.03125, 0.0322265625, 0.033203125, 0.0341796875, 0.03515625, 0.0361328125, 0.037109375, 0.0380859375, 0.0390625, 0.0400390625, 0.041015625, 0.0419921875, 0.04296875, 0.0439453125, 0.044921875, 0.0458984375, 0.046875, 0.0478515625, 0.048828125, 0.0498046875, 0.05078125, 0.0517578125, 0.052734375, 0.0537109375, 0.0546875, 0.0556640625, 0.056640625, 0.0576171875, 0.05859375, 0.0595703125, 0.060546875, 0.0615234375, 0.0625, 0.0634765625, 0.064453125, 0.0654296875, 0.06640625, 0.0673828125, 0.068359375, 0.0693359375, 0.0703125, 0.0712890625, 0.072265625, 0.0732421875, 0.07421875, 0.0751953125, 0.076171875, 0.0771484375, 0.078125, 0.0791015625, 0.080078125, 0.0810546875, 0.08203125, 0.0830078125, 0.083984375, 0.0849609375, 0.0859375, 0.0869140625, 0.087890625, 0.0888671875, 0.08984375, 0.0908203125, 0.091796875, 0.0927734375, 0.09375, 0.0947265625, 0.095703125, 0.0966796875, 0.09765625, 0.0986328125, 0.099609375, 0.1005859375, 0.1015625, 0.1025390625, 0.103515625, 0.1044921875, 0.10546875, 0.1064453125, 0.107421875, 0.1083984375, 0.109375, 0.1103515625, 0.111328125, 0.1123046875, 0.11328125, 0.1142578125, 0.115234375, 0.1162109375, 0.1171875, 0.1181640625, 0.119140625, 0.1201171875, 0.12109375, 0.1220703125, 0.123046875, 0.1240234375, 0.125, 0.1259765625, 0.126953125, 0.1279296875, 0.12890625, 0.1298828125, 0.130859375, 0.1318359375, 0.1328125, 0.1337890625, 0.134765625, 0.1357421875, 0.13671875, 0.1376953125, 0.138671875, 0.1396484375, 0.140625, 0.1416015625, 0.142578125, 0.1435546875, 0.14453125, 0.1455078125, 0.146484375, 0.1474609375, 0.1484375, 0.1494140625, 0.150390625, 0.1513671875, 0.15234375, 0.1533203125, 0.154296875, 0.1552734375, 0.15625, 0.1572265625, 0.158203125, 0.1591796875, 0.16015625, 0.1611328125, 0.162109375, 0.1630859375, 0.1640625, 0.1650390625, 0.166015625, 0.1669921875, 0.16796875, 0.1689453125, 0.169921875, 0.1708984375, 0.171875, 0.1728515625, 0.173828125, 0.1748046875, 0.17578125, 0.1767578125, 0.177734375, 0.1787109375, 0.1796875, 0.1806640625, 0.181640625, 0.1826171875, 0.18359375, 0.1845703125, 0.185546875, 0.1865234375, 0.1875, 0.1884765625, 0.189453125, 0.1904296875, 0.19140625, 0.1923828125, 0.193359375, 0.1943359375, 0.1953125, 0.1962890625, 0.197265625, 0.1982421875, 0.19921875, 0.2001953125, 0.201171875, 0.2021484375, 0.203125, 0.2041015625, 0.205078125, 0.2060546875, 0.20703125, 0.2080078125, 0.208984375, 0.2099609375, 0.2109375, 0.2119140625, 0.212890625, 0.2138671875, 0.21484375, 0.2158203125, 0.216796875, 0.2177734375, 0.21875, 0.2197265625, 0.220703125, 0.2216796875, 0.22265625, 0.2236328125, 0.224609375, 0.2255859375, 0.2265625, 0.2275390625, 0.228515625, 0.2294921875, 0.23046875, 0.2314453125, 0.232421875, 0.2333984375, 0.234375, 0.2353515625, 0.236328125, 0.2373046875, 0.23828125, 0.2392578125, 0.240234375, 0.2412109375, 0.2421875, 0.2431640625, 0.244140625, 0.2451171875, 0.24609375, 0.2470703125, 0.248046875, 0.2490234375, 0.25, 0.2509765625, 0.251953125, 0.2529296875, 0.25390625, 0.2548828125, 0.255859375, 0.2568359375, 0.2578125, 0.2587890625, 0.259765625, 0.2607421875, 0.26171875, 0.2626953125, 0.263671875, 0.2646484375, 0.265625, 0.2666015625, 0.267578125, 0.2685546875, 0.26953125, 0.2705078125, 0.271484375, 0.2724609375, 0.2734375, 0.2744140625, 0.275390625, 0.2763671875, 0.27734375, 0.2783203125, 0.279296875, 0.2802734375, 0.28125, 0.2822265625, 0.283203125, 0.2841796875, 0.28515625, 0.2861328125, 0.287109375, 0.2880859375, 0.2890625, 0.2900390625, 0.291015625, 0.2919921875, 0.29296875, 0.2939453125, 0.294921875, 0.2958984375, 0.296875, 0.2978515625, 0.298828125, 0.2998046875, 0.30078125, 0.3017578125, 0.302734375, 0.3037109375, 0.3046875, 0.3056640625, 0.306640625, 0.3076171875, 0.30859375, 0.3095703125, 0.310546875, 0.3115234375, 0.3125, 0.3134765625, 0.314453125, 0.3154296875, 0.31640625, 0.3173828125, 0.318359375, 0.3193359375, 0.3203125, 0.3212890625, 0.322265625, 0.3232421875, 0.32421875, 0.3251953125, 0.326171875, 0.3271484375, 0.328125, 0.3291015625, 0.330078125, 0.3310546875, 0.33203125, 0.3330078125, 0.333984375, 0.3349609375, 0.3359375, 0.3369140625, 0.337890625, 0.3388671875, 0.33984375, 0.3408203125, 0.341796875, 0.3427734375, 0.34375, 0.3447265625, 0.345703125, 0.3466796875, 0.34765625, 0.3486328125, 0.349609375, 0.3505859375, 0.3515625, 0.3525390625, 0.353515625, 0.3544921875, 0.35546875, 0.3564453125, 0.357421875, 0.3583984375, 0.359375, 0.3603515625, 0.361328125, 0.3623046875, 0.36328125, 0.3642578125, 0.365234375, 0.3662109375, 0.3671875, 0.3681640625, 0.369140625, 0.3701171875, 0.37109375, 0.3720703125, 0.373046875, 0.3740234375, 0.375, 0.3759765625, 0.376953125, 0.3779296875, 0.37890625, 0.3798828125, 0.380859375, 0.3818359375, 0.3828125, 0.3837890625, 0.384765625, 0.3857421875, 0.38671875, 0.3876953125, 0.388671875, 0.3896484375, 0.390625, 0.3916015625, 0.392578125, 0.3935546875, 0.39453125, 0.3955078125, 0.396484375, 0.3974609375, 0.3984375, 0.3994140625, 0.400390625, 0.4013671875, 0.40234375, 0.4033203125, 0.404296875, 0.4052734375, 0.40625, 0.4072265625, 0.408203125, 0.4091796875, 0.41015625, 0.4111328125, 0.412109375, 0.4130859375, 0.4140625, 0.4150390625, 0.416015625, 0.4169921875, 0.41796875, 0.4189453125, 0.419921875, 0.4208984375, 0.421875, 0.4228515625, 0.423828125, 0.4248046875, 0.42578125, 0.4267578125, 0.427734375, 0.4287109375, 0.4296875, 0.4306640625, 0.431640625, 0.4326171875, 0.43359375, 0.4345703125, 0.435546875, 0.4365234375, 0.4375, 0.4384765625, 0.439453125, 0.4404296875, 0.44140625, 0.4423828125, 0.443359375, 0.4443359375, 0.4453125, 0.4462890625, 0.447265625, 0.4482421875, 0.44921875, 0.4501953125, 0.451171875, 0.4521484375, 0.453125, 0.4541015625, 0.455078125, 0.4560546875, 0.45703125, 0.4580078125, 0.458984375, 0.4599609375, 0.4609375, 0.4619140625, 0.462890625, 0.4638671875, 0.46484375, 0.4658203125, 0.466796875, 0.4677734375, 0.46875, 0.4697265625, 0.470703125, 0.4716796875, 0.47265625, 0.4736328125, 0.474609375, 0.4755859375, 0.4765625, 0.4775390625, 0.478515625, 0.4794921875, 0.48046875, 0.4814453125, 0.482421875, 0.4833984375, 0.484375, 0.4853515625, 0.486328125, 0.4873046875, 0.48828125, 0.4892578125, 0.490234375, 0.4912109375, 0.4921875, 0.4931640625, 0.494140625, 0.4951171875, 0.49609375, 0.4970703125, 0.498046875, 0.4990234375, 0.5, 0.5009765625, 0.501953125, 0.5029296875, 0.50390625, 0.5048828125, 0.505859375, 0.5068359375, 0.5078125, 0.5087890625, 0.509765625, 0.5107421875, 0.51171875, 0.5126953125, 0.513671875, 0.5146484375, 0.515625, 0.5166015625, 0.517578125, 0.5185546875, 0.51953125, 0.5205078125, 0.521484375, 0.5224609375, 0.5234375, 0.5244140625, 0.525390625, 0.5263671875, 0.52734375, 0.5283203125, 0.529296875, 0.5302734375, 0.53125, 0.5322265625, 0.533203125, 0.5341796875, 0.53515625, 0.5361328125, 0.537109375, 0.5380859375, 0.5390625, 0.5400390625, 0.541015625, 0.5419921875, 0.54296875, 0.5439453125, 0.544921875, 0.5458984375, 0.546875, 0.5478515625, 0.548828125, 0.5498046875, 0.55078125, 0.5517578125, 0.552734375, 0.5537109375, 0.5546875, 0.5556640625, 0.556640625, 0.5576171875, 0.55859375, 0.5595703125, 0.560546875, 0.5615234375, 0.5625, 0.5634765625, 0.564453125, 0.5654296875, 0.56640625, 0.5673828125, 0.568359375, 0.5693359375, 0.5703125, 0.5712890625, 0.572265625, 0.5732421875, 0.57421875, 0.5751953125, 0.576171875, 0.5771484375, 0.578125, 0.5791015625, 0.580078125, 0.5810546875, 0.58203125, 0.5830078125, 0.583984375, 0.5849609375, 0.5859375, 0.5869140625, 0.587890625, 0.5888671875, 0.58984375, 0.5908203125, 0.591796875, 0.5927734375, 0.59375, 0.5947265625, 0.595703125, 0.5966796875, 0.59765625, 0.5986328125, 0.599609375, 0.6005859375, 0.6015625, 0.6025390625, 0.603515625, 0.6044921875, 0.60546875, 0.6064453125, 0.607421875, 0.6083984375, 0.609375, 0.6103515625, 0.611328125, 0.6123046875, 0.61328125, 0.6142578125, 0.615234375, 0.6162109375, 0.6171875, 0.6181640625, 0.619140625, 0.6201171875, 0.62109375, 0.6220703125, 0.623046875, 0.6240234375, 0.625, 0.6259765625, 0.626953125, 0.6279296875, 0.62890625, 0.6298828125, 0.630859375, 0.6318359375, 0.6328125, 0.6337890625, 0.634765625, 0.6357421875, 0.63671875, 0.6376953125, 0.638671875, 0.6396484375, 0.640625, 0.6416015625, 0.642578125, 0.6435546875, 0.64453125, 0.6455078125, 0.646484375, 0.6474609375, 0.6484375, 0.6494140625, 0.650390625, 0.6513671875, 0.65234375, 0.6533203125, 0.654296875, 0.6552734375, 0.65625, 0.6572265625, 0.658203125, 0.6591796875, 0.66015625, 0.6611328125, 0.662109375, 0.6630859375, 0.6640625, 0.6650390625, 0.666015625, 0.6669921875, 0.66796875, 0.6689453125, 0.669921875, 0.6708984375, 0.671875, 0.6728515625, 0.673828125, 0.6748046875, 0.67578125, 0.6767578125, 0.677734375, 0.6787109375, 0.6796875, 0.6806640625, 0.681640625, 0.6826171875, 0.68359375, 0.6845703125, 0.685546875, 0.6865234375, 0.6875, 0.6884765625, 0.689453125, 0.6904296875, 0.69140625, 0.6923828125, 0.693359375, 0.6943359375, 0.6953125, 0.6962890625, 0.697265625, 0.6982421875, 0.69921875, 0.7001953125, 0.701171875, 0.7021484375, 0.703125, 0.7041015625, 0.705078125, 0.7060546875, 0.70703125, 0.7080078125, 0.708984375, 0.7099609375, 0.7109375, 0.7119140625, 0.712890625, 0.7138671875, 0.71484375, 0.7158203125, 0.716796875, 0.7177734375, 0.71875, 0.7197265625, 0.720703125, 0.7216796875, 0.72265625, 0.7236328125, 0.724609375, 0.7255859375, 0.7265625, 0.7275390625, 0.728515625, 0.7294921875, 0.73046875, 0.7314453125, 0.732421875, 0.7333984375, 0.734375, 0.7353515625, 0.736328125, 0.7373046875, 0.73828125, 0.7392578125, 0.740234375, 0.7412109375, 0.7421875, 0.7431640625, 0.744140625, 0.7451171875, 0.74609375, 0.7470703125, 0.748046875, 0.7490234375, 0.75, 0.7509765625, 0.751953125, 0.7529296875, 0.75390625, 0.7548828125, 0.755859375, 0.7568359375, 0.7578125, 0.7587890625, 0.759765625, 0.7607421875, 0.76171875, 0.7626953125, 0.763671875, 0.7646484375, 0.765625, 0.7666015625, 0.767578125, 0.7685546875, 0.76953125, 0.7705078125, 0.771484375, 0.7724609375, 0.7734375, 0.7744140625, 0.775390625, 0.7763671875, 0.77734375, 0.7783203125, 0.779296875, 0.7802734375, 0.78125, 0.7822265625, 0.783203125, 0.7841796875, 0.78515625, 0.7861328125, 0.787109375, 0.7880859375, 0.7890625, 0.7900390625, 0.791015625, 0.7919921875, 0.79296875, 0.7939453125, 0.794921875, 0.7958984375, 0.796875, 0.7978515625, 0.798828125, 0.7998046875, 0.80078125, 0.8017578125, 0.802734375, 0.8037109375, 0.8046875, 0.8056640625, 0.806640625, 0.8076171875, 0.80859375, 0.8095703125, 0.810546875, 0.8115234375, 0.8125, 0.8134765625, 0.814453125, 0.8154296875, 0.81640625, 0.8173828125, 0.818359375, 0.8193359375, 0.8203125, 0.8212890625, 0.822265625, 0.8232421875, 0.82421875, 0.8251953125, 0.826171875, 0.8271484375, 0.828125, 0.8291015625, 0.830078125, 0.8310546875, 0.83203125, 0.8330078125, 0.833984375, 0.8349609375, 0.8359375, 0.8369140625, 0.837890625, 0.8388671875, 0.83984375, 0.8408203125, 0.841796875, 0.8427734375, 0.84375, 0.8447265625, 0.845703125, 0.8466796875, 0.84765625, 0.8486328125, 0.849609375, 0.8505859375, 0.8515625, 0.8525390625, 0.853515625, 0.8544921875, 0.85546875, 0.8564453125, 0.857421875, 0.8583984375, 0.859375, 0.8603515625, 0.861328125, 0.8623046875, 0.86328125, 0.8642578125, 0.865234375, 0.8662109375, 0.8671875, 0.8681640625, 0.869140625, 0.8701171875, 0.87109375, 0.8720703125, 0.873046875, 0.8740234375, 0.875, 0.8759765625, 0.876953125, 0.8779296875, 0.87890625, 0.8798828125, 0.880859375, 0.8818359375, 0.8828125, 0.8837890625, 0.884765625, 0.8857421875, 0.88671875, 0.8876953125, 0.888671875, 0.8896484375, 0.890625, 0.8916015625, 0.892578125, 0.8935546875, 0.89453125, 0.8955078125, 0.896484375, 0.8974609375, 0.8984375, 0.8994140625, 0.900390625, 0.9013671875, 0.90234375, 0.9033203125, 0.904296875, 0.9052734375, 0.90625, 0.9072265625, 0.908203125, 0.9091796875, 0.91015625, 0.9111328125, 0.912109375, 0.9130859375, 0.9140625, 0.9150390625, 0.916015625, 0.9169921875, 0.91796875, 0.9189453125, 0.919921875, 0.9208984375, 0.921875, 0.9228515625, 0.923828125, 0.9248046875, 0.92578125, 0.9267578125, 0.927734375, 0.9287109375, 0.9296875, 0.9306640625, 0.931640625, 0.9326171875, 0.93359375, 0.9345703125, 0.935546875, 0.9365234375, 0.9375, 0.9384765625, 0.939453125, 0.9404296875, 0.94140625, 0.9423828125, 0.943359375, 0.9443359375, 0.9453125, 0.9462890625, 0.947265625, 0.9482421875, 0.94921875, 0.9501953125, 0.951171875, 0.9521484375, 0.953125, 0.9541015625, 0.955078125, 0.9560546875, 0.95703125, 0.9580078125, 0.958984375, 0.9599609375, 0.9609375, 0.9619140625, 0.962890625, 0.9638671875, 0.96484375, 0.9658203125, 0.966796875, 0.9677734375, 0.96875, 0.9697265625, 0.970703125, 0.9716796875, 0.97265625, 0.9736328125, 0.974609375, 0.9755859375, 0.9765625, 0.9775390625, 0.978515625, 0.9794921875, 0.98046875, 0.9814453125, 0.982421875, 0.9833984375, 0.984375, 0.9853515625, 0.986328125, 0.9873046875, 0.98828125, 0.9892578125, 0.990234375, 0.9912109375, 0.9921875, 0.9931640625, 0.994140625, 0.9951171875, 0.99609375, 0.9970703125, 0.998046875, 0.9990234375, 1.0], "decode_rule": "dual_line_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 21.517578062657247, "nll_per_token": 3.068870185403263, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 38.67711208881378, "nll_per_token": 3.655248006184896, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 1.5010731185605346, "unique_tokens": 253, "token_count": 8192, "distinct_1": 0.0308837890625, "distinct_2": 0.0846774193548387, "top_token_mass": 0.4466552734375}}
|
| 30 |
+
[done] docs/lta_samples/metrics_20260520/owt_t5_2node_step290000_infer_compare_n8/dual_onehot_c1024_t1p45.jsonl
|
| 31 |
+
[infer] tag=dirres_c128_t1p55 rule=dirichlet_resample anchor=onehot cmax=128 temp=1.55 out=docs/lta_samples/metrics_20260520/owt_t5_2node_step290000_infer_compare_n8/dirres_c128_t1p55.jsonl
|
| 32 |
+
[ckpt] eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step290000_20260520_200659.pt step=290000
|
| 33 |
+
[decode-base] n=8 max_len=1024 steps=1024 model_t=post
|
| 34 |
+
[decode-time] schedule=linear s=[0.0,0.25] gumbel=(2.2,0.8) force_final=True t0=0.000000 t_mid=0.500000 t_end=1.000000 dt_mean=0.000977 dt_max=0.000977
|
| 35 |
+
[decode] temp=1.55 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 1/8
|
| 36 |
+
[decode] temp=1.55 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 2/8
|
| 37 |
+
[decode] temp=1.55 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 3/8
|
| 38 |
+
[decode] temp=1.55 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 4/8
|
| 39 |
+
[decode] temp=1.55 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 5/8
|
| 40 |
+
[decode] temp=1.55 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 6/8
|
| 41 |
+
[decode] temp=1.55 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 7/8
|
| 42 |
+
[decode] temp=1.55 final=state rule=dirichlet_resample support=1 semantic=1 anchor=onehot cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise time_path=0.0000->1.0000 generated 8/8
|
| 43 |
+
[summary] {"type": "summary", "checkpoint": "eval_ckpts/lta_owt_t5_adaln_adamw_wd0p1_rollin_p50_randk0_3_uniformt_temp1_synct_gbs512_2node8gpu_1m_t-20260518224737-tftgw/latest_frozen_step290000_20260520_200659.pt", "step": 290000, "decode": {"steps": 1024, "model_t_mode": "post", "decode_time_schedule": "linear", "decode_s_min_frac": 0.0, "decode_s_max_frac": 0.25, "decode_force_final_t": true, "decode_time_grid": [0.0, 0.0009765625, 0.001953125, 0.0029296875, 0.00390625, 0.0048828125, 0.005859375, 0.0068359375, 0.0078125, 0.0087890625, 0.009765625, 0.0107421875, 0.01171875, 0.0126953125, 0.013671875, 0.0146484375, 0.015625, 0.0166015625, 0.017578125, 0.0185546875, 0.01953125, 0.0205078125, 0.021484375, 0.0224609375, 0.0234375, 0.0244140625, 0.025390625, 0.0263671875, 0.02734375, 0.0283203125, 0.029296875, 0.0302734375, 0.03125, 0.0322265625, 0.033203125, 0.0341796875, 0.03515625, 0.0361328125, 0.037109375, 0.0380859375, 0.0390625, 0.0400390625, 0.041015625, 0.0419921875, 0.04296875, 0.0439453125, 0.044921875, 0.0458984375, 0.046875, 0.0478515625, 0.048828125, 0.0498046875, 0.05078125, 0.0517578125, 0.052734375, 0.0537109375, 0.0546875, 0.0556640625, 0.056640625, 0.0576171875, 0.05859375, 0.0595703125, 0.060546875, 0.0615234375, 0.0625, 0.0634765625, 0.064453125, 0.0654296875, 0.06640625, 0.0673828125, 0.068359375, 0.0693359375, 0.0703125, 0.0712890625, 0.072265625, 0.0732421875, 0.07421875, 0.0751953125, 0.076171875, 0.0771484375, 0.078125, 0.0791015625, 0.080078125, 0.0810546875, 0.08203125, 0.0830078125, 0.083984375, 0.0849609375, 0.0859375, 0.0869140625, 0.087890625, 0.0888671875, 0.08984375, 0.0908203125, 0.091796875, 0.0927734375, 0.09375, 0.0947265625, 0.095703125, 0.0966796875, 0.09765625, 0.0986328125, 0.099609375, 0.1005859375, 0.1015625, 0.1025390625, 0.103515625, 0.1044921875, 0.10546875, 0.1064453125, 0.107421875, 0.1083984375, 0.109375, 0.1103515625, 0.111328125, 0.1123046875, 0.11328125, 0.1142578125, 0.115234375, 0.1162109375, 0.1171875, 0.1181640625, 0.119140625, 0.1201171875, 0.12109375, 0.1220703125, 0.123046875, 0.1240234375, 0.125, 0.1259765625, 0.126953125, 0.1279296875, 0.12890625, 0.1298828125, 0.130859375, 0.1318359375, 0.1328125, 0.1337890625, 0.134765625, 0.1357421875, 0.13671875, 0.1376953125, 0.138671875, 0.1396484375, 0.140625, 0.1416015625, 0.142578125, 0.1435546875, 0.14453125, 0.1455078125, 0.146484375, 0.1474609375, 0.1484375, 0.1494140625, 0.150390625, 0.1513671875, 0.15234375, 0.1533203125, 0.154296875, 0.1552734375, 0.15625, 0.1572265625, 0.158203125, 0.1591796875, 0.16015625, 0.1611328125, 0.162109375, 0.1630859375, 0.1640625, 0.1650390625, 0.166015625, 0.1669921875, 0.16796875, 0.1689453125, 0.169921875, 0.1708984375, 0.171875, 0.1728515625, 0.173828125, 0.1748046875, 0.17578125, 0.1767578125, 0.177734375, 0.1787109375, 0.1796875, 0.1806640625, 0.181640625, 0.1826171875, 0.18359375, 0.1845703125, 0.185546875, 0.1865234375, 0.1875, 0.1884765625, 0.189453125, 0.1904296875, 0.19140625, 0.1923828125, 0.193359375, 0.1943359375, 0.1953125, 0.1962890625, 0.197265625, 0.1982421875, 0.19921875, 0.2001953125, 0.201171875, 0.2021484375, 0.203125, 0.2041015625, 0.205078125, 0.2060546875, 0.20703125, 0.2080078125, 0.208984375, 0.2099609375, 0.2109375, 0.2119140625, 0.212890625, 0.2138671875, 0.21484375, 0.2158203125, 0.216796875, 0.2177734375, 0.21875, 0.2197265625, 0.220703125, 0.2216796875, 0.22265625, 0.2236328125, 0.224609375, 0.2255859375, 0.2265625, 0.2275390625, 0.228515625, 0.2294921875, 0.23046875, 0.2314453125, 0.232421875, 0.2333984375, 0.234375, 0.2353515625, 0.236328125, 0.2373046875, 0.23828125, 0.2392578125, 0.240234375, 0.2412109375, 0.2421875, 0.2431640625, 0.244140625, 0.2451171875, 0.24609375, 0.2470703125, 0.248046875, 0.2490234375, 0.25, 0.2509765625, 0.251953125, 0.2529296875, 0.25390625, 0.2548828125, 0.255859375, 0.2568359375, 0.2578125, 0.2587890625, 0.259765625, 0.2607421875, 0.26171875, 0.2626953125, 0.263671875, 0.2646484375, 0.265625, 0.2666015625, 0.267578125, 0.2685546875, 0.26953125, 0.2705078125, 0.271484375, 0.2724609375, 0.2734375, 0.2744140625, 0.275390625, 0.2763671875, 0.27734375, 0.2783203125, 0.279296875, 0.2802734375, 0.28125, 0.2822265625, 0.283203125, 0.2841796875, 0.28515625, 0.2861328125, 0.287109375, 0.2880859375, 0.2890625, 0.2900390625, 0.291015625, 0.2919921875, 0.29296875, 0.2939453125, 0.294921875, 0.2958984375, 0.296875, 0.2978515625, 0.298828125, 0.2998046875, 0.30078125, 0.3017578125, 0.302734375, 0.3037109375, 0.3046875, 0.3056640625, 0.306640625, 0.3076171875, 0.30859375, 0.3095703125, 0.310546875, 0.3115234375, 0.3125, 0.3134765625, 0.314453125, 0.3154296875, 0.31640625, 0.3173828125, 0.318359375, 0.3193359375, 0.3203125, 0.3212890625, 0.322265625, 0.3232421875, 0.32421875, 0.3251953125, 0.326171875, 0.3271484375, 0.328125, 0.3291015625, 0.330078125, 0.3310546875, 0.33203125, 0.3330078125, 0.333984375, 0.3349609375, 0.3359375, 0.3369140625, 0.337890625, 0.3388671875, 0.33984375, 0.3408203125, 0.341796875, 0.3427734375, 0.34375, 0.3447265625, 0.345703125, 0.3466796875, 0.34765625, 0.3486328125, 0.349609375, 0.3505859375, 0.3515625, 0.3525390625, 0.353515625, 0.3544921875, 0.35546875, 0.3564453125, 0.357421875, 0.3583984375, 0.359375, 0.3603515625, 0.361328125, 0.3623046875, 0.36328125, 0.3642578125, 0.365234375, 0.3662109375, 0.3671875, 0.3681640625, 0.369140625, 0.3701171875, 0.37109375, 0.3720703125, 0.373046875, 0.3740234375, 0.375, 0.3759765625, 0.376953125, 0.3779296875, 0.37890625, 0.3798828125, 0.380859375, 0.3818359375, 0.3828125, 0.3837890625, 0.384765625, 0.3857421875, 0.38671875, 0.3876953125, 0.388671875, 0.3896484375, 0.390625, 0.3916015625, 0.392578125, 0.3935546875, 0.39453125, 0.3955078125, 0.396484375, 0.3974609375, 0.3984375, 0.3994140625, 0.400390625, 0.4013671875, 0.40234375, 0.4033203125, 0.404296875, 0.4052734375, 0.40625, 0.4072265625, 0.408203125, 0.4091796875, 0.41015625, 0.4111328125, 0.412109375, 0.4130859375, 0.4140625, 0.4150390625, 0.416015625, 0.4169921875, 0.41796875, 0.4189453125, 0.419921875, 0.4208984375, 0.421875, 0.4228515625, 0.423828125, 0.4248046875, 0.42578125, 0.4267578125, 0.427734375, 0.4287109375, 0.4296875, 0.4306640625, 0.431640625, 0.4326171875, 0.43359375, 0.4345703125, 0.435546875, 0.4365234375, 0.4375, 0.4384765625, 0.439453125, 0.4404296875, 0.44140625, 0.4423828125, 0.443359375, 0.4443359375, 0.4453125, 0.4462890625, 0.447265625, 0.4482421875, 0.44921875, 0.4501953125, 0.451171875, 0.4521484375, 0.453125, 0.4541015625, 0.455078125, 0.4560546875, 0.45703125, 0.4580078125, 0.458984375, 0.4599609375, 0.4609375, 0.4619140625, 0.462890625, 0.4638671875, 0.46484375, 0.4658203125, 0.466796875, 0.4677734375, 0.46875, 0.4697265625, 0.470703125, 0.4716796875, 0.47265625, 0.4736328125, 0.474609375, 0.4755859375, 0.4765625, 0.4775390625, 0.478515625, 0.4794921875, 0.48046875, 0.4814453125, 0.482421875, 0.4833984375, 0.484375, 0.4853515625, 0.486328125, 0.4873046875, 0.48828125, 0.4892578125, 0.490234375, 0.4912109375, 0.4921875, 0.4931640625, 0.494140625, 0.4951171875, 0.49609375, 0.4970703125, 0.498046875, 0.4990234375, 0.5, 0.5009765625, 0.501953125, 0.5029296875, 0.50390625, 0.5048828125, 0.505859375, 0.5068359375, 0.5078125, 0.5087890625, 0.509765625, 0.5107421875, 0.51171875, 0.5126953125, 0.513671875, 0.5146484375, 0.515625, 0.5166015625, 0.517578125, 0.5185546875, 0.51953125, 0.5205078125, 0.521484375, 0.5224609375, 0.5234375, 0.5244140625, 0.525390625, 0.5263671875, 0.52734375, 0.5283203125, 0.529296875, 0.5302734375, 0.53125, 0.5322265625, 0.533203125, 0.5341796875, 0.53515625, 0.5361328125, 0.537109375, 0.5380859375, 0.5390625, 0.5400390625, 0.541015625, 0.5419921875, 0.54296875, 0.5439453125, 0.544921875, 0.5458984375, 0.546875, 0.5478515625, 0.548828125, 0.5498046875, 0.55078125, 0.5517578125, 0.552734375, 0.5537109375, 0.5546875, 0.5556640625, 0.556640625, 0.5576171875, 0.55859375, 0.5595703125, 0.560546875, 0.5615234375, 0.5625, 0.5634765625, 0.564453125, 0.5654296875, 0.56640625, 0.5673828125, 0.568359375, 0.5693359375, 0.5703125, 0.5712890625, 0.572265625, 0.5732421875, 0.57421875, 0.5751953125, 0.576171875, 0.5771484375, 0.578125, 0.5791015625, 0.580078125, 0.5810546875, 0.58203125, 0.5830078125, 0.583984375, 0.5849609375, 0.5859375, 0.5869140625, 0.587890625, 0.5888671875, 0.58984375, 0.5908203125, 0.591796875, 0.5927734375, 0.59375, 0.5947265625, 0.595703125, 0.5966796875, 0.59765625, 0.5986328125, 0.599609375, 0.6005859375, 0.6015625, 0.6025390625, 0.603515625, 0.6044921875, 0.60546875, 0.6064453125, 0.607421875, 0.6083984375, 0.609375, 0.6103515625, 0.611328125, 0.6123046875, 0.61328125, 0.6142578125, 0.615234375, 0.6162109375, 0.6171875, 0.6181640625, 0.619140625, 0.6201171875, 0.62109375, 0.6220703125, 0.623046875, 0.6240234375, 0.625, 0.6259765625, 0.626953125, 0.6279296875, 0.62890625, 0.6298828125, 0.630859375, 0.6318359375, 0.6328125, 0.6337890625, 0.634765625, 0.6357421875, 0.63671875, 0.6376953125, 0.638671875, 0.6396484375, 0.640625, 0.6416015625, 0.642578125, 0.6435546875, 0.64453125, 0.6455078125, 0.646484375, 0.6474609375, 0.6484375, 0.6494140625, 0.650390625, 0.6513671875, 0.65234375, 0.6533203125, 0.654296875, 0.6552734375, 0.65625, 0.6572265625, 0.658203125, 0.6591796875, 0.66015625, 0.6611328125, 0.662109375, 0.6630859375, 0.6640625, 0.6650390625, 0.666015625, 0.6669921875, 0.66796875, 0.6689453125, 0.669921875, 0.6708984375, 0.671875, 0.6728515625, 0.673828125, 0.6748046875, 0.67578125, 0.6767578125, 0.677734375, 0.6787109375, 0.6796875, 0.6806640625, 0.681640625, 0.6826171875, 0.68359375, 0.6845703125, 0.685546875, 0.6865234375, 0.6875, 0.6884765625, 0.689453125, 0.6904296875, 0.69140625, 0.6923828125, 0.693359375, 0.6943359375, 0.6953125, 0.6962890625, 0.697265625, 0.6982421875, 0.69921875, 0.7001953125, 0.701171875, 0.7021484375, 0.703125, 0.7041015625, 0.705078125, 0.7060546875, 0.70703125, 0.7080078125, 0.708984375, 0.7099609375, 0.7109375, 0.7119140625, 0.712890625, 0.7138671875, 0.71484375, 0.7158203125, 0.716796875, 0.7177734375, 0.71875, 0.7197265625, 0.720703125, 0.7216796875, 0.72265625, 0.7236328125, 0.724609375, 0.7255859375, 0.7265625, 0.7275390625, 0.728515625, 0.7294921875, 0.73046875, 0.7314453125, 0.732421875, 0.7333984375, 0.734375, 0.7353515625, 0.736328125, 0.7373046875, 0.73828125, 0.7392578125, 0.740234375, 0.7412109375, 0.7421875, 0.7431640625, 0.744140625, 0.7451171875, 0.74609375, 0.7470703125, 0.748046875, 0.7490234375, 0.75, 0.7509765625, 0.751953125, 0.7529296875, 0.75390625, 0.7548828125, 0.755859375, 0.7568359375, 0.7578125, 0.7587890625, 0.759765625, 0.7607421875, 0.76171875, 0.7626953125, 0.763671875, 0.7646484375, 0.765625, 0.7666015625, 0.767578125, 0.7685546875, 0.76953125, 0.7705078125, 0.771484375, 0.7724609375, 0.7734375, 0.7744140625, 0.775390625, 0.7763671875, 0.77734375, 0.7783203125, 0.779296875, 0.7802734375, 0.78125, 0.7822265625, 0.783203125, 0.7841796875, 0.78515625, 0.7861328125, 0.787109375, 0.7880859375, 0.7890625, 0.7900390625, 0.791015625, 0.7919921875, 0.79296875, 0.7939453125, 0.794921875, 0.7958984375, 0.796875, 0.7978515625, 0.798828125, 0.7998046875, 0.80078125, 0.8017578125, 0.802734375, 0.8037109375, 0.8046875, 0.8056640625, 0.806640625, 0.8076171875, 0.80859375, 0.8095703125, 0.810546875, 0.8115234375, 0.8125, 0.8134765625, 0.814453125, 0.8154296875, 0.81640625, 0.8173828125, 0.818359375, 0.8193359375, 0.8203125, 0.8212890625, 0.822265625, 0.8232421875, 0.82421875, 0.8251953125, 0.826171875, 0.8271484375, 0.828125, 0.8291015625, 0.830078125, 0.8310546875, 0.83203125, 0.8330078125, 0.833984375, 0.8349609375, 0.8359375, 0.8369140625, 0.837890625, 0.8388671875, 0.83984375, 0.8408203125, 0.841796875, 0.8427734375, 0.84375, 0.8447265625, 0.845703125, 0.8466796875, 0.84765625, 0.8486328125, 0.849609375, 0.8505859375, 0.8515625, 0.8525390625, 0.853515625, 0.8544921875, 0.85546875, 0.8564453125, 0.857421875, 0.8583984375, 0.859375, 0.8603515625, 0.861328125, 0.8623046875, 0.86328125, 0.8642578125, 0.865234375, 0.8662109375, 0.8671875, 0.8681640625, 0.869140625, 0.8701171875, 0.87109375, 0.8720703125, 0.873046875, 0.8740234375, 0.875, 0.8759765625, 0.876953125, 0.8779296875, 0.87890625, 0.8798828125, 0.880859375, 0.8818359375, 0.8828125, 0.8837890625, 0.884765625, 0.8857421875, 0.88671875, 0.8876953125, 0.888671875, 0.8896484375, 0.890625, 0.8916015625, 0.892578125, 0.8935546875, 0.89453125, 0.8955078125, 0.896484375, 0.8974609375, 0.8984375, 0.8994140625, 0.900390625, 0.9013671875, 0.90234375, 0.9033203125, 0.904296875, 0.9052734375, 0.90625, 0.9072265625, 0.908203125, 0.9091796875, 0.91015625, 0.9111328125, 0.912109375, 0.9130859375, 0.9140625, 0.9150390625, 0.916015625, 0.9169921875, 0.91796875, 0.9189453125, 0.919921875, 0.9208984375, 0.921875, 0.9228515625, 0.923828125, 0.9248046875, 0.92578125, 0.9267578125, 0.927734375, 0.9287109375, 0.9296875, 0.9306640625, 0.931640625, 0.9326171875, 0.93359375, 0.9345703125, 0.935546875, 0.9365234375, 0.9375, 0.9384765625, 0.939453125, 0.9404296875, 0.94140625, 0.9423828125, 0.943359375, 0.9443359375, 0.9453125, 0.9462890625, 0.947265625, 0.9482421875, 0.94921875, 0.9501953125, 0.951171875, 0.9521484375, 0.953125, 0.9541015625, 0.955078125, 0.9560546875, 0.95703125, 0.9580078125, 0.958984375, 0.9599609375, 0.9609375, 0.9619140625, 0.962890625, 0.9638671875, 0.96484375, 0.9658203125, 0.966796875, 0.9677734375, 0.96875, 0.9697265625, 0.970703125, 0.9716796875, 0.97265625, 0.9736328125, 0.974609375, 0.9755859375, 0.9765625, 0.9775390625, 0.978515625, 0.9794921875, 0.98046875, 0.9814453125, 0.982421875, 0.9833984375, 0.984375, 0.9853515625, 0.986328125, 0.9873046875, 0.98828125, 0.9892578125, 0.990234375, 0.9912109375, 0.9921875, 0.9931640625, 0.994140625, 0.9951171875, 0.99609375, 0.9970703125, 0.998046875, 0.9990234375, 1.0], "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.0, "anchor_mode": "onehot", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 128.0, "target_prob": 1.0, "endpoint_temp": 1.55, "final_from": "state", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 8, "seed": 20260520}, "raw_genppl": {"ppl": 1.05319821217009, "nll_per_token": 0.05183145111682368, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 1.05319821217009, "nll_per_token": 0.05183145111682368, "tokens": 2040, "kept_samples": 8, "total_samples": 8, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 0.0, "unique_tokens": 1, "token_count": 8192, "distinct_1": 0.0001220703125, "distinct_2": 0.00012218963831867058, "top_token_mass": 1.0}}
|
| 44 |
+
[done] docs/lta_samples/metrics_20260520/owt_t5_2node_step290000_infer_compare_n8/dirres_c128_t1p55.jsonl
|