| {"event": "run_start", "step": 4000, "max_steps": 5764, "model_params_M": 9.853, "eff_batch": 1024, "amp_dtype": "torch.float16", "attn_backend": "chunked", "started_at": "2026-06-18T03:45:02.631395"} |
| {"step": 4020, "epoch": 0.697, "loss": 2.14847, "ppl": 8.572, "lr": 9.2227e-05, "lr_sched": "cosine", "grad_norm": 0.7808, "tokens": 4211159040, "tok_s": 58598.7, "elapsed_s": 417.4, "vram_gb": 0.67, "ram_pct": 19.2, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4040, "epoch": 0.701, "loss": 2.17968, "ppl": 8.843, "lr": 9.0927e-05, "lr_sched": "cosine", "grad_norm": 0.7191, "tokens": 4232110080, "tok_s": 59027.6, "elapsed_s": 773.6, "vram_gb": 0.67, "ram_pct": 19.3, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4060, "epoch": 0.704, "loss": 2.16665, "ppl": 8.729, "lr": 8.9636e-05, "lr_sched": "cosine", "grad_norm": 0.7424, "tokens": 4253061120, "tok_s": 59005.5, "elapsed_s": 1128.8, "vram_gb": 0.67, "ram_pct": 19.4, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4080, "epoch": 0.708, "loss": 2.18001, "ppl": 8.846, "lr": 8.8355e-05, "lr_sched": "cosine", "grad_norm": 0.697, "tokens": 4274012160, "tok_s": 58909.2, "elapsed_s": 1484.2, "vram_gb": 0.67, "ram_pct": 19.4, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4100, "epoch": 0.711, "loss": 2.18284, "ppl": 8.871, "lr": 8.7085e-05, "lr_sched": "cosine", "grad_norm": 0.7439, "tokens": 4294963200, "tok_s": 59003.2, "elapsed_s": 1839.6, "vram_gb": 0.67, "ram_pct": 19.4, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4120, "epoch": 0.715, "loss": 2.15899, "ppl": 8.662, "lr": 8.5825e-05, "lr_sched": "cosine", "grad_norm": 0.6785, "tokens": 4315914240, "tok_s": 59082.4, "elapsed_s": 2195.4, "vram_gb": 0.67, "ram_pct": 19.4, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4140, "epoch": 0.718, "loss": 2.18484, "ppl": 8.889, "lr": 8.4575e-05, "lr_sched": "cosine", "grad_norm": 0.8037, "tokens": 4336865280, "tok_s": 59063.7, "elapsed_s": 2550.6, "vram_gb": 0.67, "ram_pct": 19.4, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4160, "epoch": 0.722, "loss": 2.20062, "ppl": 9.031, "lr": 8.3336e-05, "lr_sched": "cosine", "grad_norm": 0.7413, "tokens": 4357816320, "tok_s": 58955.2, "elapsed_s": 2906.0, "vram_gb": 0.67, "ram_pct": 19.4, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4180, "epoch": 0.725, "loss": 2.17473, "ppl": 8.8, "lr": 8.2107e-05, "lr_sched": "cosine", "grad_norm": 0.7454, "tokens": 4378767360, "tok_s": 58924.2, "elapsed_s": 3261.6, "vram_gb": 0.67, "ram_pct": 19.4, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4200, "epoch": 0.729, "loss": 2.17931, "ppl": 8.84, "lr": 8.0889e-05, "lr_sched": "cosine", "grad_norm": 0.6631, "tokens": 4399718400, "tok_s": 58807.9, "elapsed_s": 3617.2, "vram_gb": 0.67, "ram_pct": 19.4, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4220, "epoch": 0.732, "loss": 2.16724, "ppl": 8.734, "lr": 7.9683e-05, "lr_sched": "cosine", "grad_norm": 0.7733, "tokens": 4420669440, "tok_s": 58358.0, "elapsed_s": 3973.1, "vram_gb": 0.67, "ram_pct": 19.4, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4240, "epoch": 0.736, "loss": 2.18435, "ppl": 8.885, "lr": 7.8488e-05, "lr_sched": "cosine", "grad_norm": 0.6586, "tokens": 4441620480, "tok_s": 58951.5, "elapsed_s": 4328.7, "vram_gb": 0.67, "ram_pct": 19.3, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4260, "epoch": 0.739, "loss": 2.17627, "ppl": 8.813, "lr": 7.7304e-05, "lr_sched": "cosine", "grad_norm": 0.6069, "tokens": 4462571520, "tok_s": 58936.3, "elapsed_s": 4684.3, "vram_gb": 0.67, "ram_pct": 19.5, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4280, "epoch": 0.743, "loss": 2.17232, "ppl": 8.779, "lr": 7.6132e-05, "lr_sched": "cosine", "grad_norm": 0.6585, "tokens": 4483522560, "tok_s": 58878.6, "elapsed_s": 5040.0, "vram_gb": 0.67, "ram_pct": 19.4, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4300, "epoch": 0.746, "loss": 2.16187, "ppl": 8.687, "lr": 7.4971e-05, "lr_sched": "cosine", "grad_norm": 0.8019, "tokens": 4504473600, "tok_s": 58945.9, "elapsed_s": 5395.6, "vram_gb": 0.67, "ram_pct": 19.4, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4320, "epoch": 0.749, "loss": 2.15217, "ppl": 8.603, "lr": 7.3822e-05, "lr_sched": "cosine", "grad_norm": 0.7391, "tokens": 4525424640, "tok_s": 58947.5, "elapsed_s": 5751.2, "vram_gb": 0.67, "ram_pct": 19.4, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4340, "epoch": 0.753, "loss": 2.17517, "ppl": 8.804, "lr": 7.2686e-05, "lr_sched": "cosine", "grad_norm": 0.7394, "tokens": 4546375680, "tok_s": 58890.0, "elapsed_s": 6106.9, "vram_gb": 0.67, "ram_pct": 19.4, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4360, "epoch": 0.756, "loss": 2.17134, "ppl": 8.77, "lr": 7.1561e-05, "lr_sched": "cosine", "grad_norm": 0.67, "tokens": 4567326720, "tok_s": 58941.4, "elapsed_s": 6462.4, "vram_gb": 0.67, "ram_pct": 19.4, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4380, "epoch": 0.76, "loss": 2.15233, "ppl": 8.605, "lr": 7.0449e-05, "lr_sched": "cosine", "grad_norm": 0.7253, "tokens": 4588277760, "tok_s": 58925.3, "elapsed_s": 6817.9, "vram_gb": 0.67, "ram_pct": 19.4, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4400, "epoch": 0.763, "loss": 2.14953, "ppl": 8.581, "lr": 6.9349e-05, "lr_sched": "cosine", "grad_norm": 0.6971, "tokens": 4609228800, "tok_s": 58902.1, "elapsed_s": 7173.4, "vram_gb": 0.67, "ram_pct": 19.4, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4400, "epoch": 0.763, "val_loss": 2.42352, "val_ppl": 11.286, "best_val_ppl": 11.286, "is_best": true} |
| {"step": 4420, "epoch": 0.767, "loss": 2.16445, "ppl": 8.71, "lr": 6.8262e-05, "lr_sched": "cosine", "grad_norm": 0.6837, "tokens": 4630179840, "tok_s": 58881.2, "elapsed_s": 7687.3, "vram_gb": 0.67, "ram_pct": 21.7, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4440, "epoch": 0.77, "loss": 2.17495, "ppl": 8.802, "lr": 6.7187e-05, "lr_sched": "cosine", "grad_norm": 0.7276, "tokens": 4651130880, "tok_s": 58949.4, "elapsed_s": 8042.7, "vram_gb": 0.67, "ram_pct": 21.8, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4460, "epoch": 0.774, "loss": 2.18319, "ppl": 8.875, "lr": 6.6126e-05, "lr_sched": "cosine", "grad_norm": 0.65, "tokens": 4672081920, "tok_s": 58906.7, "elapsed_s": 8398.7, "vram_gb": 0.67, "ram_pct": 21.7, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4480, "epoch": 0.777, "loss": 2.14039, "ppl": 8.503, "lr": 6.5077e-05, "lr_sched": "cosine", "grad_norm": 0.639, "tokens": 4693032960, "tok_s": 58853.6, "elapsed_s": 8754.4, "vram_gb": 0.67, "ram_pct": 21.8, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4500, "epoch": 0.781, "loss": 2.15079, "ppl": 8.592, "lr": 6.4042e-05, "lr_sched": "cosine", "grad_norm": 0.7281, "tokens": 4713984000, "tok_s": 58901.7, "elapsed_s": 9110.1, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4520, "epoch": 0.784, "loss": 2.16979, "ppl": 8.756, "lr": 6.302e-05, "lr_sched": "cosine", "grad_norm": 0.6345, "tokens": 4734935040, "tok_s": 58855.0, "elapsed_s": 9465.9, "vram_gb": 0.67, "ram_pct": 21.7, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4540, "epoch": 0.788, "loss": 2.15761, "ppl": 8.65, "lr": 6.2011e-05, "lr_sched": "cosine", "grad_norm": 0.733, "tokens": 4755886080, "tok_s": 58876.9, "elapsed_s": 9821.8, "vram_gb": 0.67, "ram_pct": 21.7, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4560, "epoch": 0.791, "loss": 2.14629, "ppl": 8.553, "lr": 6.1016e-05, "lr_sched": "cosine", "grad_norm": 0.6776, "tokens": 4776837120, "tok_s": 58941.3, "elapsed_s": 10177.5, "vram_gb": 0.67, "ram_pct": 21.7, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4580, "epoch": 0.795, "loss": 2.16362, "ppl": 8.703, "lr": 6.0035e-05, "lr_sched": "cosine", "grad_norm": 0.633, "tokens": 4797788160, "tok_s": 58933.9, "elapsed_s": 10533.2, "vram_gb": 0.67, "ram_pct": 21.8, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4600, "epoch": 0.798, "loss": 2.15012, "ppl": 8.586, "lr": 5.9067e-05, "lr_sched": "cosine", "grad_norm": 0.621, "tokens": 4818739200, "tok_s": 58889.4, "elapsed_s": 10889.1, "vram_gb": 0.67, "ram_pct": 21.7, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4620, "epoch": 0.802, "loss": 2.17037, "ppl": 8.762, "lr": 5.8114e-05, "lr_sched": "cosine", "grad_norm": 0.6526, "tokens": 4839690240, "tok_s": 58782.3, "elapsed_s": 11244.8, "vram_gb": 0.67, "ram_pct": 21.7, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4640, "epoch": 0.805, "loss": 2.14699, "ppl": 8.559, "lr": 5.7174e-05, "lr_sched": "cosine", "grad_norm": 0.6952, "tokens": 4860641280, "tok_s": 58926.5, "elapsed_s": 11601.2, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4660, "epoch": 0.808, "loss": 2.15575, "ppl": 8.634, "lr": 5.6249e-05, "lr_sched": "cosine", "grad_norm": 0.6711, "tokens": 4881592320, "tok_s": 58969.1, "elapsed_s": 11957.0, "vram_gb": 0.67, "ram_pct": 21.7, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4680, "epoch": 0.812, "loss": 2.15461, "ppl": 8.625, "lr": 5.5338e-05, "lr_sched": "cosine", "grad_norm": 0.6872, "tokens": 4902543360, "tok_s": 58887.5, "elapsed_s": 12312.9, "vram_gb": 0.67, "ram_pct": 21.8, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4700, "epoch": 0.815, "loss": 2.17043, "ppl": 8.762, "lr": 5.4442e-05, "lr_sched": "cosine", "grad_norm": 0.6113, "tokens": 4923494400, "tok_s": 58955.5, "elapsed_s": 12668.5, "vram_gb": 0.67, "ram_pct": 21.7, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4720, "epoch": 0.819, "loss": 2.17314, "ppl": 8.786, "lr": 5.356e-05, "lr_sched": "cosine", "grad_norm": 0.6569, "tokens": 4944445440, "tok_s": 58873.6, "elapsed_s": 13024.4, "vram_gb": 0.67, "ram_pct": 21.8, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4740, "epoch": 0.822, "loss": 2.14656, "ppl": 8.555, "lr": 5.2692e-05, "lr_sched": "cosine", "grad_norm": 0.6693, "tokens": 4965396480, "tok_s": 58873.5, "elapsed_s": 13380.2, "vram_gb": 0.67, "ram_pct": 21.7, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4760, "epoch": 0.826, "loss": 2.15693, "ppl": 8.645, "lr": 5.184e-05, "lr_sched": "cosine", "grad_norm": 0.6573, "tokens": 4986347520, "tok_s": 58914.9, "elapsed_s": 13736.0, "vram_gb": 0.67, "ram_pct": 21.8, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4780, "epoch": 0.829, "loss": 2.14654, "ppl": 8.555, "lr": 5.1002e-05, "lr_sched": "cosine", "grad_norm": 0.6675, "tokens": 5007298560, "tok_s": 58825.0, "elapsed_s": 14091.9, "vram_gb": 0.67, "ram_pct": 21.7, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4800, "epoch": 0.833, "loss": 2.1479, "ppl": 8.567, "lr": 5.018e-05, "lr_sched": "cosine", "grad_norm": 0.6503, "tokens": 5028249600, "tok_s": 58800.7, "elapsed_s": 14447.9, "vram_gb": 0.67, "ram_pct": 21.8, "disk_free_gb": 20.62, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4800, "epoch": 0.833, "val_loss": 2.41446, "val_ppl": 11.184, "best_val_ppl": 11.184, "is_best": true} |
| {"step": 4820, "epoch": 0.836, "loss": 2.15034, "ppl": 8.588, "lr": 4.9373e-05, "lr_sched": "cosine", "grad_norm": 0.6353, "tokens": 5049200640, "tok_s": 58942.5, "elapsed_s": 14931.7, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4840, "epoch": 0.84, "loss": 2.14222, "ppl": 8.518, "lr": 4.858e-05, "lr_sched": "cosine", "grad_norm": 0.6764, "tokens": 5070151680, "tok_s": 58982.8, "elapsed_s": 15286.8, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4860, "epoch": 0.843, "loss": 2.12803, "ppl": 8.398, "lr": 4.7804e-05, "lr_sched": "cosine", "grad_norm": 0.6354, "tokens": 5091102720, "tok_s": 58930.9, "elapsed_s": 15643.5, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4880, "epoch": 0.847, "loss": 2.14507, "ppl": 8.543, "lr": 4.7042e-05, "lr_sched": "cosine", "grad_norm": 0.7113, "tokens": 5112053760, "tok_s": 58854.6, "elapsed_s": 15999.1, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4900, "epoch": 0.85, "loss": 2.1485, "ppl": 8.572, "lr": 4.6296e-05, "lr_sched": "cosine", "grad_norm": 0.8371, "tokens": 5133004800, "tok_s": 58987.3, "elapsed_s": 16354.6, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4920, "epoch": 0.854, "loss": 2.1604, "ppl": 8.675, "lr": 4.5566e-05, "lr_sched": "cosine", "grad_norm": 0.6075, "tokens": 5153955840, "tok_s": 59029.6, "elapsed_s": 16709.8, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4940, "epoch": 0.857, "loss": 2.14375, "ppl": 8.531, "lr": 4.4852e-05, "lr_sched": "cosine", "grad_norm": 0.6353, "tokens": 5174906880, "tok_s": 58970.9, "elapsed_s": 17065.1, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4960, "epoch": 0.861, "loss": 2.15293, "ppl": 8.61, "lr": 4.4153e-05, "lr_sched": "cosine", "grad_norm": 0.5997, "tokens": 5195857920, "tok_s": 58959.8, "elapsed_s": 17420.4, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 4980, "epoch": 0.864, "loss": 2.15777, "ppl": 8.652, "lr": 4.347e-05, "lr_sched": "cosine", "grad_norm": 0.5908, "tokens": 5216808960, "tok_s": 58901.2, "elapsed_s": 17776.4, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5000, "epoch": 0.867, "loss": 2.12994, "ppl": 8.414, "lr": 4.2804e-05, "lr_sched": "cosine", "grad_norm": 0.5993, "tokens": 5237760000, "tok_s": 58769.5, "elapsed_s": 18132.3, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5020, "epoch": 0.871, "loss": 2.13178, "ppl": 8.43, "lr": 4.2153e-05, "lr_sched": "cosine", "grad_norm": 0.59, "tokens": 5258711040, "tok_s": 58713.3, "elapsed_s": 18488.7, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5040, "epoch": 0.874, "loss": 2.15913, "ppl": 8.664, "lr": 4.1518e-05, "lr_sched": "cosine", "grad_norm": 0.5585, "tokens": 5279662080, "tok_s": 58696.4, "elapsed_s": 18845.4, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5060, "epoch": 0.878, "loss": 2.15324, "ppl": 8.613, "lr": 4.09e-05, "lr_sched": "cosine", "grad_norm": 0.6678, "tokens": 5300613120, "tok_s": 58744.4, "elapsed_s": 19202.0, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5080, "epoch": 0.881, "loss": 2.14146, "ppl": 8.512, "lr": 4.0299e-05, "lr_sched": "cosine", "grad_norm": 0.6728, "tokens": 5321564160, "tok_s": 58840.1, "elapsed_s": 19558.2, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5100, "epoch": 0.885, "loss": 2.15999, "ppl": 8.671, "lr": 3.9713e-05, "lr_sched": "cosine", "grad_norm": 0.6077, "tokens": 5342515200, "tok_s": 58887.7, "elapsed_s": 19913.8, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5120, "epoch": 0.888, "loss": 2.16784, "ppl": 8.739, "lr": 3.9144e-05, "lr_sched": "cosine", "grad_norm": 0.6918, "tokens": 5363466240, "tok_s": 58505.2, "elapsed_s": 20271.0, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5140, "epoch": 0.892, "loss": 2.14604, "ppl": 8.551, "lr": 3.8592e-05, "lr_sched": "cosine", "grad_norm": 0.5367, "tokens": 5384417280, "tok_s": 58833.4, "elapsed_s": 20627.0, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5160, "epoch": 0.895, "loss": 2.14643, "ppl": 8.554, "lr": 3.8056e-05, "lr_sched": "cosine", "grad_norm": 0.5998, "tokens": 5405368320, "tok_s": 59149.2, "elapsed_s": 20982.8, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5180, "epoch": 0.899, "loss": 2.14637, "ppl": 8.554, "lr": 3.7537e-05, "lr_sched": "cosine", "grad_norm": 0.605, "tokens": 5426319360, "tok_s": 58878.2, "elapsed_s": 21339.1, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5200, "epoch": 0.902, "loss": 2.14077, "ppl": 8.506, "lr": 3.7035e-05, "lr_sched": "cosine", "grad_norm": 0.6609, "tokens": 5447270400, "tok_s": 58868.1, "elapsed_s": 21694.8, "vram_gb": 0.67, "ram_pct": 21.9, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5200, "epoch": 0.902, "val_loss": 2.4072, "val_ppl": 11.103, "best_val_ppl": 11.103, "is_best": true} |
| {"step": 5220, "epoch": 0.906, "loss": 2.15635, "ppl": 8.64, "lr": 3.655e-05, "lr_sched": "cosine", "grad_norm": 0.6798, "tokens": 5468221440, "tok_s": 58915.8, "elapsed_s": 22176.3, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5240, "epoch": 0.909, "loss": 2.14071, "ppl": 8.505, "lr": 3.6082e-05, "lr_sched": "cosine", "grad_norm": 0.6052, "tokens": 5489172480, "tok_s": 57784.1, "elapsed_s": 22532.4, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5260, "epoch": 0.913, "loss": 2.12207, "ppl": 8.348, "lr": 3.563e-05, "lr_sched": "cosine", "grad_norm": 0.5592, "tokens": 5510123520, "tok_s": 58884.2, "elapsed_s": 22888.9, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5280, "epoch": 0.916, "loss": 2.13748, "ppl": 8.478, "lr": 3.5196e-05, "lr_sched": "cosine", "grad_norm": 0.5881, "tokens": 5531074560, "tok_s": 58963.0, "elapsed_s": 23244.4, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5300, "epoch": 0.92, "loss": 2.13813, "ppl": 8.484, "lr": 3.4779e-05, "lr_sched": "cosine", "grad_norm": 0.7004, "tokens": 5552025600, "tok_s": 58931.1, "elapsed_s": 23599.8, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5320, "epoch": 0.923, "loss": 2.14746, "ppl": 8.563, "lr": 3.4379e-05, "lr_sched": "cosine", "grad_norm": 0.6691, "tokens": 5572976640, "tok_s": 58948.2, "elapsed_s": 23955.2, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5340, "epoch": 0.926, "loss": 2.13766, "ppl": 8.48, "lr": 3.3996e-05, "lr_sched": "cosine", "grad_norm": 0.5861, "tokens": 5593927680, "tok_s": 58988.6, "elapsed_s": 24310.5, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5360, "epoch": 0.93, "loss": 2.16387, "ppl": 8.705, "lr": 3.363e-05, "lr_sched": "cosine", "grad_norm": 0.5905, "tokens": 5614878720, "tok_s": 59127.9, "elapsed_s": 24665.8, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5380, "epoch": 0.933, "loss": 2.1365, "ppl": 8.47, "lr": 3.3282e-05, "lr_sched": "cosine", "grad_norm": 0.5865, "tokens": 5635829760, "tok_s": 58470.1, "elapsed_s": 25021.6, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5400, "epoch": 0.937, "loss": 2.14116, "ppl": 8.509, "lr": 3.2951e-05, "lr_sched": "cosine", "grad_norm": 0.6316, "tokens": 5656780800, "tok_s": 59087.3, "elapsed_s": 25377.0, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5420, "epoch": 0.94, "loss": 2.1404, "ppl": 8.503, "lr": 3.2638e-05, "lr_sched": "cosine", "grad_norm": 0.5867, "tokens": 5677731840, "tok_s": 59015.4, "elapsed_s": 25731.4, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5440, "epoch": 0.944, "loss": 2.14753, "ppl": 8.564, "lr": 3.2342e-05, "lr_sched": "cosine", "grad_norm": 0.6242, "tokens": 5698682880, "tok_s": 59042.9, "elapsed_s": 26085.9, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5460, "epoch": 0.947, "loss": 2.13541, "ppl": 8.461, "lr": 3.2063e-05, "lr_sched": "cosine", "grad_norm": 0.5893, "tokens": 5719633920, "tok_s": 58978.4, "elapsed_s": 26440.5, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5480, "epoch": 0.951, "loss": 2.15495, "ppl": 8.627, "lr": 3.1802e-05, "lr_sched": "cosine", "grad_norm": 0.6576, "tokens": 5740584960, "tok_s": 58887.8, "elapsed_s": 26795.6, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5500, "epoch": 0.954, "loss": 2.12464, "ppl": 8.37, "lr": 3.1558e-05, "lr_sched": "cosine", "grad_norm": 0.5399, "tokens": 5761536000, "tok_s": 58967.2, "elapsed_s": 27151.2, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5520, "epoch": 0.958, "loss": 2.13203, "ppl": 8.432, "lr": 3.1332e-05, "lr_sched": "cosine", "grad_norm": 0.5828, "tokens": 5782487040, "tok_s": 58942.8, "elapsed_s": 27506.9, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5540, "epoch": 0.961, "loss": 2.13436, "ppl": 8.452, "lr": 3.1124e-05, "lr_sched": "cosine", "grad_norm": 0.5575, "tokens": 5803438080, "tok_s": 58939.0, "elapsed_s": 27862.2, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5560, "epoch": 0.965, "loss": 2.12371, "ppl": 8.362, "lr": 3.0933e-05, "lr_sched": "cosine", "grad_norm": 0.6898, "tokens": 5824389120, "tok_s": 58881.7, "elapsed_s": 28217.8, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5580, "epoch": 0.968, "loss": 2.15706, "ppl": 8.646, "lr": 3.076e-05, "lr_sched": "cosine", "grad_norm": 0.5962, "tokens": 5845340160, "tok_s": 58733.3, "elapsed_s": 28574.0, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5600, "epoch": 0.972, "loss": 2.14281, "ppl": 8.523, "lr": 3.0605e-05, "lr_sched": "cosine", "grad_norm": 0.6329, "tokens": 5866291200, "tok_s": 58128.9, "elapsed_s": 28930.7, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5600, "epoch": 0.972, "val_loss": 2.40238, "val_ppl": 11.049, "best_val_ppl": 11.049, "is_best": true} |
| {"step": 5620, "epoch": 0.975, "loss": 2.14105, "ppl": 8.508, "lr": 3.0467e-05, "lr_sched": "cosine", "grad_norm": 0.6983, "tokens": 5887242240, "tok_s": 59725.4, "elapsed_s": 29410.3, "vram_gb": 0.67, "ram_pct": 22.1, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5640, "epoch": 0.978, "loss": 2.12945, "ppl": 8.41, "lr": 3.0347e-05, "lr_sched": "cosine", "grad_norm": 0.5881, "tokens": 5908193280, "tok_s": 58918.4, "elapsed_s": 29765.7, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5660, "epoch": 0.982, "loss": 2.14416, "ppl": 8.535, "lr": 3.0245e-05, "lr_sched": "cosine", "grad_norm": 0.5546, "tokens": 5929144320, "tok_s": 58866.5, "elapsed_s": 30121.3, "vram_gb": 0.67, "ram_pct": 22.1, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5680, "epoch": 0.985, "loss": 2.16154, "ppl": 8.684, "lr": 3.0161e-05, "lr_sched": "cosine", "grad_norm": 0.6755, "tokens": 5950095360, "tok_s": 58720.8, "elapsed_s": 30477.9, "vram_gb": 0.67, "ram_pct": 22.1, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5700, "epoch": 0.989, "loss": 2.12027, "ppl": 8.333, "lr": 3.0094e-05, "lr_sched": "cosine", "grad_norm": 0.5491, "tokens": 5971046400, "tok_s": 58725.1, "elapsed_s": 30834.9, "vram_gb": 0.67, "ram_pct": 22.3, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5720, "epoch": 0.992, "loss": 2.13713, "ppl": 8.475, "lr": 3.0045e-05, "lr_sched": "cosine", "grad_norm": 0.619, "tokens": 5991997440, "tok_s": 59179.1, "elapsed_s": 31191.0, "vram_gb": 0.67, "ram_pct": 22.0, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5740, "epoch": 0.996, "loss": 2.12801, "ppl": 8.398, "lr": 3.0014e-05, "lr_sched": "cosine", "grad_norm": 0.6878, "tokens": 6012948480, "tok_s": 58874.3, "elapsed_s": 31547.5, "vram_gb": 0.67, "ram_pct": 22.1, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
| {"step": 5760, "epoch": 0.999, "loss": 2.12224, "ppl": 8.35, "lr": 3.0001e-05, "lr_sched": "cosine", "grad_norm": 0.6238, "tokens": 6033899520, "tok_s": 58688.9, "elapsed_s": 31903.8, "vram_gb": 0.67, "ram_pct": 22.1, "disk_free_gb": 20.5, "attn_backend": "chunked", "amp_dtype": "torch.float16"} |
|
|