{"event": "run_start", "step": 60, "max_steps": 1264, "model_params_M": 9.902, "eff_batch": 2304, "amp_dtype": "torch.float16", "attn_backend": "chunked", "started_at": "2026-06-11T06:31:39.618880"} {"step": 80, "epoch": 0.063, "loss": 5.08373, "ppl": 161.375, "lr": 0.000246875, "lr_sched": "cosine", "grad_norm": 2.0964, "tokens": 188559360, "tok_s": 300995.5, "elapsed_s": 626.5, "vram_gb": 0.59, "ram_pct": 16.5, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 100, "epoch": 0.079, "loss": 4.72502, "ppl": 112.733, "lr": 0.000309375, "lr_sched": "cosine", "grad_norm": 1.062, "tokens": 235699200, "tok_s": 193022.6, "elapsed_s": 1221.1, "vram_gb": 0.59, "ram_pct": 16.5, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 120, "epoch": 0.095, "loss": 4.51705, "ppl": 91.565, "lr": 0.000371875, "lr_sched": "cosine", "grad_norm": 0.5691, "tokens": 282839040, "tok_s": 155225.5, "elapsed_s": 1822.1, "vram_gb": 0.59, "ram_pct": 16.5, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 140, "epoch": 0.111, "loss": 4.35544, "ppl": 77.901, "lr": 0.000399917, "lr_sched": "cosine", "grad_norm": 1.0289, "tokens": 329978880, "tok_s": 136127.2, "elapsed_s": 2424.0, "vram_gb": 0.59, "ram_pct": 16.5, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 160, "epoch": 0.127, "loss": 4.17712, "ppl": 65.178, "lr": 0.000399339, "lr_sched": "cosine", "grad_norm": 0.9668, "tokens": 377118720, "tok_s": 124668.0, "elapsed_s": 3025.0, "vram_gb": 0.59, "ram_pct": 16.5, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 180, "epoch": 0.142, "loss": 4.00303, "ppl": 54.764, "lr": 0.000398213, "lr_sched": "cosine", "grad_norm": 1.5329, "tokens": 424258560, "tok_s": 116944.5, "elapsed_s": 3627.9, "vram_gb": 0.59, "ram_pct": 16.4, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 200, "epoch": 0.158, "loss": 3.81103, "ppl": 45.197, "lr": 0.000396541, "lr_sched": "cosine", "grad_norm": 0.8057, "tokens": 471398400, "tok_s": 111401.8, "elapsed_s": 4231.5, "vram_gb": 0.59, "ram_pct": 16.5, "disk_free_gb": 20.94, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 200, "epoch": 0.158, "val_loss": 3.81748, "val_ppl": 45.489, "best_val_ppl": 45.489, "is_best": true} {"step": 220, "epoch": 0.174, "loss": 3.65134, "ppl": 38.526, "lr": 0.00039433, "lr_sched": "cosine", "grad_norm": 1.0934, "tokens": 518538240, "tok_s": 106264.9, "elapsed_s": 4879.7, "vram_gb": 0.59, "ram_pct": 18.4, "disk_free_gb": 20.74, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 240, "epoch": 0.19, "loss": 3.52499, "ppl": 33.953, "lr": 0.000391586, "lr_sched": "cosine", "grad_norm": 1.079, "tokens": 565678080, "tok_s": 103073.1, "elapsed_s": 5488.1, "vram_gb": 0.59, "ram_pct": 18.4, "disk_free_gb": 20.74, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 260, "epoch": 0.206, "loss": 3.42217, "ppl": 30.636, "lr": 0.000388316, "lr_sched": "cosine", "grad_norm": 1.1757, "tokens": 612817920, "tok_s": 100537.3, "elapsed_s": 6095.4, "vram_gb": 0.59, "ram_pct": 18.4, "disk_free_gb": 20.74, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 280, "epoch": 0.222, "loss": 3.33892, "ppl": 28.189, "lr": 0.000384533, "lr_sched": "cosine", "grad_norm": 1.1653, "tokens": 659957760, "tok_s": 98485.9, "elapsed_s": 6701.0, "vram_gb": 0.59, "ram_pct": 18.4, "disk_free_gb": 20.74, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 300, "epoch": 0.237, "loss": 3.24706, "ppl": 25.715, "lr": 0.000380245, "lr_sched": "cosine", "grad_norm": 1.0345, "tokens": 707097600, "tok_s": 96760.4, "elapsed_s": 7307.7, "vram_gb": 0.59, "ram_pct": 18.4, "disk_free_gb": 20.74, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 320, "epoch": 0.253, "loss": 3.18689, "ppl": 24.213, "lr": 0.000375468, "lr_sched": "cosine", "grad_norm": 1.0814, "tokens": 754237440, "tok_s": 95305.2, "elapsed_s": 7913.9, "vram_gb": 0.59, "ram_pct": 18.5, "disk_free_gb": 20.74, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 340, "epoch": 0.269, "loss": 3.10296, "ppl": 22.264, "lr": 0.000370215, "lr_sched": "cosine", "grad_norm": 1.1011, "tokens": 801377280, "tok_s": 94040.1, "elapsed_s": 8521.7, "vram_gb": 0.59, "ram_pct": 18.5, "disk_free_gb": 20.74, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 360, "epoch": 0.285, "loss": 3.05335, "ppl": 21.186, "lr": 0.000364503, "lr_sched": "cosine", "grad_norm": 1.0818, "tokens": 848517120, "tok_s": 92932.0, "elapsed_s": 9130.5, "vram_gb": 0.59, "ram_pct": 18.5, "disk_free_gb": 20.74, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 380, "epoch": 0.301, "loss": 3.00831, "ppl": 20.253, "lr": 0.000358349, "lr_sched": "cosine", "grad_norm": 0.9618, "tokens": 895656960, "tok_s": 91956.6, "elapsed_s": 9740.0, "vram_gb": 0.59, "ram_pct": 18.5, "disk_free_gb": 20.74, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 400, "epoch": 0.316, "loss": 2.95097, "ppl": 19.124, "lr": 0.000351772, "lr_sched": "cosine", "grad_norm": 0.8669, "tokens": 942796800, "tok_s": 91115.9, "elapsed_s": 10347.2, "vram_gb": 0.59, "ram_pct": 18.5, "disk_free_gb": 20.74, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 400, "epoch": 0.316, "val_loss": 2.96685, "val_ppl": 19.431, "best_val_ppl": 19.431, "is_best": true} {"step": 420, "epoch": 0.332, "loss": 2.92597, "ppl": 18.652, "lr": 0.000344792, "lr_sched": "cosine", "grad_norm": 1.256, "tokens": 989936640, "tok_s": 90238.0, "elapsed_s": 10970.3, "vram_gb": 0.59, "ram_pct": 18.6, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 440, "epoch": 0.348, "loss": 2.89415, "ppl": 18.068, "lr": 0.00033743, "lr_sched": "cosine", "grad_norm": 1.0429, "tokens": 1037076480, "tok_s": 89573.8, "elapsed_s": 11577.9, "vram_gb": 0.59, "ram_pct": 18.7, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 460, "epoch": 0.364, "loss": 2.8546, "ppl": 17.368, "lr": 0.000329709, "lr_sched": "cosine", "grad_norm": 1.1318, "tokens": 1084216320, "tok_s": 88970.2, "elapsed_s": 12186.3, "vram_gb": 0.59, "ram_pct": 18.6, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 480, "epoch": 0.38, "loss": 2.82379, "ppl": 16.841, "lr": 0.000321652, "lr_sched": "cosine", "grad_norm": 1.166, "tokens": 1131356160, "tok_s": 88407.9, "elapsed_s": 12797.0, "vram_gb": 0.59, "ram_pct": 18.7, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 500, "epoch": 0.396, "loss": 2.80073, "ppl": 16.457, "lr": 0.000313285, "lr_sched": "cosine", "grad_norm": 0.9405, "tokens": 1178496000, "tok_s": 87904.7, "elapsed_s": 13406.5, "vram_gb": 0.59, "ram_pct": 18.7, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 520, "epoch": 0.411, "loss": 2.77799, "ppl": 16.087, "lr": 0.000304632, "lr_sched": "cosine", "grad_norm": 1.0248, "tokens": 1225635840, "tok_s": 87461.3, "elapsed_s": 14013.5, "vram_gb": 0.59, "ram_pct": 18.7, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 540, "epoch": 0.427, "loss": 2.75765, "ppl": 15.763, "lr": 0.00029572, "lr_sched": "cosine", "grad_norm": 0.7223, "tokens": 1272775680, "tok_s": 87054.8, "elapsed_s": 14620.4, "vram_gb": 0.59, "ram_pct": 18.7, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 560, "epoch": 0.443, "loss": 2.75917, "ppl": 15.787, "lr": 0.000286577, "lr_sched": "cosine", "grad_norm": 1.0008, "tokens": 1319915520, "tok_s": 86692.0, "elapsed_s": 15225.3, "vram_gb": 0.59, "ram_pct": 18.7, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 580, "epoch": 0.459, "loss": 2.72373, "ppl": 15.237, "lr": 0.00027723, "lr_sched": "cosine", "grad_norm": 0.8632, "tokens": 1367055360, "tok_s": 86345.9, "elapsed_s": 15832.3, "vram_gb": 0.59, "ram_pct": 18.8, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 600, "epoch": 0.475, "loss": 2.71745, "ppl": 15.142, "lr": 0.000267708, "lr_sched": "cosine", "grad_norm": 1.0309, "tokens": 1414195200, "tok_s": 85995.5, "elapsed_s": 16445.0, "vram_gb": 0.59, "ram_pct": 18.8, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 600, "epoch": 0.475, "val_loss": 2.72172, "val_ppl": 15.206, "best_val_ppl": 15.206, "is_best": true} {"step": 620, "epoch": 0.491, "loss": 2.70782, "ppl": 14.997, "lr": 0.000258041, "lr_sched": "cosine", "grad_norm": 0.9506, "tokens": 1461335040, "tok_s": 85624.3, "elapsed_s": 17066.8, "vram_gb": 0.59, "ram_pct": 18.8, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 640, "epoch": 0.506, "loss": 2.69895, "ppl": 14.864, "lr": 0.000248257, "lr_sched": "cosine", "grad_norm": 1.1735, "tokens": 1508474880, "tok_s": 85357.9, "elapsed_s": 17672.4, "vram_gb": 0.59, "ram_pct": 18.8, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 660, "epoch": 0.522, "loss": 2.66671, "ppl": 14.392, "lr": 0.000238386, "lr_sched": "cosine", "grad_norm": 0.8704, "tokens": 1555614720, "tok_s": 85108.2, "elapsed_s": 18278.1, "vram_gb": 0.59, "ram_pct": 18.9, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 680, "epoch": 0.538, "loss": 2.6486, "ppl": 14.134, "lr": 0.000228459, "lr_sched": "cosine", "grad_norm": 0.638, "tokens": 1602754560, "tok_s": 84878.7, "elapsed_s": 18882.9, "vram_gb": 0.59, "ram_pct": 18.8, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 700, "epoch": 0.554, "loss": 2.65948, "ppl": 14.289, "lr": 0.000218507, "lr_sched": "cosine", "grad_norm": 0.8346, "tokens": 1649894400, "tok_s": 84661.6, "elapsed_s": 19488.1, "vram_gb": 0.59, "ram_pct": 18.8, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 720, "epoch": 0.57, "loss": 2.64185, "ppl": 14.039, "lr": 0.000208559, "lr_sched": "cosine", "grad_norm": 0.7506, "tokens": 1697034240, "tok_s": 84454.2, "elapsed_s": 20094.1, "vram_gb": 0.59, "ram_pct": 18.8, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 740, "epoch": 0.585, "loss": 2.63378, "ppl": 13.926, "lr": 0.000198646, "lr_sched": "cosine", "grad_norm": 0.7747, "tokens": 1744174080, "tok_s": 84261.2, "elapsed_s": 20699.6, "vram_gb": 0.59, "ram_pct": 18.9, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 760, "epoch": 0.601, "loss": 2.60751, "ppl": 13.565, "lr": 0.000188798, "lr_sched": "cosine", "grad_norm": 0.7297, "tokens": 1791313920, "tok_s": 84076.3, "elapsed_s": 21305.8, "vram_gb": 0.59, "ram_pct": 18.9, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 780, "epoch": 0.617, "loss": 2.61043, "ppl": 13.605, "lr": 0.000179045, "lr_sched": "cosine", "grad_norm": 0.578, "tokens": 1838453760, "tok_s": 83899.8, "elapsed_s": 21912.5, "vram_gb": 0.59, "ram_pct": 18.9, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 800, "epoch": 0.633, "loss": 2.60656, "ppl": 13.552, "lr": 0.000169418, "lr_sched": "cosine", "grad_norm": 0.807, "tokens": 1885593600, "tok_s": 83729.7, "elapsed_s": 22520.0, "vram_gb": 0.59, "ram_pct": 18.9, "disk_free_gb": 20.61, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 800, "epoch": 0.633, "val_loss": 2.612, "val_ppl": 13.626, "best_val_ppl": 13.626, "is_best": true} {"step": 820, "epoch": 0.649, "loss": 2.60156, "ppl": 13.485, "lr": 0.000159946, "lr_sched": "cosine", "grad_norm": 0.7742, "tokens": 1932733440, "tok_s": 83511.8, "elapsed_s": 23143.2, "vram_gb": 0.59, "ram_pct": 19.0, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 840, "epoch": 0.665, "loss": 2.58708, "ppl": 13.291, "lr": 0.000150657, "lr_sched": "cosine", "grad_norm": 0.7649, "tokens": 1979873280, "tok_s": 83361.5, "elapsed_s": 23750.4, "vram_gb": 0.59, "ram_pct": 19.0, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 860, "epoch": 0.68, "loss": 2.58372, "ppl": 13.246, "lr": 0.000141581, "lr_sched": "cosine", "grad_norm": 0.7413, "tokens": 2027013120, "tok_s": 83226.7, "elapsed_s": 24355.3, "vram_gb": 0.59, "ram_pct": 19.1, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 880, "epoch": 0.696, "loss": 2.58026, "ppl": 13.201, "lr": 0.000132744, "lr_sched": "cosine", "grad_norm": 0.8835, "tokens": 2074152960, "tok_s": 83104.9, "elapsed_s": 24958.2, "vram_gb": 0.59, "ram_pct": 19.0, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 900, "epoch": 0.712, "loss": 2.57021, "ppl": 13.069, "lr": 0.000124174, "lr_sched": "cosine", "grad_norm": 0.5684, "tokens": 2121292800, "tok_s": 82976.5, "elapsed_s": 25565.0, "vram_gb": 0.59, "ram_pct": 19.0, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 920, "epoch": 0.728, "loss": 2.57673, "ppl": 13.154, "lr": 0.000115897, "lr_sched": "cosine", "grad_norm": 0.5326, "tokens": 2168432640, "tok_s": 82856.0, "elapsed_s": 26171.1, "vram_gb": 0.59, "ram_pct": 19.1, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 940, "epoch": 0.744, "loss": 2.55775, "ppl": 12.907, "lr": 0.000107939, "lr_sched": "cosine", "grad_norm": 0.7495, "tokens": 2215572480, "tok_s": 82735.0, "elapsed_s": 26779.2, "vram_gb": 0.59, "ram_pct": 19.1, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 960, "epoch": 0.759, "loss": 2.56307, "ppl": 12.976, "lr": 0.000100323, "lr_sched": "cosine", "grad_norm": 0.5409, "tokens": 2262712320, "tok_s": 82604.2, "elapsed_s": 27392.2, "vram_gb": 0.59, "ram_pct": 19.1, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 980, "epoch": 0.775, "loss": 2.54577, "ppl": 12.753, "lr": 9.3073e-05, "lr_sched": "cosine", "grad_norm": 0.588, "tokens": 2309852160, "tok_s": 82500.8, "elapsed_s": 27997.9, "vram_gb": 0.59, "ram_pct": 19.1, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 1000, "epoch": 0.791, "loss": 2.55359, "ppl": 12.853, "lr": 8.6212e-05, "lr_sched": "cosine", "grad_norm": 0.6079, "tokens": 2356992000, "tok_s": 82402.7, "elapsed_s": 28603.3, "vram_gb": 0.59, "ram_pct": 19.1, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 1000, "epoch": 0.791, "val_loss": 2.55587, "val_ppl": 12.883, "best_val_ppl": 12.883, "is_best": true} {"step": 1020, "epoch": 0.807, "loss": 2.30778, "ppl": 10.052, "lr": 7.9759e-05, "lr_sched": "cosine", "grad_norm": 1.9184, "tokens": 2404131840, "tok_s": 82260.5, "elapsed_s": 29225.8, "vram_gb": 0.59, "ram_pct": 19.1, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 1040, "epoch": 0.823, "loss": 2.2168, "ppl": 9.178, "lr": 7.3736e-05, "lr_sched": "cosine", "grad_norm": 0.7887, "tokens": 2451271680, "tok_s": 82169.8, "elapsed_s": 29831.8, "vram_gb": 0.59, "ram_pct": 19.2, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 1060, "epoch": 0.839, "loss": 2.56972, "ppl": 13.062, "lr": 6.816e-05, "lr_sched": "cosine", "grad_norm": 0.5087, "tokens": 2498411520, "tok_s": 82073.2, "elapsed_s": 30441.3, "vram_gb": 0.59, "ram_pct": 19.1, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 1080, "epoch": 0.854, "loss": 2.55791, "ppl": 12.909, "lr": 6.3048e-05, "lr_sched": "cosine", "grad_norm": 0.4667, "tokens": 2545551360, "tok_s": 81982.1, "elapsed_s": 31050.1, "vram_gb": 0.59, "ram_pct": 19.1, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 1100, "epoch": 0.87, "loss": 2.54725, "ppl": 12.772, "lr": 5.8416e-05, "lr_sched": "cosine", "grad_norm": 0.4463, "tokens": 2592691200, "tok_s": 81900.2, "elapsed_s": 31656.7, "vram_gb": 0.59, "ram_pct": 19.2, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 1120, "epoch": 0.886, "loss": 2.52993, "ppl": 12.553, "lr": 5.4279e-05, "lr_sched": "cosine", "grad_norm": 0.4327, "tokens": 2639831040, "tok_s": 81821.8, "elapsed_s": 32263.2, "vram_gb": 0.59, "ram_pct": 19.2, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 1140, "epoch": 0.902, "loss": 2.53771, "ppl": 12.651, "lr": 5.0648e-05, "lr_sched": "cosine", "grad_norm": 0.3917, "tokens": 2686970880, "tok_s": 81743.1, "elapsed_s": 32870.9, "vram_gb": 0.59, "ram_pct": 19.2, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 1160, "epoch": 0.918, "loss": 2.53772, "ppl": 12.651, "lr": 4.7535e-05, "lr_sched": "cosine", "grad_norm": 0.4292, "tokens": 2734110720, "tok_s": 81671.3, "elapsed_s": 33477.0, "vram_gb": 0.59, "ram_pct": 19.2, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 1180, "epoch": 0.934, "loss": 2.53827, "ppl": 12.658, "lr": 4.495e-05, "lr_sched": "cosine", "grad_norm": 0.4724, "tokens": 2781250560, "tok_s": 81602.0, "elapsed_s": 34083.1, "vram_gb": 0.59, "ram_pct": 19.2, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 1200, "epoch": 0.949, "loss": 2.53036, "ppl": 12.558, "lr": 4.29e-05, "lr_sched": "cosine", "grad_norm": 0.3189, "tokens": 2828390400, "tok_s": 81520.6, "elapsed_s": 34695.4, "vram_gb": 0.59, "ram_pct": 19.2, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 1200, "epoch": 0.949, "val_loss": 2.53144, "val_ppl": 12.572, "best_val_ppl": 12.572, "is_best": true} {"step": 1220, "epoch": 0.965, "loss": 2.52293, "ppl": 12.465, "lr": 4.1392e-05, "lr_sched": "cosine", "grad_norm": 0.402, "tokens": 2875530240, "tok_s": 81412.0, "elapsed_s": 35320.7, "vram_gb": 0.59, "ram_pct": 19.3, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 1240, "epoch": 0.981, "loss": 2.53116, "ppl": 12.568, "lr": 4.043e-05, "lr_sched": "cosine", "grad_norm": 0.3883, "tokens": 2922670080, "tok_s": 81346.2, "elapsed_s": 35928.8, "vram_gb": 0.59, "ram_pct": 19.3, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"} {"step": 1260, "epoch": 0.997, "loss": 2.52288, "ppl": 12.464, "lr": 4.0017e-05, "lr_sched": "cosine", "grad_norm": 0.4029, "tokens": 2969809920, "tok_s": 81286.3, "elapsed_s": 36535.2, "vram_gb": 0.59, "ram_pct": 19.3, "disk_free_gb": 20.49, "attn_backend": "chunked", "amp_dtype": "torch.float16"}