{ "train_runtime": 15528.6205, "train_samples_per_second": 0.103, "train_steps_per_second": 0.103, "train_loss": 0.19463951266836374, "epoch": 1.0, "config": { "vein_name": "html_r64_v2", "start_step": 600, "end_step": 2200, "batch_size": 1, "grad_accum": 1, "lora_r": 64, "lora_alpha": 64, "warmup_steps": 10, "rs_lora": false, "learning_rate": 0.0002, "max_length": 23401, "dataset_path": "dataset.jsonl", "dataset_index_start": 1618, "dataset_index_end": 1436, "total_samples_processed": 5000 }, "gpu_memory": { "peak_reserved_gb": 19.139 }, "step_by_step_metrics": [ { "step": 601, "timestamp": "2025-12-28T08:58:29.750718", "elapsed_time": 24.035809755325317, "loss": 0.2155, "grad_norm": 0.1604655683040619, "learning_rate": 0.0, "epoch": 0.000625 }, { "step": 602, "timestamp": "2025-12-28T08:58:36.494748", "elapsed_time": 30.779838800430298, "loss": 0.152, "grad_norm": 0.1441534161567688, "learning_rate": 2e-05, "epoch": 0.00125 }, { "step": 603, "timestamp": "2025-12-28T08:58:46.390445", "elapsed_time": 40.67553472518921, "loss": 0.1768, "grad_norm": 0.14501173794269562, "learning_rate": 4e-05, "epoch": 0.001875 }, { "step": 604, "timestamp": "2025-12-28T08:58:52.949267", "elapsed_time": 47.234357595443726, "loss": 0.3154, "grad_norm": 0.20344674587249756, "learning_rate": 6e-05, "epoch": 0.0025 }, { "step": 605, "timestamp": "2025-12-28T08:59:02.976637", "elapsed_time": 57.26172733306885, "loss": 0.1981, "grad_norm": 0.1389472484588623, "learning_rate": 8e-05, "epoch": 0.003125 }, { "step": 606, "timestamp": "2025-12-28T08:59:14.217365", "elapsed_time": 68.50245547294617, "loss": 0.1454, "grad_norm": 0.12427599728107452, "learning_rate": 0.0001, "epoch": 0.00375 }, { "step": 607, "timestamp": "2025-12-28T08:59:19.569352", "elapsed_time": 73.85444259643555, "loss": 0.3674, "grad_norm": 0.21498622000217438, "learning_rate": 0.00012, "epoch": 0.004375 }, { "step": 608, "timestamp": "2025-12-28T08:59:32.803346", "elapsed_time": 87.08843612670898, "loss": 0.1358, "grad_norm": 0.1065230667591095, "learning_rate": 0.00014, "epoch": 0.005 }, { "step": 609, "timestamp": "2025-12-28T08:59:43.029459", "elapsed_time": 97.31454968452454, "loss": 0.1995, "grad_norm": 0.19915856420993805, "learning_rate": 0.00016, "epoch": 0.005625 }, { "step": 610, "timestamp": "2025-12-28T08:59:47.972807", "elapsed_time": 102.25789737701416, "loss": 0.2193, "grad_norm": 0.18390510976314545, "learning_rate": 0.00018, "epoch": 0.00625 }, { "step": 611, "timestamp": "2025-12-28T08:59:56.025895", "elapsed_time": 110.31098580360413, "loss": 0.1661, "grad_norm": 0.14340169727802277, "learning_rate": 0.0002, "epoch": 0.006875 }, { "step": 612, "timestamp": "2025-12-28T09:00:04.192186", "elapsed_time": 118.47727656364441, "loss": 0.1867, "grad_norm": 0.17027480900287628, "learning_rate": 0.000199874213836478, "epoch": 0.0075 }, { "step": 613, "timestamp": "2025-12-28T09:00:15.363335", "elapsed_time": 129.64842581748962, "loss": 0.1322, "grad_norm": 0.12804803252220154, "learning_rate": 0.000199748427672956, "epoch": 0.008125 }, { "step": 614, "timestamp": "2025-12-28T09:00:22.801954", "elapsed_time": 137.0870442390442, "loss": 0.3998, "grad_norm": 0.2402307242155075, "learning_rate": 0.00019962264150943397, "epoch": 0.00875 }, { "step": 615, "timestamp": "2025-12-28T09:00:41.397275", "elapsed_time": 155.68236541748047, "loss": 0.167, "grad_norm": 0.12692536413669586, "learning_rate": 0.00019949685534591195, "epoch": 0.009375 }, { "step": 616, "timestamp": "2025-12-28T09:00:57.925605", "elapsed_time": 172.2106957435608, "loss": 0.1226, "grad_norm": 0.1139947846531868, "learning_rate": 0.00019937106918238996, "epoch": 0.01 }, { "step": 617, "timestamp": "2025-12-28T09:01:03.902990", "elapsed_time": 178.1880807876587, "loss": 0.2534, "grad_norm": 0.2593400478363037, "learning_rate": 0.00019924528301886794, "epoch": 0.010625 }, { "step": 618, "timestamp": "2025-12-28T09:01:19.300000", "elapsed_time": 193.58509039878845, "loss": 0.1422, "grad_norm": 0.12689131498336792, "learning_rate": 0.00019911949685534592, "epoch": 0.01125 }, { "step": 619, "timestamp": "2025-12-28T09:01:25.737299", "elapsed_time": 200.0223891735077, "loss": 0.1783, "grad_norm": 0.16301578283309937, "learning_rate": 0.0001989937106918239, "epoch": 0.011875 }, { "step": 620, "timestamp": "2025-12-28T09:01:38.363267", "elapsed_time": 212.64835739135742, "loss": 0.1295, "grad_norm": 0.13712646067142487, "learning_rate": 0.0001988679245283019, "epoch": 0.0125 }, { "step": 621, "timestamp": "2025-12-28T09:01:43.657825", "elapsed_time": 217.9429154396057, "loss": 0.2815, "grad_norm": 0.21860435605049133, "learning_rate": 0.00019874213836477988, "epoch": 0.013125 }, { "step": 622, "timestamp": "2025-12-28T09:01:52.793187", "elapsed_time": 227.07827711105347, "loss": 0.3943, "grad_norm": 0.20962341129779816, "learning_rate": 0.00019861635220125786, "epoch": 0.01375 }, { "step": 623, "timestamp": "2025-12-28T09:02:00.534713", "elapsed_time": 234.81980347633362, "loss": 0.3958, "grad_norm": 0.17121204733848572, "learning_rate": 0.00019849056603773587, "epoch": 0.014375 }, { "step": 624, "timestamp": "2025-12-28T09:02:09.703390", "elapsed_time": 243.98848390579224, "loss": 0.2354, "grad_norm": 0.16160890460014343, "learning_rate": 0.00019836477987421385, "epoch": 0.015 }, { "step": 625, "timestamp": "2025-12-28T09:02:15.193169", "elapsed_time": 249.47825932502747, "loss": 0.2254, "grad_norm": 0.2050420343875885, "learning_rate": 0.00019823899371069183, "epoch": 0.015625 }, { "step": 626, "timestamp": "2025-12-28T09:02:23.621480", "elapsed_time": 257.9065706729889, "loss": 0.3533, "grad_norm": 0.16951978206634521, "learning_rate": 0.00019811320754716983, "epoch": 0.01625 }, { "step": 627, "timestamp": "2025-12-28T09:02:40.925797", "elapsed_time": 275.21088790893555, "loss": 0.1368, "grad_norm": 0.12023195624351501, "learning_rate": 0.0001979874213836478, "epoch": 0.016875 }, { "step": 628, "timestamp": "2025-12-28T09:02:47.264513", "elapsed_time": 281.54960775375366, "loss": 0.1796, "grad_norm": 0.3685634434223175, "learning_rate": 0.0001978616352201258, "epoch": 0.0175 }, { "step": 629, "timestamp": "2025-12-28T09:02:54.630161", "elapsed_time": 288.9152555465698, "loss": 0.1811, "grad_norm": 0.1793571263551712, "learning_rate": 0.0001977358490566038, "epoch": 0.018125 }, { "step": 630, "timestamp": "2025-12-28T09:03:01.180041", "elapsed_time": 295.4651312828064, "loss": 0.2804, "grad_norm": 0.23330993950366974, "learning_rate": 0.00019761006289308177, "epoch": 0.01875 }, { "step": 631, "timestamp": "2025-12-28T09:03:11.837408", "elapsed_time": 306.12249875068665, "loss": 0.2404, "grad_norm": 0.20730850100517273, "learning_rate": 0.00019748427672955975, "epoch": 0.019375 }, { "step": 632, "timestamp": "2025-12-28T09:03:16.778563", "elapsed_time": 311.0636534690857, "loss": 0.3396, "grad_norm": 0.22279325127601624, "learning_rate": 0.00019735849056603773, "epoch": 0.02 }, { "step": 633, "timestamp": "2025-12-28T09:03:25.959314", "elapsed_time": 320.2444043159485, "loss": 0.161, "grad_norm": 0.17951494455337524, "learning_rate": 0.00019723270440251574, "epoch": 0.020625 }, { "step": 634, "timestamp": "2025-12-28T09:03:42.789283", "elapsed_time": 337.07437324523926, "loss": 0.1691, "grad_norm": 0.12446315586566925, "learning_rate": 0.00019710691823899372, "epoch": 0.02125 }, { "step": 635, "timestamp": "2025-12-28T09:03:54.312342", "elapsed_time": 348.5974371433258, "loss": 0.1439, "grad_norm": 0.19256795942783356, "learning_rate": 0.0001969811320754717, "epoch": 0.021875 }, { "step": 636, "timestamp": "2025-12-28T09:04:03.177467", "elapsed_time": 357.46255707740784, "loss": 0.2008, "grad_norm": 0.13904313743114471, "learning_rate": 0.0001968553459119497, "epoch": 0.0225 }, { "step": 637, "timestamp": "2025-12-28T09:04:10.990148", "elapsed_time": 365.2752380371094, "loss": 0.1878, "grad_norm": 0.1624336838722229, "learning_rate": 0.00019672955974842768, "epoch": 0.023125 }, { "step": 638, "timestamp": "2025-12-28T09:04:17.546438", "elapsed_time": 371.83152866363525, "loss": 0.1937, "grad_norm": 0.17041485011577606, "learning_rate": 0.00019660377358490566, "epoch": 0.02375 }, { "step": 639, "timestamp": "2025-12-28T09:04:21.596846", "elapsed_time": 375.88193678855896, "loss": 0.2582, "grad_norm": 0.20617352426052094, "learning_rate": 0.00019647798742138367, "epoch": 0.024375 }, { "step": 640, "timestamp": "2025-12-28T09:04:32.606180", "elapsed_time": 386.8912706375122, "loss": 0.1758, "grad_norm": 0.1377355009317398, "learning_rate": 0.00019635220125786165, "epoch": 0.025 }, { "step": 641, "timestamp": "2025-12-28T09:04:43.506700", "elapsed_time": 397.7917902469635, "loss": 0.3234, "grad_norm": 0.1705474555492401, "learning_rate": 0.00019622641509433963, "epoch": 0.025625 }, { "step": 642, "timestamp": "2025-12-28T09:04:51.282799", "elapsed_time": 405.5678901672363, "loss": 0.2184, "grad_norm": 0.2311810404062271, "learning_rate": 0.00019610062893081763, "epoch": 0.02625 }, { "step": 643, "timestamp": "2025-12-28T09:04:57.361632", "elapsed_time": 411.6467225551605, "loss": 0.188, "grad_norm": 0.1845218986272812, "learning_rate": 0.0001959748427672956, "epoch": 0.026875 }, { "step": 644, "timestamp": "2025-12-28T09:05:06.307667", "elapsed_time": 420.5927574634552, "loss": 0.2218, "grad_norm": 0.18984770774841309, "learning_rate": 0.0001958490566037736, "epoch": 0.0275 }, { "step": 645, "timestamp": "2025-12-28T09:05:20.346611", "elapsed_time": 434.6317012310028, "loss": 0.1774, "grad_norm": 0.1433676928281784, "learning_rate": 0.00019572327044025157, "epoch": 0.028125 }, { "step": 646, "timestamp": "2025-12-28T09:05:25.128238", "elapsed_time": 439.4133291244507, "loss": 0.3369, "grad_norm": 0.22627809643745422, "learning_rate": 0.00019559748427672958, "epoch": 0.02875 }, { "step": 647, "timestamp": "2025-12-28T09:05:36.797870", "elapsed_time": 451.08296036720276, "loss": 0.1735, "grad_norm": 0.16844865679740906, "learning_rate": 0.00019547169811320755, "epoch": 0.029375 }, { "step": 648, "timestamp": "2025-12-28T09:05:57.683316", "elapsed_time": 471.96840620040894, "loss": 0.1199, "grad_norm": 0.11961396783590317, "learning_rate": 0.00019534591194968553, "epoch": 0.03 }, { "step": 649, "timestamp": "2025-12-28T09:06:05.657286", "elapsed_time": 479.9423773288727, "loss": 0.2306, "grad_norm": 0.1685563325881958, "learning_rate": 0.00019522012578616354, "epoch": 0.030625 }, { "step": 650, "timestamp": "2025-12-28T09:06:20.462627", "elapsed_time": 494.7477180957794, "loss": 0.1896, "grad_norm": 0.12992839515209198, "learning_rate": 0.00019509433962264152, "epoch": 0.03125 }, { "step": 651, "timestamp": "2025-12-28T09:06:31.640433", "elapsed_time": 505.92552399635315, "loss": 0.135, "grad_norm": 0.12081814557313919, "learning_rate": 0.0001949685534591195, "epoch": 0.031875 }, { "step": 652, "timestamp": "2025-12-28T09:06:42.823577", "elapsed_time": 517.1086673736572, "loss": 0.1369, "grad_norm": 0.11642343550920486, "learning_rate": 0.0001948427672955975, "epoch": 0.0325 }, { "step": 653, "timestamp": "2025-12-28T09:06:51.916795", "elapsed_time": 526.2018857002258, "loss": 0.1889, "grad_norm": 0.13910572230815887, "learning_rate": 0.00019471698113207548, "epoch": 0.033125 }, { "step": 654, "timestamp": "2025-12-28T09:07:02.155700", "elapsed_time": 536.4407908916473, "loss": 0.3567, "grad_norm": 0.1917208731174469, "learning_rate": 0.00019459119496855346, "epoch": 0.03375 }, { "step": 655, "timestamp": "2025-12-28T09:07:15.706369", "elapsed_time": 549.9914598464966, "loss": 0.1709, "grad_norm": 0.12028059363365173, "learning_rate": 0.00019446540880503147, "epoch": 0.034375 }, { "step": 656, "timestamp": "2025-12-28T09:07:32.250659", "elapsed_time": 566.535749912262, "loss": 0.1228, "grad_norm": 0.19263313710689545, "learning_rate": 0.00019433962264150945, "epoch": 0.035 }, { "step": 657, "timestamp": "2025-12-28T09:07:39.316479", "elapsed_time": 573.601569890976, "loss": 0.2289, "grad_norm": 0.2995343506336212, "learning_rate": 0.00019421383647798743, "epoch": 0.035625 }, { "step": 658, "timestamp": "2025-12-28T09:07:54.181478", "elapsed_time": 588.4665679931641, "loss": 0.1498, "grad_norm": 0.13047035038471222, "learning_rate": 0.00019408805031446543, "epoch": 0.03625 }, { "step": 659, "timestamp": "2025-12-28T09:08:01.427506", "elapsed_time": 595.7125968933105, "loss": 0.1957, "grad_norm": 0.16221193969249725, "learning_rate": 0.0001939622641509434, "epoch": 0.036875 }, { "step": 660, "timestamp": "2025-12-28T09:08:07.362296", "elapsed_time": 601.6473863124847, "loss": 0.1755, "grad_norm": 0.1594441831111908, "learning_rate": 0.0001938364779874214, "epoch": 0.0375 }, { "step": 661, "timestamp": "2025-12-28T09:08:13.438049", "elapsed_time": 607.7231397628784, "loss": 0.2449, "grad_norm": 0.18605917692184448, "learning_rate": 0.00019371069182389937, "epoch": 0.038125 }, { "step": 662, "timestamp": "2025-12-28T09:08:25.983999", "elapsed_time": 620.2690892219543, "loss": 0.1445, "grad_norm": 0.11945810914039612, "learning_rate": 0.00019358490566037738, "epoch": 0.03875 }, { "step": 663, "timestamp": "2025-12-28T09:08:34.320553", "elapsed_time": 628.6056432723999, "loss": 0.1952, "grad_norm": 0.15212717652320862, "learning_rate": 0.00019345911949685536, "epoch": 0.039375 }, { "step": 664, "timestamp": "2025-12-28T09:08:47.204012", "elapsed_time": 641.4891028404236, "loss": 0.1264, "grad_norm": 0.1063343733549118, "learning_rate": 0.00019333333333333333, "epoch": 0.04 }, { "step": 665, "timestamp": "2025-12-28T09:08:55.268548", "elapsed_time": 649.5536382198334, "loss": 0.1998, "grad_norm": 0.1476004421710968, "learning_rate": 0.00019320754716981134, "epoch": 0.040625 }, { "step": 666, "timestamp": "2025-12-28T09:09:11.683318", "elapsed_time": 665.9684083461761, "loss": 0.1221, "grad_norm": 0.1132006123661995, "learning_rate": 0.00019308176100628932, "epoch": 0.04125 }, { "step": 667, "timestamp": "2025-12-28T09:09:22.208240", "elapsed_time": 676.493331193924, "loss": 0.1504, "grad_norm": 0.13121221959590912, "learning_rate": 0.0001929559748427673, "epoch": 0.041875 }, { "step": 668, "timestamp": "2025-12-28T09:09:42.700506", "elapsed_time": 696.9855964183807, "loss": 0.121, "grad_norm": 0.0900096669793129, "learning_rate": 0.0001928301886792453, "epoch": 0.0425 }, { "step": 669, "timestamp": "2025-12-28T09:09:57.623005", "elapsed_time": 711.9080958366394, "loss": 0.1276, "grad_norm": 0.11430996656417847, "learning_rate": 0.00019270440251572328, "epoch": 0.043125 }, { "step": 670, "timestamp": "2025-12-28T09:10:04.247302", "elapsed_time": 718.5323920249939, "loss": 0.1923, "grad_norm": 0.19039106369018555, "learning_rate": 0.00019257861635220126, "epoch": 0.04375 }, { "step": 671, "timestamp": "2025-12-28T09:10:16.730363", "elapsed_time": 731.0154540538788, "loss": 0.314, "grad_norm": 0.1691213995218277, "learning_rate": 0.00019245283018867927, "epoch": 0.044375 }, { "step": 672, "timestamp": "2025-12-28T09:10:26.112523", "elapsed_time": 740.3976130485535, "loss": 0.164, "grad_norm": 0.13402487337589264, "learning_rate": 0.00019232704402515725, "epoch": 0.045 }, { "step": 673, "timestamp": "2025-12-28T09:10:34.372277", "elapsed_time": 748.6573669910431, "loss": 0.1849, "grad_norm": 0.14694516360759735, "learning_rate": 0.00019220125786163523, "epoch": 0.045625 }, { "step": 674, "timestamp": "2025-12-28T09:10:44.792607", "elapsed_time": 759.0776975154877, "loss": 0.1962, "grad_norm": 0.15182200074195862, "learning_rate": 0.0001920754716981132, "epoch": 0.04625 }, { "step": 675, "timestamp": "2025-12-28T09:10:54.996775", "elapsed_time": 769.2818658351898, "loss": 0.1313, "grad_norm": 0.11995584517717361, "learning_rate": 0.0001919496855345912, "epoch": 0.046875 }, { "step": 676, "timestamp": "2025-12-28T09:11:03.629001", "elapsed_time": 777.9140913486481, "loss": 0.2438, "grad_norm": 0.14396265149116516, "learning_rate": 0.0001918238993710692, "epoch": 0.0475 }, { "step": 677, "timestamp": "2025-12-28T09:11:12.607355", "elapsed_time": 786.8924453258514, "loss": 0.2567, "grad_norm": 0.1717308908700943, "learning_rate": 0.00019169811320754717, "epoch": 0.048125 }, { "step": 678, "timestamp": "2025-12-28T09:11:25.911563", "elapsed_time": 800.196653842926, "loss": 0.1644, "grad_norm": 0.14998085796833038, "learning_rate": 0.00019157232704402518, "epoch": 0.04875 }, { "step": 679, "timestamp": "2025-12-28T09:11:33.380615", "elapsed_time": 807.6657054424286, "loss": 0.3003, "grad_norm": 0.16212862730026245, "learning_rate": 0.00019144654088050316, "epoch": 0.049375 }, { "step": 680, "timestamp": "2025-12-28T09:11:39.286109", "elapsed_time": 813.5711998939514, "loss": 0.261, "grad_norm": 0.19972242414951324, "learning_rate": 0.00019132075471698114, "epoch": 0.05 }, { "step": 681, "timestamp": "2025-12-28T09:11:48.163985", "elapsed_time": 822.4490756988525, "loss": 0.3926, "grad_norm": 0.17470583319664001, "learning_rate": 0.00019119496855345914, "epoch": 0.050625 }, { "step": 682, "timestamp": "2025-12-28T09:12:01.357382", "elapsed_time": 835.6424729824066, "loss": 0.1516, "grad_norm": 0.1337703913450241, "learning_rate": 0.00019106918238993712, "epoch": 0.05125 }, { "step": 683, "timestamp": "2025-12-28T09:12:11.393661", "elapsed_time": 845.678751707077, "loss": 0.2966, "grad_norm": 0.14767307043075562, "learning_rate": 0.0001909433962264151, "epoch": 0.051875 }, { "step": 684, "timestamp": "2025-12-28T09:12:17.843729", "elapsed_time": 852.1288199424744, "loss": 0.217, "grad_norm": 0.16958969831466675, "learning_rate": 0.0001908176100628931, "epoch": 0.0525 }, { "step": 685, "timestamp": "2025-12-28T09:12:25.829441", "elapsed_time": 860.1145317554474, "loss": 0.1656, "grad_norm": 0.14111389219760895, "learning_rate": 0.00019069182389937108, "epoch": 0.053125 }, { "step": 686, "timestamp": "2025-12-28T09:12:37.674364", "elapsed_time": 871.9594547748566, "loss": 0.1438, "grad_norm": 0.11669319868087769, "learning_rate": 0.00019056603773584906, "epoch": 0.05375 }, { "step": 687, "timestamp": "2025-12-28T09:12:56.652637", "elapsed_time": 890.9377274513245, "loss": 0.1478, "grad_norm": 0.12338529527187347, "learning_rate": 0.00019044025157232704, "epoch": 0.054375 }, { "step": 688, "timestamp": "2025-12-28T09:13:02.953778", "elapsed_time": 897.2388682365417, "loss": 0.198, "grad_norm": 0.1538494974374771, "learning_rate": 0.00019031446540880505, "epoch": 0.055 }, { "step": 689, "timestamp": "2025-12-28T09:13:11.878858", "elapsed_time": 906.1639485359192, "loss": 0.1424, "grad_norm": 0.12444575875997543, "learning_rate": 0.00019018867924528303, "epoch": 0.055625 }, { "step": 690, "timestamp": "2025-12-28T09:13:20.334671", "elapsed_time": 914.61976146698, "loss": 0.1823, "grad_norm": 0.19447259604930878, "learning_rate": 0.000190062893081761, "epoch": 0.05625 }, { "step": 691, "timestamp": "2025-12-28T09:13:27.616304", "elapsed_time": 921.9013941287994, "loss": 0.1547, "grad_norm": 0.14511334896087646, "learning_rate": 0.00018993710691823901, "epoch": 0.056875 }, { "step": 692, "timestamp": "2025-12-28T09:13:41.557565", "elapsed_time": 935.8426558971405, "loss": 0.1594, "grad_norm": 0.11656054854393005, "learning_rate": 0.000189811320754717, "epoch": 0.0575 }, { "step": 693, "timestamp": "2025-12-28T09:13:51.373998", "elapsed_time": 945.6590890884399, "loss": 0.2355, "grad_norm": 0.1568254977464676, "learning_rate": 0.00018968553459119497, "epoch": 0.058125 }, { "step": 694, "timestamp": "2025-12-28T09:14:06.517264", "elapsed_time": 960.8023540973663, "loss": 0.2098, "grad_norm": 0.12919512391090393, "learning_rate": 0.00018955974842767298, "epoch": 0.05875 }, { "step": 695, "timestamp": "2025-12-28T09:14:14.033362", "elapsed_time": 968.3184523582458, "loss": 0.2264, "grad_norm": 0.21954892575740814, "learning_rate": 0.00018943396226415096, "epoch": 0.059375 }, { "step": 696, "timestamp": "2025-12-28T09:14:23.154345", "elapsed_time": 977.4394352436066, "loss": 0.197, "grad_norm": 0.1527150720357895, "learning_rate": 0.00018930817610062894, "epoch": 0.06 }, { "step": 697, "timestamp": "2025-12-28T09:14:30.973861", "elapsed_time": 985.258951663971, "loss": 0.2027, "grad_norm": 0.16911649703979492, "learning_rate": 0.00018918238993710694, "epoch": 0.060625 }, { "step": 698, "timestamp": "2025-12-28T09:14:38.748666", "elapsed_time": 993.0337567329407, "loss": 0.2625, "grad_norm": 0.15294122695922852, "learning_rate": 0.00018905660377358492, "epoch": 0.06125 }, { "step": 699, "timestamp": "2025-12-28T09:14:46.484732", "elapsed_time": 1000.7698221206665, "loss": 0.2257, "grad_norm": 0.16110914945602417, "learning_rate": 0.0001889308176100629, "epoch": 0.061875 }, { "step": 700, "timestamp": "2025-12-28T09:14:52.017443", "elapsed_time": 1006.3025336265564, "loss": 0.2229, "grad_norm": 0.19415181875228882, "learning_rate": 0.00018880503144654088, "epoch": 0.0625 }, { "step": 701, "timestamp": "2025-12-28T09:15:00.051403", "elapsed_time": 1014.336493730545, "loss": 0.1913, "grad_norm": 0.1673995554447174, "learning_rate": 0.00018867924528301889, "epoch": 0.063125 }, { "step": 702, "timestamp": "2025-12-28T09:15:16.601964", "elapsed_time": 1030.8870537281036, "loss": 0.1377, "grad_norm": 0.09899748116731644, "learning_rate": 0.00018855345911949686, "epoch": 0.06375 }, { "step": 703, "timestamp": "2025-12-28T09:15:30.488977", "elapsed_time": 1044.7740678787231, "loss": 0.1405, "grad_norm": 0.11365586519241333, "learning_rate": 0.00018842767295597484, "epoch": 0.064375 }, { "step": 704, "timestamp": "2025-12-28T09:15:36.750160", "elapsed_time": 1051.0352516174316, "loss": 0.1852, "grad_norm": 0.1651289463043213, "learning_rate": 0.00018830188679245285, "epoch": 0.065 }, { "step": 705, "timestamp": "2025-12-28T09:15:49.379817", "elapsed_time": 1063.6649072170258, "loss": 0.1602, "grad_norm": 0.12038940191268921, "learning_rate": 0.00018817610062893083, "epoch": 0.065625 }, { "step": 706, "timestamp": "2025-12-28T09:15:54.708518", "elapsed_time": 1068.9936089515686, "loss": 0.3003, "grad_norm": 0.26949650049209595, "learning_rate": 0.0001880503144654088, "epoch": 0.06625 }, { "step": 707, "timestamp": "2025-12-28T09:16:05.981736", "elapsed_time": 1080.2668268680573, "loss": 0.1754, "grad_norm": 0.133348748087883, "learning_rate": 0.00018792452830188681, "epoch": 0.066875 }, { "step": 708, "timestamp": "2025-12-28T09:16:11.541142", "elapsed_time": 1085.8262326717377, "loss": 0.1986, "grad_norm": 0.1636824756860733, "learning_rate": 0.0001877987421383648, "epoch": 0.0675 }, { "step": 709, "timestamp": "2025-12-28T09:16:17.744602", "elapsed_time": 1092.0296926498413, "loss": 0.2282, "grad_norm": 0.18413999676704407, "learning_rate": 0.00018767295597484277, "epoch": 0.068125 }, { "step": 710, "timestamp": "2025-12-28T09:16:28.622419", "elapsed_time": 1102.9075095653534, "loss": 0.1403, "grad_norm": 0.1099625825881958, "learning_rate": 0.00018754716981132078, "epoch": 0.06875 }, { "step": 711, "timestamp": "2025-12-28T09:16:38.899862", "elapsed_time": 1113.1849522590637, "loss": 0.3365, "grad_norm": 0.1869092434644699, "learning_rate": 0.00018742138364779876, "epoch": 0.069375 }, { "step": 712, "timestamp": "2025-12-28T09:16:58.976813", "elapsed_time": 1133.2619035243988, "loss": 0.1024, "grad_norm": 0.08866474777460098, "learning_rate": 0.00018729559748427674, "epoch": 0.07 }, { "step": 713, "timestamp": "2025-12-28T09:17:05.528653", "elapsed_time": 1139.8137435913086, "loss": 0.2417, "grad_norm": 0.1919388324022293, "learning_rate": 0.00018716981132075472, "epoch": 0.070625 }, { "step": 714, "timestamp": "2025-12-28T09:17:16.181514", "elapsed_time": 1150.466604232788, "loss": 0.1628, "grad_norm": 0.12882591784000397, "learning_rate": 0.00018704402515723272, "epoch": 0.07125 }, { "step": 715, "timestamp": "2025-12-28T09:17:21.921176", "elapsed_time": 1156.2062666416168, "loss": 0.2503, "grad_norm": 0.17477668821811676, "learning_rate": 0.0001869182389937107, "epoch": 0.071875 }, { "step": 716, "timestamp": "2025-12-28T09:17:31.060009", "elapsed_time": 1165.3450994491577, "loss": 0.19, "grad_norm": 0.15098389983177185, "learning_rate": 0.00018679245283018868, "epoch": 0.0725 }, { "step": 717, "timestamp": "2025-12-28T09:17:51.950466", "elapsed_time": 1186.235556602478, "loss": 0.1353, "grad_norm": 0.12708379328250885, "learning_rate": 0.0001866666666666667, "epoch": 0.073125 }, { "step": 718, "timestamp": "2025-12-28T09:17:57.215989", "elapsed_time": 1191.5010793209076, "loss": 0.2758, "grad_norm": 0.20731349289417267, "learning_rate": 0.00018654088050314467, "epoch": 0.07375 }, { "step": 719, "timestamp": "2025-12-28T09:18:05.159270", "elapsed_time": 1199.4443612098694, "loss": 0.226, "grad_norm": 0.16173875331878662, "learning_rate": 0.00018641509433962264, "epoch": 0.074375 }, { "step": 720, "timestamp": "2025-12-28T09:18:10.909682", "elapsed_time": 1205.1947722434998, "loss": 0.2709, "grad_norm": 0.21631106734275818, "learning_rate": 0.00018628930817610065, "epoch": 0.075 }, { "step": 721, "timestamp": "2025-12-28T09:18:23.590226", "elapsed_time": 1217.875316619873, "loss": 0.2306, "grad_norm": 0.12810084223747253, "learning_rate": 0.00018616352201257863, "epoch": 0.075625 }, { "step": 722, "timestamp": "2025-12-28T09:18:38.922154", "elapsed_time": 1233.2072455883026, "loss": 0.1657, "grad_norm": 0.10849788039922714, "learning_rate": 0.0001860377358490566, "epoch": 0.07625 }, { "step": 723, "timestamp": "2025-12-28T09:18:48.500418", "elapsed_time": 1242.7855124473572, "loss": 0.1657, "grad_norm": 0.17049764096736908, "learning_rate": 0.00018591194968553462, "epoch": 0.076875 }, { "step": 724, "timestamp": "2025-12-28T09:18:57.124927", "elapsed_time": 1251.4100172519684, "loss": 0.1443, "grad_norm": 0.1467668116092682, "learning_rate": 0.0001857861635220126, "epoch": 0.0775 }, { "step": 725, "timestamp": "2025-12-28T09:19:04.646596", "elapsed_time": 1258.9316868782043, "loss": 0.2106, "grad_norm": 0.16145600378513336, "learning_rate": 0.00018566037735849057, "epoch": 0.078125 }, { "step": 726, "timestamp": "2025-12-28T09:19:13.415801", "elapsed_time": 1267.7008922100067, "loss": 0.1683, "grad_norm": 0.13938355445861816, "learning_rate": 0.00018553459119496855, "epoch": 0.07875 }, { "step": 727, "timestamp": "2025-12-28T09:19:19.791168", "elapsed_time": 1274.0762577056885, "loss": 0.1834, "grad_norm": 0.16343441605567932, "learning_rate": 0.00018540880503144656, "epoch": 0.079375 }, { "step": 728, "timestamp": "2025-12-28T09:19:26.780643", "elapsed_time": 1281.065733909607, "loss": 0.3605, "grad_norm": 0.18578511476516724, "learning_rate": 0.00018528301886792454, "epoch": 0.08 }, { "step": 729, "timestamp": "2025-12-28T09:19:40.770904", "elapsed_time": 1295.0559949874878, "loss": 0.1423, "grad_norm": 0.10042066127061844, "learning_rate": 0.00018515723270440252, "epoch": 0.080625 }, { "step": 730, "timestamp": "2025-12-28T09:19:46.875385", "elapsed_time": 1301.1604759693146, "loss": 0.2364, "grad_norm": 0.2259899079799652, "learning_rate": 0.00018503144654088052, "epoch": 0.08125 }, { "step": 731, "timestamp": "2025-12-28T09:19:57.704231", "elapsed_time": 1311.9893221855164, "loss": 0.1339, "grad_norm": 0.15420939028263092, "learning_rate": 0.0001849056603773585, "epoch": 0.081875 }, { "step": 732, "timestamp": "2025-12-28T09:20:05.181888", "elapsed_time": 1319.4669790267944, "loss": 0.1448, "grad_norm": 0.13649825751781464, "learning_rate": 0.00018477987421383648, "epoch": 0.0825 }, { "step": 733, "timestamp": "2025-12-28T09:20:22.787205", "elapsed_time": 1337.0722954273224, "loss": 0.1804, "grad_norm": 0.1483648419380188, "learning_rate": 0.0001846540880503145, "epoch": 0.083125 }, { "step": 734, "timestamp": "2025-12-28T09:20:32.228745", "elapsed_time": 1346.5138351917267, "loss": 0.1621, "grad_norm": 0.1357170045375824, "learning_rate": 0.00018452830188679247, "epoch": 0.08375 }, { "step": 735, "timestamp": "2025-12-28T09:20:39.593318", "elapsed_time": 1353.8784093856812, "loss": 0.1985, "grad_norm": 0.18031319975852966, "learning_rate": 0.00018440251572327045, "epoch": 0.084375 }, { "step": 736, "timestamp": "2025-12-28T09:20:46.467101", "elapsed_time": 1360.752191543579, "loss": 0.4623, "grad_norm": 0.2359013855457306, "learning_rate": 0.00018427672955974845, "epoch": 0.085 }, { "step": 737, "timestamp": "2025-12-28T09:20:53.603967", "elapsed_time": 1367.8890571594238, "loss": 0.2163, "grad_norm": 0.16415388882160187, "learning_rate": 0.00018415094339622643, "epoch": 0.085625 }, { "step": 738, "timestamp": "2025-12-28T09:20:59.877010", "elapsed_time": 1374.1621007919312, "loss": 0.2737, "grad_norm": 0.18287940323352814, "learning_rate": 0.0001840251572327044, "epoch": 0.08625 }, { "step": 739, "timestamp": "2025-12-28T09:21:17.007779", "elapsed_time": 1391.2928698062897, "loss": 0.144, "grad_norm": 0.10500773787498474, "learning_rate": 0.0001838993710691824, "epoch": 0.086875 }, { "step": 740, "timestamp": "2025-12-28T09:21:27.416174", "elapsed_time": 1401.70126414299, "loss": 0.1542, "grad_norm": 0.11694104224443436, "learning_rate": 0.0001837735849056604, "epoch": 0.0875 }, { "step": 741, "timestamp": "2025-12-28T09:21:32.682215", "elapsed_time": 1406.9673054218292, "loss": 0.2283, "grad_norm": 0.17158034443855286, "learning_rate": 0.00018364779874213837, "epoch": 0.088125 }, { "step": 742, "timestamp": "2025-12-28T09:21:42.891669", "elapsed_time": 1417.17675948143, "loss": 0.1413, "grad_norm": 0.12412276864051819, "learning_rate": 0.00018352201257861635, "epoch": 0.08875 }, { "step": 743, "timestamp": "2025-12-28T09:21:50.263790", "elapsed_time": 1424.5488805770874, "loss": 0.322, "grad_norm": 0.2420777678489685, "learning_rate": 0.00018339622641509436, "epoch": 0.089375 }, { "step": 744, "timestamp": "2025-12-28T09:22:01.677433", "elapsed_time": 1435.9625227451324, "loss": 0.1475, "grad_norm": 0.10151813179254532, "learning_rate": 0.00018327044025157234, "epoch": 0.09 }, { "step": 745, "timestamp": "2025-12-28T09:22:11.434022", "elapsed_time": 1445.7191128730774, "loss": 0.3527, "grad_norm": 0.16911114752292633, "learning_rate": 0.00018314465408805032, "epoch": 0.090625 }, { "step": 746, "timestamp": "2025-12-28T09:22:23.572820", "elapsed_time": 1457.8579106330872, "loss": 0.1674, "grad_norm": 0.12217868864536285, "learning_rate": 0.00018301886792452832, "epoch": 0.09125 }, { "step": 747, "timestamp": "2025-12-28T09:22:36.319867", "elapsed_time": 1470.6049571037292, "loss": 0.136, "grad_norm": 0.10214618593454361, "learning_rate": 0.0001828930817610063, "epoch": 0.091875 }, { "step": 748, "timestamp": "2025-12-28T09:22:46.737961", "elapsed_time": 1481.023051738739, "loss": 0.2034, "grad_norm": 0.14041507244110107, "learning_rate": 0.00018276729559748428, "epoch": 0.0925 }, { "step": 749, "timestamp": "2025-12-28T09:22:54.486010", "elapsed_time": 1488.7711000442505, "loss": 0.203, "grad_norm": 0.23749075829982758, "learning_rate": 0.0001826415094339623, "epoch": 0.093125 }, { "step": 750, "timestamp": "2025-12-28T09:23:15.381836", "elapsed_time": 1509.6669268608093, "loss": 0.1252, "grad_norm": 0.09619986265897751, "learning_rate": 0.00018251572327044027, "epoch": 0.09375 }, { "step": 751, "timestamp": "2025-12-28T09:23:26.138595", "elapsed_time": 1520.4236857891083, "loss": 0.1759, "grad_norm": 0.1735697239637375, "learning_rate": 0.00018238993710691825, "epoch": 0.094375 }, { "step": 752, "timestamp": "2025-12-28T09:23:31.779100", "elapsed_time": 1526.064194202423, "loss": 0.4079, "grad_norm": 0.2254641205072403, "learning_rate": 0.00018226415094339625, "epoch": 0.095 }, { "step": 753, "timestamp": "2025-12-28T09:23:48.621990", "elapsed_time": 1542.9070808887482, "loss": 0.1389, "grad_norm": 0.09768623113632202, "learning_rate": 0.00018213836477987423, "epoch": 0.095625 }, { "step": 754, "timestamp": "2025-12-28T09:23:56.146839", "elapsed_time": 1550.4319293498993, "loss": 0.3221, "grad_norm": 0.17994655668735504, "learning_rate": 0.0001820125786163522, "epoch": 0.09625 }, { "step": 755, "timestamp": "2025-12-28T09:24:17.041799", "elapsed_time": 1571.3268892765045, "loss": 0.1407, "grad_norm": 0.09752582013607025, "learning_rate": 0.0001818867924528302, "epoch": 0.096875 }, { "step": 756, "timestamp": "2025-12-28T09:24:27.104362", "elapsed_time": 1581.389452457428, "loss": 0.1681, "grad_norm": 0.21668905019760132, "learning_rate": 0.0001817610062893082, "epoch": 0.0975 }, { "step": 757, "timestamp": "2025-12-28T09:24:38.052954", "elapsed_time": 1592.3380448818207, "loss": 0.2357, "grad_norm": 0.14341306686401367, "learning_rate": 0.00018163522012578617, "epoch": 0.098125 }, { "step": 758, "timestamp": "2025-12-28T09:24:46.883893", "elapsed_time": 1601.168983221054, "loss": 0.1544, "grad_norm": 0.12626543641090393, "learning_rate": 0.00018150943396226415, "epoch": 0.09875 }, { "step": 759, "timestamp": "2025-12-28T09:24:56.091568", "elapsed_time": 1610.3766589164734, "loss": 0.3366, "grad_norm": 0.1486847847700119, "learning_rate": 0.00018138364779874216, "epoch": 0.099375 }, { "step": 760, "timestamp": "2025-12-28T09:25:02.464933", "elapsed_time": 1616.7500236034393, "loss": 0.2171, "grad_norm": 0.1620669662952423, "learning_rate": 0.00018125786163522014, "epoch": 0.1 }, { "step": 761, "timestamp": "2025-12-28T09:25:12.378673", "elapsed_time": 1626.6637637615204, "loss": 0.1958, "grad_norm": 0.13596412539482117, "learning_rate": 0.00018113207547169812, "epoch": 0.100625 }, { "step": 762, "timestamp": "2025-12-28T09:25:24.048320", "elapsed_time": 1638.3334102630615, "loss": 0.1543, "grad_norm": 0.19701559841632843, "learning_rate": 0.00018100628930817612, "epoch": 0.10125 }, { "step": 763, "timestamp": "2025-12-28T09:25:31.137409", "elapsed_time": 1645.4224989414215, "loss": 0.3084, "grad_norm": 0.16929014027118683, "learning_rate": 0.0001808805031446541, "epoch": 0.101875 }, { "step": 764, "timestamp": "2025-12-28T09:25:37.301475", "elapsed_time": 1651.5865650177002, "loss": 0.2045, "grad_norm": 0.1783858984708786, "learning_rate": 0.00018075471698113208, "epoch": 0.1025 }, { "step": 765, "timestamp": "2025-12-28T09:25:42.794771", "elapsed_time": 1657.0798616409302, "loss": 0.2026, "grad_norm": 0.1741485893726349, "learning_rate": 0.0001806289308176101, "epoch": 0.103125 }, { "step": 766, "timestamp": "2025-12-28T09:25:53.503557", "elapsed_time": 1667.788647890091, "loss": 0.1651, "grad_norm": 0.39134204387664795, "learning_rate": 0.00018050314465408807, "epoch": 0.10375 }, { "step": 767, "timestamp": "2025-12-28T09:26:03.591001", "elapsed_time": 1677.8760917186737, "loss": 0.2385, "grad_norm": 0.19072601199150085, "learning_rate": 0.00018037735849056605, "epoch": 0.104375 }, { "step": 768, "timestamp": "2025-12-28T09:26:12.895494", "elapsed_time": 1687.180584192276, "loss": 0.2075, "grad_norm": 0.14767950773239136, "learning_rate": 0.00018025157232704403, "epoch": 0.105 }, { "step": 769, "timestamp": "2025-12-28T09:26:28.488803", "elapsed_time": 1702.7738931179047, "loss": 0.1246, "grad_norm": 0.11118397116661072, "learning_rate": 0.00018012578616352203, "epoch": 0.105625 }, { "step": 770, "timestamp": "2025-12-28T09:26:35.655701", "elapsed_time": 1709.9407913684845, "loss": 0.2173, "grad_norm": 0.15238629281520844, "learning_rate": 0.00018, "epoch": 0.10625 }, { "step": 771, "timestamp": "2025-12-28T09:26:42.135442", "elapsed_time": 1716.420532464981, "loss": 0.4145, "grad_norm": 0.17681153118610382, "learning_rate": 0.000179874213836478, "epoch": 0.106875 }, { "step": 772, "timestamp": "2025-12-28T09:26:54.831081", "elapsed_time": 1729.1161713600159, "loss": 0.2015, "grad_norm": 0.17193950712680817, "learning_rate": 0.000179748427672956, "epoch": 0.1075 }, { "step": 773, "timestamp": "2025-12-28T09:27:05.543585", "elapsed_time": 1739.8286757469177, "loss": 0.1725, "grad_norm": 0.11407608538866043, "learning_rate": 0.00017962264150943398, "epoch": 0.108125 }, { "step": 774, "timestamp": "2025-12-28T09:27:09.428086", "elapsed_time": 1743.713176727295, "loss": 0.3777, "grad_norm": 0.2451329529285431, "learning_rate": 0.00017949685534591195, "epoch": 0.10875 }, { "step": 775, "timestamp": "2025-12-28T09:27:23.100645", "elapsed_time": 1757.3857352733612, "loss": 0.1818, "grad_norm": 0.11372605711221695, "learning_rate": 0.00017937106918238996, "epoch": 0.109375 }, { "step": 776, "timestamp": "2025-12-28T09:27:33.162588", "elapsed_time": 1767.447678565979, "loss": 0.1858, "grad_norm": 0.1400919407606125, "learning_rate": 0.00017924528301886794, "epoch": 0.11 }, { "step": 777, "timestamp": "2025-12-28T09:27:39.454731", "elapsed_time": 1773.7398252487183, "loss": 0.2219, "grad_norm": 0.16964372992515564, "learning_rate": 0.00017911949685534592, "epoch": 0.110625 }, { "step": 778, "timestamp": "2025-12-28T09:27:44.743958", "elapsed_time": 1779.0290484428406, "loss": 0.2554, "grad_norm": 0.17479604482650757, "learning_rate": 0.00017899371069182393, "epoch": 0.11125 }, { "step": 779, "timestamp": "2025-12-28T09:27:53.377780", "elapsed_time": 1787.6628749370575, "loss": 0.1559, "grad_norm": 0.12139801681041718, "learning_rate": 0.0001788679245283019, "epoch": 0.111875 }, { "step": 780, "timestamp": "2025-12-28T09:28:09.536850", "elapsed_time": 1803.821940422058, "loss": 0.1215, "grad_norm": 0.11527290940284729, "learning_rate": 0.00017874213836477988, "epoch": 0.1125 }, { "step": 781, "timestamp": "2025-12-28T09:28:16.916052", "elapsed_time": 1811.2011427879333, "loss": 0.191, "grad_norm": 0.2477898895740509, "learning_rate": 0.00017861635220125786, "epoch": 0.113125 }, { "step": 782, "timestamp": "2025-12-28T09:28:24.055187", "elapsed_time": 1818.3402771949768, "loss": 0.2501, "grad_norm": 0.18624109029769897, "learning_rate": 0.00017849056603773587, "epoch": 0.11375 }, { "step": 783, "timestamp": "2025-12-28T09:28:35.480491", "elapsed_time": 1829.7655820846558, "loss": 0.1427, "grad_norm": 0.10521696507930756, "learning_rate": 0.00017836477987421385, "epoch": 0.114375 }, { "step": 784, "timestamp": "2025-12-28T09:28:46.488517", "elapsed_time": 1840.7736072540283, "loss": 0.153, "grad_norm": 0.14191673696041107, "learning_rate": 0.00017823899371069183, "epoch": 0.115 }, { "step": 785, "timestamp": "2025-12-28T09:28:57.330380", "elapsed_time": 1851.615470647812, "loss": 0.1657, "grad_norm": 0.11947259306907654, "learning_rate": 0.00017811320754716983, "epoch": 0.115625 }, { "step": 786, "timestamp": "2025-12-28T09:29:02.999607", "elapsed_time": 1857.2846972942352, "loss": 0.3162, "grad_norm": 0.20225848257541656, "learning_rate": 0.0001779874213836478, "epoch": 0.11625 }, { "step": 787, "timestamp": "2025-12-28T09:29:14.010140", "elapsed_time": 1868.29523062706, "loss": 0.199, "grad_norm": 0.20261730253696442, "learning_rate": 0.0001778616352201258, "epoch": 0.116875 }, { "step": 788, "timestamp": "2025-12-28T09:29:27.747132", "elapsed_time": 1882.0322229862213, "loss": 0.1879, "grad_norm": 0.1322057545185089, "learning_rate": 0.0001777358490566038, "epoch": 0.1175 }, { "step": 789, "timestamp": "2025-12-28T09:29:32.947439", "elapsed_time": 1887.2325296401978, "loss": 0.2144, "grad_norm": 0.1755441427230835, "learning_rate": 0.00017761006289308178, "epoch": 0.118125 }, { "step": 790, "timestamp": "2025-12-28T09:29:43.515994", "elapsed_time": 1897.8010845184326, "loss": 0.1669, "grad_norm": 0.12597765028476715, "learning_rate": 0.00017748427672955976, "epoch": 0.11875 }, { "step": 791, "timestamp": "2025-12-28T09:29:49.003240", "elapsed_time": 1903.288330078125, "loss": 0.3016, "grad_norm": 0.17289644479751587, "learning_rate": 0.00017735849056603776, "epoch": 0.119375 }, { "step": 792, "timestamp": "2025-12-28T09:29:57.868876", "elapsed_time": 1912.153966665268, "loss": 0.2346, "grad_norm": 0.13611753284931183, "learning_rate": 0.00017723270440251574, "epoch": 0.12 }, { "step": 793, "timestamp": "2025-12-28T09:30:09.439964", "elapsed_time": 1923.7250542640686, "loss": 0.1734, "grad_norm": 0.1182570829987526, "learning_rate": 0.00017710691823899372, "epoch": 0.120625 }, { "step": 794, "timestamp": "2025-12-28T09:30:21.281246", "elapsed_time": 1935.5663363933563, "loss": 0.2104, "grad_norm": 0.16117580235004425, "learning_rate": 0.0001769811320754717, "epoch": 0.12125 }, { "step": 795, "timestamp": "2025-12-28T09:30:35.176733", "elapsed_time": 1949.4618237018585, "loss": 0.2394, "grad_norm": 0.12246831506490707, "learning_rate": 0.0001768553459119497, "epoch": 0.121875 }, { "step": 796, "timestamp": "2025-12-28T09:30:45.890549", "elapsed_time": 1960.1756389141083, "loss": 0.222, "grad_norm": 0.14199329912662506, "learning_rate": 0.00017672955974842768, "epoch": 0.1225 }, { "step": 797, "timestamp": "2025-12-28T09:30:56.848050", "elapsed_time": 1971.1331400871277, "loss": 0.1856, "grad_norm": 0.15696978569030762, "learning_rate": 0.00017660377358490566, "epoch": 0.123125 }, { "step": 798, "timestamp": "2025-12-28T09:31:05.711194", "elapsed_time": 1979.9962842464447, "loss": 0.1754, "grad_norm": 0.12661588191986084, "learning_rate": 0.00017647798742138367, "epoch": 0.12375 }, { "step": 799, "timestamp": "2025-12-28T09:31:14.996719", "elapsed_time": 1989.2818095684052, "loss": 0.202, "grad_norm": 0.1417084038257599, "learning_rate": 0.00017635220125786165, "epoch": 0.124375 }, { "step": 800, "timestamp": "2025-12-28T09:31:23.000694", "elapsed_time": 1997.285789012909, "loss": 0.2166, "grad_norm": 0.13480763137340546, "learning_rate": 0.00017622641509433963, "epoch": 0.125 }, { "step": 801, "timestamp": "2025-12-28T09:31:35.762309", "elapsed_time": 2010.0473990440369, "loss": 0.1369, "grad_norm": 0.45576173067092896, "learning_rate": 0.00017610062893081763, "epoch": 0.125625 }, { "step": 802, "timestamp": "2025-12-28T09:31:43.012873", "elapsed_time": 2017.2979636192322, "loss": 0.2186, "grad_norm": 0.1421194076538086, "learning_rate": 0.0001759748427672956, "epoch": 0.12625 }, { "step": 803, "timestamp": "2025-12-28T09:32:02.199762", "elapsed_time": 2036.484852552414, "loss": 0.1557, "grad_norm": 0.08416479825973511, "learning_rate": 0.0001758490566037736, "epoch": 0.126875 }, { "step": 804, "timestamp": "2025-12-28T09:32:07.151235", "elapsed_time": 2041.4363248348236, "loss": 0.2531, "grad_norm": 0.17905178666114807, "learning_rate": 0.0001757232704402516, "epoch": 0.1275 }, { "step": 805, "timestamp": "2025-12-28T09:32:15.632157", "elapsed_time": 2049.9172480106354, "loss": 0.1335, "grad_norm": 0.1256638765335083, "learning_rate": 0.00017559748427672958, "epoch": 0.128125 }, { "step": 806, "timestamp": "2025-12-28T09:32:31.298185", "elapsed_time": 2065.583275079727, "loss": 0.1427, "grad_norm": 0.0954441949725151, "learning_rate": 0.00017547169811320756, "epoch": 0.12875 }, { "step": 807, "timestamp": "2025-12-28T09:32:39.447526", "elapsed_time": 2073.7326169013977, "loss": 0.1878, "grad_norm": 0.13530845940113068, "learning_rate": 0.00017534591194968554, "epoch": 0.129375 }, { "step": 808, "timestamp": "2025-12-28T09:32:50.466826", "elapsed_time": 2084.751916408539, "loss": 0.1094, "grad_norm": 0.1460101455450058, "learning_rate": 0.00017522012578616354, "epoch": 0.13 }, { "step": 809, "timestamp": "2025-12-28T09:33:11.356705", "elapsed_time": 2105.6417951583862, "loss": 0.1233, "grad_norm": 0.09432196617126465, "learning_rate": 0.00017509433962264152, "epoch": 0.130625 }, { "step": 810, "timestamp": "2025-12-28T09:33:18.523039", "elapsed_time": 2112.808129787445, "loss": 0.2119, "grad_norm": 0.15918022394180298, "learning_rate": 0.0001749685534591195, "epoch": 0.13125 }, { "step": 811, "timestamp": "2025-12-28T09:33:26.829092", "elapsed_time": 2121.114182472229, "loss": 0.1979, "grad_norm": 0.14457164704799652, "learning_rate": 0.0001748427672955975, "epoch": 0.131875 }, { "step": 812, "timestamp": "2025-12-28T09:33:32.394758", "elapsed_time": 2126.6798486709595, "loss": 0.3052, "grad_norm": 0.19710645079612732, "learning_rate": 0.00017471698113207549, "epoch": 0.1325 }, { "step": 813, "timestamp": "2025-12-28T09:33:45.262163", "elapsed_time": 2139.5472536087036, "loss": 0.1832, "grad_norm": 0.17278793454170227, "learning_rate": 0.00017459119496855346, "epoch": 0.133125 }, { "step": 814, "timestamp": "2025-12-28T09:33:55.968938", "elapsed_time": 2150.254028081894, "loss": 0.1271, "grad_norm": 0.10092293471097946, "learning_rate": 0.00017446540880503147, "epoch": 0.13375 }, { "step": 815, "timestamp": "2025-12-28T09:34:03.945511", "elapsed_time": 2158.23060131073, "loss": 0.1929, "grad_norm": 0.1362391710281372, "learning_rate": 0.00017433962264150945, "epoch": 0.134375 }, { "step": 816, "timestamp": "2025-12-28T09:34:17.206545", "elapsed_time": 2171.4916355609894, "loss": 0.1908, "grad_norm": 0.10968425869941711, "learning_rate": 0.00017421383647798743, "epoch": 0.135 }, { "step": 817, "timestamp": "2025-12-28T09:34:23.275224", "elapsed_time": 2177.5603144168854, "loss": 0.2587, "grad_norm": 0.17565354704856873, "learning_rate": 0.00017408805031446543, "epoch": 0.135625 }, { "step": 818, "timestamp": "2025-12-28T09:34:30.707329", "elapsed_time": 2184.9924190044403, "loss": 0.4115, "grad_norm": 0.21891839802265167, "learning_rate": 0.00017396226415094341, "epoch": 0.13625 }, { "step": 819, "timestamp": "2025-12-28T09:34:40.426629", "elapsed_time": 2194.7117190361023, "loss": 0.1647, "grad_norm": 0.1077812984585762, "learning_rate": 0.0001738364779874214, "epoch": 0.136875 }, { "step": 820, "timestamp": "2025-12-28T09:34:51.747535", "elapsed_time": 2206.032625436783, "loss": 0.1475, "grad_norm": 0.14761823415756226, "learning_rate": 0.00017371069182389937, "epoch": 0.1375 }, { "step": 821, "timestamp": "2025-12-28T09:34:55.071621", "elapsed_time": 2209.356710910797, "loss": 0.3967, "grad_norm": 0.24036414921283722, "learning_rate": 0.00017358490566037738, "epoch": 0.138125 }, { "step": 822, "timestamp": "2025-12-28T09:35:15.970127", "elapsed_time": 2230.255217552185, "loss": 0.0833, "grad_norm": 0.09146321564912796, "learning_rate": 0.00017345911949685536, "epoch": 0.13875 }, { "step": 823, "timestamp": "2025-12-28T09:35:26.728440", "elapsed_time": 2241.0135345458984, "loss": 0.2568, "grad_norm": 0.1330772191286087, "learning_rate": 0.00017333333333333334, "epoch": 0.139375 }, { "step": 824, "timestamp": "2025-12-28T09:35:35.721884", "elapsed_time": 2250.006974697113, "loss": 0.3728, "grad_norm": 0.1559283286333084, "learning_rate": 0.00017320754716981134, "epoch": 0.14 }, { "step": 825, "timestamp": "2025-12-28T09:35:44.080694", "elapsed_time": 2258.3657846450806, "loss": 0.1528, "grad_norm": 0.12386928498744965, "learning_rate": 0.00017308176100628932, "epoch": 0.140625 }, { "step": 826, "timestamp": "2025-12-28T09:35:57.707994", "elapsed_time": 2271.9930849075317, "loss": 0.1711, "grad_norm": 0.10447162389755249, "learning_rate": 0.0001729559748427673, "epoch": 0.14125 }, { "step": 827, "timestamp": "2025-12-28T09:36:13.573665", "elapsed_time": 2287.8587548732758, "loss": 0.1396, "grad_norm": 0.09571991115808487, "learning_rate": 0.0001728301886792453, "epoch": 0.141875 }, { "step": 828, "timestamp": "2025-12-28T09:36:20.525116", "elapsed_time": 2294.810206890106, "loss": 0.192, "grad_norm": 0.1702854335308075, "learning_rate": 0.00017270440251572329, "epoch": 0.1425 }, { "step": 829, "timestamp": "2025-12-28T09:36:31.229422", "elapsed_time": 2305.514511823654, "loss": 0.1648, "grad_norm": 0.11315345764160156, "learning_rate": 0.00017257861635220126, "epoch": 0.143125 }, { "step": 830, "timestamp": "2025-12-28T09:36:42.076098", "elapsed_time": 2316.3611884117126, "loss": 0.176, "grad_norm": 0.12335264682769775, "learning_rate": 0.00017245283018867927, "epoch": 0.14375 }, { "step": 831, "timestamp": "2025-12-28T09:36:51.299215", "elapsed_time": 2325.5843057632446, "loss": 0.2268, "grad_norm": 0.2178533375263214, "learning_rate": 0.00017232704402515725, "epoch": 0.144375 }, { "step": 832, "timestamp": "2025-12-28T09:37:04.287014", "elapsed_time": 2338.5721044540405, "loss": 0.1242, "grad_norm": 0.11771131306886673, "learning_rate": 0.00017220125786163523, "epoch": 0.145 }, { "step": 833, "timestamp": "2025-12-28T09:37:12.634424", "elapsed_time": 2346.919515132904, "loss": 0.1969, "grad_norm": 0.13342277705669403, "learning_rate": 0.00017207547169811324, "epoch": 0.145625 }, { "step": 834, "timestamp": "2025-12-28T09:37:24.771648", "elapsed_time": 2359.0567383766174, "loss": 0.1777, "grad_norm": 0.10993700474500656, "learning_rate": 0.00017194968553459121, "epoch": 0.14625 }, { "step": 835, "timestamp": "2025-12-28T09:37:32.205060", "elapsed_time": 2366.49015045166, "loss": 0.2091, "grad_norm": 0.14296327531337738, "learning_rate": 0.0001718238993710692, "epoch": 0.146875 }, { "step": 836, "timestamp": "2025-12-28T09:37:40.267010", "elapsed_time": 2374.552100419998, "loss": 0.2618, "grad_norm": 0.18104122579097748, "learning_rate": 0.00017169811320754717, "epoch": 0.1475 }, { "step": 837, "timestamp": "2025-12-28T09:37:56.881841", "elapsed_time": 2391.166932106018, "loss": 0.1163, "grad_norm": 0.11203698068857193, "learning_rate": 0.00017157232704402518, "epoch": 0.148125 }, { "step": 838, "timestamp": "2025-12-28T09:38:15.789255", "elapsed_time": 2410.0743453502655, "loss": 0.1306, "grad_norm": 0.12151342630386353, "learning_rate": 0.00017144654088050316, "epoch": 0.14875 }, { "step": 839, "timestamp": "2025-12-28T09:38:22.694989", "elapsed_time": 2416.9800794124603, "loss": 0.1991, "grad_norm": 0.14583951234817505, "learning_rate": 0.00017132075471698114, "epoch": 0.149375 }, { "step": 840, "timestamp": "2025-12-28T09:38:32.405677", "elapsed_time": 2426.6907675266266, "loss": 0.5209, "grad_norm": 0.1858104020357132, "learning_rate": 0.00017119496855345914, "epoch": 0.15 }, { "step": 841, "timestamp": "2025-12-28T09:38:41.693108", "elapsed_time": 2435.97819852829, "loss": 0.2115, "grad_norm": 0.12431403249502182, "learning_rate": 0.00017106918238993712, "epoch": 0.150625 }, { "step": 842, "timestamp": "2025-12-28T09:38:49.200296", "elapsed_time": 2443.4853858947754, "loss": 0.2078, "grad_norm": 0.1477956473827362, "learning_rate": 0.0001709433962264151, "epoch": 0.15125 }, { "step": 843, "timestamp": "2025-12-28T09:38:55.712143", "elapsed_time": 2449.9972331523895, "loss": 0.3907, "grad_norm": 0.184353306889534, "learning_rate": 0.0001708176100628931, "epoch": 0.151875 }, { "step": 844, "timestamp": "2025-12-28T09:38:58.933553", "elapsed_time": 2453.218643426895, "loss": 0.2667, "grad_norm": 0.262251079082489, "learning_rate": 0.0001706918238993711, "epoch": 0.1525 }, { "step": 845, "timestamp": "2025-12-28T09:39:05.378036", "elapsed_time": 2459.663126707077, "loss": 0.1593, "grad_norm": 0.12699125707149506, "learning_rate": 0.00017056603773584907, "epoch": 0.153125 }, { "step": 846, "timestamp": "2025-12-28T09:39:15.452772", "elapsed_time": 2469.737862586975, "loss": 0.2266, "grad_norm": 0.134856179356575, "learning_rate": 0.00017044025157232707, "epoch": 0.15375 }, { "step": 847, "timestamp": "2025-12-28T09:39:29.349192", "elapsed_time": 2483.63428235054, "loss": 0.156, "grad_norm": 0.09759867191314697, "learning_rate": 0.00017031446540880505, "epoch": 0.154375 }, { "step": 848, "timestamp": "2025-12-28T09:39:35.325679", "elapsed_time": 2489.6107692718506, "loss": 0.1556, "grad_norm": 0.13298504054546356, "learning_rate": 0.00017018867924528303, "epoch": 0.155 }, { "step": 849, "timestamp": "2025-12-28T09:39:44.060500", "elapsed_time": 2498.345590353012, "loss": 0.1904, "grad_norm": 0.11852674186229706, "learning_rate": 0.000170062893081761, "epoch": 0.155625 }, { "step": 850, "timestamp": "2025-12-28T09:39:54.425345", "elapsed_time": 2508.710435152054, "loss": 0.1842, "grad_norm": 0.22243794798851013, "learning_rate": 0.00016993710691823902, "epoch": 0.15625 }, { "step": 851, "timestamp": "2025-12-28T09:40:05.019054", "elapsed_time": 2519.3041446208954, "loss": 0.1675, "grad_norm": 0.12315836548805237, "learning_rate": 0.000169811320754717, "epoch": 0.156875 }, { "step": 852, "timestamp": "2025-12-28T09:40:18.805001", "elapsed_time": 2533.0900909900665, "loss": 0.186, "grad_norm": 0.1145598366856575, "learning_rate": 0.00016968553459119497, "epoch": 0.1575 }, { "step": 853, "timestamp": "2025-12-28T09:40:26.649585", "elapsed_time": 2540.9346754550934, "loss": 0.2299, "grad_norm": 0.15518611669540405, "learning_rate": 0.00016955974842767298, "epoch": 0.158125 }, { "step": 854, "timestamp": "2025-12-28T09:40:34.119488", "elapsed_time": 2548.4045779705048, "loss": 0.2426, "grad_norm": 0.14574050903320312, "learning_rate": 0.00016943396226415096, "epoch": 0.15875 }, { "step": 855, "timestamp": "2025-12-28T09:40:40.131033", "elapsed_time": 2554.416123867035, "loss": 0.2763, "grad_norm": 0.1946178823709488, "learning_rate": 0.00016930817610062894, "epoch": 0.159375 }, { "step": 856, "timestamp": "2025-12-28T09:40:48.080229", "elapsed_time": 2562.3653190135956, "loss": 0.4301, "grad_norm": 0.16829311847686768, "learning_rate": 0.00016918238993710694, "epoch": 0.16 }, { "step": 857, "timestamp": "2025-12-28T09:41:03.320074", "elapsed_time": 2577.6051642894745, "loss": 0.1631, "grad_norm": 0.19184663891792297, "learning_rate": 0.00016905660377358492, "epoch": 0.160625 }, { "step": 858, "timestamp": "2025-12-28T09:41:12.493085", "elapsed_time": 2586.7781751155853, "loss": 0.1824, "grad_norm": 0.125960111618042, "learning_rate": 0.0001689308176100629, "epoch": 0.16125 }, { "step": 859, "timestamp": "2025-12-28T09:41:21.589208", "elapsed_time": 2595.874298810959, "loss": 0.1654, "grad_norm": 0.12101560086011887, "learning_rate": 0.0001688050314465409, "epoch": 0.161875 }, { "step": 860, "timestamp": "2025-12-28T09:41:30.728410", "elapsed_time": 2605.0134999752045, "loss": 0.3724, "grad_norm": 0.1491203010082245, "learning_rate": 0.0001686792452830189, "epoch": 0.1625 }, { "step": 861, "timestamp": "2025-12-28T09:41:40.445296", "elapsed_time": 2614.7303869724274, "loss": 0.1735, "grad_norm": 0.12737800180912018, "learning_rate": 0.00016855345911949687, "epoch": 0.163125 }, { "step": 862, "timestamp": "2025-12-28T09:41:55.948429", "elapsed_time": 2630.2335200309753, "loss": 0.126, "grad_norm": 0.08794135600328445, "learning_rate": 0.00016842767295597485, "epoch": 0.16375 }, { "step": 863, "timestamp": "2025-12-28T09:41:59.637376", "elapsed_time": 2633.9224672317505, "loss": 0.302, "grad_norm": 0.2143140286207199, "learning_rate": 0.00016830188679245285, "epoch": 0.164375 }, { "step": 864, "timestamp": "2025-12-28T09:42:06.917332", "elapsed_time": 2641.202422618866, "loss": 0.2941, "grad_norm": 0.15600278973579407, "learning_rate": 0.00016817610062893083, "epoch": 0.165 }, { "step": 865, "timestamp": "2025-12-28T09:42:18.849851", "elapsed_time": 2653.134941339493, "loss": 0.1691, "grad_norm": 0.11934607475996017, "learning_rate": 0.0001680503144654088, "epoch": 0.165625 }, { "step": 866, "timestamp": "2025-12-28T09:42:28.174345", "elapsed_time": 2662.459435939789, "loss": 0.1558, "grad_norm": 0.11288256198167801, "learning_rate": 0.00016792452830188682, "epoch": 0.16625 }, { "step": 867, "timestamp": "2025-12-28T09:42:39.834999", "elapsed_time": 2674.1200897693634, "loss": 0.1402, "grad_norm": 0.11293160915374756, "learning_rate": 0.0001677987421383648, "epoch": 0.166875 }, { "step": 868, "timestamp": "2025-12-28T09:42:45.101829", "elapsed_time": 2679.3869194984436, "loss": 0.3336, "grad_norm": 0.21127207577228546, "learning_rate": 0.00016767295597484277, "epoch": 0.1675 }, { "step": 869, "timestamp": "2025-12-28T09:42:53.397932", "elapsed_time": 2687.683022260666, "loss": 0.2122, "grad_norm": 0.13150571286678314, "learning_rate": 0.00016754716981132078, "epoch": 0.168125 }, { "step": 870, "timestamp": "2025-12-28T09:43:02.604043", "elapsed_time": 2696.889132976532, "loss": 0.2443, "grad_norm": 0.16025897860527039, "learning_rate": 0.00016742138364779876, "epoch": 0.16875 }, { "step": 871, "timestamp": "2025-12-28T09:43:16.218580", "elapsed_time": 2710.503670692444, "loss": 0.1361, "grad_norm": 0.1163235530257225, "learning_rate": 0.00016729559748427674, "epoch": 0.169375 }, { "step": 872, "timestamp": "2025-12-28T09:43:23.734614", "elapsed_time": 2718.01970911026, "loss": 0.2797, "grad_norm": 0.3218298852443695, "learning_rate": 0.00016716981132075474, "epoch": 0.17 }, { "step": 873, "timestamp": "2025-12-28T09:43:40.199422", "elapsed_time": 2734.4845123291016, "loss": 0.1531, "grad_norm": 0.09483516216278076, "learning_rate": 0.00016704402515723272, "epoch": 0.170625 }, { "step": 874, "timestamp": "2025-12-28T09:43:46.722337", "elapsed_time": 2741.007427930832, "loss": 0.1846, "grad_norm": 0.13699615001678467, "learning_rate": 0.0001669182389937107, "epoch": 0.17125 }, { "step": 875, "timestamp": "2025-12-28T09:44:01.014473", "elapsed_time": 2755.2995631694794, "loss": 0.2334, "grad_norm": 0.14246909320354462, "learning_rate": 0.00016679245283018868, "epoch": 0.171875 }, { "step": 876, "timestamp": "2025-12-28T09:44:13.506506", "elapsed_time": 2767.7915959358215, "loss": 0.1595, "grad_norm": 0.11375102400779724, "learning_rate": 0.0001666666666666667, "epoch": 0.1725 }, { "step": 877, "timestamp": "2025-12-28T09:44:19.673893", "elapsed_time": 2773.9589836597443, "loss": 0.2133, "grad_norm": 0.1539752185344696, "learning_rate": 0.00016654088050314467, "epoch": 0.173125 }, { "step": 878, "timestamp": "2025-12-28T09:44:31.607564", "elapsed_time": 2785.8926544189453, "loss": 0.3027, "grad_norm": 0.13873888552188873, "learning_rate": 0.00016641509433962265, "epoch": 0.17375 }, { "step": 879, "timestamp": "2025-12-28T09:44:36.488223", "elapsed_time": 2790.773313522339, "loss": 0.2748, "grad_norm": 0.17139685153961182, "learning_rate": 0.00016628930817610065, "epoch": 0.174375 }, { "step": 880, "timestamp": "2025-12-28T09:44:44.545212", "elapsed_time": 2798.83030295372, "loss": 0.192, "grad_norm": 0.14361582696437836, "learning_rate": 0.00016616352201257863, "epoch": 0.175 }, { "step": 881, "timestamp": "2025-12-28T09:44:52.364196", "elapsed_time": 2806.6492867469788, "loss": 0.2773, "grad_norm": 0.18555279076099396, "learning_rate": 0.0001660377358490566, "epoch": 0.175625 }, { "step": 882, "timestamp": "2025-12-28T09:44:58.645574", "elapsed_time": 2812.9306647777557, "loss": 0.3019, "grad_norm": 0.1758921891450882, "learning_rate": 0.00016591194968553462, "epoch": 0.17625 }, { "step": 883, "timestamp": "2025-12-28T09:45:09.067925", "elapsed_time": 2823.353015899658, "loss": 0.3553, "grad_norm": 0.15257853269577026, "learning_rate": 0.0001657861635220126, "epoch": 0.176875 }, { "step": 884, "timestamp": "2025-12-28T09:45:13.656005", "elapsed_time": 2827.94109582901, "loss": 0.2704, "grad_norm": 0.17811451852321625, "learning_rate": 0.00016566037735849058, "epoch": 0.1775 }, { "step": 885, "timestamp": "2025-12-28T09:45:19.215450", "elapsed_time": 2833.500540494919, "loss": 0.1815, "grad_norm": 0.1703587919473648, "learning_rate": 0.00016553459119496858, "epoch": 0.178125 }, { "step": 886, "timestamp": "2025-12-28T09:45:24.676685", "elapsed_time": 2838.9617760181427, "loss": 0.4395, "grad_norm": 0.2093503326177597, "learning_rate": 0.00016540880503144656, "epoch": 0.17875 }, { "step": 887, "timestamp": "2025-12-28T09:45:29.125709", "elapsed_time": 2843.410799264908, "loss": 0.2659, "grad_norm": 0.1815217286348343, "learning_rate": 0.00016528301886792454, "epoch": 0.179375 }, { "step": 888, "timestamp": "2025-12-28T09:45:35.113254", "elapsed_time": 2849.3983447551727, "loss": 0.2541, "grad_norm": 0.18482376635074615, "learning_rate": 0.00016515723270440252, "epoch": 0.18 }, { "step": 889, "timestamp": "2025-12-28T09:45:41.738048", "elapsed_time": 2856.023138523102, "loss": 0.1837, "grad_norm": 0.15669426321983337, "learning_rate": 0.00016503144654088052, "epoch": 0.180625 }, { "step": 890, "timestamp": "2025-12-28T09:45:50.170533", "elapsed_time": 2864.455623626709, "loss": 0.2074, "grad_norm": 0.16513699293136597, "learning_rate": 0.0001649056603773585, "epoch": 0.18125 }, { "step": 891, "timestamp": "2025-12-28T09:45:58.910483", "elapsed_time": 2873.1955733299255, "loss": 0.1612, "grad_norm": 0.1310487985610962, "learning_rate": 0.00016477987421383648, "epoch": 0.181875 }, { "step": 892, "timestamp": "2025-12-28T09:46:08.665921", "elapsed_time": 2882.9510111808777, "loss": 0.1793, "grad_norm": 0.1385423094034195, "learning_rate": 0.0001646540880503145, "epoch": 0.1825 }, { "step": 893, "timestamp": "2025-12-28T09:46:28.595204", "elapsed_time": 2902.8802947998047, "loss": 0.1056, "grad_norm": 0.09160617738962173, "learning_rate": 0.00016452830188679247, "epoch": 0.183125 }, { "step": 894, "timestamp": "2025-12-28T09:46:37.456263", "elapsed_time": 2911.7413532733917, "loss": 0.2347, "grad_norm": 0.1339615136384964, "learning_rate": 0.00016440251572327045, "epoch": 0.18375 }, { "step": 895, "timestamp": "2025-12-28T09:46:43.090133", "elapsed_time": 2917.3752233982086, "loss": 0.2097, "grad_norm": 0.1673922836780548, "learning_rate": 0.00016427672955974845, "epoch": 0.184375 }, { "step": 896, "timestamp": "2025-12-28T09:46:48.742520", "elapsed_time": 2923.0276103019714, "loss": 0.2041, "grad_norm": 0.17525748908519745, "learning_rate": 0.00016415094339622643, "epoch": 0.185 }, { "step": 897, "timestamp": "2025-12-28T09:46:56.108721", "elapsed_time": 2930.3938117027283, "loss": 0.1659, "grad_norm": 0.15093988180160522, "learning_rate": 0.0001640251572327044, "epoch": 0.185625 }, { "step": 898, "timestamp": "2025-12-28T09:47:09.846290", "elapsed_time": 2944.1313841342926, "loss": 0.1298, "grad_norm": 0.09404166042804718, "learning_rate": 0.00016389937106918242, "epoch": 0.18625 }, { "step": 899, "timestamp": "2025-12-28T09:47:21.835574", "elapsed_time": 2956.120668411255, "loss": 0.1425, "grad_norm": 0.10061768442392349, "learning_rate": 0.0001637735849056604, "epoch": 0.186875 }, { "step": 900, "timestamp": "2025-12-28T09:47:32.792316", "elapsed_time": 2967.0774064064026, "loss": 0.124, "grad_norm": 0.09524894505739212, "learning_rate": 0.00016364779874213838, "epoch": 0.1875 }, { "step": 901, "timestamp": "2025-12-28T09:47:44.729124", "elapsed_time": 2979.0142147541046, "loss": 0.1498, "grad_norm": 2.4628686904907227, "learning_rate": 0.00016352201257861635, "epoch": 0.188125 }, { "step": 902, "timestamp": "2025-12-28T09:47:52.982666", "elapsed_time": 2987.267756462097, "loss": 0.2408, "grad_norm": 0.13900604844093323, "learning_rate": 0.00016339622641509436, "epoch": 0.18875 }, { "step": 903, "timestamp": "2025-12-28T09:48:00.120207", "elapsed_time": 2994.4052975177765, "loss": 0.2353, "grad_norm": 0.15295051038265228, "learning_rate": 0.00016327044025157234, "epoch": 0.189375 }, { "step": 904, "timestamp": "2025-12-28T09:48:08.263388", "elapsed_time": 3002.5484788417816, "loss": 0.2088, "grad_norm": 0.14227791130542755, "learning_rate": 0.00016314465408805032, "epoch": 0.19 }, { "step": 905, "timestamp": "2025-12-28T09:48:21.400428", "elapsed_time": 3015.6855177879333, "loss": 0.1279, "grad_norm": 0.13115034997463226, "learning_rate": 0.00016301886792452833, "epoch": 0.190625 }, { "step": 906, "timestamp": "2025-12-28T09:48:30.333067", "elapsed_time": 3024.618157148361, "loss": 0.1772, "grad_norm": 0.13131241500377655, "learning_rate": 0.0001628930817610063, "epoch": 0.19125 }, { "step": 907, "timestamp": "2025-12-28T09:48:39.473331", "elapsed_time": 3033.7584216594696, "loss": 0.1563, "grad_norm": 0.11939848214387894, "learning_rate": 0.00016276729559748428, "epoch": 0.191875 }, { "step": 908, "timestamp": "2025-12-28T09:48:55.692279", "elapsed_time": 3049.9773693084717, "loss": 0.1491, "grad_norm": 0.14415167272090912, "learning_rate": 0.0001626415094339623, "epoch": 0.1925 }, { "step": 909, "timestamp": "2025-12-28T09:49:04.121120", "elapsed_time": 3058.406210422516, "loss": 0.1965, "grad_norm": 0.17701658606529236, "learning_rate": 0.00016251572327044027, "epoch": 0.193125 }, { "step": 910, "timestamp": "2025-12-28T09:49:15.530418", "elapsed_time": 3069.815508365631, "loss": 0.2012, "grad_norm": 0.12482966482639313, "learning_rate": 0.00016238993710691825, "epoch": 0.19375 }, { "step": 911, "timestamp": "2025-12-28T09:49:21.168387", "elapsed_time": 3075.45347738266, "loss": 0.3984, "grad_norm": 0.17656728625297546, "learning_rate": 0.00016226415094339625, "epoch": 0.194375 }, { "step": 912, "timestamp": "2025-12-28T09:49:33.856071", "elapsed_time": 3088.141161441803, "loss": 0.131, "grad_norm": 0.0940021201968193, "learning_rate": 0.00016213836477987423, "epoch": 0.195 }, { "step": 913, "timestamp": "2025-12-28T09:49:45.752863", "elapsed_time": 3100.0379536151886, "loss": 0.165, "grad_norm": 0.19799897074699402, "learning_rate": 0.0001620125786163522, "epoch": 0.195625 }, { "step": 914, "timestamp": "2025-12-28T09:49:51.223696", "elapsed_time": 3105.50878572464, "loss": 0.2392, "grad_norm": 0.39207378029823303, "learning_rate": 0.0001618867924528302, "epoch": 0.19625 }, { "step": 915, "timestamp": "2025-12-28T09:50:04.960806", "elapsed_time": 3119.245896577835, "loss": 0.1131, "grad_norm": 0.09941917657852173, "learning_rate": 0.0001617610062893082, "epoch": 0.196875 }, { "step": 916, "timestamp": "2025-12-28T09:50:12.135355", "elapsed_time": 3126.4204454421997, "loss": 0.1949, "grad_norm": 0.16558235883712769, "learning_rate": 0.00016163522012578618, "epoch": 0.1975 }, { "step": 917, "timestamp": "2025-12-28T09:50:24.159964", "elapsed_time": 3138.4450545310974, "loss": 0.1532, "grad_norm": 0.1185847669839859, "learning_rate": 0.00016150943396226416, "epoch": 0.198125 }, { "step": 918, "timestamp": "2025-12-28T09:50:36.336876", "elapsed_time": 3150.6219668388367, "loss": 0.1423, "grad_norm": 0.11089828610420227, "learning_rate": 0.00016138364779874216, "epoch": 0.19875 }, { "step": 919, "timestamp": "2025-12-28T09:50:42.776184", "elapsed_time": 3157.0612740516663, "loss": 0.3044, "grad_norm": 0.19416838884353638, "learning_rate": 0.00016125786163522014, "epoch": 0.199375 }, { "step": 920, "timestamp": "2025-12-28T09:50:50.019743", "elapsed_time": 3164.304833650589, "loss": 0.4452, "grad_norm": 0.3824036419391632, "learning_rate": 0.00016113207547169812, "epoch": 0.2 }, { "step": 921, "timestamp": "2025-12-28T09:50:57.804129", "elapsed_time": 3172.0892198085785, "loss": 0.3216, "grad_norm": 0.16429851949214935, "learning_rate": 0.00016100628930817613, "epoch": 0.200625 }, { "step": 922, "timestamp": "2025-12-28T09:51:13.052604", "elapsed_time": 3187.337694168091, "loss": 0.1217, "grad_norm": 0.13496516644954681, "learning_rate": 0.0001608805031446541, "epoch": 0.20125 }, { "step": 923, "timestamp": "2025-12-28T09:51:24.570532", "elapsed_time": 3198.855621814728, "loss": 0.1566, "grad_norm": 0.12458086013793945, "learning_rate": 0.00016075471698113208, "epoch": 0.201875 }, { "step": 924, "timestamp": "2025-12-28T09:51:31.848741", "elapsed_time": 3206.133831501007, "loss": 0.2315, "grad_norm": 0.14897367358207703, "learning_rate": 0.0001606289308176101, "epoch": 0.2025 }, { "step": 925, "timestamp": "2025-12-28T09:51:45.734245", "elapsed_time": 3220.019335269928, "loss": 0.111, "grad_norm": 0.09125252813100815, "learning_rate": 0.00016050314465408807, "epoch": 0.203125 }, { "step": 926, "timestamp": "2025-12-28T09:51:58.601974", "elapsed_time": 3232.8870646953583, "loss": 0.1471, "grad_norm": 0.11020820587873459, "learning_rate": 0.00016037735849056605, "epoch": 0.20375 }, { "step": 927, "timestamp": "2025-12-28T09:52:06.148283", "elapsed_time": 3240.43337392807, "loss": 0.1875, "grad_norm": 0.12961116433143616, "learning_rate": 0.00016025157232704405, "epoch": 0.204375 }, { "step": 928, "timestamp": "2025-12-28T09:52:13.614144", "elapsed_time": 3247.899234056473, "loss": 0.1729, "grad_norm": 0.14182396233081818, "learning_rate": 0.00016012578616352203, "epoch": 0.205 }, { "step": 929, "timestamp": "2025-12-28T09:52:18.583980", "elapsed_time": 3252.8690705299377, "loss": 0.3456, "grad_norm": 0.20903103053569794, "learning_rate": 0.00016, "epoch": 0.205625 }, { "step": 930, "timestamp": "2025-12-28T09:52:25.638732", "elapsed_time": 3259.9238221645355, "loss": 0.1962, "grad_norm": 0.13587729632854462, "learning_rate": 0.000159874213836478, "epoch": 0.20625 }, { "step": 931, "timestamp": "2025-12-28T09:52:34.937513", "elapsed_time": 3269.222603082657, "loss": 0.2363, "grad_norm": 0.1760166883468628, "learning_rate": 0.000159748427672956, "epoch": 0.206875 }, { "step": 932, "timestamp": "2025-12-28T09:52:43.090395", "elapsed_time": 3277.3754856586456, "loss": 0.2063, "grad_norm": 0.13720373809337616, "learning_rate": 0.00015962264150943398, "epoch": 0.2075 }, { "step": 933, "timestamp": "2025-12-28T09:52:49.474895", "elapsed_time": 3283.7599856853485, "loss": 0.229, "grad_norm": 0.14343424141407013, "learning_rate": 0.00015949685534591196, "epoch": 0.208125 }, { "step": 934, "timestamp": "2025-12-28T09:53:03.465281", "elapsed_time": 3297.7503714561462, "loss": 0.1478, "grad_norm": 0.15561024844646454, "learning_rate": 0.00015937106918238996, "epoch": 0.20875 }, { "step": 935, "timestamp": "2025-12-28T09:53:10.342764", "elapsed_time": 3304.627854824066, "loss": 0.2767, "grad_norm": 0.15607592463493347, "learning_rate": 0.00015924528301886794, "epoch": 0.209375 }, { "step": 936, "timestamp": "2025-12-28T09:53:15.162542", "elapsed_time": 3309.447632074356, "loss": 0.2736, "grad_norm": 0.18114545941352844, "learning_rate": 0.00015911949685534592, "epoch": 0.21 }, { "step": 937, "timestamp": "2025-12-28T09:53:26.422739", "elapsed_time": 3320.7078297138214, "loss": 0.1715, "grad_norm": 0.11876345425844193, "learning_rate": 0.00015899371069182393, "epoch": 0.210625 }, { "step": 938, "timestamp": "2025-12-28T09:53:32.911148", "elapsed_time": 3327.1962430477142, "loss": 0.1795, "grad_norm": 0.1508261114358902, "learning_rate": 0.0001588679245283019, "epoch": 0.21125 }, { "step": 939, "timestamp": "2025-12-28T09:53:38.919335", "elapsed_time": 3333.2044246196747, "loss": 0.1735, "grad_norm": 0.14189061522483826, "learning_rate": 0.00015874213836477989, "epoch": 0.211875 }, { "step": 940, "timestamp": "2025-12-28T09:53:52.060423", "elapsed_time": 3346.345513343811, "loss": 0.1236, "grad_norm": 0.09674876928329468, "learning_rate": 0.0001586163522012579, "epoch": 0.2125 }, { "step": 941, "timestamp": "2025-12-28T09:54:12.941873", "elapsed_time": 3367.22696352005, "loss": 0.1143, "grad_norm": 0.10512091219425201, "learning_rate": 0.00015849056603773587, "epoch": 0.213125 }, { "step": 942, "timestamp": "2025-12-28T09:54:23.471024", "elapsed_time": 3377.756114244461, "loss": 0.1029, "grad_norm": 0.21176813542842865, "learning_rate": 0.00015836477987421385, "epoch": 0.21375 }, { "step": 943, "timestamp": "2025-12-28T09:54:33.191585", "elapsed_time": 3387.4766755104065, "loss": 0.1839, "grad_norm": 0.12542878091335297, "learning_rate": 0.00015823899371069183, "epoch": 0.214375 }, { "step": 944, "timestamp": "2025-12-28T09:54:53.926827", "elapsed_time": 3408.2119178771973, "loss": 0.1474, "grad_norm": 0.12502753734588623, "learning_rate": 0.00015811320754716983, "epoch": 0.215 }, { "step": 945, "timestamp": "2025-12-28T09:55:05.593186", "elapsed_time": 3419.8782770633698, "loss": 0.1523, "grad_norm": 0.11267740279436111, "learning_rate": 0.00015798742138364781, "epoch": 0.215625 }, { "step": 946, "timestamp": "2025-12-28T09:55:14.970581", "elapsed_time": 3429.2556715011597, "loss": 0.1716, "grad_norm": 0.14452391862869263, "learning_rate": 0.0001578616352201258, "epoch": 0.21625 }, { "step": 947, "timestamp": "2025-12-28T09:55:23.786812", "elapsed_time": 3438.0719022750854, "loss": 0.2105, "grad_norm": 0.14315542578697205, "learning_rate": 0.0001577358490566038, "epoch": 0.216875 }, { "step": 948, "timestamp": "2025-12-28T09:55:27.890524", "elapsed_time": 3442.175614595413, "loss": 0.3344, "grad_norm": 0.2096068263053894, "learning_rate": 0.00015761006289308178, "epoch": 0.2175 }, { "step": 949, "timestamp": "2025-12-28T09:55:37.602478", "elapsed_time": 3451.887568950653, "loss": 0.1969, "grad_norm": 0.12928517162799835, "learning_rate": 0.00015748427672955976, "epoch": 0.218125 }, { "step": 950, "timestamp": "2025-12-28T09:55:42.477278", "elapsed_time": 3456.7623686790466, "loss": 0.2521, "grad_norm": 0.17746160924434662, "learning_rate": 0.00015735849056603776, "epoch": 0.21875 }, { "step": 951, "timestamp": "2025-12-28T09:55:48.417273", "elapsed_time": 3462.702367544174, "loss": 0.1949, "grad_norm": 0.14048974215984344, "learning_rate": 0.00015723270440251574, "epoch": 0.219375 }, { "step": 952, "timestamp": "2025-12-28T09:55:56.843770", "elapsed_time": 3471.1288611888885, "loss": 0.3451, "grad_norm": 0.14857220649719238, "learning_rate": 0.00015710691823899372, "epoch": 0.22 }, { "step": 953, "timestamp": "2025-12-28T09:56:03.831991", "elapsed_time": 3478.117082118988, "loss": 0.2241, "grad_norm": 0.16286444664001465, "learning_rate": 0.00015698113207547173, "epoch": 0.220625 }, { "step": 954, "timestamp": "2025-12-28T09:56:09.773840", "elapsed_time": 3484.0589311122894, "loss": 0.4378, "grad_norm": 0.25359323620796204, "learning_rate": 0.0001568553459119497, "epoch": 0.22125 }, { "step": 955, "timestamp": "2025-12-28T09:56:18.031656", "elapsed_time": 3492.316746234894, "loss": 0.3404, "grad_norm": 0.16270765662193298, "learning_rate": 0.00015672955974842769, "epoch": 0.221875 }, { "step": 956, "timestamp": "2025-12-28T09:56:31.017715", "elapsed_time": 3505.302805662155, "loss": 0.1701, "grad_norm": 0.11110091954469681, "learning_rate": 0.00015660377358490567, "epoch": 0.2225 }, { "step": 957, "timestamp": "2025-12-28T09:56:35.074678", "elapsed_time": 3509.359768629074, "loss": 0.3194, "grad_norm": 0.20245419442653656, "learning_rate": 0.00015647798742138367, "epoch": 0.223125 }, { "step": 958, "timestamp": "2025-12-28T09:56:53.824765", "elapsed_time": 3528.109855890274, "loss": 0.1034, "grad_norm": 0.07926256209611893, "learning_rate": 0.00015635220125786165, "epoch": 0.22375 }, { "step": 959, "timestamp": "2025-12-28T09:57:00.356123", "elapsed_time": 3534.6412131786346, "loss": 0.2092, "grad_norm": 0.17986778914928436, "learning_rate": 0.00015622641509433963, "epoch": 0.224375 }, { "step": 960, "timestamp": "2025-12-28T09:57:08.990782", "elapsed_time": 3543.2758724689484, "loss": 0.1359, "grad_norm": 0.13381750881671906, "learning_rate": 0.00015610062893081764, "epoch": 0.225 }, { "step": 961, "timestamp": "2025-12-28T09:57:26.985511", "elapsed_time": 3561.2706019878387, "loss": 0.0902, "grad_norm": 0.07347288727760315, "learning_rate": 0.00015597484276729561, "epoch": 0.225625 }, { "step": 962, "timestamp": "2025-12-28T09:57:31.711118", "elapsed_time": 3565.996208190918, "loss": 0.2244, "grad_norm": 0.17614322900772095, "learning_rate": 0.0001558490566037736, "epoch": 0.22625 }, { "step": 963, "timestamp": "2025-12-28T09:57:40.681045", "elapsed_time": 3574.9661359786987, "loss": 0.2077, "grad_norm": 0.1338813602924347, "learning_rate": 0.0001557232704402516, "epoch": 0.226875 }, { "step": 964, "timestamp": "2025-12-28T09:57:44.579572", "elapsed_time": 3578.8646624088287, "loss": 0.2926, "grad_norm": 0.22020283341407776, "learning_rate": 0.00015559748427672958, "epoch": 0.2275 }, { "step": 965, "timestamp": "2025-12-28T09:57:52.917233", "elapsed_time": 3587.2023231983185, "loss": 0.1769, "grad_norm": 0.13740260899066925, "learning_rate": 0.00015547169811320756, "epoch": 0.228125 }, { "step": 966, "timestamp": "2025-12-28T09:58:01.752612", "elapsed_time": 3596.0377027988434, "loss": 0.1681, "grad_norm": 0.12490309774875641, "learning_rate": 0.00015534591194968556, "epoch": 0.22875 }, { "step": 967, "timestamp": "2025-12-28T09:58:08.641082", "elapsed_time": 3602.9261722564697, "loss": 0.1626, "grad_norm": 0.12916311621665955, "learning_rate": 0.00015522012578616354, "epoch": 0.229375 }, { "step": 968, "timestamp": "2025-12-28T09:58:21.326631", "elapsed_time": 3615.611721277237, "loss": 0.1157, "grad_norm": 0.10117348283529282, "learning_rate": 0.00015509433962264152, "epoch": 0.23 }, { "step": 969, "timestamp": "2025-12-28T09:58:32.168538", "elapsed_time": 3626.4536283016205, "loss": 0.2099, "grad_norm": 0.21025630831718445, "learning_rate": 0.0001549685534591195, "epoch": 0.230625 }, { "step": 970, "timestamp": "2025-12-28T09:58:40.157526", "elapsed_time": 3634.4426164627075, "loss": 0.2095, "grad_norm": 0.13846275210380554, "learning_rate": 0.0001548427672955975, "epoch": 0.23125 }, { "step": 971, "timestamp": "2025-12-28T09:58:58.528736", "elapsed_time": 3652.8138258457184, "loss": 0.6305, "grad_norm": 0.14148321747779846, "learning_rate": 0.0001547169811320755, "epoch": 0.231875 }, { "step": 972, "timestamp": "2025-12-28T09:59:09.985061", "elapsed_time": 3664.270151615143, "loss": 0.1818, "grad_norm": 0.4846095144748688, "learning_rate": 0.00015459119496855347, "epoch": 0.2325 }, { "step": 973, "timestamp": "2025-12-28T09:59:15.643684", "elapsed_time": 3669.9287745952606, "loss": 0.3468, "grad_norm": 0.18205124139785767, "learning_rate": 0.00015446540880503147, "epoch": 0.233125 }, { "step": 974, "timestamp": "2025-12-28T09:59:24.573388", "elapsed_time": 3678.858478307724, "loss": 0.2059, "grad_norm": 0.13004031777381897, "learning_rate": 0.00015433962264150945, "epoch": 0.23375 }, { "step": 975, "timestamp": "2025-12-28T09:59:32.752142", "elapsed_time": 3687.037232398987, "loss": 0.2575, "grad_norm": 0.1793992817401886, "learning_rate": 0.00015421383647798743, "epoch": 0.234375 }, { "step": 976, "timestamp": "2025-12-28T09:59:42.141949", "elapsed_time": 3696.4270396232605, "loss": 0.1674, "grad_norm": 0.11684457212686539, "learning_rate": 0.00015408805031446544, "epoch": 0.235 }, { "step": 977, "timestamp": "2025-12-28T09:59:49.029099", "elapsed_time": 3703.3141901493073, "loss": 0.1885, "grad_norm": 0.1317356675863266, "learning_rate": 0.00015396226415094342, "epoch": 0.235625 }, { "step": 978, "timestamp": "2025-12-28T09:59:56.964854", "elapsed_time": 3711.2499442100525, "loss": 0.2512, "grad_norm": 0.1530027985572815, "learning_rate": 0.0001538364779874214, "epoch": 0.23625 }, { "step": 979, "timestamp": "2025-12-28T10:00:05.695516", "elapsed_time": 3719.9806060791016, "loss": 0.2406, "grad_norm": 0.13735264539718628, "learning_rate": 0.0001537106918238994, "epoch": 0.236875 }, { "step": 980, "timestamp": "2025-12-28T10:00:13.159866", "elapsed_time": 3727.444956302643, "loss": 0.265, "grad_norm": 0.14935627579689026, "learning_rate": 0.00015358490566037738, "epoch": 0.2375 }, { "step": 981, "timestamp": "2025-12-28T10:00:20.073453", "elapsed_time": 3734.3585438728333, "loss": 0.2331, "grad_norm": 0.15793128311634064, "learning_rate": 0.00015345911949685536, "epoch": 0.238125 }, { "step": 982, "timestamp": "2025-12-28T10:00:28.700639", "elapsed_time": 3742.9857289791107, "loss": 0.3142, "grad_norm": 0.14131158590316772, "learning_rate": 0.00015333333333333334, "epoch": 0.23875 }, { "step": 983, "timestamp": "2025-12-28T10:00:37.338005", "elapsed_time": 3751.623096227646, "loss": 0.1871, "grad_norm": 0.12910977005958557, "learning_rate": 0.00015320754716981134, "epoch": 0.239375 }, { "step": 984, "timestamp": "2025-12-28T10:00:51.515626", "elapsed_time": 3765.8007164001465, "loss": 0.2466, "grad_norm": 0.15984784066677094, "learning_rate": 0.00015308176100628932, "epoch": 0.24 }, { "step": 985, "timestamp": "2025-12-28T10:00:58.996365", "elapsed_time": 3773.2814559936523, "loss": 0.1202, "grad_norm": 0.11847283691167831, "learning_rate": 0.0001529559748427673, "epoch": 0.240625 }, { "step": 986, "timestamp": "2025-12-28T10:01:09.966903", "elapsed_time": 3784.25199341774, "loss": 0.1645, "grad_norm": 0.15656305849552155, "learning_rate": 0.0001528301886792453, "epoch": 0.24125 }, { "step": 987, "timestamp": "2025-12-28T10:01:23.517576", "elapsed_time": 3797.802666902542, "loss": 0.1248, "grad_norm": 0.0928470715880394, "learning_rate": 0.0001527044025157233, "epoch": 0.241875 }, { "step": 988, "timestamp": "2025-12-28T10:01:34.217983", "elapsed_time": 3808.503073453903, "loss": 0.1338, "grad_norm": 0.11139731854200363, "learning_rate": 0.00015257861635220127, "epoch": 0.2425 }, { "step": 989, "timestamp": "2025-12-28T10:01:41.199518", "elapsed_time": 3815.484607934952, "loss": 0.2724, "grad_norm": 0.15163938701152802, "learning_rate": 0.00015245283018867927, "epoch": 0.243125 }, { "step": 990, "timestamp": "2025-12-28T10:01:48.450348", "elapsed_time": 3822.7354385852814, "loss": 0.2476, "grad_norm": 0.17267391085624695, "learning_rate": 0.00015232704402515725, "epoch": 0.24375 }, { "step": 991, "timestamp": "2025-12-28T10:01:56.433279", "elapsed_time": 3830.718369960785, "loss": 0.1465, "grad_norm": 0.11863212287425995, "learning_rate": 0.00015220125786163523, "epoch": 0.244375 }, { "step": 992, "timestamp": "2025-12-28T10:02:09.167701", "elapsed_time": 3843.4527916908264, "loss": 0.1302, "grad_norm": 0.10036662966012955, "learning_rate": 0.00015207547169811324, "epoch": 0.245 }, { "step": 993, "timestamp": "2025-12-28T10:02:14.112829", "elapsed_time": 3848.3979198932648, "loss": 0.3477, "grad_norm": 0.19196945428848267, "learning_rate": 0.00015194968553459122, "epoch": 0.245625 }, { "step": 994, "timestamp": "2025-12-28T10:02:24.995027", "elapsed_time": 3859.2801179885864, "loss": 0.1563, "grad_norm": 0.11495956778526306, "learning_rate": 0.0001518238993710692, "epoch": 0.24625 }, { "step": 995, "timestamp": "2025-12-28T10:02:33.036433", "elapsed_time": 3867.3215239048004, "loss": 0.1802, "grad_norm": 0.11796751618385315, "learning_rate": 0.00015169811320754717, "epoch": 0.246875 }, { "step": 996, "timestamp": "2025-12-28T10:02:50.626582", "elapsed_time": 3884.911673307419, "loss": 0.1721, "grad_norm": 0.10495149344205856, "learning_rate": 0.00015157232704402518, "epoch": 0.2475 }, { "step": 997, "timestamp": "2025-12-28T10:02:56.189074", "elapsed_time": 3890.474164247513, "loss": 0.2895, "grad_norm": 0.18693897128105164, "learning_rate": 0.00015144654088050316, "epoch": 0.248125 }, { "step": 998, "timestamp": "2025-12-28T10:03:03.931060", "elapsed_time": 3898.216150522232, "loss": 0.2034, "grad_norm": 0.1423393189907074, "learning_rate": 0.00015132075471698114, "epoch": 0.24875 }, { "step": 999, "timestamp": "2025-12-28T10:03:21.397855", "elapsed_time": 3915.6829454898834, "loss": 0.1197, "grad_norm": 0.0859493762254715, "learning_rate": 0.00015119496855345914, "epoch": 0.249375 }, { "step": 1000, "timestamp": "2025-12-28T10:03:29.241271", "elapsed_time": 3923.5263612270355, "loss": 0.2176, "grad_norm": 0.13979171216487885, "learning_rate": 0.00015106918238993712, "epoch": 0.25 }, { "step": 1001, "timestamp": "2025-12-28T10:03:35.258000", "elapsed_time": 3929.543091058731, "loss": 0.2195, "grad_norm": 0.15846221148967743, "learning_rate": 0.0001509433962264151, "epoch": 0.250625 }, { "step": 1002, "timestamp": "2025-12-28T10:03:40.199295", "elapsed_time": 3934.484385251999, "loss": 0.345, "grad_norm": 0.18337437510490417, "learning_rate": 0.0001508176100628931, "epoch": 0.25125 }, { "step": 1003, "timestamp": "2025-12-28T10:03:49.416164", "elapsed_time": 3943.701254606247, "loss": 0.2019, "grad_norm": 0.11637191474437714, "learning_rate": 0.0001506918238993711, "epoch": 0.251875 }, { "step": 1004, "timestamp": "2025-12-28T10:04:02.545538", "elapsed_time": 3956.830629825592, "loss": 0.1278, "grad_norm": 0.1031939908862114, "learning_rate": 0.00015056603773584907, "epoch": 0.2525 }, { "step": 1005, "timestamp": "2025-12-28T10:04:08.991273", "elapsed_time": 3963.2763633728027, "loss": 0.2647, "grad_norm": 0.16900090873241425, "learning_rate": 0.00015044025157232707, "epoch": 0.253125 }, { "step": 1006, "timestamp": "2025-12-28T10:04:15.995166", "elapsed_time": 3970.2802562713623, "loss": 0.2239, "grad_norm": 0.1869828701019287, "learning_rate": 0.00015031446540880505, "epoch": 0.25375 }, { "step": 1007, "timestamp": "2025-12-28T10:04:26.260626", "elapsed_time": 3980.5457170009613, "loss": 0.1438, "grad_norm": 0.1253536194562912, "learning_rate": 0.00015018867924528303, "epoch": 0.254375 }, { "step": 1008, "timestamp": "2025-12-28T10:04:33.781669", "elapsed_time": 3988.0667593479156, "loss": 0.2621, "grad_norm": 0.17986273765563965, "learning_rate": 0.00015006289308176104, "epoch": 0.255 }, { "step": 1009, "timestamp": "2025-12-28T10:04:40.413811", "elapsed_time": 3994.698902130127, "loss": 0.2008, "grad_norm": 0.14311149716377258, "learning_rate": 0.000149937106918239, "epoch": 0.255625 }, { "step": 1010, "timestamp": "2025-12-28T10:04:59.959869", "elapsed_time": 4014.2449600696564, "loss": 0.1501, "grad_norm": 0.08852102607488632, "learning_rate": 0.00014981132075471697, "epoch": 0.25625 }, { "step": 1011, "timestamp": "2025-12-28T10:05:07.239700", "elapsed_time": 4021.5247910022736, "loss": 0.2402, "grad_norm": 0.19574478268623352, "learning_rate": 0.00014968553459119498, "epoch": 0.256875 }, { "step": 1012, "timestamp": "2025-12-28T10:05:19.732988", "elapsed_time": 4034.0180780887604, "loss": 0.1703, "grad_norm": 0.10853290557861328, "learning_rate": 0.00014955974842767295, "epoch": 0.2575 }, { "step": 1013, "timestamp": "2025-12-28T10:05:28.914807", "elapsed_time": 4043.1998975276947, "loss": 0.2656, "grad_norm": 0.16859759390354156, "learning_rate": 0.00014943396226415093, "epoch": 0.258125 }, { "step": 1014, "timestamp": "2025-12-28T10:05:40.579825", "elapsed_time": 4054.8649151325226, "loss": 0.1861, "grad_norm": 0.12188176810741425, "learning_rate": 0.00014930817610062894, "epoch": 0.25875 }, { "step": 1015, "timestamp": "2025-12-28T10:05:52.247391", "elapsed_time": 4066.5324816703796, "loss": 0.172, "grad_norm": 0.11021149158477783, "learning_rate": 0.00014918238993710692, "epoch": 0.259375 }, { "step": 1016, "timestamp": "2025-12-28T10:06:02.667177", "elapsed_time": 4076.9522676467896, "loss": 0.1577, "grad_norm": 0.11924094706773758, "learning_rate": 0.0001490566037735849, "epoch": 0.26 }, { "step": 1017, "timestamp": "2025-12-28T10:06:10.516680", "elapsed_time": 4084.8017704486847, "loss": 0.1642, "grad_norm": 0.12346573173999786, "learning_rate": 0.00014893081761006288, "epoch": 0.260625 }, { "step": 1018, "timestamp": "2025-12-28T10:06:15.781551", "elapsed_time": 4090.066641330719, "loss": 0.4633, "grad_norm": 0.2274809330701828, "learning_rate": 0.00014880503144654088, "epoch": 0.26125 }, { "step": 1019, "timestamp": "2025-12-28T10:06:23.024176", "elapsed_time": 4097.30926656723, "loss": 0.2659, "grad_norm": 0.1639455407857895, "learning_rate": 0.00014867924528301886, "epoch": 0.261875 }, { "step": 1020, "timestamp": "2025-12-28T10:06:29.002861", "elapsed_time": 4103.28795170784, "loss": 0.1842, "grad_norm": 0.12965330481529236, "learning_rate": 0.00014855345911949684, "epoch": 0.2625 }, { "step": 1021, "timestamp": "2025-12-28T10:06:41.542093", "elapsed_time": 4115.827183961868, "loss": 0.159, "grad_norm": 0.10215835273265839, "learning_rate": 0.00014842767295597485, "epoch": 0.263125 }, { "step": 1022, "timestamp": "2025-12-28T10:06:47.805495", "elapsed_time": 4122.090589284897, "loss": 0.2087, "grad_norm": 0.19713939726352692, "learning_rate": 0.00014830188679245283, "epoch": 0.26375 }, { "step": 1023, "timestamp": "2025-12-28T10:06:53.583302", "elapsed_time": 4127.8683931827545, "loss": 0.2987, "grad_norm": 0.18019935488700867, "learning_rate": 0.0001481761006289308, "epoch": 0.264375 }, { "step": 1024, "timestamp": "2025-12-28T10:07:00.180550", "elapsed_time": 4134.465640544891, "loss": 0.2478, "grad_norm": 0.1467757374048233, "learning_rate": 0.0001480503144654088, "epoch": 0.265 }, { "step": 1025, "timestamp": "2025-12-28T10:07:10.795758", "elapsed_time": 4145.0808482170105, "loss": 0.1668, "grad_norm": 0.10882839560508728, "learning_rate": 0.0001479245283018868, "epoch": 0.265625 }, { "step": 1026, "timestamp": "2025-12-28T10:07:17.133433", "elapsed_time": 4151.418524265289, "loss": 0.2731, "grad_norm": 0.15907247364521027, "learning_rate": 0.00014779874213836477, "epoch": 0.26625 }, { "step": 1027, "timestamp": "2025-12-28T10:07:24.120391", "elapsed_time": 4158.405481100082, "loss": 0.2377, "grad_norm": 0.13879650831222534, "learning_rate": 0.00014767295597484278, "epoch": 0.266875 }, { "step": 1028, "timestamp": "2025-12-28T10:07:32.464337", "elapsed_time": 4166.749427556992, "loss": 0.1956, "grad_norm": 0.13655611872673035, "learning_rate": 0.00014754716981132076, "epoch": 0.2675 }, { "step": 1029, "timestamp": "2025-12-28T10:07:46.125666", "elapsed_time": 4180.410757303238, "loss": 0.119, "grad_norm": 0.2348964810371399, "learning_rate": 0.00014742138364779873, "epoch": 0.268125 }, { "step": 1030, "timestamp": "2025-12-28T10:07:50.593098", "elapsed_time": 4184.878188371658, "loss": 0.2267, "grad_norm": 0.16901762783527374, "learning_rate": 0.0001472955974842767, "epoch": 0.26875 }, { "step": 1031, "timestamp": "2025-12-28T10:08:03.007589", "elapsed_time": 4197.292679548264, "loss": 0.1174, "grad_norm": 0.08787506818771362, "learning_rate": 0.00014716981132075472, "epoch": 0.269375 }, { "step": 1032, "timestamp": "2025-12-28T10:08:09.168697", "elapsed_time": 4203.453787326813, "loss": 0.3684, "grad_norm": 0.18056993186473846, "learning_rate": 0.0001470440251572327, "epoch": 0.27 }, { "step": 1033, "timestamp": "2025-12-28T10:08:20.157096", "elapsed_time": 4214.442186117172, "loss": 0.1727, "grad_norm": 0.11187569051980972, "learning_rate": 0.00014691823899371068, "epoch": 0.270625 }, { "step": 1034, "timestamp": "2025-12-28T10:08:27.489866", "elapsed_time": 4221.774956703186, "loss": 0.1853, "grad_norm": 0.132780984044075, "learning_rate": 0.00014679245283018868, "epoch": 0.27125 }, { "step": 1035, "timestamp": "2025-12-28T10:08:32.101038", "elapsed_time": 4226.38612818718, "loss": 0.2988, "grad_norm": 0.21782910823822021, "learning_rate": 0.00014666666666666666, "epoch": 0.271875 }, { "step": 1036, "timestamp": "2025-12-28T10:08:38.180903", "elapsed_time": 4232.465993881226, "loss": 0.173, "grad_norm": 0.1362728774547577, "learning_rate": 0.00014654088050314464, "epoch": 0.2725 }, { "step": 1037, "timestamp": "2025-12-28T10:08:47.148440", "elapsed_time": 4241.433530807495, "loss": 0.1494, "grad_norm": 0.11238773167133331, "learning_rate": 0.00014641509433962265, "epoch": 0.273125 }, { "step": 1038, "timestamp": "2025-12-28T10:08:53.691649", "elapsed_time": 4247.976739406586, "loss": 0.2181, "grad_norm": 0.15294326841831207, "learning_rate": 0.00014628930817610063, "epoch": 0.27375 }, { "step": 1039, "timestamp": "2025-12-28T10:08:59.801991", "elapsed_time": 4254.0870814323425, "loss": 0.2035, "grad_norm": 0.14438562095165253, "learning_rate": 0.0001461635220125786, "epoch": 0.274375 }, { "step": 1040, "timestamp": "2025-12-28T10:09:04.922147", "elapsed_time": 4259.2072377204895, "loss": 0.1839, "grad_norm": 0.163107767701149, "learning_rate": 0.0001460377358490566, "epoch": 0.275 }, { "step": 1041, "timestamp": "2025-12-28T10:09:11.363913", "elapsed_time": 4265.649003267288, "loss": 0.2156, "grad_norm": 0.14970509707927704, "learning_rate": 0.0001459119496855346, "epoch": 0.275625 }, { "step": 1042, "timestamp": "2025-12-28T10:09:21.257050", "elapsed_time": 4275.542140722275, "loss": 0.1636, "grad_norm": 0.12284188717603683, "learning_rate": 0.00014578616352201257, "epoch": 0.27625 }, { "step": 1043, "timestamp": "2025-12-28T10:09:28.778477", "elapsed_time": 4283.063568115234, "loss": 0.4273, "grad_norm": 0.18576599657535553, "learning_rate": 0.00014566037735849055, "epoch": 0.276875 }, { "step": 1044, "timestamp": "2025-12-28T10:09:36.716585", "elapsed_time": 4291.001675605774, "loss": 0.187, "grad_norm": 0.13979823887348175, "learning_rate": 0.00014553459119496856, "epoch": 0.2775 }, { "step": 1045, "timestamp": "2025-12-28T10:09:47.537065", "elapsed_time": 4301.822155952454, "loss": 0.1229, "grad_norm": 0.11963897943496704, "learning_rate": 0.00014540880503144653, "epoch": 0.278125 }, { "step": 1046, "timestamp": "2025-12-28T10:09:53.307725", "elapsed_time": 4307.5928156375885, "loss": 0.2356, "grad_norm": 0.18368114531040192, "learning_rate": 0.00014528301886792451, "epoch": 0.27875 }, { "step": 1047, "timestamp": "2025-12-28T10:09:59.635756", "elapsed_time": 4313.920850515366, "loss": 0.2149, "grad_norm": 0.2329953908920288, "learning_rate": 0.00014515723270440252, "epoch": 0.279375 }, { "step": 1048, "timestamp": "2025-12-28T10:10:04.571410", "elapsed_time": 4318.856500864029, "loss": 0.2881, "grad_norm": 0.24254077672958374, "learning_rate": 0.0001450314465408805, "epoch": 0.28 }, { "step": 1049, "timestamp": "2025-12-28T10:10:09.018520", "elapsed_time": 4323.303614139557, "loss": 0.3995, "grad_norm": 0.20608584582805634, "learning_rate": 0.00014490566037735848, "epoch": 0.280625 }, { "step": 1050, "timestamp": "2025-12-28T10:10:19.733959", "elapsed_time": 4334.019049882889, "loss": 0.1759, "grad_norm": 0.12463296949863434, "learning_rate": 0.00014477987421383648, "epoch": 0.28125 }, { "step": 1051, "timestamp": "2025-12-28T10:10:27.504878", "elapsed_time": 4341.789968252182, "loss": 0.1602, "grad_norm": 0.13327348232269287, "learning_rate": 0.00014465408805031446, "epoch": 0.281875 }, { "step": 1052, "timestamp": "2025-12-28T10:10:40.425643", "elapsed_time": 4354.710733413696, "loss": 0.1314, "grad_norm": 0.09431233257055283, "learning_rate": 0.00014452830188679244, "epoch": 0.2825 }, { "step": 1053, "timestamp": "2025-12-28T10:10:58.475011", "elapsed_time": 4372.760101318359, "loss": 0.1314, "grad_norm": 0.08956651389598846, "learning_rate": 0.00014440251572327045, "epoch": 0.283125 }, { "step": 1054, "timestamp": "2025-12-28T10:11:05.726827", "elapsed_time": 4380.011917591095, "loss": 0.2106, "grad_norm": 0.14600154757499695, "learning_rate": 0.00014427672955974843, "epoch": 0.28375 }, { "step": 1055, "timestamp": "2025-12-28T10:11:19.021552", "elapsed_time": 4393.30664229393, "loss": 0.176, "grad_norm": 0.1056523472070694, "learning_rate": 0.0001441509433962264, "epoch": 0.284375 }, { "step": 1056, "timestamp": "2025-12-28T10:11:29.958671", "elapsed_time": 4404.243761062622, "loss": 0.1629, "grad_norm": 0.10517208278179169, "learning_rate": 0.00014402515723270439, "epoch": 0.285 }, { "step": 1057, "timestamp": "2025-12-28T10:11:36.541158", "elapsed_time": 4410.826248168945, "loss": 0.3887, "grad_norm": 0.16218851506710052, "learning_rate": 0.0001438993710691824, "epoch": 0.285625 }, { "step": 1058, "timestamp": "2025-12-28T10:11:45.563207", "elapsed_time": 4419.848297119141, "loss": 0.1683, "grad_norm": 0.1224561482667923, "learning_rate": 0.00014377358490566037, "epoch": 0.28625 }, { "step": 1059, "timestamp": "2025-12-28T10:11:58.056565", "elapsed_time": 4432.34165596962, "loss": 0.125, "grad_norm": 0.08662772178649902, "learning_rate": 0.00014364779874213835, "epoch": 0.286875 }, { "step": 1060, "timestamp": "2025-12-28T10:12:05.305261", "elapsed_time": 4439.590351343155, "loss": 0.2103, "grad_norm": 0.14773836731910706, "learning_rate": 0.00014352201257861636, "epoch": 0.2875 }, { "step": 1061, "timestamp": "2025-12-28T10:12:17.136857", "elapsed_time": 4451.4219472408295, "loss": 0.2254, "grad_norm": 0.128709077835083, "learning_rate": 0.00014339622641509434, "epoch": 0.288125 }, { "step": 1062, "timestamp": "2025-12-28T10:12:26.358416", "elapsed_time": 4460.643507003784, "loss": 0.1654, "grad_norm": 0.12667128443717957, "learning_rate": 0.00014327044025157231, "epoch": 0.28875 }, { "step": 1063, "timestamp": "2025-12-28T10:12:35.697244", "elapsed_time": 4469.982335090637, "loss": 0.1426, "grad_norm": 0.10940206795930862, "learning_rate": 0.00014314465408805032, "epoch": 0.289375 }, { "step": 1064, "timestamp": "2025-12-28T10:12:41.676751", "elapsed_time": 4475.961841106415, "loss": 0.1529, "grad_norm": 0.1327073872089386, "learning_rate": 0.0001430188679245283, "epoch": 0.29 }, { "step": 1065, "timestamp": "2025-12-28T10:12:54.040253", "elapsed_time": 4488.325343132019, "loss": 0.4812, "grad_norm": 0.15044333040714264, "learning_rate": 0.00014289308176100628, "epoch": 0.290625 }, { "step": 1066, "timestamp": "2025-12-28T10:13:01.898646", "elapsed_time": 4496.183736562729, "loss": 0.19, "grad_norm": 0.12461165338754654, "learning_rate": 0.00014276729559748429, "epoch": 0.29125 }, { "step": 1067, "timestamp": "2025-12-28T10:13:11.281195", "elapsed_time": 4505.566284894943, "loss": 0.375, "grad_norm": 0.2037774622440338, "learning_rate": 0.00014264150943396226, "epoch": 0.291875 }, { "step": 1068, "timestamp": "2025-12-28T10:13:20.374473", "elapsed_time": 4514.659563064575, "loss": 0.1759, "grad_norm": 0.11888349056243896, "learning_rate": 0.00014251572327044024, "epoch": 0.2925 }, { "step": 1069, "timestamp": "2025-12-28T10:13:33.299520", "elapsed_time": 4527.58461022377, "loss": 0.1318, "grad_norm": 0.09523271024227142, "learning_rate": 0.00014238993710691825, "epoch": 0.293125 }, { "step": 1070, "timestamp": "2025-12-28T10:13:41.047563", "elapsed_time": 4535.332653284073, "loss": 0.2335, "grad_norm": 0.13759227097034454, "learning_rate": 0.00014226415094339623, "epoch": 0.29375 }, { "step": 1071, "timestamp": "2025-12-28T10:13:56.504224", "elapsed_time": 4550.789314746857, "loss": 0.1236, "grad_norm": 0.08639674633741379, "learning_rate": 0.0001421383647798742, "epoch": 0.294375 }, { "step": 1072, "timestamp": "2025-12-28T10:14:06.307720", "elapsed_time": 4560.59281039238, "loss": 0.1721, "grad_norm": 0.1233757734298706, "learning_rate": 0.0001420125786163522, "epoch": 0.295 }, { "step": 1073, "timestamp": "2025-12-28T10:14:11.665841", "elapsed_time": 4565.950931310654, "loss": 0.2988, "grad_norm": 0.1682165414094925, "learning_rate": 0.0001418867924528302, "epoch": 0.295625 }, { "step": 1074, "timestamp": "2025-12-28T10:14:21.737669", "elapsed_time": 4576.022758960724, "loss": 0.1494, "grad_norm": 0.10447243601083755, "learning_rate": 0.00014176100628930817, "epoch": 0.29625 }, { "step": 1075, "timestamp": "2025-12-28T10:14:40.552859", "elapsed_time": 4594.837973356247, "loss": 0.1524, "grad_norm": 0.0825522392988205, "learning_rate": 0.00014163522012578615, "epoch": 0.296875 }, { "step": 1076, "timestamp": "2025-12-28T10:14:49.284505", "elapsed_time": 4603.569595575333, "loss": 0.2119, "grad_norm": 0.12849776446819305, "learning_rate": 0.00014150943396226416, "epoch": 0.2975 }, { "step": 1077, "timestamp": "2025-12-28T10:15:02.082705", "elapsed_time": 4616.367795705795, "loss": 0.1306, "grad_norm": 0.09817694872617722, "learning_rate": 0.00014138364779874214, "epoch": 0.298125 }, { "step": 1078, "timestamp": "2025-12-28T10:15:12.968494", "elapsed_time": 4627.253584623337, "loss": 0.125, "grad_norm": 0.10283590853214264, "learning_rate": 0.00014125786163522012, "epoch": 0.29875 }, { "step": 1079, "timestamp": "2025-12-28T10:15:20.706362", "elapsed_time": 4634.991452932358, "loss": 0.3718, "grad_norm": 0.16993916034698486, "learning_rate": 0.00014113207547169812, "epoch": 0.299375 }, { "step": 1080, "timestamp": "2025-12-28T10:15:24.915200", "elapsed_time": 4639.200289726257, "loss": 0.1865, "grad_norm": 0.16090618073940277, "learning_rate": 0.0001410062893081761, "epoch": 0.3 }, { "step": 1081, "timestamp": "2025-12-28T10:15:33.659663", "elapsed_time": 4647.944753646851, "loss": 0.1715, "grad_norm": 0.1435355693101883, "learning_rate": 0.00014088050314465408, "epoch": 0.300625 }, { "step": 1082, "timestamp": "2025-12-28T10:15:50.278302", "elapsed_time": 4664.563392162323, "loss": 0.1291, "grad_norm": 0.08483153581619263, "learning_rate": 0.00014075471698113209, "epoch": 0.30125 }, { "step": 1083, "timestamp": "2025-12-28T10:15:56.389680", "elapsed_time": 4670.674770593643, "loss": 0.2924, "grad_norm": 0.17434731125831604, "learning_rate": 0.00014062893081761007, "epoch": 0.301875 }, { "step": 1084, "timestamp": "2025-12-28T10:16:06.200612", "elapsed_time": 4680.48570227623, "loss": 0.1588, "grad_norm": 0.10816198587417603, "learning_rate": 0.00014050314465408804, "epoch": 0.3025 }, { "step": 1085, "timestamp": "2025-12-28T10:16:12.114394", "elapsed_time": 4686.399484395981, "loss": 0.2117, "grad_norm": 0.15836821496486664, "learning_rate": 0.00014037735849056602, "epoch": 0.303125 }, { "step": 1086, "timestamp": "2025-12-28T10:16:20.183071", "elapsed_time": 4694.468160867691, "loss": 0.2609, "grad_norm": 0.16809400916099548, "learning_rate": 0.00014025157232704403, "epoch": 0.30375 }, { "step": 1087, "timestamp": "2025-12-28T10:16:28.614605", "elapsed_time": 4702.8996958732605, "loss": 0.497, "grad_norm": 0.17253176867961884, "learning_rate": 0.000140125786163522, "epoch": 0.304375 }, { "step": 1088, "timestamp": "2025-12-28T10:16:34.729314", "elapsed_time": 4709.014405012131, "loss": 0.2396, "grad_norm": 0.1900101751089096, "learning_rate": 0.00014, "epoch": 0.305 }, { "step": 1089, "timestamp": "2025-12-28T10:16:47.236814", "elapsed_time": 4721.521904230118, "loss": 0.2017, "grad_norm": 0.1198599562048912, "learning_rate": 0.000139874213836478, "epoch": 0.305625 }, { "step": 1090, "timestamp": "2025-12-28T10:16:56.587644", "elapsed_time": 4730.872734546661, "loss": 0.1482, "grad_norm": 0.10646126419305801, "learning_rate": 0.00013974842767295597, "epoch": 0.30625 }, { "step": 1091, "timestamp": "2025-12-28T10:17:09.729699", "elapsed_time": 4744.014789104462, "loss": 0.1493, "grad_norm": 0.1059768870472908, "learning_rate": 0.00013962264150943395, "epoch": 0.306875 }, { "step": 1092, "timestamp": "2025-12-28T10:17:15.846589", "elapsed_time": 4750.131683349609, "loss": 0.2455, "grad_norm": 0.15631641447544098, "learning_rate": 0.00013949685534591196, "epoch": 0.3075 }, { "step": 1093, "timestamp": "2025-12-28T10:17:26.226547", "elapsed_time": 4760.511637687683, "loss": 0.158, "grad_norm": 0.11631731688976288, "learning_rate": 0.00013937106918238994, "epoch": 0.308125 }, { "step": 1094, "timestamp": "2025-12-28T10:17:35.012584", "elapsed_time": 4769.297674417496, "loss": 0.1526, "grad_norm": 0.1259499490261078, "learning_rate": 0.00013924528301886792, "epoch": 0.30875 }, { "step": 1095, "timestamp": "2025-12-28T10:17:45.261282", "elapsed_time": 4779.546371936798, "loss": 0.241, "grad_norm": 0.15224315226078033, "learning_rate": 0.00013911949685534592, "epoch": 0.309375 }, { "step": 1096, "timestamp": "2025-12-28T10:17:57.287033", "elapsed_time": 4791.572123765945, "loss": 0.139, "grad_norm": 0.1223248764872551, "learning_rate": 0.0001389937106918239, "epoch": 0.31 }, { "step": 1097, "timestamp": "2025-12-28T10:18:09.370759", "elapsed_time": 4803.655849456787, "loss": 0.2395, "grad_norm": 0.13052494823932648, "learning_rate": 0.00013886792452830188, "epoch": 0.310625 }, { "step": 1098, "timestamp": "2025-12-28T10:18:22.367902", "elapsed_time": 4816.652992963791, "loss": 0.1401, "grad_norm": 0.11577942967414856, "learning_rate": 0.00013874213836477986, "epoch": 0.31125 }, { "step": 1099, "timestamp": "2025-12-28T10:18:37.241574", "elapsed_time": 4831.526664972305, "loss": 0.139, "grad_norm": 0.09500917792320251, "learning_rate": 0.00013861635220125787, "epoch": 0.311875 }, { "step": 1100, "timestamp": "2025-12-28T10:18:45.367638", "elapsed_time": 4839.6527326107025, "loss": 0.1616, "grad_norm": 0.12062890082597733, "learning_rate": 0.00013849056603773585, "epoch": 0.3125 }, { "step": 1101, "timestamp": "2025-12-28T10:18:50.501032", "elapsed_time": 4844.786121845245, "loss": 0.5666, "grad_norm": 0.21491988003253937, "learning_rate": 0.00013836477987421382, "epoch": 0.313125 }, { "step": 1102, "timestamp": "2025-12-28T10:18:57.091853", "elapsed_time": 4851.37694311142, "loss": 0.1821, "grad_norm": 0.14800046384334564, "learning_rate": 0.00013823899371069183, "epoch": 0.31375 }, { "step": 1103, "timestamp": "2025-12-28T10:19:06.683644", "elapsed_time": 4860.968734264374, "loss": 0.1477, "grad_norm": 0.11978691071271896, "learning_rate": 0.0001381132075471698, "epoch": 0.314375 }, { "step": 1104, "timestamp": "2025-12-28T10:19:21.833042", "elapsed_time": 4876.118132352829, "loss": 0.1662, "grad_norm": 0.09653400629758835, "learning_rate": 0.0001379874213836478, "epoch": 0.315 }, { "step": 1105, "timestamp": "2025-12-28T10:19:26.190650", "elapsed_time": 4880.475740432739, "loss": 0.2282, "grad_norm": 0.20169983804225922, "learning_rate": 0.0001378616352201258, "epoch": 0.315625 }, { "step": 1106, "timestamp": "2025-12-28T10:19:38.616678", "elapsed_time": 4892.901768684387, "loss": 0.1446, "grad_norm": 0.10674963146448135, "learning_rate": 0.00013773584905660377, "epoch": 0.31625 }, { "step": 1107, "timestamp": "2025-12-28T10:19:59.515344", "elapsed_time": 4913.800434112549, "loss": 0.1037, "grad_norm": 0.06956765800714493, "learning_rate": 0.00013761006289308175, "epoch": 0.316875 }, { "step": 1108, "timestamp": "2025-12-28T10:20:06.485473", "elapsed_time": 4920.77056312561, "loss": 0.182, "grad_norm": 0.23156726360321045, "learning_rate": 0.00013748427672955976, "epoch": 0.3175 }, { "step": 1109, "timestamp": "2025-12-28T10:20:23.683657", "elapsed_time": 4937.968747854233, "loss": 0.1634, "grad_norm": 0.1053549200296402, "learning_rate": 0.00013735849056603774, "epoch": 0.318125 }, { "step": 1110, "timestamp": "2025-12-28T10:20:30.071074", "elapsed_time": 4944.356164216995, "loss": 0.1761, "grad_norm": 0.1309899240732193, "learning_rate": 0.00013723270440251572, "epoch": 0.31875 }, { "step": 1111, "timestamp": "2025-12-28T10:20:48.259766", "elapsed_time": 4962.5448570251465, "loss": 0.0991, "grad_norm": 0.07965648174285889, "learning_rate": 0.0001371069182389937, "epoch": 0.319375 }, { "step": 1112, "timestamp": "2025-12-28T10:20:54.336910", "elapsed_time": 4968.622000217438, "loss": 0.1765, "grad_norm": 0.16077613830566406, "learning_rate": 0.0001369811320754717, "epoch": 0.32 }, { "step": 1113, "timestamp": "2025-12-28T10:21:01.708820", "elapsed_time": 4975.993910551071, "loss": 0.2535, "grad_norm": 0.19687579572200775, "learning_rate": 0.00013685534591194968, "epoch": 0.320625 }, { "step": 1114, "timestamp": "2025-12-28T10:21:08.888514", "elapsed_time": 4983.173604011536, "loss": 0.1733, "grad_norm": 0.1458161175251007, "learning_rate": 0.00013672955974842766, "epoch": 0.32125 }, { "step": 1115, "timestamp": "2025-12-28T10:21:21.027050", "elapsed_time": 4995.312140703201, "loss": 0.1462, "grad_norm": 0.10044913738965988, "learning_rate": 0.00013660377358490567, "epoch": 0.321875 }, { "step": 1116, "timestamp": "2025-12-28T10:21:32.969714", "elapsed_time": 5007.2548043727875, "loss": 0.1104, "grad_norm": 0.08253983408212662, "learning_rate": 0.00013647798742138365, "epoch": 0.3225 }, { "step": 1117, "timestamp": "2025-12-28T10:21:38.181094", "elapsed_time": 5012.466184139252, "loss": 0.2278, "grad_norm": 0.1712544858455658, "learning_rate": 0.00013635220125786162, "epoch": 0.323125 }, { "step": 1118, "timestamp": "2025-12-28T10:21:48.132268", "elapsed_time": 5022.4173583984375, "loss": 0.1665, "grad_norm": 0.1129027009010315, "learning_rate": 0.00013622641509433963, "epoch": 0.32375 }, { "step": 1119, "timestamp": "2025-12-28T10:21:58.890656", "elapsed_time": 5033.175746202469, "loss": 0.1526, "grad_norm": 0.12182003259658813, "learning_rate": 0.0001361006289308176, "epoch": 0.324375 }, { "step": 1120, "timestamp": "2025-12-28T10:22:16.895465", "elapsed_time": 5051.1805555820465, "loss": 0.1335, "grad_norm": 0.0835099071264267, "learning_rate": 0.0001359748427672956, "epoch": 0.325 }, { "step": 1121, "timestamp": "2025-12-28T10:22:23.340365", "elapsed_time": 5057.62545633316, "loss": 0.2159, "grad_norm": 0.15998020768165588, "learning_rate": 0.0001358490566037736, "epoch": 0.325625 }, { "step": 1122, "timestamp": "2025-12-28T10:22:33.758550", "elapsed_time": 5068.04364490509, "loss": 0.1589, "grad_norm": 0.115847647190094, "learning_rate": 0.00013572327044025157, "epoch": 0.32625 }, { "step": 1123, "timestamp": "2025-12-28T10:22:44.939902", "elapsed_time": 5079.224991798401, "loss": 0.1524, "grad_norm": 0.10123994946479797, "learning_rate": 0.00013559748427672955, "epoch": 0.326875 }, { "step": 1124, "timestamp": "2025-12-28T10:22:50.271896", "elapsed_time": 5084.556986808777, "loss": 0.2074, "grad_norm": 0.1583547741174698, "learning_rate": 0.00013547169811320753, "epoch": 0.3275 }, { "step": 1125, "timestamp": "2025-12-28T10:22:58.007020", "elapsed_time": 5092.292110919952, "loss": 0.2013, "grad_norm": 0.13498209416866302, "learning_rate": 0.00013534591194968554, "epoch": 0.328125 }, { "step": 1126, "timestamp": "2025-12-28T10:23:06.472685", "elapsed_time": 5100.75777554512, "loss": 0.1787, "grad_norm": 0.12321489304304123, "learning_rate": 0.00013522012578616352, "epoch": 0.32875 }, { "step": 1127, "timestamp": "2025-12-28T10:23:16.415604", "elapsed_time": 5110.700694322586, "loss": 0.2123, "grad_norm": 0.13657422363758087, "learning_rate": 0.0001350943396226415, "epoch": 0.329375 }, { "step": 1128, "timestamp": "2025-12-28T10:23:21.550030", "elapsed_time": 5115.835124254227, "loss": 0.3319, "grad_norm": 0.19260597229003906, "learning_rate": 0.0001349685534591195, "epoch": 0.33 }, { "step": 1129, "timestamp": "2025-12-28T10:23:37.207751", "elapsed_time": 5131.492841243744, "loss": 0.1309, "grad_norm": 0.08664542436599731, "learning_rate": 0.00013484276729559748, "epoch": 0.330625 }, { "step": 1130, "timestamp": "2025-12-28T10:23:43.286231", "elapsed_time": 5137.57132101059, "loss": 0.2417, "grad_norm": 0.21089886128902435, "learning_rate": 0.00013471698113207546, "epoch": 0.33125 }, { "step": 1131, "timestamp": "2025-12-28T10:23:51.484950", "elapsed_time": 5145.7700407505035, "loss": 0.1542, "grad_norm": 0.12316355109214783, "learning_rate": 0.00013459119496855347, "epoch": 0.331875 }, { "step": 1132, "timestamp": "2025-12-28T10:23:58.046371", "elapsed_time": 5152.331461429596, "loss": 0.1719, "grad_norm": 0.1278318166732788, "learning_rate": 0.00013446540880503145, "epoch": 0.3325 }, { "step": 1133, "timestamp": "2025-12-28T10:24:04.389165", "elapsed_time": 5158.674255847931, "loss": 0.2467, "grad_norm": 0.16408215463161469, "learning_rate": 0.00013433962264150943, "epoch": 0.333125 }, { "step": 1134, "timestamp": "2025-12-28T10:24:14.662588", "elapsed_time": 5168.947679042816, "loss": 0.136, "grad_norm": 0.1054520383477211, "learning_rate": 0.00013421383647798743, "epoch": 0.33375 }, { "step": 1135, "timestamp": "2025-12-28T10:24:24.914867", "elapsed_time": 5179.199957847595, "loss": 0.1625, "grad_norm": 0.11112856864929199, "learning_rate": 0.0001340880503144654, "epoch": 0.334375 }, { "step": 1136, "timestamp": "2025-12-28T10:24:33.704996", "elapsed_time": 5187.990086078644, "loss": 0.2022, "grad_norm": 0.14535382390022278, "learning_rate": 0.0001339622641509434, "epoch": 0.335 }, { "step": 1137, "timestamp": "2025-12-28T10:24:54.598309", "elapsed_time": 5208.883399009705, "loss": 0.1032, "grad_norm": 0.07190191745758057, "learning_rate": 0.00013383647798742137, "epoch": 0.335625 }, { "step": 1138, "timestamp": "2025-12-28T10:24:59.727226", "elapsed_time": 5214.0123155117035, "loss": 0.2176, "grad_norm": 0.1817207783460617, "learning_rate": 0.00013371069182389938, "epoch": 0.33625 }, { "step": 1139, "timestamp": "2025-12-28T10:25:05.999597", "elapsed_time": 5220.284687042236, "loss": 0.1639, "grad_norm": 0.1274854838848114, "learning_rate": 0.00013358490566037735, "epoch": 0.336875 }, { "step": 1140, "timestamp": "2025-12-28T10:25:13.369606", "elapsed_time": 5227.654696941376, "loss": 0.2805, "grad_norm": 0.15669360756874084, "learning_rate": 0.00013345911949685533, "epoch": 0.3375 }, { "step": 1141, "timestamp": "2025-12-28T10:25:26.173526", "elapsed_time": 5240.458616495132, "loss": 0.2244, "grad_norm": 0.10920794308185577, "learning_rate": 0.00013333333333333334, "epoch": 0.338125 }, { "step": 1142, "timestamp": "2025-12-28T10:25:34.387144", "elapsed_time": 5248.672234773636, "loss": 0.1525, "grad_norm": 0.12012416124343872, "learning_rate": 0.00013320754716981132, "epoch": 0.33875 }, { "step": 1143, "timestamp": "2025-12-28T10:25:40.867120", "elapsed_time": 5255.15221118927, "loss": 0.2315, "grad_norm": 0.17467765510082245, "learning_rate": 0.0001330817610062893, "epoch": 0.339375 }, { "step": 1144, "timestamp": "2025-12-28T10:25:49.536049", "elapsed_time": 5263.821138858795, "loss": 0.1585, "grad_norm": 0.12994243204593658, "learning_rate": 0.0001329559748427673, "epoch": 0.34 }, { "step": 1145, "timestamp": "2025-12-28T10:25:56.165948", "elapsed_time": 5270.451038122177, "loss": 0.3481, "grad_norm": 0.17362722754478455, "learning_rate": 0.00013283018867924528, "epoch": 0.340625 }, { "step": 1146, "timestamp": "2025-12-28T10:26:07.059344", "elapsed_time": 5281.344434261322, "loss": 0.1996, "grad_norm": 0.11975681781768799, "learning_rate": 0.00013270440251572326, "epoch": 0.34125 }, { "step": 1147, "timestamp": "2025-12-28T10:26:27.952012", "elapsed_time": 5302.237102270126, "loss": 0.1627, "grad_norm": 0.08679784089326859, "learning_rate": 0.00013257861635220127, "epoch": 0.341875 }, { "step": 1148, "timestamp": "2025-12-28T10:26:36.021702", "elapsed_time": 5310.306792974472, "loss": 0.1613, "grad_norm": 0.14012403786182404, "learning_rate": 0.00013245283018867925, "epoch": 0.3425 }, { "step": 1149, "timestamp": "2025-12-28T10:26:53.963469", "elapsed_time": 5328.248558998108, "loss": 0.1234, "grad_norm": 0.08561190962791443, "learning_rate": 0.00013232704402515723, "epoch": 0.343125 }, { "step": 1150, "timestamp": "2025-12-28T10:27:05.423036", "elapsed_time": 5339.708126306534, "loss": 0.1275, "grad_norm": 0.10564465820789337, "learning_rate": 0.00013220125786163523, "epoch": 0.34375 }, { "step": 1151, "timestamp": "2025-12-28T10:27:13.370347", "elapsed_time": 5347.655437231064, "loss": 0.171, "grad_norm": 0.13151898980140686, "learning_rate": 0.0001320754716981132, "epoch": 0.344375 }, { "step": 1152, "timestamp": "2025-12-28T10:27:26.513582", "elapsed_time": 5360.798672437668, "loss": 0.1066, "grad_norm": 0.08146792650222778, "learning_rate": 0.0001319496855345912, "epoch": 0.345 }, { "step": 1153, "timestamp": "2025-12-28T10:27:37.983479", "elapsed_time": 5372.268569231033, "loss": 0.14, "grad_norm": 0.11052241921424866, "learning_rate": 0.00013182389937106917, "epoch": 0.345625 }, { "step": 1154, "timestamp": "2025-12-28T10:27:49.248781", "elapsed_time": 5383.533871412277, "loss": 0.2192, "grad_norm": 0.11441831290721893, "learning_rate": 0.00013169811320754718, "epoch": 0.34625 }, { "step": 1155, "timestamp": "2025-12-28T10:27:58.548010", "elapsed_time": 5392.833100557327, "loss": 0.1996, "grad_norm": 0.13789567351341248, "learning_rate": 0.00013157232704402516, "epoch": 0.346875 }, { "step": 1156, "timestamp": "2025-12-28T10:28:05.648486", "elapsed_time": 5399.933576583862, "loss": 0.1288, "grad_norm": 0.12561623752117157, "learning_rate": 0.00013144654088050313, "epoch": 0.3475 }, { "step": 1157, "timestamp": "2025-12-28T10:28:17.278511", "elapsed_time": 5411.563601255417, "loss": 0.1758, "grad_norm": 0.17324663698673248, "learning_rate": 0.00013132075471698114, "epoch": 0.348125 }, { "step": 1158, "timestamp": "2025-12-28T10:28:29.227502", "elapsed_time": 5423.512593269348, "loss": 0.2519, "grad_norm": 0.13509972393512726, "learning_rate": 0.00013119496855345912, "epoch": 0.34875 }, { "step": 1159, "timestamp": "2025-12-28T10:28:41.252121", "elapsed_time": 5435.537210941315, "loss": 0.1828, "grad_norm": 0.11435085535049438, "learning_rate": 0.0001310691823899371, "epoch": 0.349375 }, { "step": 1160, "timestamp": "2025-12-28T10:28:49.279299", "elapsed_time": 5443.564389467239, "loss": 0.2038, "grad_norm": 0.13359728455543518, "learning_rate": 0.0001309433962264151, "epoch": 0.35 }, { "step": 1161, "timestamp": "2025-12-28T10:29:01.419851", "elapsed_time": 5455.704941749573, "loss": 0.1188, "grad_norm": 0.09035109728574753, "learning_rate": 0.00013081761006289308, "epoch": 0.350625 }, { "step": 1162, "timestamp": "2025-12-28T10:29:07.506551", "elapsed_time": 5461.791641712189, "loss": 0.2134, "grad_norm": 0.16012340784072876, "learning_rate": 0.00013069182389937106, "epoch": 0.35125 }, { "step": 1163, "timestamp": "2025-12-28T10:29:19.642475", "elapsed_time": 5473.927566051483, "loss": 0.1431, "grad_norm": 0.10316343605518341, "learning_rate": 0.00013056603773584907, "epoch": 0.351875 }, { "step": 1164, "timestamp": "2025-12-28T10:29:26.703778", "elapsed_time": 5480.988868236542, "loss": 0.1874, "grad_norm": 0.13070017099380493, "learning_rate": 0.00013044025157232705, "epoch": 0.3525 }, { "step": 1165, "timestamp": "2025-12-28T10:29:42.102895", "elapsed_time": 5496.387985467911, "loss": 0.1539, "grad_norm": 0.09859879314899445, "learning_rate": 0.00013031446540880503, "epoch": 0.353125 }, { "step": 1166, "timestamp": "2025-12-28T10:29:58.650069", "elapsed_time": 5512.935160160065, "loss": 0.1449, "grad_norm": 0.09668051451444626, "learning_rate": 0.000130188679245283, "epoch": 0.35375 }, { "step": 1167, "timestamp": "2025-12-28T10:30:17.397139", "elapsed_time": 5531.682229757309, "loss": 0.1166, "grad_norm": 0.07741539925336838, "learning_rate": 0.000130062893081761, "epoch": 0.354375 }, { "step": 1168, "timestamp": "2025-12-28T10:30:25.144695", "elapsed_time": 5539.429785490036, "loss": 0.1554, "grad_norm": 0.11451406031847, "learning_rate": 0.000129937106918239, "epoch": 0.355 }, { "step": 1169, "timestamp": "2025-12-28T10:30:37.830348", "elapsed_time": 5552.115438699722, "loss": 0.1541, "grad_norm": 0.1345360428094864, "learning_rate": 0.00012981132075471697, "epoch": 0.355625 }, { "step": 1170, "timestamp": "2025-12-28T10:30:45.645731", "elapsed_time": 5559.930821657181, "loss": 0.2273, "grad_norm": 0.15822488069534302, "learning_rate": 0.00012968553459119498, "epoch": 0.35625 }, { "step": 1171, "timestamp": "2025-12-28T10:30:53.952954", "elapsed_time": 5568.238044023514, "loss": 0.1536, "grad_norm": 0.2116500437259674, "learning_rate": 0.00012955974842767296, "epoch": 0.356875 }, { "step": 1172, "timestamp": "2025-12-28T10:31:03.335531", "elapsed_time": 5577.620621681213, "loss": 0.1236, "grad_norm": 0.11660629510879517, "learning_rate": 0.00012943396226415094, "epoch": 0.3575 }, { "step": 1173, "timestamp": "2025-12-28T10:31:10.218223", "elapsed_time": 5584.503312826157, "loss": 0.2065, "grad_norm": 0.135480597615242, "learning_rate": 0.00012930817610062894, "epoch": 0.358125 }, { "step": 1174, "timestamp": "2025-12-28T10:31:18.995819", "elapsed_time": 5593.2809092998505, "loss": 0.1724, "grad_norm": 0.23346908390522003, "learning_rate": 0.00012918238993710692, "epoch": 0.35875 }, { "step": 1175, "timestamp": "2025-12-28T10:31:26.934005", "elapsed_time": 5601.2190997600555, "loss": 0.493, "grad_norm": 0.18534159660339355, "learning_rate": 0.0001290566037735849, "epoch": 0.359375 }, { "step": 1176, "timestamp": "2025-12-28T10:31:35.566298", "elapsed_time": 5609.851388692856, "loss": 0.18, "grad_norm": 0.1343761831521988, "learning_rate": 0.0001289308176100629, "epoch": 0.36 }, { "step": 1177, "timestamp": "2025-12-28T10:31:43.785332", "elapsed_time": 5618.070422172546, "loss": 0.1392, "grad_norm": 0.12244327366352081, "learning_rate": 0.00012880503144654088, "epoch": 0.360625 }, { "step": 1178, "timestamp": "2025-12-28T10:31:49.452840", "elapsed_time": 5623.737930774689, "loss": 0.2027, "grad_norm": 0.15840069949626923, "learning_rate": 0.00012867924528301886, "epoch": 0.36125 }, { "step": 1179, "timestamp": "2025-12-28T10:31:58.279487", "elapsed_time": 5632.564577817917, "loss": 0.24, "grad_norm": 0.17348845303058624, "learning_rate": 0.00012855345911949684, "epoch": 0.361875 }, { "step": 1180, "timestamp": "2025-12-28T10:32:06.750060", "elapsed_time": 5641.035150527954, "loss": 0.1493, "grad_norm": 0.11318610608577728, "learning_rate": 0.00012842767295597485, "epoch": 0.3625 }, { "step": 1181, "timestamp": "2025-12-28T10:32:18.316743", "elapsed_time": 5652.601833343506, "loss": 0.1519, "grad_norm": 0.1193518340587616, "learning_rate": 0.00012830188679245283, "epoch": 0.363125 }, { "step": 1182, "timestamp": "2025-12-28T10:32:24.831408", "elapsed_time": 5659.116497993469, "loss": 0.1801, "grad_norm": 0.14156877994537354, "learning_rate": 0.0001281761006289308, "epoch": 0.36375 }, { "step": 1183, "timestamp": "2025-12-28T10:32:31.901184", "elapsed_time": 5666.186274528503, "loss": 0.1667, "grad_norm": 0.13352788984775543, "learning_rate": 0.0001280503144654088, "epoch": 0.364375 }, { "step": 1184, "timestamp": "2025-12-28T10:32:46.767013", "elapsed_time": 5681.052104473114, "loss": 0.1224, "grad_norm": 0.09199526906013489, "learning_rate": 0.0001279245283018868, "epoch": 0.365 }, { "step": 1185, "timestamp": "2025-12-28T10:32:54.235718", "elapsed_time": 5688.52080821991, "loss": 0.1942, "grad_norm": 0.13854575157165527, "learning_rate": 0.00012779874213836477, "epoch": 0.365625 }, { "step": 1186, "timestamp": "2025-12-28T10:33:07.216737", "elapsed_time": 5701.501827001572, "loss": 0.1412, "grad_norm": 0.10089116543531418, "learning_rate": 0.00012767295597484278, "epoch": 0.36625 }, { "step": 1187, "timestamp": "2025-12-28T10:33:18.169177", "elapsed_time": 5712.454270601273, "loss": 0.1496, "grad_norm": 0.11052452027797699, "learning_rate": 0.00012754716981132076, "epoch": 0.366875 }, { "step": 1188, "timestamp": "2025-12-28T10:33:24.179712", "elapsed_time": 5718.464802980423, "loss": 0.1801, "grad_norm": 0.17774660885334015, "learning_rate": 0.00012742138364779874, "epoch": 0.3675 }, { "step": 1189, "timestamp": "2025-12-28T10:33:30.449798", "elapsed_time": 5724.734888315201, "loss": 0.1707, "grad_norm": 0.13715173304080963, "learning_rate": 0.00012729559748427674, "epoch": 0.368125 }, { "step": 1190, "timestamp": "2025-12-28T10:33:40.351650", "elapsed_time": 5734.636740446091, "loss": 0.1585, "grad_norm": 0.13463056087493896, "learning_rate": 0.00012716981132075472, "epoch": 0.36875 }, { "step": 1191, "timestamp": "2025-12-28T10:33:46.518785", "elapsed_time": 5740.803875684738, "loss": 0.2042, "grad_norm": 0.14503754675388336, "learning_rate": 0.0001270440251572327, "epoch": 0.369375 }, { "step": 1192, "timestamp": "2025-12-28T10:33:58.192196", "elapsed_time": 5752.477287054062, "loss": 0.1911, "grad_norm": 0.11271210759878159, "learning_rate": 0.00012691823899371068, "epoch": 0.37 }, { "step": 1193, "timestamp": "2025-12-28T10:34:05.329085", "elapsed_time": 5759.614175319672, "loss": 0.2337, "grad_norm": 0.14175988733768463, "learning_rate": 0.00012679245283018869, "epoch": 0.370625 }, { "step": 1194, "timestamp": "2025-12-28T10:34:17.155544", "elapsed_time": 5771.440634012222, "loss": 0.1992, "grad_norm": 0.10490947216749191, "learning_rate": 0.00012666666666666666, "epoch": 0.37125 }, { "step": 1195, "timestamp": "2025-12-28T10:34:32.490446", "elapsed_time": 5786.77553653717, "loss": 0.1565, "grad_norm": 0.11939844489097595, "learning_rate": 0.00012654088050314464, "epoch": 0.371875 }, { "step": 1196, "timestamp": "2025-12-28T10:34:41.906197", "elapsed_time": 5796.191288232803, "loss": 0.1493, "grad_norm": 0.11298428475856781, "learning_rate": 0.00012641509433962265, "epoch": 0.3725 }, { "step": 1197, "timestamp": "2025-12-28T10:34:49.971838", "elapsed_time": 5804.256928920746, "loss": 0.2001, "grad_norm": 0.13944107294082642, "learning_rate": 0.00012628930817610063, "epoch": 0.373125 }, { "step": 1198, "timestamp": "2025-12-28T10:34:57.032417", "elapsed_time": 5811.317508220673, "loss": 0.2897, "grad_norm": 0.1779128611087799, "learning_rate": 0.0001261635220125786, "epoch": 0.37375 }, { "step": 1199, "timestamp": "2025-12-28T10:35:08.870827", "elapsed_time": 5823.1559183597565, "loss": 0.1722, "grad_norm": 0.11074298620223999, "learning_rate": 0.00012603773584905661, "epoch": 0.374375 }, { "step": 1200, "timestamp": "2025-12-28T10:35:14.846064", "elapsed_time": 5829.131155014038, "loss": 0.1623, "grad_norm": 0.12961973249912262, "learning_rate": 0.0001259119496855346, "epoch": 0.375 }, { "step": 1201, "timestamp": "2025-12-28T10:35:25.422373", "elapsed_time": 5839.707467794418, "loss": 0.1785, "grad_norm": 0.14644969999790192, "learning_rate": 0.00012578616352201257, "epoch": 0.375625 }, { "step": 1202, "timestamp": "2025-12-28T10:35:41.289838", "elapsed_time": 5855.574928283691, "loss": 0.118, "grad_norm": 0.0863247960805893, "learning_rate": 0.00012566037735849058, "epoch": 0.37625 }, { "step": 1203, "timestamp": "2025-12-28T10:35:54.961911", "elapsed_time": 5869.247001171112, "loss": 0.1602, "grad_norm": 0.09607759863138199, "learning_rate": 0.00012553459119496856, "epoch": 0.376875 }, { "step": 1204, "timestamp": "2025-12-28T10:36:02.141105", "elapsed_time": 5876.426195144653, "loss": 0.1269, "grad_norm": 0.12356162816286087, "learning_rate": 0.00012540880503144654, "epoch": 0.3775 }, { "step": 1205, "timestamp": "2025-12-28T10:36:15.945952", "elapsed_time": 5890.23104262352, "loss": 0.1182, "grad_norm": 0.09206572920084, "learning_rate": 0.00012528301886792452, "epoch": 0.378125 }, { "step": 1206, "timestamp": "2025-12-28T10:36:23.429110", "elapsed_time": 5897.714200735092, "loss": 0.2265, "grad_norm": 0.14600634574890137, "learning_rate": 0.00012515723270440252, "epoch": 0.37875 }, { "step": 1207, "timestamp": "2025-12-28T10:36:32.859599", "elapsed_time": 5907.1446895599365, "loss": 0.1366, "grad_norm": 0.10805106908082962, "learning_rate": 0.0001250314465408805, "epoch": 0.379375 }, { "step": 1208, "timestamp": "2025-12-28T10:36:41.692070", "elapsed_time": 5915.9771609306335, "loss": 0.1532, "grad_norm": 0.12475503236055374, "learning_rate": 0.00012490566037735848, "epoch": 0.38 }, { "step": 1209, "timestamp": "2025-12-28T10:36:52.893121", "elapsed_time": 5927.178210735321, "loss": 0.1372, "grad_norm": 0.10797689855098724, "learning_rate": 0.00012477987421383649, "epoch": 0.380625 }, { "step": 1210, "timestamp": "2025-12-28T10:37:05.316752", "elapsed_time": 5939.601842880249, "loss": 0.1379, "grad_norm": 0.09704222530126572, "learning_rate": 0.00012465408805031447, "epoch": 0.38125 }, { "step": 1211, "timestamp": "2025-12-28T10:37:10.877159", "elapsed_time": 5945.1622495651245, "loss": 0.2735, "grad_norm": 0.16764011979103088, "learning_rate": 0.00012452830188679244, "epoch": 0.381875 }, { "step": 1212, "timestamp": "2025-12-28T10:37:19.026917", "elapsed_time": 5953.312007665634, "loss": 0.2679, "grad_norm": 0.1391536146402359, "learning_rate": 0.00012440251572327045, "epoch": 0.3825 }, { "step": 1213, "timestamp": "2025-12-28T10:37:28.216063", "elapsed_time": 5962.501153230667, "loss": 0.1346, "grad_norm": 0.1069680005311966, "learning_rate": 0.00012427672955974843, "epoch": 0.383125 }, { "step": 1214, "timestamp": "2025-12-28T10:37:42.338346", "elapsed_time": 5976.623436450958, "loss": 0.1306, "grad_norm": 0.0998351201415062, "learning_rate": 0.0001241509433962264, "epoch": 0.38375 }, { "step": 1215, "timestamp": "2025-12-28T10:37:49.904441", "elapsed_time": 5984.189530849457, "loss": 0.1815, "grad_norm": 0.13117118179798126, "learning_rate": 0.00012402515723270442, "epoch": 0.384375 }, { "step": 1216, "timestamp": "2025-12-28T10:38:03.854769", "elapsed_time": 5998.1398594379425, "loss": 0.3325, "grad_norm": 0.13191911578178406, "learning_rate": 0.0001238993710691824, "epoch": 0.385 }, { "step": 1217, "timestamp": "2025-12-28T10:38:08.710052", "elapsed_time": 6002.995143175125, "loss": 0.7795, "grad_norm": 0.23979060351848602, "learning_rate": 0.00012377358490566037, "epoch": 0.385625 }, { "step": 1218, "timestamp": "2025-12-28T10:38:22.340339", "elapsed_time": 6016.625429391861, "loss": 0.1357, "grad_norm": 0.0968586727976799, "learning_rate": 0.00012364779874213835, "epoch": 0.38625 }, { "step": 1219, "timestamp": "2025-12-28T10:38:42.480374", "elapsed_time": 6036.7654638290405, "loss": 0.1035, "grad_norm": 0.08069409430027008, "learning_rate": 0.00012352201257861636, "epoch": 0.386875 }, { "step": 1220, "timestamp": "2025-12-28T10:38:50.725844", "elapsed_time": 6045.010933876038, "loss": 0.1788, "grad_norm": 0.12745045125484467, "learning_rate": 0.00012339622641509434, "epoch": 0.3875 }, { "step": 1221, "timestamp": "2025-12-28T10:38:56.392405", "elapsed_time": 6050.677495241165, "loss": 0.2797, "grad_norm": 0.29041117429733276, "learning_rate": 0.00012327044025157232, "epoch": 0.388125 }, { "step": 1222, "timestamp": "2025-12-28T10:39:05.731705", "elapsed_time": 6060.016795396805, "loss": 0.2435, "grad_norm": 0.12010473012924194, "learning_rate": 0.00012314465408805032, "epoch": 0.38875 }, { "step": 1223, "timestamp": "2025-12-28T10:39:13.957976", "elapsed_time": 6068.243066072464, "loss": 0.1408, "grad_norm": 0.10792548954486847, "learning_rate": 0.0001230188679245283, "epoch": 0.389375 }, { "step": 1224, "timestamp": "2025-12-28T10:39:25.907397", "elapsed_time": 6080.192487239838, "loss": 0.15, "grad_norm": 0.10715028643608093, "learning_rate": 0.00012289308176100628, "epoch": 0.39 }, { "step": 1225, "timestamp": "2025-12-28T10:39:31.137562", "elapsed_time": 6085.422652006149, "loss": 0.1641, "grad_norm": 0.13656176626682281, "learning_rate": 0.0001227672955974843, "epoch": 0.390625 }, { "step": 1226, "timestamp": "2025-12-28T10:39:34.990548", "elapsed_time": 6089.275638103485, "loss": 0.2354, "grad_norm": 0.1691398322582245, "learning_rate": 0.00012264150943396227, "epoch": 0.39125 }, { "step": 1227, "timestamp": "2025-12-28T10:39:47.535666", "elapsed_time": 6101.820756912231, "loss": 0.1597, "grad_norm": 0.10032006353139877, "learning_rate": 0.00012251572327044025, "epoch": 0.391875 }, { "step": 1228, "timestamp": "2025-12-28T10:39:54.871871", "elapsed_time": 6109.156960964203, "loss": 0.2194, "grad_norm": 0.1799350529909134, "learning_rate": 0.00012238993710691825, "epoch": 0.3925 }, { "step": 1229, "timestamp": "2025-12-28T10:40:05.715326", "elapsed_time": 6120.000416755676, "loss": 0.1546, "grad_norm": 0.10520683974027634, "learning_rate": 0.00012226415094339623, "epoch": 0.393125 }, { "step": 1230, "timestamp": "2025-12-28T10:40:15.438855", "elapsed_time": 6129.723945379257, "loss": 0.182, "grad_norm": 0.13104449212551117, "learning_rate": 0.0001221383647798742, "epoch": 0.39375 }, { "step": 1231, "timestamp": "2025-12-28T10:40:26.329143", "elapsed_time": 6140.614233493805, "loss": 0.1419, "grad_norm": 0.09966589510440826, "learning_rate": 0.0001220125786163522, "epoch": 0.394375 }, { "step": 1232, "timestamp": "2025-12-28T10:40:31.971296", "elapsed_time": 6146.256386756897, "loss": 0.1358, "grad_norm": 0.12772534787654877, "learning_rate": 0.0001218867924528302, "epoch": 0.395 }, { "step": 1233, "timestamp": "2025-12-28T10:40:39.345570", "elapsed_time": 6153.630660057068, "loss": 0.161, "grad_norm": 0.11391877382993698, "learning_rate": 0.00012176100628930817, "epoch": 0.395625 }, { "step": 1234, "timestamp": "2025-12-28T10:40:52.265078", "elapsed_time": 6166.550168275833, "loss": 0.1591, "grad_norm": 0.1011412963271141, "learning_rate": 0.00012163522012578617, "epoch": 0.39625 }, { "step": 1235, "timestamp": "2025-12-28T10:41:00.285752", "elapsed_time": 6174.57084274292, "loss": 0.1064, "grad_norm": 0.10651546716690063, "learning_rate": 0.00012150943396226415, "epoch": 0.396875 }, { "step": 1236, "timestamp": "2025-12-28T10:41:08.312804", "elapsed_time": 6182.597895145416, "loss": 0.1547, "grad_norm": 0.13378091156482697, "learning_rate": 0.00012138364779874214, "epoch": 0.3975 }, { "step": 1237, "timestamp": "2025-12-28T10:41:17.897328", "elapsed_time": 6192.182418823242, "loss": 0.1642, "grad_norm": 0.1135389655828476, "learning_rate": 0.00012125786163522013, "epoch": 0.398125 }, { "step": 1238, "timestamp": "2025-12-28T10:41:27.960873", "elapsed_time": 6202.245964050293, "loss": 0.3449, "grad_norm": 0.1613943874835968, "learning_rate": 0.00012113207547169811, "epoch": 0.39875 }, { "step": 1239, "timestamp": "2025-12-28T10:41:39.376307", "elapsed_time": 6213.661397695541, "loss": 0.1686, "grad_norm": 0.10929451882839203, "learning_rate": 0.0001210062893081761, "epoch": 0.399375 }, { "step": 1240, "timestamp": "2025-12-28T10:41:49.452823", "elapsed_time": 6223.737913370132, "loss": 0.2091, "grad_norm": 0.1252572238445282, "learning_rate": 0.0001208805031446541, "epoch": 0.4 }, { "step": 1241, "timestamp": "2025-12-28T10:41:55.431783", "elapsed_time": 6229.716873407364, "loss": 0.2126, "grad_norm": 0.15598464012145996, "learning_rate": 0.00012075471698113207, "epoch": 0.400625 }, { "step": 1242, "timestamp": "2025-12-28T10:42:04.291713", "elapsed_time": 6238.5768032073975, "loss": 0.4607, "grad_norm": 0.16947001218795776, "learning_rate": 0.00012062893081761007, "epoch": 0.40125 }, { "step": 1243, "timestamp": "2025-12-28T10:42:16.315554", "elapsed_time": 6250.600644350052, "loss": 0.1142, "grad_norm": 0.09022119641304016, "learning_rate": 0.00012050314465408805, "epoch": 0.401875 }, { "step": 1244, "timestamp": "2025-12-28T10:42:24.823459", "elapsed_time": 6259.108549118042, "loss": 0.3034, "grad_norm": 0.14146797358989716, "learning_rate": 0.00012037735849056604, "epoch": 0.4025 }, { "step": 1245, "timestamp": "2025-12-28T10:42:37.853137", "elapsed_time": 6272.138226747513, "loss": 0.1124, "grad_norm": 0.11186513304710388, "learning_rate": 0.00012025157232704403, "epoch": 0.403125 }, { "step": 1246, "timestamp": "2025-12-28T10:42:45.407348", "elapsed_time": 6279.6924386024475, "loss": 0.1426, "grad_norm": 0.11227832734584808, "learning_rate": 0.00012012578616352201, "epoch": 0.40375 }, { "step": 1247, "timestamp": "2025-12-28T10:42:53.466547", "elapsed_time": 6287.751641750336, "loss": 0.1833, "grad_norm": 0.12571921944618225, "learning_rate": 0.00012, "epoch": 0.404375 }, { "step": 1248, "timestamp": "2025-12-28T10:43:02.403855", "elapsed_time": 6296.688944816589, "loss": 0.2413, "grad_norm": 0.26408830285072327, "learning_rate": 0.00011987421383647798, "epoch": 0.405 }, { "step": 1249, "timestamp": "2025-12-28T10:43:11.586719", "elapsed_time": 6305.871813774109, "loss": 0.1202, "grad_norm": 0.11835648864507675, "learning_rate": 0.00011974842767295597, "epoch": 0.405625 }, { "step": 1250, "timestamp": "2025-12-28T10:43:17.695080", "elapsed_time": 6311.9801704883575, "loss": 0.2076, "grad_norm": 0.24628107249736786, "learning_rate": 0.00011962264150943397, "epoch": 0.40625 }, { "step": 1251, "timestamp": "2025-12-28T10:43:26.526357", "elapsed_time": 6320.811446428299, "loss": 0.3627, "grad_norm": 0.14971527457237244, "learning_rate": 0.00011949685534591195, "epoch": 0.406875 }, { "step": 1252, "timestamp": "2025-12-28T10:43:32.439133", "elapsed_time": 6326.724223136902, "loss": 0.2245, "grad_norm": 0.16869717836380005, "learning_rate": 0.00011937106918238994, "epoch": 0.4075 }, { "step": 1253, "timestamp": "2025-12-28T10:43:40.414515", "elapsed_time": 6334.699605703354, "loss": 0.1473, "grad_norm": 0.12115878611803055, "learning_rate": 0.00011924528301886793, "epoch": 0.408125 }, { "step": 1254, "timestamp": "2025-12-28T10:43:47.555112", "elapsed_time": 6341.840202093124, "loss": 0.1415, "grad_norm": 0.3590112328529358, "learning_rate": 0.00011911949685534591, "epoch": 0.40875 }, { "step": 1255, "timestamp": "2025-12-28T10:43:59.076934", "elapsed_time": 6353.36202454567, "loss": 0.166, "grad_norm": 0.13714636862277985, "learning_rate": 0.0001189937106918239, "epoch": 0.409375 }, { "step": 1256, "timestamp": "2025-12-28T10:44:07.141027", "elapsed_time": 6361.426117897034, "loss": 0.1308, "grad_norm": 0.1114254966378212, "learning_rate": 0.00011886792452830188, "epoch": 0.41 }, { "step": 1257, "timestamp": "2025-12-28T10:44:14.960681", "elapsed_time": 6369.2457716465, "loss": 0.1211, "grad_norm": 0.10369876772165298, "learning_rate": 0.00011874213836477988, "epoch": 0.410625 }, { "step": 1258, "timestamp": "2025-12-28T10:44:21.307788", "elapsed_time": 6375.592882871628, "loss": 0.1538, "grad_norm": 0.11906347423791885, "learning_rate": 0.00011861635220125787, "epoch": 0.41125 }, { "step": 1259, "timestamp": "2025-12-28T10:44:34.306623", "elapsed_time": 6388.5917139053345, "loss": 0.1692, "grad_norm": 0.1116495355963707, "learning_rate": 0.00011849056603773585, "epoch": 0.411875 }, { "step": 1260, "timestamp": "2025-12-28T10:44:46.440601", "elapsed_time": 6400.725692033768, "loss": 0.1485, "grad_norm": 0.1158275455236435, "learning_rate": 0.00011836477987421384, "epoch": 0.4125 }, { "step": 1261, "timestamp": "2025-12-28T10:44:51.167156", "elapsed_time": 6405.452246665955, "loss": 0.1906, "grad_norm": 0.17199426889419556, "learning_rate": 0.00011823899371069183, "epoch": 0.413125 }, { "step": 1262, "timestamp": "2025-12-28T10:44:55.766324", "elapsed_time": 6410.051414012909, "loss": 0.5447, "grad_norm": 0.2059682309627533, "learning_rate": 0.00011811320754716981, "epoch": 0.41375 }, { "step": 1263, "timestamp": "2025-12-28T10:44:59.626300", "elapsed_time": 6413.911394357681, "loss": 0.2163, "grad_norm": 0.16556908190250397, "learning_rate": 0.0001179874213836478, "epoch": 0.414375 }, { "step": 1264, "timestamp": "2025-12-28T10:45:10.040506", "elapsed_time": 6424.32559633255, "loss": 0.1534, "grad_norm": 0.10824116319417953, "learning_rate": 0.00011786163522012578, "epoch": 0.415 }, { "step": 1265, "timestamp": "2025-12-28T10:45:18.021995", "elapsed_time": 6432.307085752487, "loss": 0.2078, "grad_norm": 0.1333610564470291, "learning_rate": 0.00011773584905660378, "epoch": 0.415625 }, { "step": 1266, "timestamp": "2025-12-28T10:45:27.112975", "elapsed_time": 6441.398065567017, "loss": 0.1441, "grad_norm": 0.12403620034456253, "learning_rate": 0.00011761006289308177, "epoch": 0.41625 }, { "step": 1267, "timestamp": "2025-12-28T10:45:33.196456", "elapsed_time": 6447.4815464019775, "loss": 0.3569, "grad_norm": 0.16776348650455475, "learning_rate": 0.00011748427672955975, "epoch": 0.416875 }, { "step": 1268, "timestamp": "2025-12-28T10:45:44.984909", "elapsed_time": 6459.269999265671, "loss": 0.1378, "grad_norm": 0.10034506022930145, "learning_rate": 0.00011735849056603774, "epoch": 0.4175 }, { "step": 1269, "timestamp": "2025-12-28T10:45:53.809026", "elapsed_time": 6468.0941162109375, "loss": 0.2125, "grad_norm": 0.1351146399974823, "learning_rate": 0.00011723270440251572, "epoch": 0.418125 }, { "step": 1270, "timestamp": "2025-12-28T10:45:58.659300", "elapsed_time": 6472.944390773773, "loss": 0.1865, "grad_norm": 0.14262175559997559, "learning_rate": 0.00011710691823899371, "epoch": 0.41875 }, { "step": 1271, "timestamp": "2025-12-28T10:46:04.637730", "elapsed_time": 6478.922821044922, "loss": 0.194, "grad_norm": 0.14949339628219604, "learning_rate": 0.0001169811320754717, "epoch": 0.419375 }, { "step": 1272, "timestamp": "2025-12-28T10:46:12.419593", "elapsed_time": 6486.704683542252, "loss": 0.169, "grad_norm": 0.12632359564304352, "learning_rate": 0.00011685534591194968, "epoch": 0.42 }, { "step": 1273, "timestamp": "2025-12-28T10:46:28.716300", "elapsed_time": 6503.0013909339905, "loss": 0.115, "grad_norm": 0.08015146851539612, "learning_rate": 0.00011672955974842768, "epoch": 0.420625 }, { "step": 1274, "timestamp": "2025-12-28T10:46:32.620770", "elapsed_time": 6506.905859708786, "loss": 0.1909, "grad_norm": 0.16493003070354462, "learning_rate": 0.00011660377358490567, "epoch": 0.42125 }, { "step": 1275, "timestamp": "2025-12-28T10:46:43.229055", "elapsed_time": 6517.5141451358795, "loss": 0.166, "grad_norm": 0.10563270002603531, "learning_rate": 0.00011647798742138365, "epoch": 0.421875 }, { "step": 1276, "timestamp": "2025-12-28T10:46:50.110258", "elapsed_time": 6524.395348310471, "loss": 0.2099, "grad_norm": 0.1380920559167862, "learning_rate": 0.00011635220125786164, "epoch": 0.4225 }, { "step": 1277, "timestamp": "2025-12-28T10:46:58.322692", "elapsed_time": 6532.607782125473, "loss": 0.1598, "grad_norm": 0.11996757984161377, "learning_rate": 0.00011622641509433962, "epoch": 0.423125 }, { "step": 1278, "timestamp": "2025-12-28T10:47:03.207181", "elapsed_time": 6537.49227142334, "loss": 0.1789, "grad_norm": 0.14649128913879395, "learning_rate": 0.00011610062893081761, "epoch": 0.42375 }, { "step": 1279, "timestamp": "2025-12-28T10:47:09.522712", "elapsed_time": 6543.807803153992, "loss": 0.1889, "grad_norm": 0.1377793848514557, "learning_rate": 0.0001159748427672956, "epoch": 0.424375 }, { "step": 1280, "timestamp": "2025-12-28T10:47:18.362641", "elapsed_time": 6552.6477308273315, "loss": 0.1573, "grad_norm": 0.11395595967769623, "learning_rate": 0.00011584905660377358, "epoch": 0.425 }, { "step": 1281, "timestamp": "2025-12-28T10:47:24.849601", "elapsed_time": 6559.134691953659, "loss": 0.3999, "grad_norm": 0.17579996585845947, "learning_rate": 0.00011572327044025158, "epoch": 0.425625 }, { "step": 1282, "timestamp": "2025-12-28T10:47:37.488885", "elapsed_time": 6571.773975133896, "loss": 0.1319, "grad_norm": 0.09526728838682175, "learning_rate": 0.00011559748427672956, "epoch": 0.42625 }, { "step": 1283, "timestamp": "2025-12-28T10:47:45.629523", "elapsed_time": 6579.914613485336, "loss": 0.1803, "grad_norm": 0.12247025221586227, "learning_rate": 0.00011547169811320755, "epoch": 0.426875 }, { "step": 1284, "timestamp": "2025-12-28T10:47:50.156209", "elapsed_time": 6584.441299676895, "loss": 0.2226, "grad_norm": 0.15616583824157715, "learning_rate": 0.00011534591194968554, "epoch": 0.4275 }, { "step": 1285, "timestamp": "2025-12-28T10:47:58.291658", "elapsed_time": 6592.576747894287, "loss": 0.1435, "grad_norm": 0.11908165365457535, "learning_rate": 0.00011522012578616352, "epoch": 0.428125 }, { "step": 1286, "timestamp": "2025-12-28T10:48:03.589513", "elapsed_time": 6597.874603271484, "loss": 0.253, "grad_norm": 0.15733444690704346, "learning_rate": 0.00011509433962264151, "epoch": 0.42875 }, { "step": 1287, "timestamp": "2025-12-28T10:48:15.662909", "elapsed_time": 6609.947999715805, "loss": 0.1346, "grad_norm": 0.08608326315879822, "learning_rate": 0.0001149685534591195, "epoch": 0.429375 }, { "step": 1288, "timestamp": "2025-12-28T10:48:26.180269", "elapsed_time": 6620.465359449387, "loss": 0.159, "grad_norm": 0.10130292922258377, "learning_rate": 0.00011484276729559748, "epoch": 0.43 }, { "step": 1289, "timestamp": "2025-12-28T10:48:31.670014", "elapsed_time": 6625.955104589462, "loss": 0.1755, "grad_norm": 0.1298661231994629, "learning_rate": 0.00011471698113207548, "epoch": 0.430625 }, { "step": 1290, "timestamp": "2025-12-28T10:48:38.047130", "elapsed_time": 6632.332220554352, "loss": 0.1663, "grad_norm": 0.12876641750335693, "learning_rate": 0.00011459119496855346, "epoch": 0.43125 }, { "step": 1291, "timestamp": "2025-12-28T10:48:49.938884", "elapsed_time": 6644.223974466324, "loss": 0.1259, "grad_norm": 0.08893437683582306, "learning_rate": 0.00011446540880503145, "epoch": 0.431875 }, { "step": 1292, "timestamp": "2025-12-28T10:48:56.281897", "elapsed_time": 6650.566987276077, "loss": 0.2474, "grad_norm": 0.1418265700340271, "learning_rate": 0.00011433962264150944, "epoch": 0.4325 }, { "step": 1293, "timestamp": "2025-12-28T10:49:04.023046", "elapsed_time": 6658.308136463165, "loss": 0.1673, "grad_norm": 0.12449798732995987, "learning_rate": 0.00011421383647798742, "epoch": 0.433125 }, { "step": 1294, "timestamp": "2025-12-28T10:49:18.892772", "elapsed_time": 6673.177862167358, "loss": 0.143, "grad_norm": 0.10782869905233383, "learning_rate": 0.00011408805031446541, "epoch": 0.43375 }, { "step": 1295, "timestamp": "2025-12-28T10:49:29.168117", "elapsed_time": 6683.45320725441, "loss": 0.1301, "grad_norm": 0.09740663319826126, "learning_rate": 0.00011396226415094339, "epoch": 0.434375 }, { "step": 1296, "timestamp": "2025-12-28T10:49:48.401036", "elapsed_time": 6702.686126708984, "loss": 0.1528, "grad_norm": 0.08156444132328033, "learning_rate": 0.00011383647798742138, "epoch": 0.435 }, { "step": 1297, "timestamp": "2025-12-28T10:50:01.198414", "elapsed_time": 6715.483504772186, "loss": 0.1212, "grad_norm": 0.09743256121873856, "learning_rate": 0.00011371069182389938, "epoch": 0.435625 }, { "step": 1298, "timestamp": "2025-12-28T10:50:09.499924", "elapsed_time": 6723.785014867783, "loss": 0.1612, "grad_norm": 0.12712618708610535, "learning_rate": 0.00011358490566037736, "epoch": 0.43625 }, { "step": 1299, "timestamp": "2025-12-28T10:50:17.483321", "elapsed_time": 6731.768411159515, "loss": 0.1511, "grad_norm": 0.13302955031394958, "learning_rate": 0.00011345911949685535, "epoch": 0.436875 }, { "step": 1300, "timestamp": "2025-12-28T10:50:27.725225", "elapsed_time": 6742.010316371918, "loss": 0.1615, "grad_norm": 0.10727046430110931, "learning_rate": 0.00011333333333333334, "epoch": 0.4375 }, { "step": 1301, "timestamp": "2025-12-28T10:50:36.587116", "elapsed_time": 6750.872206449509, "loss": 0.2059, "grad_norm": 0.15867577493190765, "learning_rate": 0.00011320754716981132, "epoch": 0.438125 }, { "step": 1302, "timestamp": "2025-12-28T10:50:41.926696", "elapsed_time": 6756.21178650856, "loss": 0.2088, "grad_norm": 0.15406003594398499, "learning_rate": 0.00011308176100628931, "epoch": 0.43875 }, { "step": 1303, "timestamp": "2025-12-28T10:50:49.475802", "elapsed_time": 6763.760892391205, "loss": 0.2062, "grad_norm": 0.13464148342609406, "learning_rate": 0.00011295597484276729, "epoch": 0.439375 }, { "step": 1304, "timestamp": "2025-12-28T10:50:53.491533", "elapsed_time": 6767.776623249054, "loss": 0.2728, "grad_norm": 0.2125401347875595, "learning_rate": 0.00011283018867924528, "epoch": 0.44 }, { "step": 1305, "timestamp": "2025-12-28T10:51:02.825718", "elapsed_time": 6777.110809087753, "loss": 0.1534, "grad_norm": 0.10923696309328079, "learning_rate": 0.00011270440251572328, "epoch": 0.440625 }, { "step": 1306, "timestamp": "2025-12-28T10:51:12.781667", "elapsed_time": 6787.066757440567, "loss": 0.4045, "grad_norm": 0.1884467899799347, "learning_rate": 0.00011257861635220126, "epoch": 0.44125 }, { "step": 1307, "timestamp": "2025-12-28T10:51:21.728953", "elapsed_time": 6796.014044284821, "loss": 0.1939, "grad_norm": 0.12376397848129272, "learning_rate": 0.00011245283018867925, "epoch": 0.441875 }, { "step": 1308, "timestamp": "2025-12-28T10:51:33.630122", "elapsed_time": 6807.915212869644, "loss": 0.1559, "grad_norm": 0.09741462022066116, "learning_rate": 0.00011232704402515724, "epoch": 0.4425 }, { "step": 1309, "timestamp": "2025-12-28T10:51:47.370883", "elapsed_time": 6821.65597319603, "loss": 0.1371, "grad_norm": 0.08514436334371567, "learning_rate": 0.00011220125786163522, "epoch": 0.443125 }, { "step": 1310, "timestamp": "2025-12-28T10:52:02.892235", "elapsed_time": 6837.177325725555, "loss": 0.109, "grad_norm": 0.09866739064455032, "learning_rate": 0.00011207547169811321, "epoch": 0.44375 }, { "step": 1311, "timestamp": "2025-12-28T10:52:13.507601", "elapsed_time": 6847.792692184448, "loss": 0.1884, "grad_norm": 0.10366571694612503, "learning_rate": 0.00011194968553459119, "epoch": 0.444375 }, { "step": 1312, "timestamp": "2025-12-28T10:52:23.266819", "elapsed_time": 6857.551909446716, "loss": 0.2008, "grad_norm": 0.11828729510307312, "learning_rate": 0.00011182389937106919, "epoch": 0.445 }, { "step": 1313, "timestamp": "2025-12-28T10:52:33.470306", "elapsed_time": 6867.755395889282, "loss": 0.1863, "grad_norm": 0.11549749970436096, "learning_rate": 0.00011169811320754718, "epoch": 0.445625 }, { "step": 1314, "timestamp": "2025-12-28T10:52:42.647558", "elapsed_time": 6876.932648897171, "loss": 0.1582, "grad_norm": 0.11198070645332336, "learning_rate": 0.00011157232704402516, "epoch": 0.44625 }, { "step": 1315, "timestamp": "2025-12-28T10:52:52.551145", "elapsed_time": 6886.8362357616425, "loss": 0.1355, "grad_norm": 0.10204551368951797, "learning_rate": 0.00011144654088050315, "epoch": 0.446875 }, { "step": 1316, "timestamp": "2025-12-28T10:53:04.736390", "elapsed_time": 6899.021480321884, "loss": 0.1699, "grad_norm": 0.10276511311531067, "learning_rate": 0.00011132075471698113, "epoch": 0.4475 }, { "step": 1317, "timestamp": "2025-12-28T10:53:10.665741", "elapsed_time": 6904.950830936432, "loss": 0.2933, "grad_norm": 0.16935305297374725, "learning_rate": 0.00011119496855345912, "epoch": 0.448125 }, { "step": 1318, "timestamp": "2025-12-28T10:53:15.139070", "elapsed_time": 6909.424160718918, "loss": 0.2412, "grad_norm": 0.18222883343696594, "learning_rate": 0.00011106918238993711, "epoch": 0.44875 }, { "step": 1319, "timestamp": "2025-12-28T10:53:23.782033", "elapsed_time": 6918.067123413086, "loss": 0.128, "grad_norm": 0.11177527904510498, "learning_rate": 0.00011094339622641509, "epoch": 0.449375 }, { "step": 1320, "timestamp": "2025-12-28T10:53:36.143000", "elapsed_time": 6930.428090333939, "loss": 0.149, "grad_norm": 0.12012763321399689, "learning_rate": 0.00011081761006289309, "epoch": 0.45 }, { "step": 1321, "timestamp": "2025-12-28T10:53:44.449019", "elapsed_time": 6938.734109163284, "loss": 0.1378, "grad_norm": 0.11260993033647537, "learning_rate": 0.00011069182389937108, "epoch": 0.450625 }, { "step": 1322, "timestamp": "2025-12-28T10:53:53.834895", "elapsed_time": 6948.119985580444, "loss": 0.2031, "grad_norm": 0.13370606303215027, "learning_rate": 0.00011056603773584906, "epoch": 0.45125 }, { "step": 1323, "timestamp": "2025-12-28T10:54:03.643538", "elapsed_time": 6957.928628444672, "loss": 0.1721, "grad_norm": 0.12418955564498901, "learning_rate": 0.00011044025157232705, "epoch": 0.451875 }, { "step": 1324, "timestamp": "2025-12-28T10:54:10.718971", "elapsed_time": 6965.004061698914, "loss": 0.217, "grad_norm": 0.15134598314762115, "learning_rate": 0.00011031446540880503, "epoch": 0.4525 }, { "step": 1325, "timestamp": "2025-12-28T10:54:27.565308", "elapsed_time": 6981.850398540497, "loss": 0.1303, "grad_norm": 0.08781035244464874, "learning_rate": 0.00011018867924528302, "epoch": 0.453125 }, { "step": 1326, "timestamp": "2025-12-28T10:54:47.648331", "elapsed_time": 7001.933421134949, "loss": 0.1125, "grad_norm": 0.083819180727005, "learning_rate": 0.00011006289308176101, "epoch": 0.45375 }, { "step": 1327, "timestamp": "2025-12-28T10:55:08.535086", "elapsed_time": 7022.820177078247, "loss": 0.0996, "grad_norm": 0.06516632437705994, "learning_rate": 0.000109937106918239, "epoch": 0.454375 }, { "step": 1328, "timestamp": "2025-12-28T10:55:16.596328", "elapsed_time": 7030.881418466568, "loss": 0.2083, "grad_norm": 0.2553313672542572, "learning_rate": 0.00010981132075471699, "epoch": 0.455 }, { "step": 1329, "timestamp": "2025-12-28T10:55:22.137094", "elapsed_time": 7036.4221839904785, "loss": 0.2141, "grad_norm": 0.16487905383110046, "learning_rate": 0.00010968553459119497, "epoch": 0.455625 }, { "step": 1330, "timestamp": "2025-12-28T10:55:27.891844", "elapsed_time": 7042.176934480667, "loss": 0.1958, "grad_norm": 0.2868243157863617, "learning_rate": 0.00010955974842767296, "epoch": 0.45625 }, { "step": 1331, "timestamp": "2025-12-28T10:55:40.387369", "elapsed_time": 7054.672459602356, "loss": 0.1071, "grad_norm": 0.08470939844846725, "learning_rate": 0.00010943396226415095, "epoch": 0.456875 }, { "step": 1332, "timestamp": "2025-12-28T10:55:46.589006", "elapsed_time": 7060.8740956783295, "loss": 0.2562, "grad_norm": 0.16096508502960205, "learning_rate": 0.00010930817610062893, "epoch": 0.4575 }, { "step": 1333, "timestamp": "2025-12-28T10:55:59.879933", "elapsed_time": 7074.165023565292, "loss": 0.1492, "grad_norm": 0.10115651786327362, "learning_rate": 0.00010918238993710692, "epoch": 0.458125 }, { "step": 1334, "timestamp": "2025-12-28T10:56:06.938716", "elapsed_time": 7081.223806381226, "loss": 0.1331, "grad_norm": 0.12268812209367752, "learning_rate": 0.00010905660377358491, "epoch": 0.45875 }, { "step": 1335, "timestamp": "2025-12-28T10:56:17.194994", "elapsed_time": 7091.4800844192505, "loss": 0.1838, "grad_norm": 0.1212567612528801, "learning_rate": 0.0001089308176100629, "epoch": 0.459375 }, { "step": 1336, "timestamp": "2025-12-28T10:56:25.631434", "elapsed_time": 7099.916523933411, "loss": 0.1439, "grad_norm": 0.11041584610939026, "learning_rate": 0.00010880503144654089, "epoch": 0.46 }, { "step": 1337, "timestamp": "2025-12-28T10:56:32.228488", "elapsed_time": 7106.513578653336, "loss": 0.1623, "grad_norm": 0.14172060787677765, "learning_rate": 0.00010867924528301887, "epoch": 0.460625 }, { "step": 1338, "timestamp": "2025-12-28T10:56:39.404079", "elapsed_time": 7113.689169168472, "loss": 0.1669, "grad_norm": 0.1437670886516571, "learning_rate": 0.00010855345911949686, "epoch": 0.46125 }, { "step": 1339, "timestamp": "2025-12-28T10:56:52.538722", "elapsed_time": 7126.823812246323, "loss": 0.122, "grad_norm": 0.08426967263221741, "learning_rate": 0.00010842767295597485, "epoch": 0.461875 }, { "step": 1340, "timestamp": "2025-12-28T10:56:58.657468", "elapsed_time": 7132.942558288574, "loss": 0.1871, "grad_norm": 0.13586315512657166, "learning_rate": 0.00010830188679245283, "epoch": 0.4625 }, { "step": 1341, "timestamp": "2025-12-28T10:57:14.320116", "elapsed_time": 7148.605206251144, "loss": 0.1435, "grad_norm": 0.09016376733779907, "learning_rate": 0.00010817610062893082, "epoch": 0.463125 }, { "step": 1342, "timestamp": "2025-12-28T10:57:21.801068", "elapsed_time": 7156.086158514023, "loss": 0.235, "grad_norm": 0.1631210297346115, "learning_rate": 0.00010805031446540882, "epoch": 0.46375 }, { "step": 1343, "timestamp": "2025-12-28T10:57:42.686920", "elapsed_time": 7176.972010374069, "loss": 0.1637, "grad_norm": 0.08319737762212753, "learning_rate": 0.0001079245283018868, "epoch": 0.464375 }, { "step": 1344, "timestamp": "2025-12-28T10:57:57.552601", "elapsed_time": 7191.837691068649, "loss": 0.1667, "grad_norm": 0.10576473921537399, "learning_rate": 0.00010779874213836479, "epoch": 0.465 }, { "step": 1345, "timestamp": "2025-12-28T10:58:07.616516", "elapsed_time": 7201.901606798172, "loss": 0.2561, "grad_norm": 0.142886221408844, "learning_rate": 0.00010767295597484277, "epoch": 0.465625 }, { "step": 1346, "timestamp": "2025-12-28T10:58:15.468138", "elapsed_time": 7209.753228902817, "loss": 0.1848, "grad_norm": 0.14011366665363312, "learning_rate": 0.00010754716981132076, "epoch": 0.46625 }, { "step": 1347, "timestamp": "2025-12-28T10:58:20.930984", "elapsed_time": 7215.216074466705, "loss": 0.2569, "grad_norm": 0.16688081622123718, "learning_rate": 0.00010742138364779875, "epoch": 0.466875 }, { "step": 1348, "timestamp": "2025-12-28T10:58:29.272767", "elapsed_time": 7223.557857751846, "loss": 0.1619, "grad_norm": 0.12044026702642441, "learning_rate": 0.00010729559748427673, "epoch": 0.4675 }, { "step": 1349, "timestamp": "2025-12-28T10:58:35.655956", "elapsed_time": 7229.941046714783, "loss": 0.1515, "grad_norm": 0.13383439183235168, "learning_rate": 0.00010716981132075472, "epoch": 0.468125 }, { "step": 1350, "timestamp": "2025-12-28T10:58:51.652984", "elapsed_time": 7245.938074588776, "loss": 0.1298, "grad_norm": 0.08610431104898453, "learning_rate": 0.0001070440251572327, "epoch": 0.46875 }, { "step": 1351, "timestamp": "2025-12-28T10:59:01.082143", "elapsed_time": 7255.367233753204, "loss": 0.1903, "grad_norm": 0.11925891786813736, "learning_rate": 0.0001069182389937107, "epoch": 0.469375 }, { "step": 1352, "timestamp": "2025-12-28T10:59:07.425183", "elapsed_time": 7261.710273742676, "loss": 0.1339, "grad_norm": 0.12832607328891754, "learning_rate": 0.00010679245283018869, "epoch": 0.47 }, { "step": 1353, "timestamp": "2025-12-28T10:59:14.908303", "elapsed_time": 7269.193393468857, "loss": 0.1308, "grad_norm": 0.10823425650596619, "learning_rate": 0.00010666666666666667, "epoch": 0.470625 }, { "step": 1354, "timestamp": "2025-12-28T10:59:22.938044", "elapsed_time": 7277.223134994507, "loss": 0.256, "grad_norm": 0.16312338411808014, "learning_rate": 0.00010654088050314466, "epoch": 0.47125 }, { "step": 1355, "timestamp": "2025-12-28T10:59:32.232549", "elapsed_time": 7286.517638921738, "loss": 0.125, "grad_norm": 0.10023007541894913, "learning_rate": 0.00010641509433962265, "epoch": 0.471875 }, { "step": 1356, "timestamp": "2025-12-28T10:59:43.645190", "elapsed_time": 7297.930280685425, "loss": 0.3132, "grad_norm": 0.14320826530456543, "learning_rate": 0.00010628930817610063, "epoch": 0.4725 }, { "step": 1357, "timestamp": "2025-12-28T10:59:49.851471", "elapsed_time": 7304.136561393738, "loss": 0.229, "grad_norm": 0.18624381721019745, "learning_rate": 0.00010616352201257862, "epoch": 0.473125 }, { "step": 1358, "timestamp": "2025-12-28T10:59:58.793059", "elapsed_time": 7313.0781490802765, "loss": 0.1375, "grad_norm": 0.10548731684684753, "learning_rate": 0.0001060377358490566, "epoch": 0.47375 }, { "step": 1359, "timestamp": "2025-12-28T11:00:10.967893", "elapsed_time": 7325.252983093262, "loss": 0.1417, "grad_norm": 0.10804691165685654, "learning_rate": 0.0001059119496855346, "epoch": 0.474375 }, { "step": 1360, "timestamp": "2025-12-28T11:00:16.238424", "elapsed_time": 7330.523514509201, "loss": 0.2253, "grad_norm": 0.1500820368528366, "learning_rate": 0.00010578616352201259, "epoch": 0.475 }, { "step": 1361, "timestamp": "2025-12-28T11:00:24.712628", "elapsed_time": 7338.997718095779, "loss": 0.1785, "grad_norm": 0.12143810838460922, "learning_rate": 0.00010566037735849057, "epoch": 0.475625 }, { "step": 1362, "timestamp": "2025-12-28T11:00:33.573643", "elapsed_time": 7347.858733415604, "loss": 0.176, "grad_norm": 0.13725930452346802, "learning_rate": 0.00010553459119496856, "epoch": 0.47625 }, { "step": 1363, "timestamp": "2025-12-28T11:00:41.508324", "elapsed_time": 7355.7934148311615, "loss": 0.1825, "grad_norm": 0.13526056706905365, "learning_rate": 0.00010540880503144654, "epoch": 0.476875 }, { "step": 1364, "timestamp": "2025-12-28T11:00:47.043992", "elapsed_time": 7361.329082250595, "loss": 0.2259, "grad_norm": 0.16327698528766632, "learning_rate": 0.00010528301886792453, "epoch": 0.4775 }, { "step": 1365, "timestamp": "2025-12-28T11:00:57.065386", "elapsed_time": 7371.350476741791, "loss": 0.182, "grad_norm": 0.12609660625457764, "learning_rate": 0.00010515723270440252, "epoch": 0.478125 }, { "step": 1366, "timestamp": "2025-12-28T11:01:06.163415", "elapsed_time": 7380.4485058784485, "loss": 0.1756, "grad_norm": 0.1293572634458542, "learning_rate": 0.0001050314465408805, "epoch": 0.47875 }, { "step": 1367, "timestamp": "2025-12-28T11:01:15.379986", "elapsed_time": 7389.665076971054, "loss": 0.1195, "grad_norm": 0.0980340987443924, "learning_rate": 0.0001049056603773585, "epoch": 0.479375 }, { "step": 1368, "timestamp": "2025-12-28T11:01:23.731574", "elapsed_time": 7398.016664505005, "loss": 0.2087, "grad_norm": 0.1566362828016281, "learning_rate": 0.00010477987421383649, "epoch": 0.48 }, { "step": 1369, "timestamp": "2025-12-28T11:01:38.713040", "elapsed_time": 7412.99813079834, "loss": 0.1185, "grad_norm": 0.09030961245298386, "learning_rate": 0.00010465408805031447, "epoch": 0.480625 }, { "step": 1370, "timestamp": "2025-12-28T11:01:47.053864", "elapsed_time": 7421.3389544487, "loss": 0.1549, "grad_norm": 0.10715126991271973, "learning_rate": 0.00010452830188679246, "epoch": 0.48125 }, { "step": 1371, "timestamp": "2025-12-28T11:01:52.967016", "elapsed_time": 7427.252106428146, "loss": 0.253, "grad_norm": 0.16500554978847504, "learning_rate": 0.00010440251572327044, "epoch": 0.481875 }, { "step": 1372, "timestamp": "2025-12-28T11:02:04.814134", "elapsed_time": 7439.0992250442505, "loss": 0.2054, "grad_norm": 0.10960862785577774, "learning_rate": 0.00010427672955974843, "epoch": 0.4825 }, { "step": 1373, "timestamp": "2025-12-28T11:02:20.468162", "elapsed_time": 7454.7532522678375, "loss": 0.1554, "grad_norm": 0.12754300236701965, "learning_rate": 0.00010415094339622642, "epoch": 0.483125 }, { "step": 1374, "timestamp": "2025-12-28T11:02:28.644614", "elapsed_time": 7462.929704427719, "loss": 0.1667, "grad_norm": 0.12952545285224915, "learning_rate": 0.0001040251572327044, "epoch": 0.48375 }, { "step": 1375, "timestamp": "2025-12-28T11:02:33.987063", "elapsed_time": 7468.2721536159515, "loss": 0.221, "grad_norm": 0.1964443176984787, "learning_rate": 0.0001038993710691824, "epoch": 0.484375 }, { "step": 1376, "timestamp": "2025-12-28T11:02:45.242830", "elapsed_time": 7479.527920484543, "loss": 0.1549, "grad_norm": 0.10678889602422714, "learning_rate": 0.00010377358490566037, "epoch": 0.485 }, { "step": 1377, "timestamp": "2025-12-28T11:02:57.881441", "elapsed_time": 7492.166531324387, "loss": 0.1292, "grad_norm": 0.1204121932387352, "learning_rate": 0.00010364779874213837, "epoch": 0.485625 }, { "step": 1378, "timestamp": "2025-12-28T11:03:05.127176", "elapsed_time": 7499.412266731262, "loss": 0.2362, "grad_norm": 0.14369583129882812, "learning_rate": 0.00010352201257861636, "epoch": 0.48625 }, { "step": 1379, "timestamp": "2025-12-28T11:03:08.953753", "elapsed_time": 7503.238843917847, "loss": 0.3803, "grad_norm": 0.21724280714988708, "learning_rate": 0.00010339622641509434, "epoch": 0.486875 }, { "step": 1380, "timestamp": "2025-12-28T11:03:18.974982", "elapsed_time": 7513.26007270813, "loss": 0.1546, "grad_norm": 0.1561720371246338, "learning_rate": 0.00010327044025157233, "epoch": 0.4875 }, { "step": 1381, "timestamp": "2025-12-28T11:03:23.926806", "elapsed_time": 7518.211896419525, "loss": 0.2732, "grad_norm": 0.17296800017356873, "learning_rate": 0.00010314465408805032, "epoch": 0.488125 }, { "step": 1382, "timestamp": "2025-12-28T11:03:42.230374", "elapsed_time": 7536.5154638290405, "loss": 0.122, "grad_norm": 0.08095800131559372, "learning_rate": 0.0001030188679245283, "epoch": 0.48875 }, { "step": 1383, "timestamp": "2025-12-28T11:03:46.737340", "elapsed_time": 7541.022430181503, "loss": 0.2151, "grad_norm": 0.17441503703594208, "learning_rate": 0.0001028930817610063, "epoch": 0.489375 }, { "step": 1384, "timestamp": "2025-12-28T11:03:58.400597", "elapsed_time": 7552.685687303543, "loss": 0.1857, "grad_norm": 0.12047084420919418, "learning_rate": 0.00010276729559748428, "epoch": 0.49 }, { "step": 1385, "timestamp": "2025-12-28T11:04:04.778751", "elapsed_time": 7559.063841819763, "loss": 0.2076, "grad_norm": 0.1551089733839035, "learning_rate": 0.00010264150943396227, "epoch": 0.490625 }, { "step": 1386, "timestamp": "2025-12-28T11:04:13.091378", "elapsed_time": 7567.37646818161, "loss": 0.1264, "grad_norm": 0.10384256392717361, "learning_rate": 0.00010251572327044026, "epoch": 0.49125 }, { "step": 1387, "timestamp": "2025-12-28T11:04:21.309367", "elapsed_time": 7575.594457626343, "loss": 0.3628, "grad_norm": 0.22657428681850433, "learning_rate": 0.00010238993710691824, "epoch": 0.491875 }, { "step": 1388, "timestamp": "2025-12-28T11:04:29.487914", "elapsed_time": 7583.773004293442, "loss": 0.2039, "grad_norm": 0.13718171417713165, "learning_rate": 0.00010226415094339623, "epoch": 0.4925 }, { "step": 1389, "timestamp": "2025-12-28T11:04:42.300103", "elapsed_time": 7596.585193157196, "loss": 0.1706, "grad_norm": 0.10597704350948334, "learning_rate": 0.00010213836477987422, "epoch": 0.493125 }, { "step": 1390, "timestamp": "2025-12-28T11:05:03.024157", "elapsed_time": 7617.309247255325, "loss": 0.1205, "grad_norm": 0.07673165202140808, "learning_rate": 0.0001020125786163522, "epoch": 0.49375 }, { "step": 1391, "timestamp": "2025-12-28T11:05:10.494896", "elapsed_time": 7624.779986381531, "loss": 0.1777, "grad_norm": 0.12185313552618027, "learning_rate": 0.0001018867924528302, "epoch": 0.494375 }, { "step": 1392, "timestamp": "2025-12-28T11:05:16.240771", "elapsed_time": 7630.525861024857, "loss": 0.1678, "grad_norm": 0.1360418051481247, "learning_rate": 0.00010176100628930818, "epoch": 0.495 }, { "step": 1393, "timestamp": "2025-12-28T11:05:25.834105", "elapsed_time": 7640.119195222855, "loss": 0.3532, "grad_norm": 0.14602701365947723, "learning_rate": 0.00010163522012578617, "epoch": 0.495625 }, { "step": 1394, "timestamp": "2025-12-28T11:05:41.365149", "elapsed_time": 7655.650239467621, "loss": 0.1404, "grad_norm": 0.08702640235424042, "learning_rate": 0.00010150943396226416, "epoch": 0.49625 }, { "step": 1395, "timestamp": "2025-12-28T11:05:48.620666", "elapsed_time": 7662.905756235123, "loss": 0.222, "grad_norm": 0.15947473049163818, "learning_rate": 0.00010138364779874214, "epoch": 0.496875 }, { "step": 1396, "timestamp": "2025-12-28T11:05:58.248188", "elapsed_time": 7672.533278942108, "loss": 0.3532, "grad_norm": 0.15767693519592285, "learning_rate": 0.00010125786163522013, "epoch": 0.4975 }, { "step": 1397, "timestamp": "2025-12-28T11:06:04.588958", "elapsed_time": 7678.874048471451, "loss": 0.2166, "grad_norm": 0.14681337773799896, "learning_rate": 0.00010113207547169811, "epoch": 0.498125 }, { "step": 1398, "timestamp": "2025-12-28T11:06:08.944856", "elapsed_time": 7683.22994685173, "loss": 0.2352, "grad_norm": 0.19252383708953857, "learning_rate": 0.0001010062893081761, "epoch": 0.49875 }, { "step": 1399, "timestamp": "2025-12-28T11:06:18.169645", "elapsed_time": 7692.454735279083, "loss": 0.2846, "grad_norm": 0.14431574940681458, "learning_rate": 0.0001008805031446541, "epoch": 0.499375 }, { "step": 1400, "timestamp": "2025-12-28T11:06:23.381124", "elapsed_time": 7697.666215181351, "loss": 0.2615, "grad_norm": 0.18521860241889954, "learning_rate": 0.00010075471698113208, "epoch": 0.5 }, { "step": 1401, "timestamp": "2025-12-28T11:06:31.966311", "elapsed_time": 7706.2514016628265, "loss": 0.2161, "grad_norm": 0.12801428139209747, "learning_rate": 0.00010062893081761007, "epoch": 0.500625 }, { "step": 1402, "timestamp": "2025-12-28T11:06:36.665411", "elapsed_time": 7710.950501441956, "loss": 0.2801, "grad_norm": 0.18982771039009094, "learning_rate": 0.00010050314465408806, "epoch": 0.50125 }, { "step": 1403, "timestamp": "2025-12-28T11:06:40.414131", "elapsed_time": 7714.699221611023, "loss": 0.2644, "grad_norm": 0.2565673589706421, "learning_rate": 0.00010037735849056604, "epoch": 0.501875 }, { "step": 1404, "timestamp": "2025-12-28T11:06:51.418661", "elapsed_time": 7725.70375084877, "loss": 0.1786, "grad_norm": 0.10981456935405731, "learning_rate": 0.00010025157232704403, "epoch": 0.5025 }, { "step": 1405, "timestamp": "2025-12-28T11:06:57.172405", "elapsed_time": 7731.4574954509735, "loss": 0.4434, "grad_norm": 0.20104235410690308, "learning_rate": 0.00010012578616352201, "epoch": 0.503125 }, { "step": 1406, "timestamp": "2025-12-28T11:07:05.960076", "elapsed_time": 7740.245167016983, "loss": 0.1505, "grad_norm": 0.11026592552661896, "learning_rate": 0.0001, "epoch": 0.50375 }, { "step": 1407, "timestamp": "2025-12-28T11:07:17.486947", "elapsed_time": 7751.7720375061035, "loss": 0.1486, "grad_norm": 0.09958239644765854, "learning_rate": 9.9874213836478e-05, "epoch": 0.504375 }, { "step": 1408, "timestamp": "2025-12-28T11:07:26.472895", "elapsed_time": 7760.757986068726, "loss": 0.1752, "grad_norm": 0.11857740581035614, "learning_rate": 9.974842767295598e-05, "epoch": 0.505 }, { "step": 1409, "timestamp": "2025-12-28T11:07:33.007795", "elapsed_time": 7767.292885303497, "loss": 0.2939, "grad_norm": 0.15093620121479034, "learning_rate": 9.962264150943397e-05, "epoch": 0.505625 }, { "step": 1410, "timestamp": "2025-12-28T11:07:41.150713", "elapsed_time": 7775.43580365181, "loss": 0.1838, "grad_norm": 0.1252412497997284, "learning_rate": 9.949685534591195e-05, "epoch": 0.50625 }, { "step": 1411, "timestamp": "2025-12-28T11:07:53.703346", "elapsed_time": 7787.988436460495, "loss": 0.1411, "grad_norm": 0.0989488959312439, "learning_rate": 9.937106918238994e-05, "epoch": 0.506875 }, { "step": 1412, "timestamp": "2025-12-28T11:08:02.343988", "elapsed_time": 7796.629077911377, "loss": 0.1611, "grad_norm": 0.11869240552186966, "learning_rate": 9.924528301886793e-05, "epoch": 0.5075 }, { "step": 1413, "timestamp": "2025-12-28T11:08:12.956190", "elapsed_time": 7807.2412803173065, "loss": 0.1394, "grad_norm": 0.10440147668123245, "learning_rate": 9.911949685534591e-05, "epoch": 0.508125 }, { "step": 1414, "timestamp": "2025-12-28T11:08:18.967906", "elapsed_time": 7813.252996206284, "loss": 0.2214, "grad_norm": 0.16440127789974213, "learning_rate": 9.89937106918239e-05, "epoch": 0.50875 }, { "step": 1415, "timestamp": "2025-12-28T11:08:27.441443", "elapsed_time": 7821.726533174515, "loss": 0.1799, "grad_norm": 0.16303229331970215, "learning_rate": 9.88679245283019e-05, "epoch": 0.509375 }, { "step": 1416, "timestamp": "2025-12-28T11:08:38.770337", "elapsed_time": 7833.055427074432, "loss": 0.1423, "grad_norm": 0.12514905631542206, "learning_rate": 9.874213836477988e-05, "epoch": 0.51 }, { "step": 1417, "timestamp": "2025-12-28T11:08:49.045039", "elapsed_time": 7843.330129623413, "loss": 0.128, "grad_norm": 0.41818127036094666, "learning_rate": 9.861635220125787e-05, "epoch": 0.510625 }, { "step": 1418, "timestamp": "2025-12-28T11:08:56.563729", "elapsed_time": 7850.848820209503, "loss": 0.384, "grad_norm": 0.1607155203819275, "learning_rate": 9.849056603773585e-05, "epoch": 0.51125 }, { "step": 1419, "timestamp": "2025-12-28T11:09:04.998828", "elapsed_time": 7859.2839179039, "loss": 0.1517, "grad_norm": 0.11149600148200989, "learning_rate": 9.836477987421384e-05, "epoch": 0.511875 }, { "step": 1420, "timestamp": "2025-12-28T11:09:15.250027", "elapsed_time": 7869.53511762619, "loss": 0.16, "grad_norm": 0.10607406497001648, "learning_rate": 9.823899371069183e-05, "epoch": 0.5125 }, { "step": 1421, "timestamp": "2025-12-28T11:09:24.383519", "elapsed_time": 7878.668609380722, "loss": 0.206, "grad_norm": 0.12708930671215057, "learning_rate": 9.811320754716981e-05, "epoch": 0.513125 }, { "step": 1422, "timestamp": "2025-12-28T11:09:39.780418", "elapsed_time": 7894.06550860405, "loss": 0.3202, "grad_norm": 0.12644296884536743, "learning_rate": 9.79874213836478e-05, "epoch": 0.51375 }, { "step": 1423, "timestamp": "2025-12-28T11:09:52.468194", "elapsed_time": 7906.753284931183, "loss": 0.1625, "grad_norm": 0.1026865616440773, "learning_rate": 9.786163522012578e-05, "epoch": 0.514375 }, { "step": 1424, "timestamp": "2025-12-28T11:10:07.620418", "elapsed_time": 7921.905508518219, "loss": 0.1569, "grad_norm": 0.08125265687704086, "learning_rate": 9.773584905660378e-05, "epoch": 0.515 }, { "step": 1425, "timestamp": "2025-12-28T11:10:19.799493", "elapsed_time": 7934.084583282471, "loss": 0.1262, "grad_norm": 0.09111649543046951, "learning_rate": 9.761006289308177e-05, "epoch": 0.515625 }, { "step": 1426, "timestamp": "2025-12-28T11:10:25.818900", "elapsed_time": 7940.1039955616, "loss": 0.1587, "grad_norm": 0.1336372345685959, "learning_rate": 9.748427672955975e-05, "epoch": 0.51625 }, { "step": 1427, "timestamp": "2025-12-28T11:10:35.455757", "elapsed_time": 7949.740847349167, "loss": 0.1667, "grad_norm": 0.11319567263126373, "learning_rate": 9.735849056603774e-05, "epoch": 0.516875 }, { "step": 1428, "timestamp": "2025-12-28T11:10:40.995568", "elapsed_time": 7955.280659198761, "loss": 0.1724, "grad_norm": 0.13806995749473572, "learning_rate": 9.723270440251573e-05, "epoch": 0.5175 }, { "step": 1429, "timestamp": "2025-12-28T11:10:51.368410", "elapsed_time": 7965.653500318527, "loss": 0.2761, "grad_norm": 0.12941741943359375, "learning_rate": 9.710691823899371e-05, "epoch": 0.518125 }, { "step": 1430, "timestamp": "2025-12-28T11:10:57.451776", "elapsed_time": 7971.736865758896, "loss": 0.1955, "grad_norm": 0.13250701129436493, "learning_rate": 9.69811320754717e-05, "epoch": 0.51875 }, { "step": 1431, "timestamp": "2025-12-28T11:11:06.229749", "elapsed_time": 7980.514839172363, "loss": 0.1667, "grad_norm": 0.11635927855968475, "learning_rate": 9.685534591194969e-05, "epoch": 0.519375 }, { "step": 1432, "timestamp": "2025-12-28T11:11:15.100261", "elapsed_time": 7989.385351657867, "loss": 0.1818, "grad_norm": 0.1113656610250473, "learning_rate": 9.672955974842768e-05, "epoch": 0.52 }, { "step": 1433, "timestamp": "2025-12-28T11:11:35.992872", "elapsed_time": 8010.277962684631, "loss": 0.1138, "grad_norm": 0.06887345761060715, "learning_rate": 9.660377358490567e-05, "epoch": 0.520625 }, { "step": 1434, "timestamp": "2025-12-28T11:11:46.465445", "elapsed_time": 8020.75053524971, "loss": 0.1542, "grad_norm": 0.1157265156507492, "learning_rate": 9.647798742138365e-05, "epoch": 0.52125 }, { "step": 1435, "timestamp": "2025-12-28T11:11:56.976986", "elapsed_time": 8031.26207613945, "loss": 0.1418, "grad_norm": 0.10130833089351654, "learning_rate": 9.635220125786164e-05, "epoch": 0.521875 }, { "step": 1436, "timestamp": "2025-12-28T11:12:01.707156", "elapsed_time": 8035.9922461509705, "loss": 0.3286, "grad_norm": 0.18273454904556274, "learning_rate": 9.622641509433963e-05, "epoch": 0.5225 }, { "step": 1437, "timestamp": "2025-12-28T11:12:07.714931", "elapsed_time": 8042.000021696091, "loss": 0.4137, "grad_norm": 0.19872736930847168, "learning_rate": 9.610062893081761e-05, "epoch": 0.523125 }, { "step": 1438, "timestamp": "2025-12-28T11:12:19.337217", "elapsed_time": 8053.622307062149, "loss": 0.1493, "grad_norm": 0.1055750697851181, "learning_rate": 9.59748427672956e-05, "epoch": 0.52375 }, { "step": 1439, "timestamp": "2025-12-28T11:12:26.407821", "elapsed_time": 8060.692911624908, "loss": 0.1795, "grad_norm": 0.25722602009773254, "learning_rate": 9.584905660377359e-05, "epoch": 0.524375 }, { "step": 1440, "timestamp": "2025-12-28T11:12:38.538641", "elapsed_time": 8072.823730945587, "loss": 0.1517, "grad_norm": 0.10837842524051666, "learning_rate": 9.572327044025158e-05, "epoch": 0.525 }, { "step": 1441, "timestamp": "2025-12-28T11:12:53.364199", "elapsed_time": 8087.649289131165, "loss": 0.0904, "grad_norm": 0.10741348564624786, "learning_rate": 9.559748427672957e-05, "epoch": 0.525625 }, { "step": 1442, "timestamp": "2025-12-28T11:13:14.254540", "elapsed_time": 8108.539630651474, "loss": 0.0746, "grad_norm": 0.07680816948413849, "learning_rate": 9.547169811320755e-05, "epoch": 0.52625 }, { "step": 1443, "timestamp": "2025-12-28T11:13:29.340440", "elapsed_time": 8123.625530004501, "loss": 0.1088, "grad_norm": 0.07666955888271332, "learning_rate": 9.534591194968554e-05, "epoch": 0.526875 }, { "step": 1444, "timestamp": "2025-12-28T11:13:39.098916", "elapsed_time": 8133.384006500244, "loss": 0.2508, "grad_norm": 0.1188189759850502, "learning_rate": 9.522012578616352e-05, "epoch": 0.5275 }, { "step": 1445, "timestamp": "2025-12-28T11:13:44.861179", "elapsed_time": 8139.146269083023, "loss": 0.2474, "grad_norm": 0.15990658104419708, "learning_rate": 9.509433962264151e-05, "epoch": 0.528125 }, { "step": 1446, "timestamp": "2025-12-28T11:13:52.613787", "elapsed_time": 8146.898877620697, "loss": 0.32, "grad_norm": 0.15008442103862762, "learning_rate": 9.496855345911951e-05, "epoch": 0.52875 }, { "step": 1447, "timestamp": "2025-12-28T11:14:05.480355", "elapsed_time": 8159.765445947647, "loss": 0.1453, "grad_norm": 0.0898258164525032, "learning_rate": 9.484276729559749e-05, "epoch": 0.529375 }, { "step": 1448, "timestamp": "2025-12-28T11:14:15.110555", "elapsed_time": 8169.395644903183, "loss": 0.187, "grad_norm": 0.1340561956167221, "learning_rate": 9.471698113207548e-05, "epoch": 0.53 }, { "step": 1449, "timestamp": "2025-12-28T11:14:22.202495", "elapsed_time": 8176.487585544586, "loss": 0.176, "grad_norm": 0.16083924472332, "learning_rate": 9.459119496855347e-05, "epoch": 0.530625 }, { "step": 1450, "timestamp": "2025-12-28T11:14:28.754429", "elapsed_time": 8183.039519309998, "loss": 0.2644, "grad_norm": 0.15739032626152039, "learning_rate": 9.446540880503145e-05, "epoch": 0.53125 }, { "step": 1451, "timestamp": "2025-12-28T11:14:38.517478", "elapsed_time": 8192.802568435669, "loss": 0.1586, "grad_norm": 0.11289030313491821, "learning_rate": 9.433962264150944e-05, "epoch": 0.531875 }, { "step": 1452, "timestamp": "2025-12-28T11:14:44.085834", "elapsed_time": 8198.370924711227, "loss": 0.1873, "grad_norm": 0.1545426845550537, "learning_rate": 9.421383647798742e-05, "epoch": 0.5325 }, { "step": 1453, "timestamp": "2025-12-28T11:14:49.388857", "elapsed_time": 8203.673947095871, "loss": 0.4183, "grad_norm": 0.2863948941230774, "learning_rate": 9.408805031446541e-05, "epoch": 0.533125 }, { "step": 1454, "timestamp": "2025-12-28T11:14:59.422428", "elapsed_time": 8213.707518577576, "loss": 0.1046, "grad_norm": 0.0934741273522377, "learning_rate": 9.396226415094341e-05, "epoch": 0.53375 }, { "step": 1455, "timestamp": "2025-12-28T11:15:05.324002", "elapsed_time": 8219.6090965271, "loss": 0.2274, "grad_norm": 0.150177463889122, "learning_rate": 9.383647798742139e-05, "epoch": 0.534375 }, { "step": 1456, "timestamp": "2025-12-28T11:15:09.916095", "elapsed_time": 8224.201185464859, "loss": 0.3107, "grad_norm": 0.19902926683425903, "learning_rate": 9.371069182389938e-05, "epoch": 0.535 }, { "step": 1457, "timestamp": "2025-12-28T11:15:17.157963", "elapsed_time": 8231.443053722382, "loss": 0.2108, "grad_norm": 0.14318348467350006, "learning_rate": 9.358490566037736e-05, "epoch": 0.535625 }, { "step": 1458, "timestamp": "2025-12-28T11:15:23.456852", "elapsed_time": 8237.741942882538, "loss": 0.1734, "grad_norm": 0.1470271497964859, "learning_rate": 9.345911949685535e-05, "epoch": 0.53625 }, { "step": 1459, "timestamp": "2025-12-28T11:15:34.335466", "elapsed_time": 8248.620555877686, "loss": 0.1041, "grad_norm": 0.08832447230815887, "learning_rate": 9.333333333333334e-05, "epoch": 0.536875 }, { "step": 1460, "timestamp": "2025-12-28T11:15:42.518003", "elapsed_time": 8256.803106307983, "loss": 0.333, "grad_norm": 0.17225605249404907, "learning_rate": 9.320754716981132e-05, "epoch": 0.5375 }, { "step": 1461, "timestamp": "2025-12-28T11:15:52.109549", "elapsed_time": 8266.394639015198, "loss": 0.1827, "grad_norm": 0.11171706020832062, "learning_rate": 9.308176100628931e-05, "epoch": 0.538125 }, { "step": 1462, "timestamp": "2025-12-28T11:15:59.068953", "elapsed_time": 8273.354043006897, "loss": 0.1926, "grad_norm": 0.13213799893856049, "learning_rate": 9.295597484276731e-05, "epoch": 0.53875 }, { "step": 1463, "timestamp": "2025-12-28T11:16:12.804617", "elapsed_time": 8287.089706897736, "loss": 0.1089, "grad_norm": 0.08467783033847809, "learning_rate": 9.283018867924529e-05, "epoch": 0.539375 }, { "step": 1464, "timestamp": "2025-12-28T11:16:18.706671", "elapsed_time": 8292.991761922836, "loss": 0.4806, "grad_norm": 0.20453688502311707, "learning_rate": 9.270440251572328e-05, "epoch": 0.54 }, { "step": 1465, "timestamp": "2025-12-28T11:16:28.734388", "elapsed_time": 8303.01947760582, "loss": 0.1556, "grad_norm": 0.11547412723302841, "learning_rate": 9.257861635220126e-05, "epoch": 0.540625 }, { "step": 1466, "timestamp": "2025-12-28T11:16:38.462518", "elapsed_time": 8312.74760890007, "loss": 0.1464, "grad_norm": 0.10674792528152466, "learning_rate": 9.245283018867925e-05, "epoch": 0.54125 }, { "step": 1467, "timestamp": "2025-12-28T11:16:49.934266", "elapsed_time": 8324.219356536865, "loss": 0.1205, "grad_norm": 0.10482378304004669, "learning_rate": 9.232704402515724e-05, "epoch": 0.541875 }, { "step": 1468, "timestamp": "2025-12-28T11:16:59.067362", "elapsed_time": 8333.3524518013, "loss": 0.1575, "grad_norm": 0.18603090941905975, "learning_rate": 9.220125786163522e-05, "epoch": 0.5425 }, { "step": 1469, "timestamp": "2025-12-28T11:17:06.628602", "elapsed_time": 8340.913692951202, "loss": 0.2441, "grad_norm": 0.16684553027153015, "learning_rate": 9.207547169811322e-05, "epoch": 0.543125 }, { "step": 1470, "timestamp": "2025-12-28T11:17:16.426551", "elapsed_time": 8350.711641550064, "loss": 0.3306, "grad_norm": 0.14771923422813416, "learning_rate": 9.19496855345912e-05, "epoch": 0.54375 }, { "step": 1471, "timestamp": "2025-12-28T11:17:29.569642", "elapsed_time": 8363.85473227501, "loss": 0.1172, "grad_norm": 0.08360818773508072, "learning_rate": 9.182389937106919e-05, "epoch": 0.544375 }, { "step": 1472, "timestamp": "2025-12-28T11:17:35.910254", "elapsed_time": 8370.195344924927, "loss": 0.2702, "grad_norm": 0.1648559272289276, "learning_rate": 9.169811320754718e-05, "epoch": 0.545 }, { "step": 1473, "timestamp": "2025-12-28T11:17:43.053273", "elapsed_time": 8377.338364124298, "loss": 0.207, "grad_norm": 0.1394606977701187, "learning_rate": 9.157232704402516e-05, "epoch": 0.545625 }, { "step": 1474, "timestamp": "2025-12-28T11:17:49.068090", "elapsed_time": 8383.353180408478, "loss": 0.148, "grad_norm": 0.1300075501203537, "learning_rate": 9.144654088050315e-05, "epoch": 0.54625 }, { "step": 1475, "timestamp": "2025-12-28T11:17:56.519082", "elapsed_time": 8390.804172039032, "loss": 0.1676, "grad_norm": 0.12073966860771179, "learning_rate": 9.132075471698114e-05, "epoch": 0.546875 }, { "step": 1476, "timestamp": "2025-12-28T11:18:08.644421", "elapsed_time": 8402.92951130867, "loss": 0.1639, "grad_norm": 0.10669399052858353, "learning_rate": 9.119496855345912e-05, "epoch": 0.5475 }, { "step": 1477, "timestamp": "2025-12-28T11:18:17.423433", "elapsed_time": 8411.708523750305, "loss": 0.1742, "grad_norm": 0.12219968438148499, "learning_rate": 9.106918238993712e-05, "epoch": 0.548125 }, { "step": 1478, "timestamp": "2025-12-28T11:18:36.177009", "elapsed_time": 8430.462099313736, "loss": 0.1255, "grad_norm": 0.08146153390407562, "learning_rate": 9.09433962264151e-05, "epoch": 0.54875 }, { "step": 1479, "timestamp": "2025-12-28T11:18:46.115780", "elapsed_time": 8440.400870800018, "loss": 0.1592, "grad_norm": 0.1143009215593338, "learning_rate": 9.081761006289309e-05, "epoch": 0.549375 }, { "step": 1480, "timestamp": "2025-12-28T11:18:51.646790", "elapsed_time": 8445.931880950928, "loss": 0.1368, "grad_norm": 0.1322580873966217, "learning_rate": 9.069182389937108e-05, "epoch": 0.55 }, { "step": 1481, "timestamp": "2025-12-28T11:19:03.435806", "elapsed_time": 8457.72089600563, "loss": 0.184, "grad_norm": 0.11221913248300552, "learning_rate": 9.056603773584906e-05, "epoch": 0.550625 }, { "step": 1482, "timestamp": "2025-12-28T11:19:11.463661", "elapsed_time": 8465.74875164032, "loss": 0.2432, "grad_norm": 0.13333527743816376, "learning_rate": 9.044025157232705e-05, "epoch": 0.55125 }, { "step": 1483, "timestamp": "2025-12-28T11:19:20.808185", "elapsed_time": 8475.093275308609, "loss": 0.1595, "grad_norm": 0.11253681033849716, "learning_rate": 9.031446540880504e-05, "epoch": 0.551875 }, { "step": 1484, "timestamp": "2025-12-28T11:19:26.372578", "elapsed_time": 8480.657668352127, "loss": 0.2285, "grad_norm": 0.163836270570755, "learning_rate": 9.018867924528302e-05, "epoch": 0.5525 }, { "step": 1485, "timestamp": "2025-12-28T11:19:37.048053", "elapsed_time": 8491.333143234253, "loss": 0.1388, "grad_norm": 0.1059766411781311, "learning_rate": 9.006289308176102e-05, "epoch": 0.553125 }, { "step": 1486, "timestamp": "2025-12-28T11:19:46.279936", "elapsed_time": 8500.565026283264, "loss": 0.1649, "grad_norm": 0.11681363731622696, "learning_rate": 8.9937106918239e-05, "epoch": 0.55375 }, { "step": 1487, "timestamp": "2025-12-28T11:19:53.538665", "elapsed_time": 8507.823755979538, "loss": 0.2236, "grad_norm": 0.1437806636095047, "learning_rate": 8.981132075471699e-05, "epoch": 0.554375 }, { "step": 1488, "timestamp": "2025-12-28T11:20:00.062395", "elapsed_time": 8514.347485303879, "loss": 0.1624, "grad_norm": 0.13652074337005615, "learning_rate": 8.968553459119498e-05, "epoch": 0.555 }, { "step": 1489, "timestamp": "2025-12-28T11:20:12.801806", "elapsed_time": 8527.08689570427, "loss": 0.1095, "grad_norm": 0.08968634903430939, "learning_rate": 8.955974842767296e-05, "epoch": 0.555625 }, { "step": 1490, "timestamp": "2025-12-28T11:20:19.013698", "elapsed_time": 8533.298788785934, "loss": 0.1724, "grad_norm": 0.16458983719348907, "learning_rate": 8.943396226415095e-05, "epoch": 0.55625 }, { "step": 1491, "timestamp": "2025-12-28T11:20:30.809395", "elapsed_time": 8545.094485759735, "loss": 0.1667, "grad_norm": 0.10606341063976288, "learning_rate": 8.930817610062893e-05, "epoch": 0.556875 }, { "step": 1492, "timestamp": "2025-12-28T11:20:44.762591", "elapsed_time": 8559.047681331635, "loss": 0.1019, "grad_norm": 0.07538938522338867, "learning_rate": 8.918238993710692e-05, "epoch": 0.5575 }, { "step": 1493, "timestamp": "2025-12-28T11:21:01.644690", "elapsed_time": 8575.92978143692, "loss": 0.1099, "grad_norm": 0.08097642660140991, "learning_rate": 8.905660377358492e-05, "epoch": 0.558125 }, { "step": 1494, "timestamp": "2025-12-28T11:21:10.267153", "elapsed_time": 8584.55224275589, "loss": 0.1285, "grad_norm": 0.11686515063047409, "learning_rate": 8.89308176100629e-05, "epoch": 0.55875 }, { "step": 1495, "timestamp": "2025-12-28T11:21:16.174120", "elapsed_time": 8590.459214687347, "loss": 0.143, "grad_norm": 0.1381654292345047, "learning_rate": 8.880503144654089e-05, "epoch": 0.559375 }, { "step": 1496, "timestamp": "2025-12-28T11:21:26.924445", "elapsed_time": 8601.209535360336, "loss": 0.1798, "grad_norm": 0.12096145004034042, "learning_rate": 8.867924528301888e-05, "epoch": 0.56 }, { "step": 1497, "timestamp": "2025-12-28T11:21:34.078375", "elapsed_time": 8608.363465070724, "loss": 0.1584, "grad_norm": 0.13140904903411865, "learning_rate": 8.855345911949686e-05, "epoch": 0.560625 }, { "step": 1498, "timestamp": "2025-12-28T11:21:44.697298", "elapsed_time": 8618.982388973236, "loss": 0.2015, "grad_norm": 0.13744622468948364, "learning_rate": 8.842767295597485e-05, "epoch": 0.56125 }, { "step": 1499, "timestamp": "2025-12-28T11:21:53.321142", "elapsed_time": 8627.60623216629, "loss": 0.1702, "grad_norm": 0.10851988196372986, "learning_rate": 8.830188679245283e-05, "epoch": 0.561875 }, { "step": 1500, "timestamp": "2025-12-28T11:22:01.457244", "elapsed_time": 8635.7423350811, "loss": 0.1723, "grad_norm": 0.123291976749897, "learning_rate": 8.817610062893082e-05, "epoch": 0.5625 }, { "step": 1501, "timestamp": "2025-12-28T11:22:14.727687", "elapsed_time": 8649.012777328491, "loss": 0.1281, "grad_norm": 0.09212938696146011, "learning_rate": 8.805031446540882e-05, "epoch": 0.563125 }, { "step": 1502, "timestamp": "2025-12-28T11:22:31.623445", "elapsed_time": 8665.90853524208, "loss": 0.1026, "grad_norm": 0.0903933122754097, "learning_rate": 8.79245283018868e-05, "epoch": 0.56375 }, { "step": 1503, "timestamp": "2025-12-28T11:22:44.799315", "elapsed_time": 8679.084406137466, "loss": 0.1257, "grad_norm": 0.10182217508554459, "learning_rate": 8.779874213836479e-05, "epoch": 0.564375 }, { "step": 1504, "timestamp": "2025-12-28T11:22:49.619397", "elapsed_time": 8683.904487848282, "loss": 0.3112, "grad_norm": 0.19652864336967468, "learning_rate": 8.767295597484277e-05, "epoch": 0.565 }, { "step": 1505, "timestamp": "2025-12-28T11:22:55.523465", "elapsed_time": 8689.808555364609, "loss": 0.1635, "grad_norm": 0.12541894614696503, "learning_rate": 8.754716981132076e-05, "epoch": 0.565625 }, { "step": 1506, "timestamp": "2025-12-28T11:23:00.308337", "elapsed_time": 8694.593427419662, "loss": 0.2257, "grad_norm": 0.18083442747592926, "learning_rate": 8.742138364779875e-05, "epoch": 0.56625 }, { "step": 1507, "timestamp": "2025-12-28T11:23:05.013413", "elapsed_time": 8699.298503875732, "loss": 0.2438, "grad_norm": 0.19925068318843842, "learning_rate": 8.729559748427673e-05, "epoch": 0.566875 }, { "step": 1508, "timestamp": "2025-12-28T11:23:15.771520", "elapsed_time": 8710.056610822678, "loss": 0.1307, "grad_norm": 0.0917578861117363, "learning_rate": 8.716981132075472e-05, "epoch": 0.5675 }, { "step": 1509, "timestamp": "2025-12-28T11:23:35.259930", "elapsed_time": 8729.545020341873, "loss": 0.1047, "grad_norm": 0.0767156183719635, "learning_rate": 8.704402515723272e-05, "epoch": 0.568125 }, { "step": 1510, "timestamp": "2025-12-28T11:23:53.128246", "elapsed_time": 8747.413336277008, "loss": 0.1, "grad_norm": 0.08321381360292435, "learning_rate": 8.69182389937107e-05, "epoch": 0.56875 }, { "step": 1511, "timestamp": "2025-12-28T11:24:06.430650", "elapsed_time": 8760.715740919113, "loss": 0.1469, "grad_norm": 0.09777943044900894, "learning_rate": 8.679245283018869e-05, "epoch": 0.569375 }, { "step": 1512, "timestamp": "2025-12-28T11:24:17.892332", "elapsed_time": 8772.17742228508, "loss": 0.1326, "grad_norm": 0.10049686580896378, "learning_rate": 8.666666666666667e-05, "epoch": 0.57 }, { "step": 1513, "timestamp": "2025-12-28T11:24:25.872236", "elapsed_time": 8780.157325983047, "loss": 0.1542, "grad_norm": 0.11362040787935257, "learning_rate": 8.654088050314466e-05, "epoch": 0.570625 }, { "step": 1514, "timestamp": "2025-12-28T11:24:30.494130", "elapsed_time": 8784.779220819473, "loss": 0.2623, "grad_norm": 0.1871953308582306, "learning_rate": 8.641509433962265e-05, "epoch": 0.57125 }, { "step": 1515, "timestamp": "2025-12-28T11:24:36.507144", "elapsed_time": 8790.792234420776, "loss": 0.1441, "grad_norm": 0.1314094513654709, "learning_rate": 8.628930817610063e-05, "epoch": 0.571875 }, { "step": 1516, "timestamp": "2025-12-28T11:24:49.493922", "elapsed_time": 8803.779012680054, "loss": 0.1849, "grad_norm": 0.10825859010219574, "learning_rate": 8.616352201257863e-05, "epoch": 0.5725 }, { "step": 1517, "timestamp": "2025-12-28T11:24:56.586838", "elapsed_time": 8810.871928453445, "loss": 0.223, "grad_norm": 0.13489221036434174, "learning_rate": 8.603773584905662e-05, "epoch": 0.573125 }, { "step": 1518, "timestamp": "2025-12-28T11:25:07.311983", "elapsed_time": 8821.597073554993, "loss": 0.1414, "grad_norm": 0.1013222485780716, "learning_rate": 8.59119496855346e-05, "epoch": 0.57375 }, { "step": 1519, "timestamp": "2025-12-28T11:25:19.598691", "elapsed_time": 8833.883781433105, "loss": 0.1759, "grad_norm": 0.10948710888624191, "learning_rate": 8.578616352201259e-05, "epoch": 0.574375 }, { "step": 1520, "timestamp": "2025-12-28T11:25:24.829791", "elapsed_time": 8839.114881277084, "loss": 0.2123, "grad_norm": 0.1585264950990677, "learning_rate": 8.566037735849057e-05, "epoch": 0.575 }, { "step": 1521, "timestamp": "2025-12-28T11:25:31.895164", "elapsed_time": 8846.180253505707, "loss": 0.2303, "grad_norm": 0.15961548686027527, "learning_rate": 8.553459119496856e-05, "epoch": 0.575625 }, { "step": 1522, "timestamp": "2025-12-28T11:25:41.238568", "elapsed_time": 8855.523658752441, "loss": 0.2164, "grad_norm": 0.12455689162015915, "learning_rate": 8.540880503144655e-05, "epoch": 0.57625 }, { "step": 1523, "timestamp": "2025-12-28T11:26:00.476163", "elapsed_time": 8874.76125407219, "loss": 0.0883, "grad_norm": 0.06583801656961441, "learning_rate": 8.528301886792453e-05, "epoch": 0.576875 }, { "step": 1524, "timestamp": "2025-12-28T11:26:06.460351", "elapsed_time": 8880.745441198349, "loss": 0.196, "grad_norm": 0.14068111777305603, "learning_rate": 8.515723270440253e-05, "epoch": 0.5775 }, { "step": 1525, "timestamp": "2025-12-28T11:26:20.362662", "elapsed_time": 8894.647752046585, "loss": 0.125, "grad_norm": 0.08721217513084412, "learning_rate": 8.50314465408805e-05, "epoch": 0.578125 }, { "step": 1526, "timestamp": "2025-12-28T11:26:32.991352", "elapsed_time": 8907.276442289352, "loss": 0.1126, "grad_norm": 0.08798250555992126, "learning_rate": 8.49056603773585e-05, "epoch": 0.57875 }, { "step": 1527, "timestamp": "2025-12-28T11:26:40.116742", "elapsed_time": 8914.401833057404, "loss": 0.1558, "grad_norm": 0.12924781441688538, "learning_rate": 8.477987421383649e-05, "epoch": 0.579375 }, { "step": 1528, "timestamp": "2025-12-28T11:26:53.096204", "elapsed_time": 8927.381293773651, "loss": 0.1997, "grad_norm": 0.11501215398311615, "learning_rate": 8.465408805031447e-05, "epoch": 0.58 }, { "step": 1529, "timestamp": "2025-12-28T11:27:01.938294", "elapsed_time": 8936.223383903503, "loss": 0.1264, "grad_norm": 0.1118590384721756, "learning_rate": 8.452830188679246e-05, "epoch": 0.580625 }, { "step": 1530, "timestamp": "2025-12-28T11:27:13.567330", "elapsed_time": 8947.852420091629, "loss": 0.1195, "grad_norm": 0.08885247260332108, "learning_rate": 8.440251572327045e-05, "epoch": 0.58125 }, { "step": 1531, "timestamp": "2025-12-28T11:27:28.081330", "elapsed_time": 8962.366420507431, "loss": 0.1682, "grad_norm": 0.10244069248437881, "learning_rate": 8.427672955974843e-05, "epoch": 0.581875 }, { "step": 1532, "timestamp": "2025-12-28T11:27:35.182126", "elapsed_time": 8969.46721625328, "loss": 0.2695, "grad_norm": 0.14979572594165802, "learning_rate": 8.415094339622643e-05, "epoch": 0.5825 }, { "step": 1533, "timestamp": "2025-12-28T11:27:43.626751", "elapsed_time": 8977.911841392517, "loss": 0.3527, "grad_norm": 0.2613980174064636, "learning_rate": 8.40251572327044e-05, "epoch": 0.583125 }, { "step": 1534, "timestamp": "2025-12-28T11:28:04.513663", "elapsed_time": 8998.798753499985, "loss": 0.1256, "grad_norm": 0.09623493254184723, "learning_rate": 8.38993710691824e-05, "epoch": 0.58375 }, { "step": 1535, "timestamp": "2025-12-28T11:28:10.502220", "elapsed_time": 9004.787314653397, "loss": 0.1629, "grad_norm": 0.19102250039577484, "learning_rate": 8.377358490566039e-05, "epoch": 0.584375 }, { "step": 1536, "timestamp": "2025-12-28T11:28:22.123196", "elapsed_time": 9016.40828704834, "loss": 0.1249, "grad_norm": 0.2857312262058258, "learning_rate": 8.364779874213837e-05, "epoch": 0.585 }, { "step": 1537, "timestamp": "2025-12-28T11:28:33.969444", "elapsed_time": 9028.254534721375, "loss": 0.1436, "grad_norm": 0.0983673706650734, "learning_rate": 8.352201257861636e-05, "epoch": 0.585625 }, { "step": 1538, "timestamp": "2025-12-28T11:28:40.851432", "elapsed_time": 9035.136521816254, "loss": 0.2335, "grad_norm": 0.15554088354110718, "learning_rate": 8.339622641509434e-05, "epoch": 0.58625 }, { "step": 1539, "timestamp": "2025-12-28T11:28:49.945462", "elapsed_time": 9044.230551958084, "loss": 0.1401, "grad_norm": 0.11018933355808258, "learning_rate": 8.327044025157233e-05, "epoch": 0.586875 }, { "step": 1540, "timestamp": "2025-12-28T11:28:58.168763", "elapsed_time": 9052.45385313034, "loss": 0.1303, "grad_norm": 0.1073099821805954, "learning_rate": 8.314465408805033e-05, "epoch": 0.5875 }, { "step": 1541, "timestamp": "2025-12-28T11:29:12.414620", "elapsed_time": 9066.699710607529, "loss": 0.1614, "grad_norm": 0.10494954884052277, "learning_rate": 8.30188679245283e-05, "epoch": 0.588125 }, { "step": 1542, "timestamp": "2025-12-28T11:29:20.721392", "elapsed_time": 9075.006482601166, "loss": 0.1503, "grad_norm": 0.11659257858991623, "learning_rate": 8.28930817610063e-05, "epoch": 0.58875 }, { "step": 1543, "timestamp": "2025-12-28T11:29:33.420507", "elapsed_time": 9087.705597639084, "loss": 0.1244, "grad_norm": 0.09270057827234268, "learning_rate": 8.276729559748429e-05, "epoch": 0.589375 }, { "step": 1544, "timestamp": "2025-12-28T11:29:42.359077", "elapsed_time": 9096.64416718483, "loss": 0.1409, "grad_norm": 0.11551465094089508, "learning_rate": 8.264150943396227e-05, "epoch": 0.59 }, { "step": 1545, "timestamp": "2025-12-28T11:29:58.755264", "elapsed_time": 9113.040354728699, "loss": 0.1204, "grad_norm": 0.09296566992998123, "learning_rate": 8.251572327044026e-05, "epoch": 0.590625 }, { "step": 1546, "timestamp": "2025-12-28T11:30:04.541006", "elapsed_time": 9118.826096773148, "loss": 0.1537, "grad_norm": 0.33984237909317017, "learning_rate": 8.238993710691824e-05, "epoch": 0.59125 }, { "step": 1547, "timestamp": "2025-12-28T11:30:15.055486", "elapsed_time": 9129.340576410294, "loss": 0.1195, "grad_norm": 0.11354348808526993, "learning_rate": 8.226415094339623e-05, "epoch": 0.591875 }, { "step": 1548, "timestamp": "2025-12-28T11:30:20.580712", "elapsed_time": 9134.865801811218, "loss": 0.2554, "grad_norm": 0.17498965561389923, "learning_rate": 8.213836477987423e-05, "epoch": 0.5925 }, { "step": 1549, "timestamp": "2025-12-28T11:30:25.852317", "elapsed_time": 9140.137407064438, "loss": 0.2915, "grad_norm": 0.17898206412792206, "learning_rate": 8.20125786163522e-05, "epoch": 0.593125 }, { "step": 1550, "timestamp": "2025-12-28T11:30:37.662920", "elapsed_time": 9151.948010444641, "loss": 0.2806, "grad_norm": 0.14019358158111572, "learning_rate": 8.18867924528302e-05, "epoch": 0.59375 }, { "step": 1551, "timestamp": "2025-12-28T11:30:45.929046", "elapsed_time": 9160.214137077332, "loss": 0.171, "grad_norm": 0.12632305920124054, "learning_rate": 8.176100628930818e-05, "epoch": 0.594375 }, { "step": 1552, "timestamp": "2025-12-28T11:30:56.181321", "elapsed_time": 9170.466411352158, "loss": 0.1839, "grad_norm": 0.11256668716669083, "learning_rate": 8.163522012578617e-05, "epoch": 0.595 }, { "step": 1553, "timestamp": "2025-12-28T11:31:07.963905", "elapsed_time": 9182.248995542526, "loss": 0.325, "grad_norm": 0.1411469578742981, "learning_rate": 8.150943396226416e-05, "epoch": 0.595625 }, { "step": 1554, "timestamp": "2025-12-28T11:31:15.818950", "elapsed_time": 9190.104040145874, "loss": 0.2134, "grad_norm": 0.157893106341362, "learning_rate": 8.138364779874214e-05, "epoch": 0.59625 }, { "step": 1555, "timestamp": "2025-12-28T11:31:22.342648", "elapsed_time": 9196.627738714218, "loss": 0.1857, "grad_norm": 0.12139203399419785, "learning_rate": 8.125786163522013e-05, "epoch": 0.596875 }, { "step": 1556, "timestamp": "2025-12-28T11:31:32.092816", "elapsed_time": 9206.37790632248, "loss": 0.1616, "grad_norm": 0.10829068720340729, "learning_rate": 8.113207547169813e-05, "epoch": 0.5975 }, { "step": 1557, "timestamp": "2025-12-28T11:31:38.204155", "elapsed_time": 9212.489244699478, "loss": 0.2388, "grad_norm": 0.15463301539421082, "learning_rate": 8.10062893081761e-05, "epoch": 0.598125 }, { "step": 1558, "timestamp": "2025-12-28T11:31:46.978385", "elapsed_time": 9221.263475179672, "loss": 0.1431, "grad_norm": 0.1963123232126236, "learning_rate": 8.08805031446541e-05, "epoch": 0.59875 }, { "step": 1559, "timestamp": "2025-12-28T11:31:51.937096", "elapsed_time": 9226.222190618515, "loss": 0.2025, "grad_norm": 0.15821681916713715, "learning_rate": 8.075471698113208e-05, "epoch": 0.599375 }, { "step": 1560, "timestamp": "2025-12-28T11:32:04.000430", "elapsed_time": 9238.285520553589, "loss": 0.3795, "grad_norm": 0.1718837171792984, "learning_rate": 8.062893081761007e-05, "epoch": 0.6 }, { "step": 1561, "timestamp": "2025-12-28T11:32:08.448180", "elapsed_time": 9242.73327088356, "loss": 0.2619, "grad_norm": 0.19040848314762115, "learning_rate": 8.050314465408806e-05, "epoch": 0.600625 }, { "step": 1562, "timestamp": "2025-12-28T11:32:15.075715", "elapsed_time": 9249.360805511475, "loss": 0.1528, "grad_norm": 0.14181895554065704, "learning_rate": 8.037735849056604e-05, "epoch": 0.60125 }, { "step": 1563, "timestamp": "2025-12-28T11:32:24.698558", "elapsed_time": 9258.983648777008, "loss": 0.1461, "grad_norm": 0.11014454066753387, "learning_rate": 8.025157232704403e-05, "epoch": 0.601875 }, { "step": 1564, "timestamp": "2025-12-28T11:32:29.643873", "elapsed_time": 9263.928963661194, "loss": 0.3451, "grad_norm": 0.19026674330234528, "learning_rate": 8.012578616352203e-05, "epoch": 0.6025 }, { "step": 1565, "timestamp": "2025-12-28T11:32:36.118502", "elapsed_time": 9270.403592348099, "loss": 0.2698, "grad_norm": 0.15855702757835388, "learning_rate": 8e-05, "epoch": 0.603125 }, { "step": 1566, "timestamp": "2025-12-28T11:32:47.584312", "elapsed_time": 9281.869402885437, "loss": 0.1317, "grad_norm": 0.10985270142555237, "learning_rate": 7.9874213836478e-05, "epoch": 0.60375 }, { "step": 1567, "timestamp": "2025-12-28T11:32:57.951733", "elapsed_time": 9292.236823320389, "loss": 0.1344, "grad_norm": 0.09886456280946732, "learning_rate": 7.974842767295598e-05, "epoch": 0.604375 }, { "step": 1568, "timestamp": "2025-12-28T11:33:08.832307", "elapsed_time": 9303.117396831512, "loss": 0.3551, "grad_norm": 0.13379919528961182, "learning_rate": 7.962264150943397e-05, "epoch": 0.605 }, { "step": 1569, "timestamp": "2025-12-28T11:33:15.380680", "elapsed_time": 9309.665770292282, "loss": 0.4387, "grad_norm": 0.20473594963550568, "learning_rate": 7.949685534591196e-05, "epoch": 0.605625 }, { "step": 1570, "timestamp": "2025-12-28T11:33:28.420603", "elapsed_time": 9322.705693244934, "loss": 0.1534, "grad_norm": 0.09838785231113434, "learning_rate": 7.937106918238994e-05, "epoch": 0.60625 }, { "step": 1571, "timestamp": "2025-12-28T11:33:36.492152", "elapsed_time": 9330.777242422104, "loss": 0.4969, "grad_norm": 0.19386343657970428, "learning_rate": 7.924528301886794e-05, "epoch": 0.606875 }, { "step": 1572, "timestamp": "2025-12-28T11:33:44.673340", "elapsed_time": 9338.958430051804, "loss": 0.1638, "grad_norm": 0.1306898593902588, "learning_rate": 7.911949685534591e-05, "epoch": 0.6075 }, { "step": 1573, "timestamp": "2025-12-28T11:33:49.969537", "elapsed_time": 9344.254627227783, "loss": 0.1831, "grad_norm": 0.1826959103345871, "learning_rate": 7.899371069182391e-05, "epoch": 0.608125 }, { "step": 1574, "timestamp": "2025-12-28T11:33:57.344461", "elapsed_time": 9351.629550933838, "loss": 0.1919, "grad_norm": 0.1459428369998932, "learning_rate": 7.88679245283019e-05, "epoch": 0.60875 }, { "step": 1575, "timestamp": "2025-12-28T11:34:07.601020", "elapsed_time": 9361.886110544205, "loss": 0.1914, "grad_norm": 0.11683019995689392, "learning_rate": 7.874213836477988e-05, "epoch": 0.609375 }, { "step": 1576, "timestamp": "2025-12-28T11:34:12.734692", "elapsed_time": 9367.019781827927, "loss": 0.162, "grad_norm": 0.14056500792503357, "learning_rate": 7.861635220125787e-05, "epoch": 0.61 }, { "step": 1577, "timestamp": "2025-12-28T11:34:30.402188", "elapsed_time": 9384.687278032303, "loss": 0.1201, "grad_norm": 0.08820690959692001, "learning_rate": 7.849056603773586e-05, "epoch": 0.610625 }, { "step": 1578, "timestamp": "2025-12-28T11:34:39.188114", "elapsed_time": 9393.473204135895, "loss": 0.2578, "grad_norm": 0.14672920107841492, "learning_rate": 7.836477987421384e-05, "epoch": 0.61125 }, { "step": 1579, "timestamp": "2025-12-28T11:34:50.013659", "elapsed_time": 9404.298749446869, "loss": 0.1506, "grad_norm": 0.11653705686330795, "learning_rate": 7.823899371069184e-05, "epoch": 0.611875 }, { "step": 1580, "timestamp": "2025-12-28T11:34:54.799485", "elapsed_time": 9409.084575176239, "loss": 0.1957, "grad_norm": 0.1690455675125122, "learning_rate": 7.811320754716981e-05, "epoch": 0.6125 }, { "step": 1581, "timestamp": "2025-12-28T11:34:58.274219", "elapsed_time": 9412.559309482574, "loss": 0.3539, "grad_norm": 0.24397805333137512, "learning_rate": 7.798742138364781e-05, "epoch": 0.613125 }, { "step": 1582, "timestamp": "2025-12-28T11:35:05.235925", "elapsed_time": 9419.521015405655, "loss": 0.3798, "grad_norm": 0.22178561985492706, "learning_rate": 7.78616352201258e-05, "epoch": 0.61375 }, { "step": 1583, "timestamp": "2025-12-28T11:35:13.675308", "elapsed_time": 9427.960398435593, "loss": 0.2782, "grad_norm": 0.14682307839393616, "learning_rate": 7.773584905660378e-05, "epoch": 0.614375 }, { "step": 1584, "timestamp": "2025-12-28T11:35:27.622590", "elapsed_time": 9441.907680273056, "loss": 0.1162, "grad_norm": 0.08615633100271225, "learning_rate": 7.761006289308177e-05, "epoch": 0.615 }, { "step": 1585, "timestamp": "2025-12-28T11:35:44.742170", "elapsed_time": 9459.02725982666, "loss": 0.0976, "grad_norm": 0.07224409282207489, "learning_rate": 7.748427672955975e-05, "epoch": 0.615625 }, { "step": 1586, "timestamp": "2025-12-28T11:35:56.023997", "elapsed_time": 9470.30908703804, "loss": 0.1516, "grad_norm": 0.1125619113445282, "learning_rate": 7.735849056603774e-05, "epoch": 0.61625 }, { "step": 1587, "timestamp": "2025-12-28T11:36:07.646063", "elapsed_time": 9481.931153059006, "loss": 0.1575, "grad_norm": 0.12028831243515015, "learning_rate": 7.723270440251574e-05, "epoch": 0.616875 }, { "step": 1588, "timestamp": "2025-12-28T11:36:25.636674", "elapsed_time": 9499.92176437378, "loss": 0.1307, "grad_norm": 0.08736097812652588, "learning_rate": 7.710691823899372e-05, "epoch": 0.6175 }, { "step": 1589, "timestamp": "2025-12-28T11:36:35.410727", "elapsed_time": 9509.69581747055, "loss": 0.2209, "grad_norm": 0.21880212426185608, "learning_rate": 7.698113207547171e-05, "epoch": 0.618125 }, { "step": 1590, "timestamp": "2025-12-28T11:36:47.358344", "elapsed_time": 9521.643434047699, "loss": 0.3212, "grad_norm": 0.13422144949436188, "learning_rate": 7.68553459119497e-05, "epoch": 0.61875 }, { "step": 1591, "timestamp": "2025-12-28T11:36:56.658149", "elapsed_time": 9530.943238973618, "loss": 0.3681, "grad_norm": 0.2638307511806488, "learning_rate": 7.672955974842768e-05, "epoch": 0.619375 }, { "step": 1592, "timestamp": "2025-12-28T11:37:13.127742", "elapsed_time": 9547.41283249855, "loss": 0.0943, "grad_norm": 0.08068544417619705, "learning_rate": 7.660377358490567e-05, "epoch": 0.62 }, { "step": 1593, "timestamp": "2025-12-28T11:37:21.962911", "elapsed_time": 9556.248000860214, "loss": 0.183, "grad_norm": 0.1172947883605957, "learning_rate": 7.647798742138365e-05, "epoch": 0.620625 }, { "step": 1594, "timestamp": "2025-12-28T11:37:32.441106", "elapsed_time": 9566.726195812225, "loss": 0.2204, "grad_norm": 0.09986839443445206, "learning_rate": 7.635220125786164e-05, "epoch": 0.62125 }, { "step": 1595, "timestamp": "2025-12-28T11:37:38.220326", "elapsed_time": 9572.505415916443, "loss": 0.2831, "grad_norm": 0.1995793581008911, "learning_rate": 7.622641509433964e-05, "epoch": 0.621875 }, { "step": 1596, "timestamp": "2025-12-28T11:37:48.926748", "elapsed_time": 9583.21183848381, "loss": 0.1441, "grad_norm": 0.09849265217781067, "learning_rate": 7.610062893081762e-05, "epoch": 0.6225 }, { "step": 1597, "timestamp": "2025-12-28T11:37:59.937984", "elapsed_time": 9594.223074674606, "loss": 0.1331, "grad_norm": 0.11908537149429321, "learning_rate": 7.597484276729561e-05, "epoch": 0.623125 }, { "step": 1598, "timestamp": "2025-12-28T11:38:08.883234", "elapsed_time": 9603.168324947357, "loss": 0.1916, "grad_norm": 0.12535853683948517, "learning_rate": 7.584905660377359e-05, "epoch": 0.62375 }, { "step": 1599, "timestamp": "2025-12-28T11:38:17.822928", "elapsed_time": 9612.108018875122, "loss": 0.4302, "grad_norm": 0.21333205699920654, "learning_rate": 7.572327044025158e-05, "epoch": 0.624375 }, { "step": 1600, "timestamp": "2025-12-28T11:38:28.592143", "elapsed_time": 9622.877233028412, "loss": 0.1339, "grad_norm": 0.10051511973142624, "learning_rate": 7.559748427672957e-05, "epoch": 0.625 }, { "step": 1601, "timestamp": "2025-12-28T11:38:39.199999", "elapsed_time": 9633.485090017319, "loss": 0.1352, "grad_norm": 0.11261122673749924, "learning_rate": 7.547169811320755e-05, "epoch": 0.625625 }, { "step": 1602, "timestamp": "2025-12-28T11:38:46.300157", "elapsed_time": 9640.58524775505, "loss": 0.1886, "grad_norm": 0.12923336029052734, "learning_rate": 7.534591194968554e-05, "epoch": 0.62625 }, { "step": 1603, "timestamp": "2025-12-28T11:38:55.288169", "elapsed_time": 9649.5732588768, "loss": 0.1616, "grad_norm": 0.11907650530338287, "learning_rate": 7.522012578616354e-05, "epoch": 0.626875 }, { "step": 1604, "timestamp": "2025-12-28T11:39:04.668887", "elapsed_time": 9658.953977823257, "loss": 0.1443, "grad_norm": 0.11075244098901749, "learning_rate": 7.509433962264152e-05, "epoch": 0.6275 }, { "step": 1605, "timestamp": "2025-12-28T11:39:10.754759", "elapsed_time": 9665.039849996567, "loss": 0.2587, "grad_norm": 0.2336777150630951, "learning_rate": 7.49685534591195e-05, "epoch": 0.628125 }, { "step": 1606, "timestamp": "2025-12-28T11:39:18.045720", "elapsed_time": 9672.330810546875, "loss": 0.2099, "grad_norm": 0.14048390090465546, "learning_rate": 7.484276729559749e-05, "epoch": 0.62875 }, { "step": 1607, "timestamp": "2025-12-28T11:39:26.880911", "elapsed_time": 9681.166000843048, "loss": 0.1913, "grad_norm": 0.12840668857097626, "learning_rate": 7.471698113207547e-05, "epoch": 0.629375 }, { "step": 1608, "timestamp": "2025-12-28T11:39:38.404229", "elapsed_time": 9692.689319133759, "loss": 0.1631, "grad_norm": 0.17724475264549255, "learning_rate": 7.459119496855346e-05, "epoch": 0.63 }, { "step": 1609, "timestamp": "2025-12-28T11:39:46.668636", "elapsed_time": 9700.95373082161, "loss": 0.1678, "grad_norm": 0.12339440733194351, "learning_rate": 7.446540880503144e-05, "epoch": 0.630625 }, { "step": 1610, "timestamp": "2025-12-28T11:39:55.140506", "elapsed_time": 9709.425596952438, "loss": 0.3796, "grad_norm": 0.15686391294002533, "learning_rate": 7.433962264150943e-05, "epoch": 0.63125 }, { "step": 1611, "timestamp": "2025-12-28T11:40:03.202851", "elapsed_time": 9717.487941265106, "loss": 0.1415, "grad_norm": 0.1224021390080452, "learning_rate": 7.421383647798742e-05, "epoch": 0.631875 }, { "step": 1612, "timestamp": "2025-12-28T11:40:10.163533", "elapsed_time": 9724.44862318039, "loss": 0.1966, "grad_norm": 0.17393508553504944, "learning_rate": 7.40880503144654e-05, "epoch": 0.6325 }, { "step": 1613, "timestamp": "2025-12-28T11:40:24.152425", "elapsed_time": 9738.437515258789, "loss": 0.1186, "grad_norm": 0.08472223579883575, "learning_rate": 7.39622641509434e-05, "epoch": 0.633125 }, { "step": 1614, "timestamp": "2025-12-28T11:40:45.041966", "elapsed_time": 9759.327056407928, "loss": 0.1331, "grad_norm": 0.08091577142477036, "learning_rate": 7.383647798742139e-05, "epoch": 0.63375 }, { "step": 1615, "timestamp": "2025-12-28T11:40:52.221602", "elapsed_time": 9766.506692886353, "loss": 0.1571, "grad_norm": 0.12333718687295914, "learning_rate": 7.371069182389937e-05, "epoch": 0.634375 }, { "step": 1616, "timestamp": "2025-12-28T11:41:02.877788", "elapsed_time": 9777.162879228592, "loss": 0.1733, "grad_norm": 0.12430380284786224, "learning_rate": 7.358490566037736e-05, "epoch": 0.635 }, { "step": 1617, "timestamp": "2025-12-28T11:41:11.012938", "elapsed_time": 9785.298028707504, "loss": 0.1672, "grad_norm": 0.12017210572957993, "learning_rate": 7.345911949685534e-05, "epoch": 0.635625 }, { "step": 1618, "timestamp": "2025-12-28T11:41:19.447939", "elapsed_time": 9793.7330327034, "loss": 0.1954, "grad_norm": 0.14666104316711426, "learning_rate": 7.333333333333333e-05, "epoch": 0.63625 }, { "step": 1619, "timestamp": "2025-12-28T11:41:30.321982", "elapsed_time": 9804.607072114944, "loss": 0.1235, "grad_norm": 0.0968768298625946, "learning_rate": 7.320754716981132e-05, "epoch": 0.636875 }, { "step": 1620, "timestamp": "2025-12-28T11:41:43.576219", "elapsed_time": 9817.86130952835, "loss": 0.1528, "grad_norm": 0.11526031047105789, "learning_rate": 7.30817610062893e-05, "epoch": 0.6375 }, { "step": 1621, "timestamp": "2025-12-28T11:41:51.883719", "elapsed_time": 9826.16880941391, "loss": 0.2519, "grad_norm": 0.13182447850704193, "learning_rate": 7.29559748427673e-05, "epoch": 0.638125 }, { "step": 1622, "timestamp": "2025-12-28T11:42:01.783503", "elapsed_time": 9836.068593025208, "loss": 0.18, "grad_norm": 0.11508210748434067, "learning_rate": 7.283018867924527e-05, "epoch": 0.63875 }, { "step": 1623, "timestamp": "2025-12-28T11:42:08.222492", "elapsed_time": 9842.507582426071, "loss": 0.1862, "grad_norm": 0.14239539206027985, "learning_rate": 7.270440251572327e-05, "epoch": 0.639375 }, { "step": 1624, "timestamp": "2025-12-28T11:42:16.155646", "elapsed_time": 9850.440736532211, "loss": 0.3164, "grad_norm": 0.16672523319721222, "learning_rate": 7.257861635220126e-05, "epoch": 0.64 }, { "step": 1625, "timestamp": "2025-12-28T11:42:22.465387", "elapsed_time": 9856.750477075577, "loss": 0.1875, "grad_norm": 0.13851501047611237, "learning_rate": 7.245283018867924e-05, "epoch": 0.640625 }, { "step": 1626, "timestamp": "2025-12-28T11:42:43.343959", "elapsed_time": 9877.629049301147, "loss": 0.1101, "grad_norm": 0.07382085174322128, "learning_rate": 7.232704402515723e-05, "epoch": 0.64125 }, { "step": 1627, "timestamp": "2025-12-28T11:42:48.836245", "elapsed_time": 9883.121335983276, "loss": 0.3851, "grad_norm": 0.19532591104507446, "learning_rate": 7.220125786163522e-05, "epoch": 0.641875 }, { "step": 1628, "timestamp": "2025-12-28T11:42:54.575703", "elapsed_time": 9888.860793590546, "loss": 0.226, "grad_norm": 0.1721881628036499, "learning_rate": 7.20754716981132e-05, "epoch": 0.6425 }, { "step": 1629, "timestamp": "2025-12-28T11:43:02.309918", "elapsed_time": 9896.595008134842, "loss": 0.1805, "grad_norm": 0.12694233655929565, "learning_rate": 7.19496855345912e-05, "epoch": 0.643125 }, { "step": 1630, "timestamp": "2025-12-28T11:43:10.240464", "elapsed_time": 9904.52555346489, "loss": 0.1923, "grad_norm": 0.137771874666214, "learning_rate": 7.182389937106918e-05, "epoch": 0.64375 }, { "step": 1631, "timestamp": "2025-12-28T11:43:14.942263", "elapsed_time": 9909.227353334427, "loss": 0.3536, "grad_norm": 0.19589699804782867, "learning_rate": 7.169811320754717e-05, "epoch": 0.644375 }, { "step": 1632, "timestamp": "2025-12-28T11:43:20.961561", "elapsed_time": 9915.2466506958, "loss": 0.1823, "grad_norm": 0.14660844206809998, "learning_rate": 7.157232704402516e-05, "epoch": 0.645 }, { "step": 1633, "timestamp": "2025-12-28T11:43:25.350571", "elapsed_time": 9919.635661840439, "loss": 0.22, "grad_norm": 0.18052536249160767, "learning_rate": 7.144654088050314e-05, "epoch": 0.645625 }, { "step": 1634, "timestamp": "2025-12-28T11:43:34.930070", "elapsed_time": 9929.215160131454, "loss": 0.1316, "grad_norm": 0.10462430864572525, "learning_rate": 7.132075471698113e-05, "epoch": 0.64625 }, { "step": 1635, "timestamp": "2025-12-28T11:43:43.559647", "elapsed_time": 9937.844737291336, "loss": 0.1756, "grad_norm": 0.126973494887352, "learning_rate": 7.119496855345912e-05, "epoch": 0.646875 }, { "step": 1636, "timestamp": "2025-12-28T11:43:51.704679", "elapsed_time": 9945.989769220352, "loss": 0.1654, "grad_norm": 0.14970456063747406, "learning_rate": 7.10691823899371e-05, "epoch": 0.6475 }, { "step": 1637, "timestamp": "2025-12-28T11:44:10.864475", "elapsed_time": 9965.149565935135, "loss": 0.1098, "grad_norm": 0.08534051477909088, "learning_rate": 7.09433962264151e-05, "epoch": 0.648125 }, { "step": 1638, "timestamp": "2025-12-28T11:44:24.042231", "elapsed_time": 9978.327321767807, "loss": 0.1609, "grad_norm": 0.10151031613349915, "learning_rate": 7.081761006289308e-05, "epoch": 0.64875 }, { "step": 1639, "timestamp": "2025-12-28T11:44:33.412608", "elapsed_time": 9987.697702884674, "loss": 0.1008, "grad_norm": 0.09468650072813034, "learning_rate": 7.069182389937107e-05, "epoch": 0.649375 }, { "step": 1640, "timestamp": "2025-12-28T11:44:42.835568", "elapsed_time": 9997.120658397675, "loss": 0.1202, "grad_norm": 0.1058143749833107, "learning_rate": 7.056603773584906e-05, "epoch": 0.65 }, { "step": 1641, "timestamp": "2025-12-28T11:44:51.467264", "elapsed_time": 10005.752353906631, "loss": 0.1627, "grad_norm": 0.12414304912090302, "learning_rate": 7.044025157232704e-05, "epoch": 0.650625 }, { "step": 1642, "timestamp": "2025-12-28T11:45:00.450823", "elapsed_time": 10014.735912799835, "loss": 0.1949, "grad_norm": 0.13407202064990997, "learning_rate": 7.031446540880503e-05, "epoch": 0.65125 }, { "step": 1643, "timestamp": "2025-12-28T11:45:09.542076", "elapsed_time": 10023.827166318893, "loss": 0.1587, "grad_norm": 0.19232605397701263, "learning_rate": 7.018867924528301e-05, "epoch": 0.651875 }, { "step": 1644, "timestamp": "2025-12-28T11:45:17.892732", "elapsed_time": 10032.177821874619, "loss": 0.1543, "grad_norm": 0.12963691353797913, "learning_rate": 7.0062893081761e-05, "epoch": 0.6525 }, { "step": 1645, "timestamp": "2025-12-28T11:45:29.559165", "elapsed_time": 10043.844255447388, "loss": 0.1364, "grad_norm": 0.09772875905036926, "learning_rate": 6.9937106918239e-05, "epoch": 0.653125 }, { "step": 1646, "timestamp": "2025-12-28T11:45:34.381174", "elapsed_time": 10048.66626906395, "loss": 0.2831, "grad_norm": 0.20503878593444824, "learning_rate": 6.981132075471698e-05, "epoch": 0.65375 }, { "step": 1647, "timestamp": "2025-12-28T11:45:38.491344", "elapsed_time": 10052.77643442154, "loss": 0.4374, "grad_norm": 0.20627865195274353, "learning_rate": 6.968553459119497e-05, "epoch": 0.654375 }, { "step": 1648, "timestamp": "2025-12-28T11:45:44.518475", "elapsed_time": 10058.803565263748, "loss": 0.1858, "grad_norm": 0.16573020815849304, "learning_rate": 6.955974842767296e-05, "epoch": 0.655 }, { "step": 1649, "timestamp": "2025-12-28T11:46:05.413855", "elapsed_time": 10079.698945760727, "loss": 0.0942, "grad_norm": 0.07246656715869904, "learning_rate": 6.943396226415094e-05, "epoch": 0.655625 }, { "step": 1650, "timestamp": "2025-12-28T11:46:24.467732", "elapsed_time": 10098.752822637558, "loss": 0.1091, "grad_norm": 0.09009367972612381, "learning_rate": 6.930817610062893e-05, "epoch": 0.65625 }, { "step": 1651, "timestamp": "2025-12-28T11:46:33.886132", "elapsed_time": 10108.17122220993, "loss": 0.2347, "grad_norm": 0.12859852612018585, "learning_rate": 6.918238993710691e-05, "epoch": 0.656875 }, { "step": 1652, "timestamp": "2025-12-28T11:46:42.823278", "elapsed_time": 10117.108368635178, "loss": 0.1737, "grad_norm": 0.11088800430297852, "learning_rate": 6.90566037735849e-05, "epoch": 0.6575 }, { "step": 1653, "timestamp": "2025-12-28T11:46:54.658395", "elapsed_time": 10128.943485021591, "loss": 0.1427, "grad_norm": 0.10402551293373108, "learning_rate": 6.89308176100629e-05, "epoch": 0.658125 }, { "step": 1654, "timestamp": "2025-12-28T11:47:03.630914", "elapsed_time": 10137.916004419327, "loss": 0.1378, "grad_norm": 0.11617520451545715, "learning_rate": 6.880503144654088e-05, "epoch": 0.65875 }, { "step": 1655, "timestamp": "2025-12-28T11:47:13.389780", "elapsed_time": 10147.67487025261, "loss": 0.2234, "grad_norm": 0.11303507536649704, "learning_rate": 6.867924528301887e-05, "epoch": 0.659375 }, { "step": 1656, "timestamp": "2025-12-28T11:47:26.010568", "elapsed_time": 10160.29565834999, "loss": 0.2006, "grad_norm": 0.11115586757659912, "learning_rate": 6.855345911949685e-05, "epoch": 0.66 }, { "step": 1657, "timestamp": "2025-12-28T11:47:33.149623", "elapsed_time": 10167.434713602066, "loss": 0.4581, "grad_norm": 0.19294337928295135, "learning_rate": 6.842767295597484e-05, "epoch": 0.660625 }, { "step": 1658, "timestamp": "2025-12-28T11:47:43.773293", "elapsed_time": 10178.058383464813, "loss": 0.197, "grad_norm": 0.1200195699930191, "learning_rate": 6.830188679245283e-05, "epoch": 0.66125 }, { "step": 1659, "timestamp": "2025-12-28T11:47:58.867153", "elapsed_time": 10193.152243375778, "loss": 0.1369, "grad_norm": 0.09150442481040955, "learning_rate": 6.817610062893081e-05, "epoch": 0.661875 }, { "step": 1660, "timestamp": "2025-12-28T11:48:11.905738", "elapsed_time": 10206.190828084946, "loss": 0.1244, "grad_norm": 0.09038378298282623, "learning_rate": 6.80503144654088e-05, "epoch": 0.6625 }, { "step": 1661, "timestamp": "2025-12-28T11:48:31.524146", "elapsed_time": 10225.80923986435, "loss": 0.1345, "grad_norm": 0.08619663864374161, "learning_rate": 6.79245283018868e-05, "epoch": 0.663125 }, { "step": 1662, "timestamp": "2025-12-28T11:48:46.868927", "elapsed_time": 10241.154017448425, "loss": 0.1112, "grad_norm": 0.08081649988889694, "learning_rate": 6.779874213836478e-05, "epoch": 0.66375 }, { "step": 1663, "timestamp": "2025-12-28T11:48:56.461538", "elapsed_time": 10250.746627807617, "loss": 0.1567, "grad_norm": 0.12073066085577011, "learning_rate": 6.767295597484277e-05, "epoch": 0.664375 }, { "step": 1664, "timestamp": "2025-12-28T11:49:01.469922", "elapsed_time": 10255.755012512207, "loss": 0.2881, "grad_norm": 0.17294885218143463, "learning_rate": 6.754716981132075e-05, "epoch": 0.665 }, { "step": 1665, "timestamp": "2025-12-28T11:49:14.018002", "elapsed_time": 10268.303092956543, "loss": 0.1356, "grad_norm": 0.09254828095436096, "learning_rate": 6.742138364779874e-05, "epoch": 0.665625 }, { "step": 1666, "timestamp": "2025-12-28T11:49:34.915088", "elapsed_time": 10289.20017838478, "loss": 0.0919, "grad_norm": 0.06759099662303925, "learning_rate": 6.729559748427673e-05, "epoch": 0.66625 }, { "step": 1667, "timestamp": "2025-12-28T11:49:43.859625", "elapsed_time": 10298.144714593887, "loss": 0.1992, "grad_norm": 0.11738283932209015, "learning_rate": 6.716981132075471e-05, "epoch": 0.666875 }, { "step": 1668, "timestamp": "2025-12-28T11:49:54.744691", "elapsed_time": 10309.029781341553, "loss": 0.3147, "grad_norm": 0.15712329745292664, "learning_rate": 6.70440251572327e-05, "epoch": 0.6675 }, { "step": 1669, "timestamp": "2025-12-28T11:50:06.585145", "elapsed_time": 10320.870235919952, "loss": 0.1362, "grad_norm": 0.09576017409563065, "learning_rate": 6.691823899371068e-05, "epoch": 0.668125 }, { "step": 1670, "timestamp": "2025-12-28T11:50:12.699422", "elapsed_time": 10326.984511852264, "loss": 0.3545, "grad_norm": 0.17530383169651031, "learning_rate": 6.679245283018868e-05, "epoch": 0.66875 }, { "step": 1671, "timestamp": "2025-12-28T11:50:20.519400", "elapsed_time": 10334.804490327835, "loss": 0.1705, "grad_norm": 0.17078684270381927, "learning_rate": 6.666666666666667e-05, "epoch": 0.669375 }, { "step": 1672, "timestamp": "2025-12-28T11:50:27.666519", "elapsed_time": 10341.951608896255, "loss": 0.1515, "grad_norm": 0.13205629587173462, "learning_rate": 6.654088050314465e-05, "epoch": 0.67 }, { "step": 1673, "timestamp": "2025-12-28T11:50:36.092046", "elapsed_time": 10350.37713599205, "loss": 0.1247, "grad_norm": 0.1089896485209465, "learning_rate": 6.641509433962264e-05, "epoch": 0.670625 }, { "step": 1674, "timestamp": "2025-12-28T11:50:44.269347", "elapsed_time": 10358.55443739891, "loss": 0.2136, "grad_norm": 0.2696709930896759, "learning_rate": 6.628930817610063e-05, "epoch": 0.67125 }, { "step": 1675, "timestamp": "2025-12-28T11:50:54.025458", "elapsed_time": 10368.310548067093, "loss": 0.0958, "grad_norm": 0.09596393257379532, "learning_rate": 6.616352201257861e-05, "epoch": 0.671875 }, { "step": 1676, "timestamp": "2025-12-28T11:51:02.324646", "elapsed_time": 10376.609736204147, "loss": 0.3834, "grad_norm": 0.1570800244808197, "learning_rate": 6.60377358490566e-05, "epoch": 0.6725 }, { "step": 1677, "timestamp": "2025-12-28T11:51:08.949728", "elapsed_time": 10383.234818220139, "loss": 0.2527, "grad_norm": 0.17659151554107666, "learning_rate": 6.591194968553458e-05, "epoch": 0.673125 }, { "step": 1678, "timestamp": "2025-12-28T11:51:16.808478", "elapsed_time": 10391.093568086624, "loss": 0.1488, "grad_norm": 0.13421952724456787, "learning_rate": 6.578616352201258e-05, "epoch": 0.67375 }, { "step": 1679, "timestamp": "2025-12-28T11:51:23.951873", "elapsed_time": 10398.236963272095, "loss": 0.1324, "grad_norm": 0.10407258570194244, "learning_rate": 6.566037735849057e-05, "epoch": 0.674375 }, { "step": 1680, "timestamp": "2025-12-28T11:51:44.838057", "elapsed_time": 10419.123147726059, "loss": 0.111, "grad_norm": 0.08704216778278351, "learning_rate": 6.553459119496855e-05, "epoch": 0.675 }, { "step": 1681, "timestamp": "2025-12-28T11:51:54.548567", "elapsed_time": 10428.83365702629, "loss": 0.149, "grad_norm": 0.11608418822288513, "learning_rate": 6.540880503144654e-05, "epoch": 0.675625 }, { "step": 1682, "timestamp": "2025-12-28T11:52:03.641934", "elapsed_time": 10437.927023887634, "loss": 0.2173, "grad_norm": 0.1506926417350769, "learning_rate": 6.528301886792453e-05, "epoch": 0.67625 }, { "step": 1683, "timestamp": "2025-12-28T11:52:12.314341", "elapsed_time": 10446.599431276321, "loss": 0.192, "grad_norm": 0.17497746646404266, "learning_rate": 6.515723270440251e-05, "epoch": 0.676875 }, { "step": 1684, "timestamp": "2025-12-28T11:52:24.992727", "elapsed_time": 10459.277817249298, "loss": 0.1565, "grad_norm": 0.09996242821216583, "learning_rate": 6.50314465408805e-05, "epoch": 0.6775 }, { "step": 1685, "timestamp": "2025-12-28T11:52:35.596215", "elapsed_time": 10469.881305456161, "loss": 0.1227, "grad_norm": 0.10592522472143173, "learning_rate": 6.490566037735849e-05, "epoch": 0.678125 }, { "step": 1686, "timestamp": "2025-12-28T11:52:43.075463", "elapsed_time": 10477.360553264618, "loss": 0.1698, "grad_norm": 0.1294756829738617, "learning_rate": 6.477987421383648e-05, "epoch": 0.67875 }, { "step": 1687, "timestamp": "2025-12-28T11:52:52.975805", "elapsed_time": 10487.260895729065, "loss": 0.155, "grad_norm": 0.12220288068056107, "learning_rate": 6.465408805031447e-05, "epoch": 0.679375 }, { "step": 1688, "timestamp": "2025-12-28T11:53:00.065512", "elapsed_time": 10494.350602388382, "loss": 0.3612, "grad_norm": 0.18684455752372742, "learning_rate": 6.452830188679245e-05, "epoch": 0.68 }, { "step": 1689, "timestamp": "2025-12-28T11:53:07.317682", "elapsed_time": 10501.602772951126, "loss": 0.1816, "grad_norm": 0.13791623711585999, "learning_rate": 6.440251572327044e-05, "epoch": 0.680625 }, { "step": 1690, "timestamp": "2025-12-28T11:53:17.843498", "elapsed_time": 10512.128588676453, "loss": 0.1549, "grad_norm": 0.11356908082962036, "learning_rate": 6.427672955974842e-05, "epoch": 0.68125 }, { "step": 1691, "timestamp": "2025-12-28T11:53:38.715052", "elapsed_time": 10533.00014257431, "loss": 0.0906, "grad_norm": 0.07176671177148819, "learning_rate": 6.415094339622641e-05, "epoch": 0.681875 }, { "step": 1692, "timestamp": "2025-12-28T11:53:46.531119", "elapsed_time": 10540.816209077835, "loss": 0.1907, "grad_norm": 0.141318216919899, "learning_rate": 6.40251572327044e-05, "epoch": 0.6825 }, { "step": 1693, "timestamp": "2025-12-28T11:54:00.485420", "elapsed_time": 10554.770510435104, "loss": 0.1189, "grad_norm": 0.08776037395000458, "learning_rate": 6.389937106918239e-05, "epoch": 0.683125 }, { "step": 1694, "timestamp": "2025-12-28T11:54:17.313372", "elapsed_time": 10571.598462820053, "loss": 0.1334, "grad_norm": 0.09080228954553604, "learning_rate": 6.377358490566038e-05, "epoch": 0.68375 }, { "step": 1695, "timestamp": "2025-12-28T11:54:35.244713", "elapsed_time": 10589.529803276062, "loss": 0.117, "grad_norm": 0.09272570163011551, "learning_rate": 6.364779874213837e-05, "epoch": 0.684375 }, { "step": 1696, "timestamp": "2025-12-28T11:54:42.409774", "elapsed_time": 10596.69486451149, "loss": 0.1424, "grad_norm": 0.21328891813755035, "learning_rate": 6.352201257861635e-05, "epoch": 0.685 }, { "step": 1697, "timestamp": "2025-12-28T11:54:47.771967", "elapsed_time": 10602.057057142258, "loss": 0.1881, "grad_norm": 0.14298084378242493, "learning_rate": 6.339622641509434e-05, "epoch": 0.685625 }, { "step": 1698, "timestamp": "2025-12-28T11:54:54.653860", "elapsed_time": 10608.938950777054, "loss": 0.2189, "grad_norm": 0.1579611450433731, "learning_rate": 6.327044025157232e-05, "epoch": 0.68625 }, { "step": 1699, "timestamp": "2025-12-28T11:55:08.547726", "elapsed_time": 10622.832817077637, "loss": 0.1117, "grad_norm": 0.13192813098430634, "learning_rate": 6.314465408805031e-05, "epoch": 0.686875 }, { "step": 1700, "timestamp": "2025-12-28T11:55:14.487705", "elapsed_time": 10628.77279496193, "loss": 0.1748, "grad_norm": 0.13700881600379944, "learning_rate": 6.301886792452831e-05, "epoch": 0.6875 }, { "step": 1701, "timestamp": "2025-12-28T11:55:22.517878", "elapsed_time": 10636.802968740463, "loss": 0.1818, "grad_norm": 0.13351568579673767, "learning_rate": 6.289308176100629e-05, "epoch": 0.688125 }, { "step": 1702, "timestamp": "2025-12-28T11:55:32.797402", "elapsed_time": 10647.082492351532, "loss": 0.1418, "grad_norm": 0.11062666773796082, "learning_rate": 6.276729559748428e-05, "epoch": 0.68875 }, { "step": 1703, "timestamp": "2025-12-28T11:55:41.627529", "elapsed_time": 10655.912618637085, "loss": 0.138, "grad_norm": 0.12265567481517792, "learning_rate": 6.264150943396226e-05, "epoch": 0.689375 }, { "step": 1704, "timestamp": "2025-12-28T11:55:48.617990", "elapsed_time": 10662.903080940247, "loss": 0.231, "grad_norm": 0.14810243248939514, "learning_rate": 6.251572327044025e-05, "epoch": 0.69 }, { "step": 1705, "timestamp": "2025-12-28T11:55:56.917497", "elapsed_time": 10671.202587604523, "loss": 0.1909, "grad_norm": 0.1277126967906952, "learning_rate": 6.238993710691824e-05, "epoch": 0.690625 }, { "step": 1706, "timestamp": "2025-12-28T11:56:11.042753", "elapsed_time": 10685.327842950821, "loss": 0.1558, "grad_norm": 0.10240671783685684, "learning_rate": 6.226415094339622e-05, "epoch": 0.69125 }, { "step": 1707, "timestamp": "2025-12-28T11:56:19.175090", "elapsed_time": 10693.460180997849, "loss": 0.1653, "grad_norm": 0.13144554197788239, "learning_rate": 6.213836477987421e-05, "epoch": 0.691875 }, { "step": 1708, "timestamp": "2025-12-28T11:56:28.590060", "elapsed_time": 10702.875150203705, "loss": 0.4414, "grad_norm": 0.15143659710884094, "learning_rate": 6.201257861635221e-05, "epoch": 0.6925 }, { "step": 1709, "timestamp": "2025-12-28T11:56:35.593767", "elapsed_time": 10709.87885761261, "loss": 0.1623, "grad_norm": 0.1259068101644516, "learning_rate": 6.188679245283019e-05, "epoch": 0.693125 }, { "step": 1710, "timestamp": "2025-12-28T11:56:48.621945", "elapsed_time": 10722.907035589218, "loss": 0.1455, "grad_norm": 0.09466191381216049, "learning_rate": 6.176100628930818e-05, "epoch": 0.69375 }, { "step": 1711, "timestamp": "2025-12-28T11:57:02.125457", "elapsed_time": 10736.41054725647, "loss": 0.1517, "grad_norm": 0.11332697421312332, "learning_rate": 6.163522012578616e-05, "epoch": 0.694375 }, { "step": 1712, "timestamp": "2025-12-28T11:57:17.576247", "elapsed_time": 10751.861337184906, "loss": 0.1169, "grad_norm": 0.08637837320566177, "learning_rate": 6.150943396226415e-05, "epoch": 0.695 }, { "step": 1713, "timestamp": "2025-12-28T11:57:27.606626", "elapsed_time": 10761.891716957092, "loss": 0.2205, "grad_norm": 0.220438614487648, "learning_rate": 6.138364779874214e-05, "epoch": 0.695625 }, { "step": 1714, "timestamp": "2025-12-28T11:57:44.152720", "elapsed_time": 10778.437810659409, "loss": 0.107, "grad_norm": 0.07576917856931686, "learning_rate": 6.125786163522012e-05, "epoch": 0.69625 }, { "step": 1715, "timestamp": "2025-12-28T11:58:02.072391", "elapsed_time": 10796.357481956482, "loss": 0.1151, "grad_norm": 0.08082325756549835, "learning_rate": 6.113207547169812e-05, "epoch": 0.696875 }, { "step": 1716, "timestamp": "2025-12-28T11:58:06.544039", "elapsed_time": 10800.829129457474, "loss": 0.253, "grad_norm": 0.17654751241207123, "learning_rate": 6.10062893081761e-05, "epoch": 0.6975 }, { "step": 1717, "timestamp": "2025-12-28T11:58:12.629577", "elapsed_time": 10806.914667367935, "loss": 0.225, "grad_norm": 0.15153177082538605, "learning_rate": 6.088050314465409e-05, "epoch": 0.698125 }, { "step": 1718, "timestamp": "2025-12-28T11:58:20.697858", "elapsed_time": 10814.982948064804, "loss": 0.1918, "grad_norm": 0.14196348190307617, "learning_rate": 6.075471698113207e-05, "epoch": 0.69875 }, { "step": 1719, "timestamp": "2025-12-28T11:58:35.776435", "elapsed_time": 10830.061525821686, "loss": 0.1034, "grad_norm": 0.08372589945793152, "learning_rate": 6.0628930817610065e-05, "epoch": 0.699375 }, { "step": 1720, "timestamp": "2025-12-28T11:58:41.937607", "elapsed_time": 10836.222697019577, "loss": 0.2077, "grad_norm": 0.15489476919174194, "learning_rate": 6.050314465408805e-05, "epoch": 0.7 }, { "step": 1721, "timestamp": "2025-12-28T11:58:49.416543", "elapsed_time": 10843.70163345337, "loss": 0.1679, "grad_norm": 0.13079725205898285, "learning_rate": 6.037735849056604e-05, "epoch": 0.700625 }, { "step": 1722, "timestamp": "2025-12-28T11:58:57.300619", "elapsed_time": 10851.58570933342, "loss": 0.232, "grad_norm": 0.1388118714094162, "learning_rate": 6.025157232704402e-05, "epoch": 0.70125 }, { "step": 1723, "timestamp": "2025-12-28T11:59:01.515497", "elapsed_time": 10855.800586938858, "loss": 0.3026, "grad_norm": 0.21141380071640015, "learning_rate": 6.0125786163522016e-05, "epoch": 0.701875 }, { "step": 1724, "timestamp": "2025-12-28T11:59:09.331565", "elapsed_time": 10863.61665558815, "loss": 0.2376, "grad_norm": 0.14707203209400177, "learning_rate": 6e-05, "epoch": 0.7025 }, { "step": 1725, "timestamp": "2025-12-28T11:59:15.412136", "elapsed_time": 10869.697226524353, "loss": 0.2456, "grad_norm": 0.16581028699874878, "learning_rate": 5.987421383647799e-05, "epoch": 0.703125 }, { "step": 1726, "timestamp": "2025-12-28T11:59:22.041531", "elapsed_time": 10876.326621294022, "loss": 0.2671, "grad_norm": 0.15748703479766846, "learning_rate": 5.974842767295597e-05, "epoch": 0.70375 }, { "step": 1727, "timestamp": "2025-12-28T11:59:36.715440", "elapsed_time": 10891.00053024292, "loss": 0.0921, "grad_norm": 0.083674855530262, "learning_rate": 5.9622641509433966e-05, "epoch": 0.704375 }, { "step": 1728, "timestamp": "2025-12-28T11:59:51.882192", "elapsed_time": 10906.167282104492, "loss": 0.126, "grad_norm": 0.0893712192773819, "learning_rate": 5.949685534591195e-05, "epoch": 0.705 }, { "step": 1729, "timestamp": "2025-12-28T11:59:59.217349", "elapsed_time": 10913.502439022064, "loss": 0.1477, "grad_norm": 0.15378941595554352, "learning_rate": 5.937106918238994e-05, "epoch": 0.705625 }, { "step": 1730, "timestamp": "2025-12-28T12:00:07.029866", "elapsed_time": 10921.31495642662, "loss": 0.1585, "grad_norm": 0.5633273124694824, "learning_rate": 5.9245283018867923e-05, "epoch": 0.70625 }, { "step": 1731, "timestamp": "2025-12-28T12:00:15.817213", "elapsed_time": 10930.102303981781, "loss": 0.141, "grad_norm": 0.11738086491823196, "learning_rate": 5.9119496855345916e-05, "epoch": 0.706875 }, { "step": 1732, "timestamp": "2025-12-28T12:00:31.339033", "elapsed_time": 10945.62412405014, "loss": 0.1483, "grad_norm": 0.09447266906499863, "learning_rate": 5.89937106918239e-05, "epoch": 0.7075 }, { "step": 1733, "timestamp": "2025-12-28T12:00:45.283079", "elapsed_time": 10959.568170070648, "loss": 0.1092, "grad_norm": 0.09043192118406296, "learning_rate": 5.886792452830189e-05, "epoch": 0.708125 }, { "step": 1734, "timestamp": "2025-12-28T12:00:54.651343", "elapsed_time": 10968.936433792114, "loss": 0.2226, "grad_norm": 0.1368798464536667, "learning_rate": 5.8742138364779874e-05, "epoch": 0.70875 }, { "step": 1735, "timestamp": "2025-12-28T12:00:59.476546", "elapsed_time": 10973.76163649559, "loss": 0.23, "grad_norm": 0.18571457266807556, "learning_rate": 5.861635220125786e-05, "epoch": 0.709375 }, { "step": 1736, "timestamp": "2025-12-28T12:01:18.837352", "elapsed_time": 10993.12244272232, "loss": 0.132, "grad_norm": 0.07873135060071945, "learning_rate": 5.849056603773585e-05, "epoch": 0.71 }, { "step": 1737, "timestamp": "2025-12-28T12:01:36.420927", "elapsed_time": 11010.70601773262, "loss": 0.0795, "grad_norm": 0.06837231665849686, "learning_rate": 5.836477987421384e-05, "epoch": 0.710625 }, { "step": 1738, "timestamp": "2025-12-28T12:01:44.490699", "elapsed_time": 11018.77578997612, "loss": 0.1778, "grad_norm": 0.13413724303245544, "learning_rate": 5.8238993710691824e-05, "epoch": 0.71125 }, { "step": 1739, "timestamp": "2025-12-28T12:02:00.532185", "elapsed_time": 11034.81727552414, "loss": 0.1339, "grad_norm": 0.092499740421772, "learning_rate": 5.811320754716981e-05, "epoch": 0.711875 }, { "step": 1740, "timestamp": "2025-12-28T12:02:08.705472", "elapsed_time": 11042.990562677383, "loss": 0.1989, "grad_norm": 0.14828531444072723, "learning_rate": 5.79874213836478e-05, "epoch": 0.7125 }, { "step": 1741, "timestamp": "2025-12-28T12:02:21.495006", "elapsed_time": 11055.780095815659, "loss": 0.1429, "grad_norm": 0.10832472145557404, "learning_rate": 5.786163522012579e-05, "epoch": 0.713125 }, { "step": 1742, "timestamp": "2025-12-28T12:02:27.508870", "elapsed_time": 11061.793960094452, "loss": 0.1582, "grad_norm": 0.1331390142440796, "learning_rate": 5.7735849056603774e-05, "epoch": 0.71375 }, { "step": 1743, "timestamp": "2025-12-28T12:02:32.397381", "elapsed_time": 11066.682476043701, "loss": 0.3914, "grad_norm": 0.20057356357574463, "learning_rate": 5.761006289308176e-05, "epoch": 0.714375 }, { "step": 1744, "timestamp": "2025-12-28T12:02:41.828268", "elapsed_time": 11076.113358259201, "loss": 0.1488, "grad_norm": 0.1079302653670311, "learning_rate": 5.748427672955975e-05, "epoch": 0.715 }, { "step": 1745, "timestamp": "2025-12-28T12:02:48.746835", "elapsed_time": 11083.03192615509, "loss": 0.2745, "grad_norm": 0.15143828094005585, "learning_rate": 5.735849056603774e-05, "epoch": 0.715625 }, { "step": 1746, "timestamp": "2025-12-28T12:02:54.122330", "elapsed_time": 11088.407420396805, "loss": 0.432, "grad_norm": 0.1899183988571167, "learning_rate": 5.7232704402515724e-05, "epoch": 0.71625 }, { "step": 1747, "timestamp": "2025-12-28T12:03:04.601026", "elapsed_time": 11098.88611626625, "loss": 0.1694, "grad_norm": 0.1249026283621788, "learning_rate": 5.710691823899371e-05, "epoch": 0.716875 }, { "step": 1748, "timestamp": "2025-12-28T12:03:24.883654", "elapsed_time": 11119.168744325638, "loss": 0.1206, "grad_norm": 0.07808022201061249, "learning_rate": 5.6981132075471696e-05, "epoch": 0.7175 }, { "step": 1749, "timestamp": "2025-12-28T12:03:34.253119", "elapsed_time": 11128.538208961487, "loss": 0.1954, "grad_norm": 0.12799711525440216, "learning_rate": 5.685534591194969e-05, "epoch": 0.718125 }, { "step": 1750, "timestamp": "2025-12-28T12:03:41.136923", "elapsed_time": 11135.422013521194, "loss": 0.1752, "grad_norm": 0.15330050885677338, "learning_rate": 5.6729559748427674e-05, "epoch": 0.71875 }, { "step": 1751, "timestamp": "2025-12-28T12:03:58.075443", "elapsed_time": 11152.360533714294, "loss": 0.1457, "grad_norm": 0.11428968608379364, "learning_rate": 5.660377358490566e-05, "epoch": 0.719375 }, { "step": 1752, "timestamp": "2025-12-28T12:04:06.337992", "elapsed_time": 11160.623083114624, "loss": 0.14, "grad_norm": 0.1099119707942009, "learning_rate": 5.6477987421383646e-05, "epoch": 0.72 }, { "step": 1753, "timestamp": "2025-12-28T12:04:27.219359", "elapsed_time": 11181.504449605942, "loss": 0.192, "grad_norm": 0.08993933349847794, "learning_rate": 5.635220125786164e-05, "epoch": 0.720625 }, { "step": 1754, "timestamp": "2025-12-28T12:04:32.971069", "elapsed_time": 11187.256159305573, "loss": 0.2048, "grad_norm": 0.1937875747680664, "learning_rate": 5.6226415094339625e-05, "epoch": 0.72125 }, { "step": 1755, "timestamp": "2025-12-28T12:04:50.508927", "elapsed_time": 11204.79401755333, "loss": 0.1391, "grad_norm": 0.10435943305492401, "learning_rate": 5.610062893081761e-05, "epoch": 0.721875 }, { "step": 1756, "timestamp": "2025-12-28T12:05:03.301495", "elapsed_time": 11217.586585998535, "loss": 0.1312, "grad_norm": 0.10173836350440979, "learning_rate": 5.5974842767295596e-05, "epoch": 0.7225 }, { "step": 1757, "timestamp": "2025-12-28T12:05:09.379646", "elapsed_time": 11223.66473698616, "loss": 0.1685, "grad_norm": 0.2740840017795563, "learning_rate": 5.584905660377359e-05, "epoch": 0.723125 }, { "step": 1758, "timestamp": "2025-12-28T12:05:16.331503", "elapsed_time": 11230.616593122482, "loss": 0.1821, "grad_norm": 0.13564808666706085, "learning_rate": 5.5723270440251575e-05, "epoch": 0.72375 }, { "step": 1759, "timestamp": "2025-12-28T12:05:25.908391", "elapsed_time": 11240.193481445312, "loss": 0.1539, "grad_norm": 0.11152060329914093, "learning_rate": 5.559748427672956e-05, "epoch": 0.724375 }, { "step": 1760, "timestamp": "2025-12-28T12:05:32.102932", "elapsed_time": 11246.388021945953, "loss": 0.2534, "grad_norm": 0.35456421971321106, "learning_rate": 5.5471698113207547e-05, "epoch": 0.725 }, { "step": 1761, "timestamp": "2025-12-28T12:05:38.035109", "elapsed_time": 11252.320199251175, "loss": 0.2746, "grad_norm": 0.20745328068733215, "learning_rate": 5.534591194968554e-05, "epoch": 0.725625 }, { "step": 1762, "timestamp": "2025-12-28T12:05:45.470001", "elapsed_time": 11259.755092144012, "loss": 0.1958, "grad_norm": 0.1335449367761612, "learning_rate": 5.5220125786163525e-05, "epoch": 0.72625 }, { "step": 1763, "timestamp": "2025-12-28T12:05:51.035573", "elapsed_time": 11265.320663452148, "loss": 0.2487, "grad_norm": 0.17504768073558807, "learning_rate": 5.509433962264151e-05, "epoch": 0.726875 }, { "step": 1764, "timestamp": "2025-12-28T12:06:07.328667", "elapsed_time": 11281.613757371902, "loss": 0.1174, "grad_norm": 0.15383565425872803, "learning_rate": 5.49685534591195e-05, "epoch": 0.7275 }, { "step": 1765, "timestamp": "2025-12-28T12:06:13.760643", "elapsed_time": 11288.045733213425, "loss": 0.1973, "grad_norm": 0.31960293650627136, "learning_rate": 5.484276729559748e-05, "epoch": 0.728125 }, { "step": 1766, "timestamp": "2025-12-28T12:06:20.069410", "elapsed_time": 11294.354505062103, "loss": 0.2468, "grad_norm": 0.16368219256401062, "learning_rate": 5.4716981132075475e-05, "epoch": 0.72875 }, { "step": 1767, "timestamp": "2025-12-28T12:06:32.245904", "elapsed_time": 11306.530994415283, "loss": 0.0958, "grad_norm": 0.08179421722888947, "learning_rate": 5.459119496855346e-05, "epoch": 0.729375 }, { "step": 1768, "timestamp": "2025-12-28T12:06:42.619417", "elapsed_time": 11316.904507875443, "loss": 0.1315, "grad_norm": 0.10045388340950012, "learning_rate": 5.446540880503145e-05, "epoch": 0.73 }, { "step": 1769, "timestamp": "2025-12-28T12:06:51.947947", "elapsed_time": 11326.233037233353, "loss": 0.211, "grad_norm": 0.13236618041992188, "learning_rate": 5.433962264150943e-05, "epoch": 0.730625 }, { "step": 1770, "timestamp": "2025-12-28T12:07:03.166704", "elapsed_time": 11337.451794624329, "loss": 0.1187, "grad_norm": 0.08745797723531723, "learning_rate": 5.4213836477987425e-05, "epoch": 0.73125 }, { "step": 1771, "timestamp": "2025-12-28T12:07:08.268446", "elapsed_time": 11342.553536176682, "loss": 0.261, "grad_norm": 0.1820874959230423, "learning_rate": 5.408805031446541e-05, "epoch": 0.731875 }, { "step": 1772, "timestamp": "2025-12-28T12:07:18.742417", "elapsed_time": 11353.027507543564, "loss": 0.1424, "grad_norm": 0.10937105119228363, "learning_rate": 5.39622641509434e-05, "epoch": 0.7325 }, { "step": 1773, "timestamp": "2025-12-28T12:07:35.988111", "elapsed_time": 11370.273201227188, "loss": 0.131, "grad_norm": 0.09035047143697739, "learning_rate": 5.383647798742138e-05, "epoch": 0.733125 }, { "step": 1774, "timestamp": "2025-12-28T12:07:42.368541", "elapsed_time": 11376.653632164001, "loss": 0.2366, "grad_norm": 0.1836661696434021, "learning_rate": 5.3710691823899376e-05, "epoch": 0.73375 }, { "step": 1775, "timestamp": "2025-12-28T12:07:51.102768", "elapsed_time": 11385.387857913971, "loss": 0.1211, "grad_norm": 0.10112607479095459, "learning_rate": 5.358490566037736e-05, "epoch": 0.734375 }, { "step": 1776, "timestamp": "2025-12-28T12:08:07.843256", "elapsed_time": 11402.128346443176, "loss": 0.1377, "grad_norm": 0.08925390243530273, "learning_rate": 5.345911949685535e-05, "epoch": 0.735 }, { "step": 1777, "timestamp": "2025-12-28T12:08:15.286123", "elapsed_time": 11409.571213960648, "loss": 0.1673, "grad_norm": 0.13005472719669342, "learning_rate": 5.333333333333333e-05, "epoch": 0.735625 }, { "step": 1778, "timestamp": "2025-12-28T12:08:26.521285", "elapsed_time": 11420.806375265121, "loss": 0.2347, "grad_norm": 0.11882436275482178, "learning_rate": 5.3207547169811326e-05, "epoch": 0.73625 }, { "step": 1779, "timestamp": "2025-12-28T12:08:33.112874", "elapsed_time": 11427.397964477539, "loss": 0.1772, "grad_norm": 0.14445851743221283, "learning_rate": 5.308176100628931e-05, "epoch": 0.736875 }, { "step": 1780, "timestamp": "2025-12-28T12:08:43.367927", "elapsed_time": 11437.653017282486, "loss": 0.1138, "grad_norm": 0.10857395082712173, "learning_rate": 5.29559748427673e-05, "epoch": 0.7375 }, { "step": 1781, "timestamp": "2025-12-28T12:08:50.894299", "elapsed_time": 11445.179389238358, "loss": 0.1847, "grad_norm": 0.13043537735939026, "learning_rate": 5.283018867924528e-05, "epoch": 0.738125 }, { "step": 1782, "timestamp": "2025-12-28T12:08:59.518528", "elapsed_time": 11453.803642749786, "loss": 0.1546, "grad_norm": 0.16742828488349915, "learning_rate": 5.270440251572327e-05, "epoch": 0.73875 }, { "step": 1783, "timestamp": "2025-12-28T12:09:07.688718", "elapsed_time": 11461.973808765411, "loss": 0.137, "grad_norm": 0.1259349137544632, "learning_rate": 5.257861635220126e-05, "epoch": 0.739375 }, { "step": 1784, "timestamp": "2025-12-28T12:09:14.061669", "elapsed_time": 11468.346759557724, "loss": 0.2945, "grad_norm": 0.1729360669851303, "learning_rate": 5.245283018867925e-05, "epoch": 0.74 }, { "step": 1785, "timestamp": "2025-12-28T12:09:24.669063", "elapsed_time": 11478.95415353775, "loss": 0.143, "grad_norm": 0.10868491232395172, "learning_rate": 5.2327044025157234e-05, "epoch": 0.740625 }, { "step": 1786, "timestamp": "2025-12-28T12:09:29.943561", "elapsed_time": 11484.228651285172, "loss": 0.2229, "grad_norm": 0.1582539975643158, "learning_rate": 5.220125786163522e-05, "epoch": 0.74125 }, { "step": 1787, "timestamp": "2025-12-28T12:09:44.863706", "elapsed_time": 11499.14879655838, "loss": 0.1276, "grad_norm": 0.09422967582941055, "learning_rate": 5.207547169811321e-05, "epoch": 0.741875 }, { "step": 1788, "timestamp": "2025-12-28T12:09:52.629311", "elapsed_time": 11506.91440153122, "loss": 0.3331, "grad_norm": 0.15312416851520538, "learning_rate": 5.19496855345912e-05, "epoch": 0.7425 }, { "step": 1789, "timestamp": "2025-12-28T12:10:00.979750", "elapsed_time": 11515.264840602875, "loss": 0.1471, "grad_norm": 0.14243608713150024, "learning_rate": 5.1823899371069184e-05, "epoch": 0.743125 }, { "step": 1790, "timestamp": "2025-12-28T12:10:21.855558", "elapsed_time": 11536.140648126602, "loss": 0.0733, "grad_norm": 0.06610149145126343, "learning_rate": 5.169811320754717e-05, "epoch": 0.74375 }, { "step": 1791, "timestamp": "2025-12-28T12:10:37.078731", "elapsed_time": 11551.363821744919, "loss": 0.0946, "grad_norm": 0.07790885865688324, "learning_rate": 5.157232704402516e-05, "epoch": 0.744375 }, { "step": 1792, "timestamp": "2025-12-28T12:10:42.615191", "elapsed_time": 11556.90028142929, "loss": 0.2814, "grad_norm": 0.21737056970596313, "learning_rate": 5.144654088050315e-05, "epoch": 0.745 }, { "step": 1793, "timestamp": "2025-12-28T12:10:51.083715", "elapsed_time": 11565.36880493164, "loss": 0.1001, "grad_norm": 0.10370247066020966, "learning_rate": 5.1320754716981134e-05, "epoch": 0.745625 }, { "step": 1794, "timestamp": "2025-12-28T12:11:01.500012", "elapsed_time": 11575.785102844238, "loss": 0.158, "grad_norm": 0.10645274817943573, "learning_rate": 5.119496855345912e-05, "epoch": 0.74625 }, { "step": 1795, "timestamp": "2025-12-28T12:11:15.700384", "elapsed_time": 11589.985474824905, "loss": 0.1245, "grad_norm": 0.09372659027576447, "learning_rate": 5.106918238993711e-05, "epoch": 0.746875 }, { "step": 1796, "timestamp": "2025-12-28T12:11:23.832555", "elapsed_time": 11598.117645263672, "loss": 0.1891, "grad_norm": 0.2279994636774063, "learning_rate": 5.09433962264151e-05, "epoch": 0.7475 }, { "step": 1797, "timestamp": "2025-12-28T12:11:28.965281", "elapsed_time": 11603.250371217728, "loss": 0.242, "grad_norm": 0.17063166201114655, "learning_rate": 5.0817610062893084e-05, "epoch": 0.748125 }, { "step": 1798, "timestamp": "2025-12-28T12:11:49.020815", "elapsed_time": 11623.30590581894, "loss": 0.1164, "grad_norm": 0.07605966180562973, "learning_rate": 5.069182389937107e-05, "epoch": 0.74875 }, { "step": 1799, "timestamp": "2025-12-28T12:11:57.236498", "elapsed_time": 11631.521588563919, "loss": 0.145, "grad_norm": 0.11617890745401382, "learning_rate": 5.0566037735849056e-05, "epoch": 0.749375 }, { "step": 1800, "timestamp": "2025-12-28T12:12:05.263297", "elapsed_time": 11639.548387527466, "loss": 0.1271, "grad_norm": 0.12031132727861404, "learning_rate": 5.044025157232705e-05, "epoch": 0.75 }, { "step": 1801, "timestamp": "2025-12-28T12:12:13.459492", "elapsed_time": 11647.744582653046, "loss": 0.1386, "grad_norm": 0.12550540268421173, "learning_rate": 5.0314465408805034e-05, "epoch": 0.750625 }, { "step": 1802, "timestamp": "2025-12-28T12:12:20.087338", "elapsed_time": 11654.372427940369, "loss": 0.2102, "grad_norm": 0.14700697362422943, "learning_rate": 5.018867924528302e-05, "epoch": 0.75125 }, { "step": 1803, "timestamp": "2025-12-28T12:12:27.258226", "elapsed_time": 11661.543317079544, "loss": 0.1896, "grad_norm": 0.1357898861169815, "learning_rate": 5.0062893081761006e-05, "epoch": 0.751875 }, { "step": 1804, "timestamp": "2025-12-28T12:12:36.128775", "elapsed_time": 11670.413865566254, "loss": 0.2121, "grad_norm": 0.170795738697052, "learning_rate": 4.9937106918239e-05, "epoch": 0.7525 }, { "step": 1805, "timestamp": "2025-12-28T12:12:47.640738", "elapsed_time": 11681.925828456879, "loss": 0.1038, "grad_norm": 0.08710363507270813, "learning_rate": 4.9811320754716985e-05, "epoch": 0.753125 }, { "step": 1806, "timestamp": "2025-12-28T12:12:59.703031", "elapsed_time": 11693.988121509552, "loss": 0.1155, "grad_norm": 0.09086853265762329, "learning_rate": 4.968553459119497e-05, "epoch": 0.75375 }, { "step": 1807, "timestamp": "2025-12-28T12:13:07.973124", "elapsed_time": 11702.258214712143, "loss": 0.1217, "grad_norm": 0.09986895322799683, "learning_rate": 4.9559748427672956e-05, "epoch": 0.754375 }, { "step": 1808, "timestamp": "2025-12-28T12:13:15.956985", "elapsed_time": 11710.242075920105, "loss": 0.2275, "grad_norm": 0.2224908024072647, "learning_rate": 4.943396226415095e-05, "epoch": 0.755 }, { "step": 1809, "timestamp": "2025-12-28T12:13:26.842250", "elapsed_time": 11721.127341032028, "loss": 0.1621, "grad_norm": 0.13384607434272766, "learning_rate": 4.9308176100628935e-05, "epoch": 0.755625 }, { "step": 1810, "timestamp": "2025-12-28T12:13:33.836759", "elapsed_time": 11728.121849298477, "loss": 0.1827, "grad_norm": 0.12781156599521637, "learning_rate": 4.918238993710692e-05, "epoch": 0.75625 }, { "step": 1811, "timestamp": "2025-12-28T12:13:45.020366", "elapsed_time": 11739.305456399918, "loss": 0.1141, "grad_norm": 0.08918755501508713, "learning_rate": 4.9056603773584906e-05, "epoch": 0.756875 }, { "step": 1812, "timestamp": "2025-12-28T12:13:53.188101", "elapsed_time": 11747.473192214966, "loss": 0.2376, "grad_norm": 0.1457865685224533, "learning_rate": 4.893081761006289e-05, "epoch": 0.7575 }, { "step": 1813, "timestamp": "2025-12-28T12:14:02.152041", "elapsed_time": 11756.437131166458, "loss": 0.1533, "grad_norm": 0.18399164080619812, "learning_rate": 4.8805031446540885e-05, "epoch": 0.758125 }, { "step": 1814, "timestamp": "2025-12-28T12:14:09.888429", "elapsed_time": 11764.173519134521, "loss": 0.2152, "grad_norm": 0.14015917479991913, "learning_rate": 4.867924528301887e-05, "epoch": 0.75875 }, { "step": 1815, "timestamp": "2025-12-28T12:14:15.353774", "elapsed_time": 11769.638864517212, "loss": 0.1697, "grad_norm": 0.14405082166194916, "learning_rate": 4.855345911949686e-05, "epoch": 0.759375 }, { "step": 1816, "timestamp": "2025-12-28T12:14:20.680001", "elapsed_time": 11774.965090990067, "loss": 0.3188, "grad_norm": 0.23127539455890656, "learning_rate": 4.842767295597484e-05, "epoch": 0.76 }, { "step": 1817, "timestamp": "2025-12-28T12:14:26.352096", "elapsed_time": 11780.637186527252, "loss": 0.2125, "grad_norm": 0.16115504503250122, "learning_rate": 4.8301886792452835e-05, "epoch": 0.760625 }, { "step": 1818, "timestamp": "2025-12-28T12:14:37.336389", "elapsed_time": 11791.621479988098, "loss": 0.3284, "grad_norm": 0.14183737337589264, "learning_rate": 4.817610062893082e-05, "epoch": 0.76125 }, { "step": 1819, "timestamp": "2025-12-28T12:14:44.254425", "elapsed_time": 11798.539515256882, "loss": 0.2209, "grad_norm": 0.14606201648712158, "learning_rate": 4.805031446540881e-05, "epoch": 0.761875 }, { "step": 1820, "timestamp": "2025-12-28T12:14:53.530877", "elapsed_time": 11807.815967798233, "loss": 0.1198, "grad_norm": 0.18284346163272858, "learning_rate": 4.792452830188679e-05, "epoch": 0.7625 }, { "step": 1821, "timestamp": "2025-12-28T12:15:02.736188", "elapsed_time": 11817.021278858185, "loss": 0.1288, "grad_norm": 0.1111404076218605, "learning_rate": 4.7798742138364785e-05, "epoch": 0.763125 }, { "step": 1822, "timestamp": "2025-12-28T12:15:10.672655", "elapsed_time": 11824.957744836807, "loss": 0.3206, "grad_norm": 0.1543583869934082, "learning_rate": 4.767295597484277e-05, "epoch": 0.76375 }, { "step": 1823, "timestamp": "2025-12-28T12:15:24.962444", "elapsed_time": 11839.247534513474, "loss": 0.1522, "grad_norm": 0.11686399579048157, "learning_rate": 4.754716981132076e-05, "epoch": 0.764375 }, { "step": 1824, "timestamp": "2025-12-28T12:15:32.700314", "elapsed_time": 11846.985404253006, "loss": 0.2664, "grad_norm": 0.1384664922952652, "learning_rate": 4.742138364779874e-05, "epoch": 0.765 }, { "step": 1825, "timestamp": "2025-12-28T12:15:51.808174", "elapsed_time": 11866.093264579773, "loss": 0.2736, "grad_norm": 0.11269756406545639, "learning_rate": 4.7295597484276736e-05, "epoch": 0.765625 }, { "step": 1826, "timestamp": "2025-12-28T12:15:57.370013", "elapsed_time": 11871.655103206635, "loss": 0.1664, "grad_norm": 0.1405743509531021, "learning_rate": 4.716981132075472e-05, "epoch": 0.76625 }, { "step": 1827, "timestamp": "2025-12-28T12:16:10.153089", "elapsed_time": 11884.43817949295, "loss": 0.1049, "grad_norm": 0.08988852053880692, "learning_rate": 4.704402515723271e-05, "epoch": 0.766875 }, { "step": 1828, "timestamp": "2025-12-28T12:16:18.898682", "elapsed_time": 11893.183772325516, "loss": 0.1607, "grad_norm": 0.22954370081424713, "learning_rate": 4.691823899371069e-05, "epoch": 0.7675 }, { "step": 1829, "timestamp": "2025-12-28T12:16:26.036369", "elapsed_time": 11900.321459770203, "loss": 0.2229, "grad_norm": 0.15082332491874695, "learning_rate": 4.679245283018868e-05, "epoch": 0.768125 }, { "step": 1830, "timestamp": "2025-12-28T12:16:35.324020", "elapsed_time": 11909.609110355377, "loss": 0.0898, "grad_norm": 0.08692368119955063, "learning_rate": 4.666666666666667e-05, "epoch": 0.76875 }, { "step": 1831, "timestamp": "2025-12-28T12:16:41.705976", "elapsed_time": 11915.991065740585, "loss": 0.1725, "grad_norm": 0.13606931269168854, "learning_rate": 4.654088050314466e-05, "epoch": 0.769375 }, { "step": 1832, "timestamp": "2025-12-28T12:16:53.044860", "elapsed_time": 11927.32995057106, "loss": 0.2069, "grad_norm": 0.44243958592414856, "learning_rate": 4.641509433962264e-05, "epoch": 0.77 }, { "step": 1833, "timestamp": "2025-12-28T12:17:10.459005", "elapsed_time": 11944.744095563889, "loss": 0.1064, "grad_norm": 0.07558456808328629, "learning_rate": 4.628930817610063e-05, "epoch": 0.770625 }, { "step": 1834, "timestamp": "2025-12-28T12:17:22.401241", "elapsed_time": 11956.686330795288, "loss": 0.0909, "grad_norm": 0.09237058460712433, "learning_rate": 4.616352201257862e-05, "epoch": 0.77125 }, { "step": 1835, "timestamp": "2025-12-28T12:17:36.526898", "elapsed_time": 11970.811988592148, "loss": 0.1281, "grad_norm": 0.09217442572116852, "learning_rate": 4.603773584905661e-05, "epoch": 0.771875 }, { "step": 1836, "timestamp": "2025-12-28T12:17:48.704766", "elapsed_time": 11982.989856481552, "loss": 0.1197, "grad_norm": 0.09364216774702072, "learning_rate": 4.5911949685534594e-05, "epoch": 0.7725 }, { "step": 1837, "timestamp": "2025-12-28T12:17:54.175712", "elapsed_time": 11988.460802555084, "loss": 0.1996, "grad_norm": 0.15755294263362885, "learning_rate": 4.578616352201258e-05, "epoch": 0.773125 }, { "step": 1838, "timestamp": "2025-12-28T12:18:03.161139", "elapsed_time": 11997.446228981018, "loss": 0.3362, "grad_norm": 0.1641501635313034, "learning_rate": 4.566037735849057e-05, "epoch": 0.77375 }, { "step": 1839, "timestamp": "2025-12-28T12:18:15.104307", "elapsed_time": 12009.389397144318, "loss": 0.1722, "grad_norm": 0.12171639502048492, "learning_rate": 4.553459119496856e-05, "epoch": 0.774375 }, { "step": 1840, "timestamp": "2025-12-28T12:18:19.637046", "elapsed_time": 12013.922136545181, "loss": 0.2978, "grad_norm": 0.19682510197162628, "learning_rate": 4.5408805031446544e-05, "epoch": 0.775 }, { "step": 1841, "timestamp": "2025-12-28T12:18:31.203708", "elapsed_time": 12025.48879814148, "loss": 0.1497, "grad_norm": 0.10714510083198547, "learning_rate": 4.528301886792453e-05, "epoch": 0.775625 }, { "step": 1842, "timestamp": "2025-12-28T12:18:41.667974", "elapsed_time": 12035.95306444168, "loss": 0.3087, "grad_norm": 0.1449776291847229, "learning_rate": 4.515723270440252e-05, "epoch": 0.77625 }, { "step": 1843, "timestamp": "2025-12-28T12:18:48.114191", "elapsed_time": 12042.399285078049, "loss": 0.1493, "grad_norm": 0.13305744528770447, "learning_rate": 4.503144654088051e-05, "epoch": 0.776875 }, { "step": 1844, "timestamp": "2025-12-28T12:18:55.887349", "elapsed_time": 12050.172439575195, "loss": 0.3034, "grad_norm": 0.15352778136730194, "learning_rate": 4.4905660377358494e-05, "epoch": 0.7775 }, { "step": 1845, "timestamp": "2025-12-28T12:19:08.313052", "elapsed_time": 12062.598142147064, "loss": 0.1209, "grad_norm": 0.08751270920038223, "learning_rate": 4.477987421383648e-05, "epoch": 0.778125 }, { "step": 1846, "timestamp": "2025-12-28T12:19:27.969503", "elapsed_time": 12082.254593133926, "loss": 0.1342, "grad_norm": 0.07397376745939255, "learning_rate": 4.4654088050314466e-05, "epoch": 0.77875 }, { "step": 1847, "timestamp": "2025-12-28T12:19:34.562787", "elapsed_time": 12088.847877264023, "loss": 0.2321, "grad_norm": 0.15114973485469818, "learning_rate": 4.452830188679246e-05, "epoch": 0.779375 }, { "step": 1848, "timestamp": "2025-12-28T12:19:40.318451", "elapsed_time": 12094.603546380997, "loss": 0.1495, "grad_norm": 0.13792107999324799, "learning_rate": 4.4402515723270444e-05, "epoch": 0.78 }, { "step": 1849, "timestamp": "2025-12-28T12:19:50.399687", "elapsed_time": 12104.68477678299, "loss": 0.1054, "grad_norm": 0.10309217870235443, "learning_rate": 4.427672955974843e-05, "epoch": 0.780625 }, { "step": 1850, "timestamp": "2025-12-28T12:20:05.387318", "elapsed_time": 12119.672408342361, "loss": 0.1211, "grad_norm": 0.08788999915122986, "learning_rate": 4.4150943396226416e-05, "epoch": 0.78125 }, { "step": 1851, "timestamp": "2025-12-28T12:20:15.522711", "elapsed_time": 12129.807801246643, "loss": 0.1387, "grad_norm": 0.11078450828790665, "learning_rate": 4.402515723270441e-05, "epoch": 0.781875 }, { "step": 1852, "timestamp": "2025-12-28T12:20:19.994887", "elapsed_time": 12134.279977321625, "loss": 0.3038, "grad_norm": 0.19242431223392487, "learning_rate": 4.3899371069182394e-05, "epoch": 0.7825 }, { "step": 1853, "timestamp": "2025-12-28T12:20:37.161220", "elapsed_time": 12151.446311235428, "loss": 0.0975, "grad_norm": 0.0753704383969307, "learning_rate": 4.377358490566038e-05, "epoch": 0.783125 }, { "step": 1854, "timestamp": "2025-12-28T12:20:45.622228", "elapsed_time": 12159.907318115234, "loss": 0.1389, "grad_norm": 0.11638470739126205, "learning_rate": 4.3647798742138366e-05, "epoch": 0.78375 }, { "step": 1855, "timestamp": "2025-12-28T12:20:51.601755", "elapsed_time": 12165.886845588684, "loss": 0.1841, "grad_norm": 0.14550043642520905, "learning_rate": 4.352201257861636e-05, "epoch": 0.784375 }, { "step": 1856, "timestamp": "2025-12-28T12:21:02.010335", "elapsed_time": 12176.295424938202, "loss": 0.1331, "grad_norm": 0.10679540038108826, "learning_rate": 4.3396226415094345e-05, "epoch": 0.785 }, { "step": 1857, "timestamp": "2025-12-28T12:21:07.989370", "elapsed_time": 12182.274460554123, "loss": 0.147, "grad_norm": 0.136283740401268, "learning_rate": 4.327044025157233e-05, "epoch": 0.785625 }, { "step": 1858, "timestamp": "2025-12-28T12:21:13.524178", "elapsed_time": 12187.809268712997, "loss": 0.2294, "grad_norm": 0.16854384541511536, "learning_rate": 4.3144654088050316e-05, "epoch": 0.78625 }, { "step": 1859, "timestamp": "2025-12-28T12:21:22.939585", "elapsed_time": 12197.22467494011, "loss": 0.1837, "grad_norm": 0.1425347775220871, "learning_rate": 4.301886792452831e-05, "epoch": 0.786875 }, { "step": 1860, "timestamp": "2025-12-28T12:21:29.373097", "elapsed_time": 12203.658187150955, "loss": 0.3369, "grad_norm": 0.1832302361726761, "learning_rate": 4.2893081761006295e-05, "epoch": 0.7875 }, { "step": 1861, "timestamp": "2025-12-28T12:21:33.991433", "elapsed_time": 12208.27652335167, "loss": 0.1674, "grad_norm": 0.16159717738628387, "learning_rate": 4.276729559748428e-05, "epoch": 0.788125 }, { "step": 1862, "timestamp": "2025-12-28T12:21:41.508287", "elapsed_time": 12215.793377637863, "loss": 0.182, "grad_norm": 0.13854670524597168, "learning_rate": 4.2641509433962266e-05, "epoch": 0.78875 }, { "step": 1863, "timestamp": "2025-12-28T12:21:47.953231", "elapsed_time": 12222.238321304321, "loss": 0.2442, "grad_norm": 0.13430240750312805, "learning_rate": 4.251572327044025e-05, "epoch": 0.789375 }, { "step": 1864, "timestamp": "2025-12-28T12:22:00.040777", "elapsed_time": 12234.325867414474, "loss": 0.1518, "grad_norm": 0.10589438676834106, "learning_rate": 4.2389937106918245e-05, "epoch": 0.79 }, { "step": 1865, "timestamp": "2025-12-28T12:22:05.336070", "elapsed_time": 12239.621160030365, "loss": 0.2583, "grad_norm": 0.1759846806526184, "learning_rate": 4.226415094339623e-05, "epoch": 0.790625 }, { "step": 1866, "timestamp": "2025-12-28T12:22:13.399786", "elapsed_time": 12247.684876441956, "loss": 0.1389, "grad_norm": 0.12064716964960098, "learning_rate": 4.213836477987422e-05, "epoch": 0.79125 }, { "step": 1867, "timestamp": "2025-12-28T12:22:34.272638", "elapsed_time": 12268.557728767395, "loss": 0.0743, "grad_norm": 0.06062662601470947, "learning_rate": 4.20125786163522e-05, "epoch": 0.791875 }, { "step": 1868, "timestamp": "2025-12-28T12:22:42.738728", "elapsed_time": 12277.02381849289, "loss": 0.1641, "grad_norm": 0.12611140310764313, "learning_rate": 4.1886792452830195e-05, "epoch": 0.7925 }, { "step": 1869, "timestamp": "2025-12-28T12:22:49.875677", "elapsed_time": 12284.160766839981, "loss": 0.1267, "grad_norm": 0.10889780521392822, "learning_rate": 4.176100628930818e-05, "epoch": 0.793125 }, { "step": 1870, "timestamp": "2025-12-28T12:22:58.497093", "elapsed_time": 12292.782183408737, "loss": 0.4324, "grad_norm": 0.1732441633939743, "learning_rate": 4.163522012578617e-05, "epoch": 0.79375 }, { "step": 1871, "timestamp": "2025-12-28T12:23:04.803488", "elapsed_time": 12299.08857870102, "loss": 0.4409, "grad_norm": 0.19805917143821716, "learning_rate": 4.150943396226415e-05, "epoch": 0.794375 }, { "step": 1872, "timestamp": "2025-12-28T12:23:10.710661", "elapsed_time": 12304.995751619339, "loss": 0.4087, "grad_norm": 0.1881469041109085, "learning_rate": 4.1383647798742145e-05, "epoch": 0.795 }, { "step": 1873, "timestamp": "2025-12-28T12:23:18.265312", "elapsed_time": 12312.55040216446, "loss": 0.1252, "grad_norm": 0.11210913211107254, "learning_rate": 4.125786163522013e-05, "epoch": 0.795625 }, { "step": 1874, "timestamp": "2025-12-28T12:23:34.658542", "elapsed_time": 12328.943633079529, "loss": 0.116, "grad_norm": 0.10739357024431229, "learning_rate": 4.113207547169812e-05, "epoch": 0.79625 }, { "step": 1875, "timestamp": "2025-12-28T12:23:40.821995", "elapsed_time": 12335.107085466385, "loss": 0.1966, "grad_norm": 0.14850841462612152, "learning_rate": 4.10062893081761e-05, "epoch": 0.796875 }, { "step": 1876, "timestamp": "2025-12-28T12:23:47.096816", "elapsed_time": 12341.381906032562, "loss": 0.1838, "grad_norm": 0.1477261185646057, "learning_rate": 4.088050314465409e-05, "epoch": 0.7975 }, { "step": 1877, "timestamp": "2025-12-28T12:24:07.963691", "elapsed_time": 12362.248781204224, "loss": 0.0783, "grad_norm": 0.06663120537996292, "learning_rate": 4.075471698113208e-05, "epoch": 0.798125 }, { "step": 1878, "timestamp": "2025-12-28T12:24:25.074044", "elapsed_time": 12379.35913491249, "loss": 0.1794, "grad_norm": 0.10132871568202972, "learning_rate": 4.062893081761007e-05, "epoch": 0.79875 }, { "step": 1879, "timestamp": "2025-12-28T12:24:35.826343", "elapsed_time": 12390.111433267593, "loss": 0.1471, "grad_norm": 0.11134599149227142, "learning_rate": 4.050314465408805e-05, "epoch": 0.799375 }, { "step": 1880, "timestamp": "2025-12-28T12:24:42.929007", "elapsed_time": 12397.21409702301, "loss": 0.2019, "grad_norm": 0.1353609710931778, "learning_rate": 4.037735849056604e-05, "epoch": 0.8 }, { "step": 1881, "timestamp": "2025-12-28T12:24:55.553077", "elapsed_time": 12409.83816742897, "loss": 0.1102, "grad_norm": 0.09974975883960724, "learning_rate": 4.025157232704403e-05, "epoch": 0.800625 }, { "step": 1882, "timestamp": "2025-12-28T12:25:09.292309", "elapsed_time": 12423.577399730682, "loss": 0.1936, "grad_norm": 0.10728470981121063, "learning_rate": 4.012578616352202e-05, "epoch": 0.80125 }, { "step": 1883, "timestamp": "2025-12-28T12:25:29.788806", "elapsed_time": 12444.073896169662, "loss": 0.1056, "grad_norm": 0.08063426613807678, "learning_rate": 4e-05, "epoch": 0.801875 }, { "step": 1884, "timestamp": "2025-12-28T12:25:37.645916", "elapsed_time": 12451.931006908417, "loss": 0.1324, "grad_norm": 0.1093749925494194, "learning_rate": 3.987421383647799e-05, "epoch": 0.8025 }, { "step": 1885, "timestamp": "2025-12-28T12:25:50.396263", "elapsed_time": 12464.681353330612, "loss": 0.11, "grad_norm": 0.0938941091299057, "learning_rate": 3.974842767295598e-05, "epoch": 0.803125 }, { "step": 1886, "timestamp": "2025-12-28T12:26:07.508041", "elapsed_time": 12481.793131351471, "loss": 0.1111, "grad_norm": 0.08618809282779694, "learning_rate": 3.962264150943397e-05, "epoch": 0.80375 }, { "step": 1887, "timestamp": "2025-12-28T12:26:12.131185", "elapsed_time": 12486.416275262833, "loss": 0.1774, "grad_norm": 0.16178780794143677, "learning_rate": 3.9496855345911953e-05, "epoch": 0.804375 }, { "step": 1888, "timestamp": "2025-12-28T12:26:22.840063", "elapsed_time": 12497.125153303146, "loss": 0.1668, "grad_norm": 0.11834202706813812, "learning_rate": 3.937106918238994e-05, "epoch": 0.805 }, { "step": 1889, "timestamp": "2025-12-28T12:26:33.452731", "elapsed_time": 12507.737821102142, "loss": 0.1647, "grad_norm": 0.10877163708209991, "learning_rate": 3.924528301886793e-05, "epoch": 0.805625 }, { "step": 1890, "timestamp": "2025-12-28T12:26:43.963908", "elapsed_time": 12518.248998880386, "loss": 0.3662, "grad_norm": 0.34827324748039246, "learning_rate": 3.911949685534592e-05, "epoch": 0.80625 }, { "step": 1891, "timestamp": "2025-12-28T12:26:51.062721", "elapsed_time": 12525.347811460495, "loss": 0.1494, "grad_norm": 0.15114177763462067, "learning_rate": 3.8993710691823904e-05, "epoch": 0.806875 }, { "step": 1892, "timestamp": "2025-12-28T12:27:00.778145", "elapsed_time": 12535.063235282898, "loss": 0.2763, "grad_norm": 0.14429102838039398, "learning_rate": 3.886792452830189e-05, "epoch": 0.8075 }, { "step": 1893, "timestamp": "2025-12-28T12:27:07.766029", "elapsed_time": 12542.051119327545, "loss": 0.1599, "grad_norm": 0.14175190031528473, "learning_rate": 3.8742138364779875e-05, "epoch": 0.808125 }, { "step": 1894, "timestamp": "2025-12-28T12:27:18.126378", "elapsed_time": 12552.411469221115, "loss": 0.1061, "grad_norm": 0.08977079391479492, "learning_rate": 3.861635220125787e-05, "epoch": 0.80875 }, { "step": 1895, "timestamp": "2025-12-28T12:27:24.648082", "elapsed_time": 12558.93317270279, "loss": 0.2233, "grad_norm": 0.15675166249275208, "learning_rate": 3.8490566037735854e-05, "epoch": 0.809375 }, { "step": 1896, "timestamp": "2025-12-28T12:27:38.818514", "elapsed_time": 12573.10360455513, "loss": 0.1142, "grad_norm": 0.08880900591611862, "learning_rate": 3.836477987421384e-05, "epoch": 0.81 }, { "step": 1897, "timestamp": "2025-12-28T12:27:45.255238", "elapsed_time": 12579.540328502655, "loss": 0.1721, "grad_norm": 0.14262078702449799, "learning_rate": 3.8238993710691826e-05, "epoch": 0.810625 }, { "step": 1898, "timestamp": "2025-12-28T12:27:57.156162", "elapsed_time": 12591.441256284714, "loss": 0.1236, "grad_norm": 0.09527979791164398, "learning_rate": 3.811320754716982e-05, "epoch": 0.81125 }, { "step": 1899, "timestamp": "2025-12-28T12:28:18.026839", "elapsed_time": 12612.31192946434, "loss": 0.1045, "grad_norm": 0.08572285622358322, "learning_rate": 3.7987421383647804e-05, "epoch": 0.811875 }, { "step": 1900, "timestamp": "2025-12-28T12:28:32.684984", "elapsed_time": 12626.970074653625, "loss": 0.1285, "grad_norm": 0.09535647183656693, "learning_rate": 3.786163522012579e-05, "epoch": 0.8125 }, { "step": 1901, "timestamp": "2025-12-28T12:28:47.846051", "elapsed_time": 12642.13114118576, "loss": 0.1224, "grad_norm": 0.09724906086921692, "learning_rate": 3.7735849056603776e-05, "epoch": 0.813125 }, { "step": 1902, "timestamp": "2025-12-28T12:28:56.318374", "elapsed_time": 12650.603464603424, "loss": 0.25, "grad_norm": 0.1405816376209259, "learning_rate": 3.761006289308177e-05, "epoch": 0.81375 }, { "step": 1903, "timestamp": "2025-12-28T12:29:07.320674", "elapsed_time": 12661.605763912201, "loss": 0.1659, "grad_norm": 0.1113525927066803, "learning_rate": 3.748427672955975e-05, "epoch": 0.814375 }, { "step": 1904, "timestamp": "2025-12-28T12:29:16.620685", "elapsed_time": 12670.905775785446, "loss": 0.1158, "grad_norm": 0.11509151756763458, "learning_rate": 3.735849056603773e-05, "epoch": 0.815 }, { "step": 1905, "timestamp": "2025-12-28T12:29:34.296146", "elapsed_time": 12688.581236124039, "loss": 0.1305, "grad_norm": 0.10004525631666183, "learning_rate": 3.723270440251572e-05, "epoch": 0.815625 }, { "step": 1906, "timestamp": "2025-12-28T12:29:44.763574", "elapsed_time": 12699.048664331436, "loss": 0.1535, "grad_norm": 0.10531044751405716, "learning_rate": 3.710691823899371e-05, "epoch": 0.81625 }, { "step": 1907, "timestamp": "2025-12-28T12:29:52.278280", "elapsed_time": 12706.56337094307, "loss": 0.1443, "grad_norm": 0.13411474227905273, "learning_rate": 3.69811320754717e-05, "epoch": 0.816875 }, { "step": 1908, "timestamp": "2025-12-28T12:30:05.896343", "elapsed_time": 12720.181433439255, "loss": 0.1701, "grad_norm": 0.11000078171491623, "learning_rate": 3.6855345911949684e-05, "epoch": 0.8175 }, { "step": 1909, "timestamp": "2025-12-28T12:30:14.677199", "elapsed_time": 12728.962289571762, "loss": 0.3532, "grad_norm": 0.1655169278383255, "learning_rate": 3.672955974842767e-05, "epoch": 0.818125 }, { "step": 1910, "timestamp": "2025-12-28T12:30:25.150670", "elapsed_time": 12739.435760736465, "loss": 0.1252, "grad_norm": 0.1451990157365799, "learning_rate": 3.660377358490566e-05, "epoch": 0.81875 }, { "step": 1911, "timestamp": "2025-12-28T12:30:33.004276", "elapsed_time": 12747.289366722107, "loss": 0.2257, "grad_norm": 0.15634727478027344, "learning_rate": 3.647798742138365e-05, "epoch": 0.819375 }, { "step": 1912, "timestamp": "2025-12-28T12:30:40.480022", "elapsed_time": 12754.765112161636, "loss": 0.3694, "grad_norm": 0.17593269050121307, "learning_rate": 3.6352201257861634e-05, "epoch": 0.82 }, { "step": 1913, "timestamp": "2025-12-28T12:30:48.029511", "elapsed_time": 12762.314601421356, "loss": 0.2491, "grad_norm": 0.14427348971366882, "learning_rate": 3.622641509433962e-05, "epoch": 0.820625 }, { "step": 1914, "timestamp": "2025-12-28T12:30:54.103421", "elapsed_time": 12768.388511657715, "loss": 0.2834, "grad_norm": 0.21829214692115784, "learning_rate": 3.610062893081761e-05, "epoch": 0.82125 }, { "step": 1915, "timestamp": "2025-12-28T12:31:01.382574", "elapsed_time": 12775.667664289474, "loss": 0.168, "grad_norm": 0.15699121356010437, "learning_rate": 3.59748427672956e-05, "epoch": 0.821875 }, { "step": 1916, "timestamp": "2025-12-28T12:31:09.694802", "elapsed_time": 12783.979892253876, "loss": 0.2832, "grad_norm": 0.16049239039421082, "learning_rate": 3.5849056603773584e-05, "epoch": 0.8225 }, { "step": 1917, "timestamp": "2025-12-28T12:31:19.119223", "elapsed_time": 12793.404314041138, "loss": 0.196, "grad_norm": 0.12521395087242126, "learning_rate": 3.572327044025157e-05, "epoch": 0.823125 }, { "step": 1918, "timestamp": "2025-12-28T12:31:25.059477", "elapsed_time": 12799.344567537308, "loss": 0.144, "grad_norm": 0.12810324132442474, "learning_rate": 3.559748427672956e-05, "epoch": 0.82375 }, { "step": 1919, "timestamp": "2025-12-28T12:31:31.502266", "elapsed_time": 12805.787356376648, "loss": 0.2127, "grad_norm": 0.14652138948440552, "learning_rate": 3.547169811320755e-05, "epoch": 0.824375 }, { "step": 1920, "timestamp": "2025-12-28T12:31:39.355931", "elapsed_time": 12813.641021966934, "loss": 0.2024, "grad_norm": 0.13335974514484406, "learning_rate": 3.5345911949685534e-05, "epoch": 0.825 }, { "step": 1921, "timestamp": "2025-12-28T12:31:45.989318", "elapsed_time": 12820.274408578873, "loss": 0.1712, "grad_norm": 0.13532021641731262, "learning_rate": 3.522012578616352e-05, "epoch": 0.825625 }, { "step": 1922, "timestamp": "2025-12-28T12:31:55.415620", "elapsed_time": 12829.70071029663, "loss": 0.1493, "grad_norm": 0.11901037395000458, "learning_rate": 3.5094339622641506e-05, "epoch": 0.82625 }, { "step": 1923, "timestamp": "2025-12-28T12:32:06.926802", "elapsed_time": 12841.211893081665, "loss": 0.1154, "grad_norm": 0.08895418792963028, "learning_rate": 3.49685534591195e-05, "epoch": 0.826875 }, { "step": 1924, "timestamp": "2025-12-28T12:32:13.518398", "elapsed_time": 12847.8034927845, "loss": 0.1603, "grad_norm": 0.133051335811615, "learning_rate": 3.4842767295597484e-05, "epoch": 0.8275 }, { "step": 1925, "timestamp": "2025-12-28T12:32:24.513980", "elapsed_time": 12858.799070358276, "loss": 0.1377, "grad_norm": 0.10832136869430542, "learning_rate": 3.471698113207547e-05, "epoch": 0.828125 }, { "step": 1926, "timestamp": "2025-12-28T12:32:30.547900", "elapsed_time": 12864.83299088478, "loss": 0.393, "grad_norm": 0.16918057203292847, "learning_rate": 3.4591194968553456e-05, "epoch": 0.82875 }, { "step": 1927, "timestamp": "2025-12-28T12:32:37.613818", "elapsed_time": 12871.89890909195, "loss": 0.1779, "grad_norm": 0.13324803113937378, "learning_rate": 3.446540880503145e-05, "epoch": 0.829375 }, { "step": 1928, "timestamp": "2025-12-28T12:32:44.130396", "elapsed_time": 12878.415486097336, "loss": 0.1886, "grad_norm": 0.22868554294109344, "learning_rate": 3.4339622641509435e-05, "epoch": 0.83 }, { "step": 1929, "timestamp": "2025-12-28T12:32:50.719516", "elapsed_time": 12885.00460600853, "loss": 0.3735, "grad_norm": 0.16795550286769867, "learning_rate": 3.421383647798742e-05, "epoch": 0.830625 }, { "step": 1930, "timestamp": "2025-12-28T12:33:00.614246", "elapsed_time": 12894.899336338043, "loss": 0.1517, "grad_norm": 0.1097177118062973, "learning_rate": 3.4088050314465406e-05, "epoch": 0.83125 }, { "step": 1931, "timestamp": "2025-12-28T12:33:11.076050", "elapsed_time": 12905.361140727997, "loss": 0.1361, "grad_norm": 0.11156441271305084, "learning_rate": 3.39622641509434e-05, "epoch": 0.831875 }, { "step": 1932, "timestamp": "2025-12-28T12:33:22.857950", "elapsed_time": 12917.143040180206, "loss": 0.1602, "grad_norm": 0.11632666736841202, "learning_rate": 3.3836477987421385e-05, "epoch": 0.8325 }, { "step": 1933, "timestamp": "2025-12-28T12:33:43.115463", "elapsed_time": 12937.400553941727, "loss": 0.1039, "grad_norm": 0.07940562069416046, "learning_rate": 3.371069182389937e-05, "epoch": 0.833125 }, { "step": 1934, "timestamp": "2025-12-28T12:33:56.721853", "elapsed_time": 12951.006942987442, "loss": 0.0978, "grad_norm": 0.27172061800956726, "learning_rate": 3.3584905660377356e-05, "epoch": 0.83375 }, { "step": 1935, "timestamp": "2025-12-28T12:34:11.804848", "elapsed_time": 12966.089939117432, "loss": 0.1315, "grad_norm": 0.09464891999959946, "learning_rate": 3.345911949685534e-05, "epoch": 0.834375 }, { "step": 1936, "timestamp": "2025-12-28T12:34:17.303351", "elapsed_time": 12971.58844089508, "loss": 0.1868, "grad_norm": 0.15971067547798157, "learning_rate": 3.3333333333333335e-05, "epoch": 0.835 }, { "step": 1937, "timestamp": "2025-12-28T12:34:38.344428", "elapsed_time": 12992.629518508911, "loss": 0.0888, "grad_norm": 0.07462675869464874, "learning_rate": 3.320754716981132e-05, "epoch": 0.835625 }, { "step": 1938, "timestamp": "2025-12-28T12:34:44.610775", "elapsed_time": 12998.89586520195, "loss": 0.1803, "grad_norm": 0.18845033645629883, "learning_rate": 3.308176100628931e-05, "epoch": 0.83625 }, { "step": 1939, "timestamp": "2025-12-28T12:34:50.913733", "elapsed_time": 13005.198822975159, "loss": 0.2887, "grad_norm": 0.17362096905708313, "learning_rate": 3.295597484276729e-05, "epoch": 0.836875 }, { "step": 1940, "timestamp": "2025-12-28T12:34:57.357092", "elapsed_time": 13011.642182588577, "loss": 0.1972, "grad_norm": 0.1999269723892212, "learning_rate": 3.2830188679245285e-05, "epoch": 0.8375 }, { "step": 1941, "timestamp": "2025-12-28T12:35:02.217109", "elapsed_time": 13016.502199172974, "loss": 0.215, "grad_norm": 0.16465437412261963, "learning_rate": 3.270440251572327e-05, "epoch": 0.838125 }, { "step": 1942, "timestamp": "2025-12-28T12:35:10.992022", "elapsed_time": 13025.27711224556, "loss": 0.1497, "grad_norm": 0.12079225480556488, "learning_rate": 3.257861635220126e-05, "epoch": 0.83875 }, { "step": 1943, "timestamp": "2025-12-28T12:35:14.991163", "elapsed_time": 13029.276253938675, "loss": 0.2926, "grad_norm": 0.23038972914218903, "learning_rate": 3.245283018867924e-05, "epoch": 0.839375 }, { "step": 1944, "timestamp": "2025-12-28T12:35:24.077690", "elapsed_time": 13038.362780094147, "loss": 0.5596, "grad_norm": 0.17396830022335052, "learning_rate": 3.2327044025157235e-05, "epoch": 0.84 }, { "step": 1945, "timestamp": "2025-12-28T12:35:31.821806", "elapsed_time": 13046.106900453568, "loss": 0.1473, "grad_norm": 0.12451894581317902, "learning_rate": 3.220125786163522e-05, "epoch": 0.840625 }, { "step": 1946, "timestamp": "2025-12-28T12:35:41.040337", "elapsed_time": 13055.325427770615, "loss": 0.0972, "grad_norm": 0.09150367975234985, "learning_rate": 3.207547169811321e-05, "epoch": 0.84125 }, { "step": 1947, "timestamp": "2025-12-28T12:35:54.543075", "elapsed_time": 13068.828165531158, "loss": 0.1249, "grad_norm": 0.09334319829940796, "learning_rate": 3.194968553459119e-05, "epoch": 0.841875 }, { "step": 1948, "timestamp": "2025-12-28T12:36:05.426685", "elapsed_time": 13079.711775302887, "loss": 0.1417, "grad_norm": 0.10683770477771759, "learning_rate": 3.1823899371069186e-05, "epoch": 0.8425 }, { "step": 1949, "timestamp": "2025-12-28T12:36:18.168592", "elapsed_time": 13092.453682422638, "loss": 0.1475, "grad_norm": 0.0970892384648323, "learning_rate": 3.169811320754717e-05, "epoch": 0.843125 }, { "step": 1950, "timestamp": "2025-12-28T12:36:34.081832", "elapsed_time": 13108.366922616959, "loss": 0.1373, "grad_norm": 0.09424092620611191, "learning_rate": 3.157232704402516e-05, "epoch": 0.84375 }, { "step": 1951, "timestamp": "2025-12-28T12:36:44.441007", "elapsed_time": 13118.72609782219, "loss": 0.1229, "grad_norm": 0.11297397315502167, "learning_rate": 3.144654088050314e-05, "epoch": 0.844375 }, { "step": 1952, "timestamp": "2025-12-28T12:36:54.066744", "elapsed_time": 13128.351834058762, "loss": 0.1486, "grad_norm": 0.11981320381164551, "learning_rate": 3.132075471698113e-05, "epoch": 0.845 }, { "step": 1953, "timestamp": "2025-12-28T12:37:04.580382", "elapsed_time": 13138.86547279358, "loss": 0.1323, "grad_norm": 0.10277368128299713, "learning_rate": 3.119496855345912e-05, "epoch": 0.845625 }, { "step": 1954, "timestamp": "2025-12-28T12:37:14.942990", "elapsed_time": 13149.228080272675, "loss": 0.2132, "grad_norm": 0.13679690659046173, "learning_rate": 3.106918238993711e-05, "epoch": 0.84625 }, { "step": 1955, "timestamp": "2025-12-28T12:37:21.894712", "elapsed_time": 13156.179806470871, "loss": 0.1818, "grad_norm": 0.14240722358226776, "learning_rate": 3.094339622641509e-05, "epoch": 0.846875 }, { "step": 1956, "timestamp": "2025-12-28T12:37:26.431086", "elapsed_time": 13160.716176271439, "loss": 0.2607, "grad_norm": 0.1875351518392563, "learning_rate": 3.081761006289308e-05, "epoch": 0.8475 }, { "step": 1957, "timestamp": "2025-12-28T12:37:39.981919", "elapsed_time": 13174.267012834549, "loss": 0.181, "grad_norm": 0.12086289376020432, "learning_rate": 3.069182389937107e-05, "epoch": 0.848125 }, { "step": 1958, "timestamp": "2025-12-28T12:37:50.822168", "elapsed_time": 13185.107258558273, "loss": 0.1471, "grad_norm": 0.11906228959560394, "learning_rate": 3.056603773584906e-05, "epoch": 0.84875 }, { "step": 1959, "timestamp": "2025-12-28T12:38:08.057968", "elapsed_time": 13202.34305858612, "loss": 0.1174, "grad_norm": 0.07859740406274796, "learning_rate": 3.0440251572327043e-05, "epoch": 0.849375 }, { "step": 1960, "timestamp": "2025-12-28T12:38:15.115247", "elapsed_time": 13209.400336742401, "loss": 0.1584, "grad_norm": 0.22734029591083527, "learning_rate": 3.0314465408805033e-05, "epoch": 0.85 }, { "step": 1961, "timestamp": "2025-12-28T12:38:19.840730", "elapsed_time": 13214.12582039833, "loss": 0.2167, "grad_norm": 0.18263642489910126, "learning_rate": 3.018867924528302e-05, "epoch": 0.850625 }, { "step": 1962, "timestamp": "2025-12-28T12:38:28.595786", "elapsed_time": 13222.880876779556, "loss": 0.1756, "grad_norm": 0.12433876842260361, "learning_rate": 3.0062893081761008e-05, "epoch": 0.85125 }, { "step": 1963, "timestamp": "2025-12-28T12:38:40.961363", "elapsed_time": 13235.246453523636, "loss": 0.1387, "grad_norm": 0.10110918432474136, "learning_rate": 2.9937106918238994e-05, "epoch": 0.851875 }, { "step": 1964, "timestamp": "2025-12-28T12:38:52.419176", "elapsed_time": 13246.704270601273, "loss": 0.1096, "grad_norm": 0.0898265689611435, "learning_rate": 2.9811320754716983e-05, "epoch": 0.8525 }, { "step": 1965, "timestamp": "2025-12-28T12:38:58.899414", "elapsed_time": 13253.18450474739, "loss": 0.194, "grad_norm": 0.15123769640922546, "learning_rate": 2.968553459119497e-05, "epoch": 0.853125 }, { "step": 1966, "timestamp": "2025-12-28T12:39:06.139908", "elapsed_time": 13260.425002336502, "loss": 0.147, "grad_norm": 0.12028482556343079, "learning_rate": 2.9559748427672958e-05, "epoch": 0.85375 }, { "step": 1967, "timestamp": "2025-12-28T12:39:14.272168", "elapsed_time": 13268.55725812912, "loss": 0.1709, "grad_norm": 0.13067594170570374, "learning_rate": 2.9433962264150944e-05, "epoch": 0.854375 }, { "step": 1968, "timestamp": "2025-12-28T12:39:20.178108", "elapsed_time": 13274.463198184967, "loss": 0.1636, "grad_norm": 0.15121272206306458, "learning_rate": 2.930817610062893e-05, "epoch": 0.855 }, { "step": 1969, "timestamp": "2025-12-28T12:39:27.555839", "elapsed_time": 13281.84092926979, "loss": 0.1912, "grad_norm": 0.18086270987987518, "learning_rate": 2.918238993710692e-05, "epoch": 0.855625 }, { "step": 1970, "timestamp": "2025-12-28T12:39:37.363912", "elapsed_time": 13291.649002075195, "loss": 0.1739, "grad_norm": 0.12668268382549286, "learning_rate": 2.9056603773584905e-05, "epoch": 0.85625 }, { "step": 1971, "timestamp": "2025-12-28T12:39:45.421058", "elapsed_time": 13299.706147909164, "loss": 0.1524, "grad_norm": 0.12265957146883011, "learning_rate": 2.8930817610062894e-05, "epoch": 0.856875 }, { "step": 1972, "timestamp": "2025-12-28T12:40:02.948650", "elapsed_time": 13317.233740568161, "loss": 0.1257, "grad_norm": 0.08345521986484528, "learning_rate": 2.880503144654088e-05, "epoch": 0.8575 }, { "step": 1973, "timestamp": "2025-12-28T12:40:12.078552", "elapsed_time": 13326.363642930984, "loss": 0.1756, "grad_norm": 0.11556252837181091, "learning_rate": 2.867924528301887e-05, "epoch": 0.858125 }, { "step": 1974, "timestamp": "2025-12-28T12:40:25.168584", "elapsed_time": 13339.453674077988, "loss": 0.1235, "grad_norm": 0.09028176963329315, "learning_rate": 2.8553459119496855e-05, "epoch": 0.85875 }, { "step": 1975, "timestamp": "2025-12-28T12:40:33.147283", "elapsed_time": 13347.432373523712, "loss": 0.1337, "grad_norm": 0.12279459834098816, "learning_rate": 2.8427672955974844e-05, "epoch": 0.859375 }, { "step": 1976, "timestamp": "2025-12-28T12:40:44.760775", "elapsed_time": 13359.045865058899, "loss": 0.1334, "grad_norm": 0.09873386472463608, "learning_rate": 2.830188679245283e-05, "epoch": 0.86 }, { "step": 1977, "timestamp": "2025-12-28T12:40:55.871028", "elapsed_time": 13370.156118869781, "loss": 0.122, "grad_norm": 0.10663529485464096, "learning_rate": 2.817610062893082e-05, "epoch": 0.860625 }, { "step": 1978, "timestamp": "2025-12-28T12:41:03.235317", "elapsed_time": 13377.520410776138, "loss": 0.1669, "grad_norm": 0.12646640837192535, "learning_rate": 2.8050314465408805e-05, "epoch": 0.86125 }, { "step": 1979, "timestamp": "2025-12-28T12:41:11.043014", "elapsed_time": 13385.32810497284, "loss": 0.1255, "grad_norm": 0.12962931394577026, "learning_rate": 2.7924528301886794e-05, "epoch": 0.861875 }, { "step": 1980, "timestamp": "2025-12-28T12:41:16.966780", "elapsed_time": 13391.251874685287, "loss": 0.1866, "grad_norm": 0.14886367321014404, "learning_rate": 2.779874213836478e-05, "epoch": 0.8625 }, { "step": 1981, "timestamp": "2025-12-28T12:41:28.910216", "elapsed_time": 13403.195305585861, "loss": 0.1053, "grad_norm": 0.08952232450246811, "learning_rate": 2.767295597484277e-05, "epoch": 0.863125 }, { "step": 1982, "timestamp": "2025-12-28T12:41:37.341955", "elapsed_time": 13411.627045869827, "loss": 0.2185, "grad_norm": 0.1354040801525116, "learning_rate": 2.7547169811320755e-05, "epoch": 0.86375 }, { "step": 1983, "timestamp": "2025-12-28T12:41:43.511317", "elapsed_time": 13417.796411514282, "loss": 0.1658, "grad_norm": 0.1327390819787979, "learning_rate": 2.742138364779874e-05, "epoch": 0.864375 }, { "step": 1984, "timestamp": "2025-12-28T12:41:54.923595", "elapsed_time": 13429.208688497543, "loss": 0.1531, "grad_norm": 0.13857761025428772, "learning_rate": 2.729559748427673e-05, "epoch": 0.865 }, { "step": 1985, "timestamp": "2025-12-28T12:42:07.782059", "elapsed_time": 13442.06714963913, "loss": 0.1701, "grad_norm": 0.1057012677192688, "learning_rate": 2.7169811320754716e-05, "epoch": 0.865625 }, { "step": 1986, "timestamp": "2025-12-28T12:42:13.794560", "elapsed_time": 13448.07965040207, "loss": 0.2544, "grad_norm": 0.16066089272499084, "learning_rate": 2.7044025157232706e-05, "epoch": 0.86625 }, { "step": 1987, "timestamp": "2025-12-28T12:42:24.972624", "elapsed_time": 13459.257714748383, "loss": 0.1063, "grad_norm": 0.10727685689926147, "learning_rate": 2.691823899371069e-05, "epoch": 0.866875 }, { "step": 1988, "timestamp": "2025-12-28T12:42:35.333485", "elapsed_time": 13469.618575811386, "loss": 0.1744, "grad_norm": 0.10637471824884415, "learning_rate": 2.679245283018868e-05, "epoch": 0.8675 }, { "step": 1989, "timestamp": "2025-12-28T12:42:44.268707", "elapsed_time": 13478.553797006607, "loss": 0.3446, "grad_norm": 0.18917757272720337, "learning_rate": 2.6666666666666667e-05, "epoch": 0.868125 }, { "step": 1990, "timestamp": "2025-12-28T12:42:53.119626", "elapsed_time": 13487.404715776443, "loss": 0.1694, "grad_norm": 0.16625277698040009, "learning_rate": 2.6540880503144656e-05, "epoch": 0.86875 }, { "step": 1991, "timestamp": "2025-12-28T12:42:56.594548", "elapsed_time": 13490.879638910294, "loss": 0.3027, "grad_norm": 0.21782198548316956, "learning_rate": 2.641509433962264e-05, "epoch": 0.869375 }, { "step": 1992, "timestamp": "2025-12-28T12:43:05.768847", "elapsed_time": 13500.05393743515, "loss": 0.1083, "grad_norm": 0.09708056598901749, "learning_rate": 2.628930817610063e-05, "epoch": 0.87 }, { "step": 1993, "timestamp": "2025-12-28T12:43:15.670772", "elapsed_time": 13509.955861568451, "loss": 0.13, "grad_norm": 0.12269952893257141, "learning_rate": 2.6163522012578617e-05, "epoch": 0.870625 }, { "step": 1994, "timestamp": "2025-12-28T12:43:21.608461", "elapsed_time": 13515.893551588058, "loss": 0.2237, "grad_norm": 0.17270702123641968, "learning_rate": 2.6037735849056606e-05, "epoch": 0.87125 }, { "step": 1995, "timestamp": "2025-12-28T12:43:28.693932", "elapsed_time": 13522.9790225029, "loss": 0.3098, "grad_norm": 0.17161986231803894, "learning_rate": 2.5911949685534592e-05, "epoch": 0.871875 }, { "step": 1996, "timestamp": "2025-12-28T12:43:35.216183", "elapsed_time": 13529.501273870468, "loss": 0.4953, "grad_norm": 0.18305501341819763, "learning_rate": 2.578616352201258e-05, "epoch": 0.8725 }, { "step": 1997, "timestamp": "2025-12-28T12:43:40.964345", "elapsed_time": 13535.249434709549, "loss": 0.3349, "grad_norm": 0.1995151937007904, "learning_rate": 2.5660377358490567e-05, "epoch": 0.873125 }, { "step": 1998, "timestamp": "2025-12-28T12:43:50.213890", "elapsed_time": 13544.498980283737, "loss": 0.1256, "grad_norm": 0.11612726747989655, "learning_rate": 2.5534591194968556e-05, "epoch": 0.87375 }, { "step": 1999, "timestamp": "2025-12-28T12:44:01.831854", "elapsed_time": 13556.11694407463, "loss": 0.1089, "grad_norm": 0.09675383567810059, "learning_rate": 2.5408805031446542e-05, "epoch": 0.874375 }, { "step": 2000, "timestamp": "2025-12-28T12:44:08.303416", "elapsed_time": 13562.588506937027, "loss": 0.4029, "grad_norm": 0.19399595260620117, "learning_rate": 2.5283018867924528e-05, "epoch": 0.875 }, { "step": 2001, "timestamp": "2025-12-28T12:44:16.958567", "elapsed_time": 13571.243657827377, "loss": 0.1523, "grad_norm": 0.1444273293018341, "learning_rate": 2.5157232704402517e-05, "epoch": 0.875625 }, { "step": 2002, "timestamp": "2025-12-28T12:44:24.980330", "elapsed_time": 13579.26542043686, "loss": 0.4692, "grad_norm": 0.18777810037136078, "learning_rate": 2.5031446540880503e-05, "epoch": 0.87625 }, { "step": 2003, "timestamp": "2025-12-28T12:44:32.077927", "elapsed_time": 13586.36301779747, "loss": 0.1718, "grad_norm": 0.1408010870218277, "learning_rate": 2.4905660377358492e-05, "epoch": 0.876875 }, { "step": 2004, "timestamp": "2025-12-28T12:44:37.283086", "elapsed_time": 13591.568180322647, "loss": 0.1881, "grad_norm": 0.1654125154018402, "learning_rate": 2.4779874213836478e-05, "epoch": 0.8775 }, { "step": 2005, "timestamp": "2025-12-28T12:44:44.801095", "elapsed_time": 13599.08618569374, "loss": 0.1881, "grad_norm": 0.15793928503990173, "learning_rate": 2.4654088050314467e-05, "epoch": 0.878125 }, { "step": 2006, "timestamp": "2025-12-28T12:44:50.482390", "elapsed_time": 13604.767484664917, "loss": 0.185, "grad_norm": 0.1794230043888092, "learning_rate": 2.4528301886792453e-05, "epoch": 0.87875 }, { "step": 2007, "timestamp": "2025-12-28T12:45:06.649005", "elapsed_time": 13620.934096097946, "loss": 0.0961, "grad_norm": 0.07328634709119797, "learning_rate": 2.4402515723270442e-05, "epoch": 0.879375 }, { "step": 2008, "timestamp": "2025-12-28T12:45:10.656329", "elapsed_time": 13624.941419363022, "loss": 0.2643, "grad_norm": 0.2130497395992279, "learning_rate": 2.427672955974843e-05, "epoch": 0.88 }, { "step": 2009, "timestamp": "2025-12-28T12:45:26.129250", "elapsed_time": 13640.414340496063, "loss": 0.146, "grad_norm": 0.09115441888570786, "learning_rate": 2.4150943396226418e-05, "epoch": 0.880625 }, { "step": 2010, "timestamp": "2025-12-28T12:45:35.552234", "elapsed_time": 13649.837324380875, "loss": 0.1519, "grad_norm": 0.11384664475917816, "learning_rate": 2.4025157232704403e-05, "epoch": 0.88125 }, { "step": 2011, "timestamp": "2025-12-28T12:45:42.432010", "elapsed_time": 13656.71710062027, "loss": 0.1838, "grad_norm": 0.13385522365570068, "learning_rate": 2.3899371069182393e-05, "epoch": 0.881875 }, { "step": 2012, "timestamp": "2025-12-28T12:45:48.338311", "elapsed_time": 13662.623405456543, "loss": 0.3394, "grad_norm": 0.18050479888916016, "learning_rate": 2.377358490566038e-05, "epoch": 0.8825 }, { "step": 2013, "timestamp": "2025-12-28T12:45:57.763912", "elapsed_time": 13672.049002170563, "loss": 0.1075, "grad_norm": 0.094784677028656, "learning_rate": 2.3647798742138368e-05, "epoch": 0.883125 }, { "step": 2014, "timestamp": "2025-12-28T12:46:03.098688", "elapsed_time": 13677.383778333664, "loss": 0.2244, "grad_norm": 0.19391919672489166, "learning_rate": 2.3522012578616354e-05, "epoch": 0.88375 }, { "step": 2015, "timestamp": "2025-12-28T12:46:09.212719", "elapsed_time": 13683.49780869484, "loss": 0.1827, "grad_norm": 0.1965276449918747, "learning_rate": 2.339622641509434e-05, "epoch": 0.884375 }, { "step": 2016, "timestamp": "2025-12-28T12:46:19.627349", "elapsed_time": 13693.912438869476, "loss": 0.2289, "grad_norm": 0.2740084230899811, "learning_rate": 2.327044025157233e-05, "epoch": 0.885 }, { "step": 2017, "timestamp": "2025-12-28T12:46:31.416162", "elapsed_time": 13705.70125246048, "loss": 0.2902, "grad_norm": 0.12946578860282898, "learning_rate": 2.3144654088050315e-05, "epoch": 0.885625 }, { "step": 2018, "timestamp": "2025-12-28T12:46:41.666259", "elapsed_time": 13715.951349258423, "loss": 0.1345, "grad_norm": 0.11686120927333832, "learning_rate": 2.3018867924528304e-05, "epoch": 0.88625 }, { "step": 2019, "timestamp": "2025-12-28T12:46:57.304742", "elapsed_time": 13731.589831829071, "loss": 0.1551, "grad_norm": 0.10054396092891693, "learning_rate": 2.289308176100629e-05, "epoch": 0.886875 }, { "step": 2020, "timestamp": "2025-12-28T12:47:12.450934", "elapsed_time": 13746.736023902893, "loss": 0.1416, "grad_norm": 0.11809642612934113, "learning_rate": 2.276729559748428e-05, "epoch": 0.8875 }, { "step": 2021, "timestamp": "2025-12-28T12:47:22.163353", "elapsed_time": 13756.44844341278, "loss": 0.1301, "grad_norm": 0.1024162694811821, "learning_rate": 2.2641509433962265e-05, "epoch": 0.888125 }, { "step": 2022, "timestamp": "2025-12-28T12:47:33.984884", "elapsed_time": 13768.269978284836, "loss": 0.1243, "grad_norm": 0.09722079336643219, "learning_rate": 2.2515723270440254e-05, "epoch": 0.88875 }, { "step": 2023, "timestamp": "2025-12-28T12:47:44.991859", "elapsed_time": 13779.276949644089, "loss": 0.1849, "grad_norm": 0.11634030938148499, "learning_rate": 2.238993710691824e-05, "epoch": 0.889375 }, { "step": 2024, "timestamp": "2025-12-28T12:47:52.178131", "elapsed_time": 13786.463220596313, "loss": 0.3668, "grad_norm": 0.2070102095603943, "learning_rate": 2.226415094339623e-05, "epoch": 0.89 }, { "step": 2025, "timestamp": "2025-12-28T12:47:58.522744", "elapsed_time": 13792.807834386826, "loss": 0.1465, "grad_norm": 0.11943688988685608, "learning_rate": 2.2138364779874215e-05, "epoch": 0.890625 }, { "step": 2026, "timestamp": "2025-12-28T12:48:06.267378", "elapsed_time": 13800.5524725914, "loss": 0.3664, "grad_norm": 0.18064796924591064, "learning_rate": 2.2012578616352204e-05, "epoch": 0.89125 }, { "step": 2027, "timestamp": "2025-12-28T12:48:16.684863", "elapsed_time": 13810.969953536987, "loss": 0.1507, "grad_norm": 0.10812822729349136, "learning_rate": 2.188679245283019e-05, "epoch": 0.891875 }, { "step": 2028, "timestamp": "2025-12-28T12:48:26.633930", "elapsed_time": 13820.919019937515, "loss": 0.1116, "grad_norm": 0.0921606570482254, "learning_rate": 2.176100628930818e-05, "epoch": 0.8925 }, { "step": 2029, "timestamp": "2025-12-28T12:48:34.803507", "elapsed_time": 13829.08859705925, "loss": 0.2362, "grad_norm": 0.14584888517856598, "learning_rate": 2.1635220125786165e-05, "epoch": 0.893125 }, { "step": 2030, "timestamp": "2025-12-28T12:48:41.721190", "elapsed_time": 13836.006280899048, "loss": 0.3648, "grad_norm": 0.1669163852930069, "learning_rate": 2.1509433962264154e-05, "epoch": 0.89375 }, { "step": 2031, "timestamp": "2025-12-28T12:48:51.532480", "elapsed_time": 13845.817570209503, "loss": 0.1496, "grad_norm": 0.11350235342979431, "learning_rate": 2.138364779874214e-05, "epoch": 0.894375 }, { "step": 2032, "timestamp": "2025-12-28T12:49:01.120902", "elapsed_time": 13855.405995368958, "loss": 0.1385, "grad_norm": 0.10483425110578537, "learning_rate": 2.1257861635220126e-05, "epoch": 0.895 }, { "step": 2033, "timestamp": "2025-12-28T12:49:10.447301", "elapsed_time": 13864.732391834259, "loss": 0.1473, "grad_norm": 0.11778236925601959, "learning_rate": 2.1132075471698115e-05, "epoch": 0.895625 }, { "step": 2034, "timestamp": "2025-12-28T12:49:24.060029", "elapsed_time": 13878.3451192379, "loss": 0.1106, "grad_norm": 0.09687533229589462, "learning_rate": 2.10062893081761e-05, "epoch": 0.89625 }, { "step": 2035, "timestamp": "2025-12-28T12:49:29.037036", "elapsed_time": 13883.322126865387, "loss": 0.4332, "grad_norm": 0.2040807008743286, "learning_rate": 2.088050314465409e-05, "epoch": 0.896875 }, { "step": 2036, "timestamp": "2025-12-28T12:49:45.341778", "elapsed_time": 13899.626868963242, "loss": 0.2711, "grad_norm": 0.12027235329151154, "learning_rate": 2.0754716981132076e-05, "epoch": 0.8975 }, { "step": 2037, "timestamp": "2025-12-28T12:49:55.608164", "elapsed_time": 13909.893257856369, "loss": 0.1105, "grad_norm": 0.10322773456573486, "learning_rate": 2.0628930817610066e-05, "epoch": 0.898125 }, { "step": 2038, "timestamp": "2025-12-28T12:50:02.286113", "elapsed_time": 13916.571203231812, "loss": 0.1892, "grad_norm": 0.15024706721305847, "learning_rate": 2.050314465408805e-05, "epoch": 0.89875 }, { "step": 2039, "timestamp": "2025-12-28T12:50:22.839318", "elapsed_time": 13937.124408245087, "loss": 0.1057, "grad_norm": 0.07259613275527954, "learning_rate": 2.037735849056604e-05, "epoch": 0.899375 }, { "step": 2040, "timestamp": "2025-12-28T12:50:30.370013", "elapsed_time": 13944.655103683472, "loss": 0.1901, "grad_norm": 0.14556270837783813, "learning_rate": 2.0251572327044027e-05, "epoch": 0.9 }, { "step": 2041, "timestamp": "2025-12-28T12:50:40.280173", "elapsed_time": 13954.565263748169, "loss": 0.1534, "grad_norm": 0.2072232961654663, "learning_rate": 2.0125786163522016e-05, "epoch": 0.900625 }, { "step": 2042, "timestamp": "2025-12-28T12:50:48.582155", "elapsed_time": 13962.867245435715, "loss": 0.3279, "grad_norm": 0.2846614122390747, "learning_rate": 2e-05, "epoch": 0.90125 }, { "step": 2043, "timestamp": "2025-12-28T12:50:55.464925", "elapsed_time": 13969.750015258789, "loss": 0.2373, "grad_norm": 0.14941410720348358, "learning_rate": 1.987421383647799e-05, "epoch": 0.901875 }, { "step": 2044, "timestamp": "2025-12-28T12:51:03.935715", "elapsed_time": 13978.220804929733, "loss": 0.2142, "grad_norm": 0.13745370507240295, "learning_rate": 1.9748427672955977e-05, "epoch": 0.9025 }, { "step": 2045, "timestamp": "2025-12-28T12:51:14.650669", "elapsed_time": 13988.935759544373, "loss": 0.1392, "grad_norm": 0.100920669734478, "learning_rate": 1.9622641509433966e-05, "epoch": 0.903125 }, { "step": 2046, "timestamp": "2025-12-28T12:51:22.402615", "elapsed_time": 13996.687705039978, "loss": 0.1752, "grad_norm": 0.13003851473331451, "learning_rate": 1.9496855345911952e-05, "epoch": 0.90375 }, { "step": 2047, "timestamp": "2025-12-28T12:51:41.001338", "elapsed_time": 14015.286432504654, "loss": 0.2414, "grad_norm": 0.12125887721776962, "learning_rate": 1.9371069182389938e-05, "epoch": 0.904375 }, { "step": 2048, "timestamp": "2025-12-28T12:51:50.001884", "elapsed_time": 14024.286974191666, "loss": 0.1899, "grad_norm": 0.1845715045928955, "learning_rate": 1.9245283018867927e-05, "epoch": 0.905 }, { "step": 2049, "timestamp": "2025-12-28T12:52:10.709887", "elapsed_time": 14044.994977474213, "loss": 0.1702, "grad_norm": 0.09026765078306198, "learning_rate": 1.9119496855345913e-05, "epoch": 0.905625 }, { "step": 2050, "timestamp": "2025-12-28T12:52:17.185077", "elapsed_time": 14051.470167636871, "loss": 0.1794, "grad_norm": 0.1455916464328766, "learning_rate": 1.8993710691823902e-05, "epoch": 0.90625 }, { "step": 2051, "timestamp": "2025-12-28T12:52:28.177130", "elapsed_time": 14062.462219953537, "loss": 0.1146, "grad_norm": 0.11119363456964493, "learning_rate": 1.8867924528301888e-05, "epoch": 0.906875 }, { "step": 2052, "timestamp": "2025-12-28T12:52:36.238218", "elapsed_time": 14070.523307800293, "loss": 0.2618, "grad_norm": 0.17123495042324066, "learning_rate": 1.8742138364779874e-05, "epoch": 0.9075 }, { "step": 2053, "timestamp": "2025-12-28T12:52:46.104010", "elapsed_time": 14080.389100313187, "loss": 0.2253, "grad_norm": 0.1355234980583191, "learning_rate": 1.861635220125786e-05, "epoch": 0.908125 }, { "step": 2054, "timestamp": "2025-12-28T12:52:54.928314", "elapsed_time": 14089.213404417038, "loss": 0.1234, "grad_norm": 0.10494759678840637, "learning_rate": 1.849056603773585e-05, "epoch": 0.90875 }, { "step": 2055, "timestamp": "2025-12-28T12:53:04.303552", "elapsed_time": 14098.588641881943, "loss": 0.1178, "grad_norm": 0.11895407736301422, "learning_rate": 1.8364779874213835e-05, "epoch": 0.909375 }, { "step": 2056, "timestamp": "2025-12-28T12:53:10.684555", "elapsed_time": 14104.969645500183, "loss": 0.3704, "grad_norm": 0.20144037902355194, "learning_rate": 1.8238993710691824e-05, "epoch": 0.91 }, { "step": 2057, "timestamp": "2025-12-28T12:53:24.571026", "elapsed_time": 14118.856116056442, "loss": 0.1448, "grad_norm": 0.10675830394029617, "learning_rate": 1.811320754716981e-05, "epoch": 0.910625 }, { "step": 2058, "timestamp": "2025-12-28T12:53:35.228851", "elapsed_time": 14129.513941764832, "loss": 0.1201, "grad_norm": 0.09952869266271591, "learning_rate": 1.79874213836478e-05, "epoch": 0.91125 }, { "step": 2059, "timestamp": "2025-12-28T12:53:44.722006", "elapsed_time": 14139.007096767426, "loss": 0.2014, "grad_norm": 0.15313848853111267, "learning_rate": 1.7861635220125785e-05, "epoch": 0.911875 }, { "step": 2060, "timestamp": "2025-12-28T12:53:55.235495", "elapsed_time": 14149.520585536957, "loss": 0.156, "grad_norm": 0.16458632051944733, "learning_rate": 1.7735849056603774e-05, "epoch": 0.9125 }, { "step": 2061, "timestamp": "2025-12-28T12:54:04.167911", "elapsed_time": 14158.453001022339, "loss": 0.1428, "grad_norm": 0.11248766630887985, "learning_rate": 1.761006289308176e-05, "epoch": 0.913125 }, { "step": 2062, "timestamp": "2025-12-28T12:54:14.179512", "elapsed_time": 14168.464602470398, "loss": 0.1162, "grad_norm": 0.10351016372442245, "learning_rate": 1.748427672955975e-05, "epoch": 0.91375 }, { "step": 2063, "timestamp": "2025-12-28T12:54:19.161273", "elapsed_time": 14173.446363210678, "loss": 0.2882, "grad_norm": 0.19298550486564636, "learning_rate": 1.7358490566037735e-05, "epoch": 0.914375 }, { "step": 2064, "timestamp": "2025-12-28T12:54:30.239615", "elapsed_time": 14184.524705171585, "loss": 0.1304, "grad_norm": 0.10329542309045792, "learning_rate": 1.7232704402515724e-05, "epoch": 0.915 }, { "step": 2065, "timestamp": "2025-12-28T12:54:36.398946", "elapsed_time": 14190.684037208557, "loss": 0.2199, "grad_norm": 0.15773223340511322, "learning_rate": 1.710691823899371e-05, "epoch": 0.915625 }, { "step": 2066, "timestamp": "2025-12-28T12:54:42.958616", "elapsed_time": 14197.243706703186, "loss": 0.2395, "grad_norm": 0.176762193441391, "learning_rate": 1.69811320754717e-05, "epoch": 0.91625 }, { "step": 2067, "timestamp": "2025-12-28T12:54:54.130810", "elapsed_time": 14208.41589999199, "loss": 0.0974, "grad_norm": 0.08783324062824249, "learning_rate": 1.6855345911949685e-05, "epoch": 0.916875 }, { "step": 2068, "timestamp": "2025-12-28T12:55:01.944477", "elapsed_time": 14216.22956776619, "loss": 0.196, "grad_norm": 0.14528080821037292, "learning_rate": 1.672955974842767e-05, "epoch": 0.9175 }, { "step": 2069, "timestamp": "2025-12-28T12:55:13.899335", "elapsed_time": 14228.184425115585, "loss": 0.205, "grad_norm": 0.12957097589969635, "learning_rate": 1.660377358490566e-05, "epoch": 0.918125 }, { "step": 2070, "timestamp": "2025-12-28T12:55:23.618615", "elapsed_time": 14237.903705358505, "loss": 0.1474, "grad_norm": 0.11285874992609024, "learning_rate": 1.6477987421383646e-05, "epoch": 0.91875 }, { "step": 2071, "timestamp": "2025-12-28T12:55:37.163652", "elapsed_time": 14251.44874215126, "loss": 0.1786, "grad_norm": 0.09955456852912903, "learning_rate": 1.6352201257861635e-05, "epoch": 0.919375 }, { "step": 2072, "timestamp": "2025-12-28T12:55:44.637996", "elapsed_time": 14258.923086166382, "loss": 0.1564, "grad_norm": 0.14337117969989777, "learning_rate": 1.622641509433962e-05, "epoch": 0.92 }, { "step": 2073, "timestamp": "2025-12-28T12:55:53.757282", "elapsed_time": 14268.042372465134, "loss": 0.2802, "grad_norm": 0.1456514596939087, "learning_rate": 1.610062893081761e-05, "epoch": 0.920625 }, { "step": 2074, "timestamp": "2025-12-28T12:56:00.273054", "elapsed_time": 14274.558144569397, "loss": 0.4306, "grad_norm": 0.18445518612861633, "learning_rate": 1.5974842767295596e-05, "epoch": 0.92125 }, { "step": 2075, "timestamp": "2025-12-28T12:56:07.521121", "elapsed_time": 14281.806211948395, "loss": 0.1628, "grad_norm": 0.1335839480161667, "learning_rate": 1.5849056603773586e-05, "epoch": 0.921875 }, { "step": 2076, "timestamp": "2025-12-28T12:56:15.072953", "elapsed_time": 14289.358043909073, "loss": 0.1945, "grad_norm": 0.13724136352539062, "learning_rate": 1.572327044025157e-05, "epoch": 0.9225 }, { "step": 2077, "timestamp": "2025-12-28T12:56:26.969569", "elapsed_time": 14301.254658937454, "loss": 0.1434, "grad_norm": 0.10332413017749786, "learning_rate": 1.559748427672956e-05, "epoch": 0.923125 }, { "step": 2078, "timestamp": "2025-12-28T12:56:40.909120", "elapsed_time": 14315.194210767746, "loss": 0.1229, "grad_norm": 0.0949297845363617, "learning_rate": 1.5471698113207547e-05, "epoch": 0.92375 }, { "step": 2079, "timestamp": "2025-12-28T12:56:46.573585", "elapsed_time": 14320.858675479889, "loss": 0.1417, "grad_norm": 0.1971912533044815, "learning_rate": 1.5345911949685536e-05, "epoch": 0.924375 }, { "step": 2080, "timestamp": "2025-12-28T12:56:55.214575", "elapsed_time": 14329.499665021896, "loss": 0.1695, "grad_norm": 0.13136085867881775, "learning_rate": 1.5220125786163522e-05, "epoch": 0.925 }, { "step": 2081, "timestamp": "2025-12-28T12:57:16.087282", "elapsed_time": 14350.372372865677, "loss": 0.0893, "grad_norm": 0.07915375381708145, "learning_rate": 1.509433962264151e-05, "epoch": 0.925625 }, { "step": 2082, "timestamp": "2025-12-28T12:57:33.493431", "elapsed_time": 14367.778520822525, "loss": 0.1024, "grad_norm": 0.10471498221158981, "learning_rate": 1.4968553459119497e-05, "epoch": 0.92625 }, { "step": 2083, "timestamp": "2025-12-28T12:57:42.614440", "elapsed_time": 14376.899530172348, "loss": 0.1551, "grad_norm": 0.11656136065721512, "learning_rate": 1.4842767295597484e-05, "epoch": 0.926875 }, { "step": 2084, "timestamp": "2025-12-28T12:57:47.407488", "elapsed_time": 14381.692578554153, "loss": 0.1693, "grad_norm": 0.1702698916196823, "learning_rate": 1.4716981132075472e-05, "epoch": 0.9275 }, { "step": 2085, "timestamp": "2025-12-28T12:58:02.399763", "elapsed_time": 14396.684853076935, "loss": 0.1079, "grad_norm": 0.09625794738531113, "learning_rate": 1.459119496855346e-05, "epoch": 0.928125 }, { "step": 2086, "timestamp": "2025-12-28T12:58:15.434870", "elapsed_time": 14409.719960689545, "loss": 0.1103, "grad_norm": 0.09237074851989746, "learning_rate": 1.4465408805031447e-05, "epoch": 0.92875 }, { "step": 2087, "timestamp": "2025-12-28T12:58:25.288608", "elapsed_time": 14419.573698043823, "loss": 0.1244, "grad_norm": 0.1188262552022934, "learning_rate": 1.4339622641509435e-05, "epoch": 0.929375 }, { "step": 2088, "timestamp": "2025-12-28T12:58:32.277959", "elapsed_time": 14426.563049316406, "loss": 0.2027, "grad_norm": 0.2063203752040863, "learning_rate": 1.4213836477987422e-05, "epoch": 0.93 }, { "step": 2089, "timestamp": "2025-12-28T12:58:43.511867", "elapsed_time": 14437.796962022781, "loss": 0.1312, "grad_norm": 0.10580072551965714, "learning_rate": 1.408805031446541e-05, "epoch": 0.930625 }, { "step": 2090, "timestamp": "2025-12-28T12:58:56.223461", "elapsed_time": 14450.508551120758, "loss": 0.1399, "grad_norm": 0.10505926609039307, "learning_rate": 1.3962264150943397e-05, "epoch": 0.93125 }, { "step": 2091, "timestamp": "2025-12-28T12:59:05.563465", "elapsed_time": 14459.848555326462, "loss": 0.2831, "grad_norm": 0.15598870813846588, "learning_rate": 1.3836477987421385e-05, "epoch": 0.931875 }, { "step": 2092, "timestamp": "2025-12-28T12:59:15.934233", "elapsed_time": 14470.219323635101, "loss": 0.127, "grad_norm": 0.10266047716140747, "learning_rate": 1.371069182389937e-05, "epoch": 0.9325 }, { "step": 2093, "timestamp": "2025-12-28T12:59:28.065574", "elapsed_time": 14482.350664615631, "loss": 0.1606, "grad_norm": 0.1118762195110321, "learning_rate": 1.3584905660377358e-05, "epoch": 0.933125 }, { "step": 2094, "timestamp": "2025-12-28T12:59:41.619545", "elapsed_time": 14495.90463590622, "loss": 0.1627, "grad_norm": 0.1268414705991745, "learning_rate": 1.3459119496855346e-05, "epoch": 0.93375 }, { "step": 2095, "timestamp": "2025-12-28T12:59:57.135947", "elapsed_time": 14511.421037197113, "loss": 0.1509, "grad_norm": 0.09595310688018799, "learning_rate": 1.3333333333333333e-05, "epoch": 0.934375 }, { "step": 2096, "timestamp": "2025-12-28T13:00:07.077230", "elapsed_time": 14521.362320899963, "loss": 0.1116, "grad_norm": 0.09903395920991898, "learning_rate": 1.320754716981132e-05, "epoch": 0.935 }, { "step": 2097, "timestamp": "2025-12-28T13:00:17.748745", "elapsed_time": 14532.033835411072, "loss": 0.1372, "grad_norm": 0.10799795389175415, "learning_rate": 1.3081761006289308e-05, "epoch": 0.935625 }, { "step": 2098, "timestamp": "2025-12-28T13:00:34.507777", "elapsed_time": 14548.792867422104, "loss": 0.0991, "grad_norm": 0.0850529670715332, "learning_rate": 1.2955974842767296e-05, "epoch": 0.93625 }, { "step": 2099, "timestamp": "2025-12-28T13:00:49.895456", "elapsed_time": 14564.180546045303, "loss": 0.1757, "grad_norm": 0.09990602731704712, "learning_rate": 1.2830188679245283e-05, "epoch": 0.936875 }, { "step": 2100, "timestamp": "2025-12-28T13:00:57.176345", "elapsed_time": 14571.46143579483, "loss": 0.2252, "grad_norm": 0.1569076031446457, "learning_rate": 1.2704402515723271e-05, "epoch": 0.9375 }, { "step": 2101, "timestamp": "2025-12-28T13:01:00.897935", "elapsed_time": 14575.183025836945, "loss": 0.2693, "grad_norm": 0.2882073223590851, "learning_rate": 1.2578616352201259e-05, "epoch": 0.938125 }, { "step": 2102, "timestamp": "2025-12-28T13:01:09.059749", "elapsed_time": 14583.344839334488, "loss": 0.076, "grad_norm": 0.09046997874975204, "learning_rate": 1.2452830188679246e-05, "epoch": 0.93875 }, { "step": 2103, "timestamp": "2025-12-28T13:01:13.882199", "elapsed_time": 14588.167289018631, "loss": 0.1864, "grad_norm": 0.18029439449310303, "learning_rate": 1.2327044025157234e-05, "epoch": 0.939375 }, { "step": 2104, "timestamp": "2025-12-28T13:01:26.361765", "elapsed_time": 14600.646855354309, "loss": 0.1504, "grad_norm": 0.09953170269727707, "learning_rate": 1.2201257861635221e-05, "epoch": 0.94 }, { "step": 2105, "timestamp": "2025-12-28T13:01:33.533065", "elapsed_time": 14607.818155527115, "loss": 0.1704, "grad_norm": 0.14062711596488953, "learning_rate": 1.2075471698113209e-05, "epoch": 0.940625 }, { "step": 2106, "timestamp": "2025-12-28T13:01:40.874776", "elapsed_time": 14615.159866333008, "loss": 0.3254, "grad_norm": 0.19320929050445557, "learning_rate": 1.1949685534591196e-05, "epoch": 0.94125 }, { "step": 2107, "timestamp": "2025-12-28T13:01:45.352972", "elapsed_time": 14619.63806271553, "loss": 0.3774, "grad_norm": 0.19888830184936523, "learning_rate": 1.1823899371069184e-05, "epoch": 0.941875 }, { "step": 2108, "timestamp": "2025-12-28T13:01:54.645732", "elapsed_time": 14628.930822610855, "loss": 0.1445, "grad_norm": 0.11683964729309082, "learning_rate": 1.169811320754717e-05, "epoch": 0.9425 }, { "step": 2109, "timestamp": "2025-12-28T13:02:12.993090", "elapsed_time": 14647.278180122375, "loss": 0.1005, "grad_norm": 0.08152023702859879, "learning_rate": 1.1572327044025157e-05, "epoch": 0.943125 }, { "step": 2110, "timestamp": "2025-12-28T13:02:20.465761", "elapsed_time": 14654.750851392746, "loss": 0.1488, "grad_norm": 0.1277417093515396, "learning_rate": 1.1446540880503145e-05, "epoch": 0.94375 }, { "step": 2111, "timestamp": "2025-12-28T13:02:26.990394", "elapsed_time": 14661.27548456192, "loss": 0.2444, "grad_norm": 0.49488744139671326, "learning_rate": 1.1320754716981132e-05, "epoch": 0.944375 }, { "step": 2112, "timestamp": "2025-12-28T13:02:36.374549", "elapsed_time": 14670.659639120102, "loss": 0.1365, "grad_norm": 0.11476011574268341, "learning_rate": 1.119496855345912e-05, "epoch": 0.945 }, { "step": 2113, "timestamp": "2025-12-28T13:02:45.111270", "elapsed_time": 14679.396360874176, "loss": 0.1309, "grad_norm": 0.11950299888849258, "learning_rate": 1.1069182389937107e-05, "epoch": 0.945625 }, { "step": 2114, "timestamp": "2025-12-28T13:02:51.591249", "elapsed_time": 14685.876343011856, "loss": 0.2217, "grad_norm": 0.16578605771064758, "learning_rate": 1.0943396226415095e-05, "epoch": 0.94625 }, { "step": 2115, "timestamp": "2025-12-28T13:03:00.576464", "elapsed_time": 14694.861553907394, "loss": 0.3675, "grad_norm": 0.16877128183841705, "learning_rate": 1.0817610062893083e-05, "epoch": 0.946875 }, { "step": 2116, "timestamp": "2025-12-28T13:03:18.311130", "elapsed_time": 14712.596220493317, "loss": 0.1257, "grad_norm": 0.08800987899303436, "learning_rate": 1.069182389937107e-05, "epoch": 0.9475 }, { "step": 2117, "timestamp": "2025-12-28T13:03:27.242185", "elapsed_time": 14721.527275800705, "loss": 0.1195, "grad_norm": 0.11122479289770126, "learning_rate": 1.0566037735849058e-05, "epoch": 0.948125 }, { "step": 2118, "timestamp": "2025-12-28T13:03:37.954344", "elapsed_time": 14732.239434480667, "loss": 0.1142, "grad_norm": 0.0907776802778244, "learning_rate": 1.0440251572327045e-05, "epoch": 0.94875 }, { "step": 2119, "timestamp": "2025-12-28T13:03:55.267284", "elapsed_time": 14749.552374362946, "loss": 0.0934, "grad_norm": 0.07522504776716232, "learning_rate": 1.0314465408805033e-05, "epoch": 0.949375 }, { "step": 2120, "timestamp": "2025-12-28T13:04:02.452456", "elapsed_time": 14756.73754644394, "loss": 0.1678, "grad_norm": 0.1397898644208908, "learning_rate": 1.018867924528302e-05, "epoch": 0.95 }, { "step": 2121, "timestamp": "2025-12-28T13:04:13.404991", "elapsed_time": 14767.690082073212, "loss": 0.1685, "grad_norm": 0.1206582635641098, "learning_rate": 1.0062893081761008e-05, "epoch": 0.950625 }, { "step": 2122, "timestamp": "2025-12-28T13:04:21.537710", "elapsed_time": 14775.822799921036, "loss": 0.4038, "grad_norm": 0.17276889085769653, "learning_rate": 9.937106918238995e-06, "epoch": 0.95125 }, { "step": 2123, "timestamp": "2025-12-28T13:04:29.361970", "elapsed_time": 14783.647060155869, "loss": 0.1303, "grad_norm": 0.12048753350973129, "learning_rate": 9.811320754716983e-06, "epoch": 0.951875 }, { "step": 2124, "timestamp": "2025-12-28T13:04:36.912129", "elapsed_time": 14791.197219133377, "loss": 0.1316, "grad_norm": 0.1113729476928711, "learning_rate": 9.685534591194969e-06, "epoch": 0.9525 }, { "step": 2125, "timestamp": "2025-12-28T13:04:42.123245", "elapsed_time": 14796.408335924149, "loss": 0.2985, "grad_norm": 0.1864505261182785, "learning_rate": 9.559748427672956e-06, "epoch": 0.953125 }, { "step": 2126, "timestamp": "2025-12-28T13:04:59.705245", "elapsed_time": 14813.990335941315, "loss": 0.1241, "grad_norm": 0.08124972134828568, "learning_rate": 9.433962264150944e-06, "epoch": 0.95375 }, { "step": 2127, "timestamp": "2025-12-28T13:05:06.621313", "elapsed_time": 14820.906403064728, "loss": 0.261, "grad_norm": 0.19667741656303406, "learning_rate": 9.30817610062893e-06, "epoch": 0.954375 }, { "step": 2128, "timestamp": "2025-12-28T13:05:18.124603", "elapsed_time": 14832.409697771072, "loss": 0.2653, "grad_norm": 0.12561364471912384, "learning_rate": 9.182389937106917e-06, "epoch": 0.955 }, { "step": 2129, "timestamp": "2025-12-28T13:05:22.177655", "elapsed_time": 14836.46274471283, "loss": 0.2647, "grad_norm": 0.21830357611179352, "learning_rate": 9.056603773584905e-06, "epoch": 0.955625 }, { "step": 2130, "timestamp": "2025-12-28T13:05:31.039317", "elapsed_time": 14845.324407577515, "loss": 0.3741, "grad_norm": 0.14887583255767822, "learning_rate": 8.930817610062892e-06, "epoch": 0.95625 }, { "step": 2131, "timestamp": "2025-12-28T13:05:42.990517", "elapsed_time": 14857.275606870651, "loss": 0.1244, "grad_norm": 0.11540309339761734, "learning_rate": 8.80503144654088e-06, "epoch": 0.956875 }, { "step": 2132, "timestamp": "2025-12-28T13:05:48.518364", "elapsed_time": 14862.803453683853, "loss": 0.2524, "grad_norm": 0.17561720311641693, "learning_rate": 8.679245283018868e-06, "epoch": 0.9575 }, { "step": 2133, "timestamp": "2025-12-28T13:05:54.626380", "elapsed_time": 14868.911471128464, "loss": 0.205, "grad_norm": 0.14403176307678223, "learning_rate": 8.553459119496855e-06, "epoch": 0.958125 }, { "step": 2134, "timestamp": "2025-12-28T13:06:03.304009", "elapsed_time": 14877.589098930359, "loss": 0.1677, "grad_norm": 0.1289132833480835, "learning_rate": 8.427672955974843e-06, "epoch": 0.95875 }, { "step": 2135, "timestamp": "2025-12-28T13:06:19.305306", "elapsed_time": 14893.590396642685, "loss": 0.1706, "grad_norm": 0.11218154430389404, "learning_rate": 8.30188679245283e-06, "epoch": 0.959375 }, { "step": 2136, "timestamp": "2025-12-28T13:06:34.028175", "elapsed_time": 14908.313265562057, "loss": 0.1002, "grad_norm": 0.09118315577507019, "learning_rate": 8.176100628930818e-06, "epoch": 0.96 }, { "step": 2137, "timestamp": "2025-12-28T13:06:44.744421", "elapsed_time": 14919.029515028, "loss": 0.1318, "grad_norm": 0.11322548240423203, "learning_rate": 8.050314465408805e-06, "epoch": 0.960625 }, { "step": 2138, "timestamp": "2025-12-28T13:07:05.626666", "elapsed_time": 14939.911756277084, "loss": 0.1157, "grad_norm": 0.07905202358961105, "learning_rate": 7.924528301886793e-06, "epoch": 0.96125 }, { "step": 2139, "timestamp": "2025-12-28T13:07:12.917071", "elapsed_time": 14947.202164649963, "loss": 0.193, "grad_norm": 0.14137513935565948, "learning_rate": 7.79874213836478e-06, "epoch": 0.961875 }, { "step": 2140, "timestamp": "2025-12-28T13:07:20.296717", "elapsed_time": 14954.581811666489, "loss": 0.1775, "grad_norm": 0.13806195557117462, "learning_rate": 7.672955974842768e-06, "epoch": 0.9625 }, { "step": 2141, "timestamp": "2025-12-28T13:07:33.172898", "elapsed_time": 14967.457988500595, "loss": 0.1039, "grad_norm": 0.09371381998062134, "learning_rate": 7.547169811320755e-06, "epoch": 0.963125 }, { "step": 2142, "timestamp": "2025-12-28T13:07:49.768333", "elapsed_time": 14984.053423166275, "loss": 0.1143, "grad_norm": 0.09071200340986252, "learning_rate": 7.421383647798742e-06, "epoch": 0.96375 }, { "step": 2143, "timestamp": "2025-12-28T13:08:00.029288", "elapsed_time": 14994.314378499985, "loss": 0.113, "grad_norm": 0.0975324884057045, "learning_rate": 7.29559748427673e-06, "epoch": 0.964375 }, { "step": 2144, "timestamp": "2025-12-28T13:08:07.277118", "elapsed_time": 15001.56220817566, "loss": 0.1588, "grad_norm": 0.1530754715204239, "learning_rate": 7.169811320754717e-06, "epoch": 0.965 }, { "step": 2145, "timestamp": "2025-12-28T13:08:11.788620", "elapsed_time": 15006.07371020317, "loss": 0.2424, "grad_norm": 0.17115284502506256, "learning_rate": 7.044025157232705e-06, "epoch": 0.965625 }, { "step": 2146, "timestamp": "2025-12-28T13:08:24.581324", "elapsed_time": 15018.866414546967, "loss": 0.0938, "grad_norm": 0.08514872193336487, "learning_rate": 6.918238993710692e-06, "epoch": 0.96625 }, { "step": 2147, "timestamp": "2025-12-28T13:08:35.277645", "elapsed_time": 15029.562735319138, "loss": 0.16, "grad_norm": 0.11149827390909195, "learning_rate": 6.792452830188679e-06, "epoch": 0.966875 }, { "step": 2148, "timestamp": "2025-12-28T13:08:44.659727", "elapsed_time": 15038.944817781448, "loss": 0.1451, "grad_norm": 0.1102396696805954, "learning_rate": 6.666666666666667e-06, "epoch": 0.9675 }, { "step": 2149, "timestamp": "2025-12-28T13:08:56.168291", "elapsed_time": 15050.453381538391, "loss": 0.1014, "grad_norm": 0.11420729011297226, "learning_rate": 6.540880503144654e-06, "epoch": 0.968125 }, { "step": 2150, "timestamp": "2025-12-28T13:09:05.395277", "elapsed_time": 15059.68036699295, "loss": 0.1707, "grad_norm": 0.13086120784282684, "learning_rate": 6.415094339622642e-06, "epoch": 0.96875 }, { "step": 2151, "timestamp": "2025-12-28T13:09:13.567665", "elapsed_time": 15067.852755784988, "loss": 0.1435, "grad_norm": 0.12494708597660065, "learning_rate": 6.289308176100629e-06, "epoch": 0.969375 }, { "step": 2152, "timestamp": "2025-12-28T13:09:22.026165", "elapsed_time": 15076.311259746552, "loss": 0.2231, "grad_norm": 0.15418364107608795, "learning_rate": 6.163522012578617e-06, "epoch": 0.97 }, { "step": 2153, "timestamp": "2025-12-28T13:09:28.234531", "elapsed_time": 15082.519621133804, "loss": 0.1911, "grad_norm": 0.15970326960086823, "learning_rate": 6.037735849056604e-06, "epoch": 0.970625 }, { "step": 2154, "timestamp": "2025-12-28T13:09:41.850657", "elapsed_time": 15096.135746717453, "loss": 0.1166, "grad_norm": 0.08712539076805115, "learning_rate": 5.911949685534592e-06, "epoch": 0.97125 }, { "step": 2155, "timestamp": "2025-12-28T13:09:49.022333", "elapsed_time": 15103.307422876358, "loss": 0.2125, "grad_norm": 0.15600165724754333, "learning_rate": 5.786163522012579e-06, "epoch": 0.971875 }, { "step": 2156, "timestamp": "2025-12-28T13:09:55.267040", "elapsed_time": 15109.552130699158, "loss": 0.1704, "grad_norm": 0.1566295623779297, "learning_rate": 5.660377358490566e-06, "epoch": 0.9725 }, { "step": 2157, "timestamp": "2025-12-28T13:10:07.050404", "elapsed_time": 15121.33549451828, "loss": 0.122, "grad_norm": 0.09626911580562592, "learning_rate": 5.534591194968554e-06, "epoch": 0.973125 }, { "step": 2158, "timestamp": "2025-12-28T13:10:13.971436", "elapsed_time": 15128.256526470184, "loss": 0.2565, "grad_norm": 0.14205630123615265, "learning_rate": 5.408805031446541e-06, "epoch": 0.97375 }, { "step": 2159, "timestamp": "2025-12-28T13:10:23.606334", "elapsed_time": 15137.891424417496, "loss": 0.1477, "grad_norm": 0.12025802582502365, "learning_rate": 5.283018867924529e-06, "epoch": 0.974375 }, { "step": 2160, "timestamp": "2025-12-28T13:10:38.587449", "elapsed_time": 15152.872539758682, "loss": 0.122, "grad_norm": 0.0904950201511383, "learning_rate": 5.157232704402516e-06, "epoch": 0.975 }, { "step": 2161, "timestamp": "2025-12-28T13:10:48.211282", "elapsed_time": 15162.496372699738, "loss": 0.1241, "grad_norm": 0.1013362929224968, "learning_rate": 5.031446540880504e-06, "epoch": 0.975625 }, { "step": 2162, "timestamp": "2025-12-28T13:11:00.090939", "elapsed_time": 15174.376032590866, "loss": 0.3714, "grad_norm": 0.14272800087928772, "learning_rate": 4.9056603773584915e-06, "epoch": 0.97625 }, { "step": 2163, "timestamp": "2025-12-28T13:11:03.768510", "elapsed_time": 15178.05360007286, "loss": 0.3031, "grad_norm": 0.21242351830005646, "learning_rate": 4.779874213836478e-06, "epoch": 0.976875 }, { "step": 2164, "timestamp": "2025-12-28T13:11:15.372898", "elapsed_time": 15189.65798830986, "loss": 0.1319, "grad_norm": 0.10468777269124985, "learning_rate": 4.654088050314465e-06, "epoch": 0.9775 }, { "step": 2165, "timestamp": "2025-12-28T13:11:22.360971", "elapsed_time": 15196.64606142044, "loss": 0.2542, "grad_norm": 0.1459261178970337, "learning_rate": 4.5283018867924524e-06, "epoch": 0.978125 }, { "step": 2166, "timestamp": "2025-12-28T13:11:27.698671", "elapsed_time": 15201.983761548996, "loss": 0.1664, "grad_norm": 0.16032001376152039, "learning_rate": 4.40251572327044e-06, "epoch": 0.97875 }, { "step": 2167, "timestamp": "2025-12-28T13:11:35.312824", "elapsed_time": 15209.597914457321, "loss": 0.14, "grad_norm": 0.12672804296016693, "learning_rate": 4.2767295597484275e-06, "epoch": 0.979375 }, { "step": 2168, "timestamp": "2025-12-28T13:11:42.449067", "elapsed_time": 15216.734157085419, "loss": 0.1409, "grad_norm": 0.21911774575710297, "learning_rate": 4.150943396226415e-06, "epoch": 0.98 }, { "step": 2169, "timestamp": "2025-12-28T13:11:52.968471", "elapsed_time": 15227.253565788269, "loss": 0.1652, "grad_norm": 0.10740287601947784, "learning_rate": 4.025157232704403e-06, "epoch": 0.980625 }, { "step": 2170, "timestamp": "2025-12-28T13:12:14.028363", "elapsed_time": 15248.31345319748, "loss": 0.0946, "grad_norm": 0.107594795525074, "learning_rate": 3.89937106918239e-06, "epoch": 0.98125 }, { "step": 2171, "timestamp": "2025-12-28T13:12:26.958057", "elapsed_time": 15261.243147850037, "loss": 0.1402, "grad_norm": 0.09502803534269333, "learning_rate": 3.7735849056603773e-06, "epoch": 0.981875 }, { "step": 2172, "timestamp": "2025-12-28T13:12:41.825213", "elapsed_time": 15276.110303640366, "loss": 0.1656, "grad_norm": 0.10404454171657562, "learning_rate": 3.647798742138365e-06, "epoch": 0.9825 }, { "step": 2173, "timestamp": "2025-12-28T13:12:54.237611", "elapsed_time": 15288.522701501846, "loss": 0.1088, "grad_norm": 0.09469737857580185, "learning_rate": 3.5220125786163524e-06, "epoch": 0.983125 }, { "step": 2174, "timestamp": "2025-12-28T13:13:14.320573", "elapsed_time": 15308.60566353798, "loss": 0.1108, "grad_norm": 0.08538592606782913, "learning_rate": 3.3962264150943395e-06, "epoch": 0.98375 }, { "step": 2175, "timestamp": "2025-12-28T13:13:31.983656", "elapsed_time": 15326.2687458992, "loss": 0.113, "grad_norm": 0.07896923273801804, "learning_rate": 3.270440251572327e-06, "epoch": 0.984375 }, { "step": 2176, "timestamp": "2025-12-28T13:13:36.117474", "elapsed_time": 15330.402564287186, "loss": 0.2856, "grad_norm": 0.20000196993350983, "learning_rate": 3.1446540880503146e-06, "epoch": 0.985 }, { "step": 2177, "timestamp": "2025-12-28T13:13:40.749900", "elapsed_time": 15335.03499007225, "loss": 0.2019, "grad_norm": 0.17783957719802856, "learning_rate": 3.018867924528302e-06, "epoch": 0.985625 }, { "step": 2178, "timestamp": "2025-12-28T13:13:50.081194", "elapsed_time": 15344.366284370422, "loss": 0.124, "grad_norm": 0.10862841457128525, "learning_rate": 2.8930817610062893e-06, "epoch": 0.98625 }, { "step": 2179, "timestamp": "2025-12-28T13:13:57.182561", "elapsed_time": 15351.467651605606, "loss": 0.3846, "grad_norm": 0.1680181324481964, "learning_rate": 2.767295597484277e-06, "epoch": 0.986875 }, { "step": 2180, "timestamp": "2025-12-28T13:14:18.069949", "elapsed_time": 15372.355040073395, "loss": 0.1295, "grad_norm": 0.10930982232093811, "learning_rate": 2.6415094339622644e-06, "epoch": 0.9875 }, { "step": 2181, "timestamp": "2025-12-28T13:14:25.840440", "elapsed_time": 15380.125534534454, "loss": 0.1287, "grad_norm": 0.11625991761684418, "learning_rate": 2.515723270440252e-06, "epoch": 0.988125 }, { "step": 2182, "timestamp": "2025-12-28T13:14:32.900696", "elapsed_time": 15387.185786247253, "loss": 0.2032, "grad_norm": 0.13967153429985046, "learning_rate": 2.389937106918239e-06, "epoch": 0.98875 }, { "step": 2183, "timestamp": "2025-12-28T13:14:38.813341", "elapsed_time": 15393.098430871964, "loss": 0.1665, "grad_norm": 0.16941803693771362, "learning_rate": 2.2641509433962262e-06, "epoch": 0.989375 }, { "step": 2184, "timestamp": "2025-12-28T13:14:48.082713", "elapsed_time": 15402.36780333519, "loss": 0.1239, "grad_norm": 0.1080615222454071, "learning_rate": 2.1383647798742138e-06, "epoch": 0.99 }, { "step": 2185, "timestamp": "2025-12-28T13:14:59.031848", "elapsed_time": 15413.316938877106, "loss": 0.1657, "grad_norm": 0.1154465302824974, "learning_rate": 2.0125786163522013e-06, "epoch": 0.990625 }, { "step": 2186, "timestamp": "2025-12-28T13:15:09.288405", "elapsed_time": 15423.573495388031, "loss": 0.1329, "grad_norm": 0.10635515302419662, "learning_rate": 1.8867924528301887e-06, "epoch": 0.99125 }, { "step": 2187, "timestamp": "2025-12-28T13:15:19.358530", "elapsed_time": 15433.64362025261, "loss": 0.1496, "grad_norm": 0.10568325221538544, "learning_rate": 1.7610062893081762e-06, "epoch": 0.991875 }, { "step": 2188, "timestamp": "2025-12-28T13:15:27.696443", "elapsed_time": 15441.981533288956, "loss": 0.1668, "grad_norm": 0.11821103096008301, "learning_rate": 1.6352201257861635e-06, "epoch": 0.9925 }, { "step": 2189, "timestamp": "2025-12-28T13:15:32.319517", "elapsed_time": 15446.60460782051, "loss": 0.2601, "grad_norm": 0.18787573277950287, "learning_rate": 1.509433962264151e-06, "epoch": 0.993125 }, { "step": 2190, "timestamp": "2025-12-28T13:15:42.782764", "elapsed_time": 15457.067858695984, "loss": 0.1287, "grad_norm": 0.11284953355789185, "learning_rate": 1.3836477987421384e-06, "epoch": 0.99375 }, { "step": 2191, "timestamp": "2025-12-28T13:15:48.893842", "elapsed_time": 15463.178936243057, "loss": 0.1804, "grad_norm": 0.15460120141506195, "learning_rate": 1.257861635220126e-06, "epoch": 0.994375 }, { "step": 2192, "timestamp": "2025-12-28T13:15:56.849546", "elapsed_time": 15471.134636163712, "loss": 0.166, "grad_norm": 0.13522419333457947, "learning_rate": 1.1320754716981131e-06, "epoch": 0.995 }, { "step": 2193, "timestamp": "2025-12-28T13:16:09.481063", "elapsed_time": 15483.766153335571, "loss": 0.1317, "grad_norm": 0.11077344417572021, "learning_rate": 1.0062893081761007e-06, "epoch": 0.995625 }, { "step": 2194, "timestamp": "2025-12-28T13:16:17.328514", "elapsed_time": 15491.613604068756, "loss": 0.4237, "grad_norm": 0.1749534010887146, "learning_rate": 8.805031446540881e-07, "epoch": 0.99625 }, { "step": 2195, "timestamp": "2025-12-28T13:16:23.345606", "elapsed_time": 15497.63069653511, "loss": 0.1748, "grad_norm": 0.1672588586807251, "learning_rate": 7.547169811320755e-07, "epoch": 0.996875 }, { "step": 2196, "timestamp": "2025-12-28T13:16:31.823739", "elapsed_time": 15506.108829021454, "loss": 0.2156, "grad_norm": 0.1420753449201584, "learning_rate": 6.28930817610063e-07, "epoch": 0.9975 }, { "step": 2197, "timestamp": "2025-12-28T13:16:40.998256", "elapsed_time": 15515.283346652985, "loss": 0.1277, "grad_norm": 0.10503512620925903, "learning_rate": 5.031446540880503e-07, "epoch": 0.998125 }, { "step": 2198, "timestamp": "2025-12-28T13:16:49.844562", "elapsed_time": 15524.12965297699, "loss": 0.1375, "grad_norm": 0.1124568060040474, "learning_rate": 3.773584905660378e-07, "epoch": 0.99875 }, { "step": 2199, "timestamp": "2025-12-28T13:16:58.714665", "elapsed_time": 15532.9997549057, "loss": 0.1504, "grad_norm": 0.11710984259843826, "learning_rate": 2.5157232704402517e-07, "epoch": 0.999375 }, { "step": 2200, "timestamp": "2025-12-28T13:17:05.804184", "elapsed_time": 15540.089273929596, "loss": 0.1503, "grad_norm": 0.12923410534858704, "learning_rate": 1.2578616352201258e-07, "epoch": 1.0 }, { "step": 2200, "timestamp": "2025-12-28T13:17:06.653219", "elapsed_time": 15540.938311100006, "train_runtime": 15528.6205, "train_samples_per_second": 0.103, "train_steps_per_second": 0.103, "total_flos": 5.392201841787095e+17, "train_loss": 0.19463951266836374, "epoch": 1.0 } ], "loss_summary": { "min": 0.0733, "max": 0.7795, "final": 0.1503, "average": 0.1946379375 }, "grad_norm_summary": { "min": 0.06062662601470947, "max": 2.4628686904907227, "final": 0.12923410534858704, "average": 0.14187599109020085 } }