{ "best_global_step": 800, "best_metric": 0.2455482929944992, "best_model_checkpoint": "./checkpoints/qwen253-lora-leduc_random_l_s3/checkpoint-800", "epoch": 1.0, "eval_steps": 200, "global_step": 826, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012106537530266344, "grad_norm": 10.681763648986816, "learning_rate": 8.999999999999999e-06, "loss": 0.828, "mean_token_accuracy": 0.8087970525026321, "num_tokens": 158075.0, "step": 10 }, { "epoch": 0.024213075060532687, "grad_norm": 0.8575407266616821, "learning_rate": 1.8999999999999998e-05, "loss": 0.3278, "mean_token_accuracy": 0.8451847195625305, "num_tokens": 314832.0, "step": 20 }, { "epoch": 0.03631961259079903, "grad_norm": 0.38625678420066833, "learning_rate": 2.9e-05, "loss": 0.2427, "mean_token_accuracy": 0.8527996808290481, "num_tokens": 473401.0, "step": 30 }, { "epoch": 0.048426150121065374, "grad_norm": 0.25771161913871765, "learning_rate": 3.4997791661317485e-05, "loss": 0.2284, "mean_token_accuracy": 0.8509193986654282, "num_tokens": 630871.0, "step": 40 }, { "epoch": 0.06053268765133172, "grad_norm": 0.21924710273742676, "learning_rate": 3.497295425144213e-05, "loss": 0.2428, "mean_token_accuracy": 0.8498442590236663, "num_tokens": 782593.0, "step": 50 }, { "epoch": 0.07263922518159806, "grad_norm": 0.172270268201828, "learning_rate": 3.4920558312793984e-05, "loss": 0.2396, "mean_token_accuracy": 0.844213005900383, "num_tokens": 938469.0, "step": 60 }, { "epoch": 0.0847457627118644, "grad_norm": 0.23005615174770355, "learning_rate": 3.4840686484803226e-05, "loss": 0.2336, "mean_token_accuracy": 0.8490692973136902, "num_tokens": 1093154.0, "step": 70 }, { "epoch": 0.09685230024213075, "grad_norm": 0.2150808423757553, "learning_rate": 3.473346474216413e-05, "loss": 0.2308, "mean_token_accuracy": 0.8609368681907654, "num_tokens": 1249444.0, "step": 80 }, { "epoch": 0.1089588377723971, "grad_norm": 0.15730148553848267, "learning_rate": 3.459906219614643e-05, "loss": 0.2424, "mean_token_accuracy": 0.8448190927505493, "num_tokens": 1405823.0, "step": 90 }, { "epoch": 0.12106537530266344, "grad_norm": 0.4564799964427948, "learning_rate": 3.4437690827871256e-05, "loss": 0.2422, "mean_token_accuracy": 0.8474129974842072, "num_tokens": 1559563.0, "step": 100 }, { "epoch": 0.13317191283292978, "grad_norm": 0.1691020429134369, "learning_rate": 3.424960515397224e-05, "loss": 0.2426, "mean_token_accuracy": 0.8417032897472382, "num_tokens": 1714717.0, "step": 110 }, { "epoch": 0.14527845036319612, "grad_norm": 0.13242730498313904, "learning_rate": 3.403510182516918e-05, "loss": 0.2401, "mean_token_accuracy": 0.8526080518960952, "num_tokens": 1869540.0, "step": 120 }, { "epoch": 0.15738498789346247, "grad_norm": 0.16052637994289398, "learning_rate": 3.379451915838742e-05, "loss": 0.2442, "mean_token_accuracy": 0.8375077575445176, "num_tokens": 2024145.0, "step": 130 }, { "epoch": 0.1694915254237288, "grad_norm": 0.1499997228384018, "learning_rate": 3.352823660316074e-05, "loss": 0.2346, "mean_token_accuracy": 0.845609164237976, "num_tokens": 2180735.0, "step": 140 }, { "epoch": 0.18159806295399517, "grad_norm": 0.10429004579782486, "learning_rate": 3.323667414315959e-05, "loss": 0.2419, "mean_token_accuracy": 0.8553953766822815, "num_tokens": 2335148.0, "step": 150 }, { "epoch": 0.1937046004842615, "grad_norm": 0.08566914498806, "learning_rate": 3.292029163378833e-05, "loss": 0.2357, "mean_token_accuracy": 0.8465773612260818, "num_tokens": 2492016.0, "step": 160 }, { "epoch": 0.20581113801452786, "grad_norm": 0.14463454484939575, "learning_rate": 3.2579588076896486e-05, "loss": 0.2314, "mean_token_accuracy": 0.8503528028726578, "num_tokens": 2647377.0, "step": 170 }, { "epoch": 0.2179176755447942, "grad_norm": 0.12798088788986206, "learning_rate": 3.221510083374765e-05, "loss": 0.2333, "mean_token_accuracy": 0.8497181862592698, "num_tokens": 2803931.0, "step": 180 }, { "epoch": 0.23002421307506055, "grad_norm": 0.17899736762046814, "learning_rate": 3.182740477748768e-05, "loss": 0.2358, "mean_token_accuracy": 0.8438972860574723, "num_tokens": 2956080.0, "step": 190 }, { "epoch": 0.24213075060532688, "grad_norm": 0.15043821930885315, "learning_rate": 3.1417111386448595e-05, "loss": 0.2366, "mean_token_accuracy": 0.8498786896467209, "num_tokens": 3111180.0, "step": 200 }, { "epoch": 0.24213075060532688, "eval_loss": 0.24580417573451996, "eval_num_tokens": 3111180.0, "eval_runtime": 27.1445, "eval_samples_per_second": 9.836, "eval_steps_per_second": 9.836, "step": 200 }, { "epoch": 0.2542372881355932, "grad_norm": 0.12326110154390335, "learning_rate": 3.098486777971855e-05, "loss": 0.2277, "mean_token_accuracy": 0.8524766951799393, "num_tokens": 3269249.0, "step": 210 }, { "epoch": 0.26634382566585957, "grad_norm": 0.0829845741391182, "learning_rate": 3.053135569649868e-05, "loss": 0.2419, "mean_token_accuracy": 0.843473681807518, "num_tokens": 3424471.0, "step": 220 }, { "epoch": 0.2784503631961259, "grad_norm": 0.13634343445301056, "learning_rate": 3.005729042085683e-05, "loss": 0.2383, "mean_token_accuracy": 0.8487411588430405, "num_tokens": 3579004.0, "step": 230 }, { "epoch": 0.29055690072639223, "grad_norm": 0.09264083206653595, "learning_rate": 2.956341965357393e-05, "loss": 0.236, "mean_token_accuracy": 0.8531801581382752, "num_tokens": 3734168.0, "step": 240 }, { "epoch": 0.3026634382566586, "grad_norm": 0.09290221333503723, "learning_rate": 2.9050522332862385e-05, "loss": 0.2369, "mean_token_accuracy": 0.8493932217359543, "num_tokens": 3888227.0, "step": 250 }, { "epoch": 0.31476997578692495, "grad_norm": 0.08292774111032486, "learning_rate": 2.8519407405816493e-05, "loss": 0.2313, "mean_token_accuracy": 0.851080346107483, "num_tokens": 4046278.0, "step": 260 }, { "epoch": 0.3268765133171913, "grad_norm": 0.1620582491159439, "learning_rate": 2.797091255253247e-05, "loss": 0.2379, "mean_token_accuracy": 0.8395844340324402, "num_tokens": 4200203.0, "step": 270 }, { "epoch": 0.3389830508474576, "grad_norm": 0.12099113315343857, "learning_rate": 2.7405902864910543e-05, "loss": 0.2364, "mean_token_accuracy": 0.8551326721906662, "num_tokens": 4355292.0, "step": 280 }, { "epoch": 0.35108958837772397, "grad_norm": 0.12489405274391174, "learning_rate": 2.6825269482222827e-05, "loss": 0.2354, "mean_token_accuracy": 0.8442697525024414, "num_tokens": 4510258.0, "step": 290 }, { "epoch": 0.36319612590799033, "grad_norm": 0.10075319558382034, "learning_rate": 2.6229928185598994e-05, "loss": 0.2333, "mean_token_accuracy": 0.8536905407905578, "num_tokens": 4664788.0, "step": 300 }, { "epoch": 0.37530266343825663, "grad_norm": 0.11994941532611847, "learning_rate": 2.5620817953646596e-05, "loss": 0.2323, "mean_token_accuracy": 0.8539896428585052, "num_tokens": 4821986.0, "step": 310 }, { "epoch": 0.387409200968523, "grad_norm": 0.12077498435974121, "learning_rate": 2.4998899481484006e-05, "loss": 0.2399, "mean_token_accuracy": 0.8509245574474334, "num_tokens": 4978102.0, "step": 320 }, { "epoch": 0.39951573849878935, "grad_norm": 0.1378944218158722, "learning_rate": 2.4365153665521915e-05, "loss": 0.233, "mean_token_accuracy": 0.8478419154882431, "num_tokens": 5134005.0, "step": 330 }, { "epoch": 0.4116222760290557, "grad_norm": 0.15924955904483795, "learning_rate": 2.3720580056383107e-05, "loss": 0.2244, "mean_token_accuracy": 0.8621924012899399, "num_tokens": 5290764.0, "step": 340 }, { "epoch": 0.423728813559322, "grad_norm": 0.1484508514404297, "learning_rate": 2.30661952824006e-05, "loss": 0.2266, "mean_token_accuracy": 0.8586694985628128, "num_tokens": 5447775.0, "step": 350 }, { "epoch": 0.4358353510895884, "grad_norm": 0.18554432690143585, "learning_rate": 2.2403031446180677e-05, "loss": 0.2269, "mean_token_accuracy": 0.8663704991340637, "num_tokens": 5605311.0, "step": 360 }, { "epoch": 0.44794188861985473, "grad_norm": 0.2061612904071808, "learning_rate": 2.1732134496759685e-05, "loss": 0.2293, "mean_token_accuracy": 0.8527790486812592, "num_tokens": 5763991.0, "step": 370 }, { "epoch": 0.4600484261501211, "grad_norm": 0.21663211286067963, "learning_rate": 2.1054562579922147e-05, "loss": 0.2384, "mean_token_accuracy": 0.8578897565603256, "num_tokens": 5918372.0, "step": 380 }, { "epoch": 0.4721549636803874, "grad_norm": 0.16164663434028625, "learning_rate": 2.0371384369281973e-05, "loss": 0.2321, "mean_token_accuracy": 0.8527203172445297, "num_tokens": 6076662.0, "step": 390 }, { "epoch": 0.48426150121065376, "grad_norm": 0.14720699191093445, "learning_rate": 1.968367738075915e-05, "loss": 0.223, "mean_token_accuracy": 0.8647637069225311, "num_tokens": 6233988.0, "step": 400 }, { "epoch": 0.48426150121065376, "eval_loss": 0.24974025785923004, "eval_num_tokens": 6233988.0, "eval_runtime": 26.7143, "eval_samples_per_second": 9.995, "eval_steps_per_second": 9.995, "step": 400 }, { "epoch": 0.4963680387409201, "grad_norm": 0.11780782788991928, "learning_rate": 1.899252627311015e-05, "loss": 0.2288, "mean_token_accuracy": 0.853996068239212, "num_tokens": 6391251.0, "step": 410 }, { "epoch": 0.5084745762711864, "grad_norm": 0.14394888281822205, "learning_rate": 1.8299021137192683e-05, "loss": 0.237, "mean_token_accuracy": 0.8532957583665848, "num_tokens": 6544551.0, "step": 420 }, { "epoch": 0.5205811138014528, "grad_norm": 0.4107162356376648, "learning_rate": 1.760425577666279e-05, "loss": 0.2294, "mean_token_accuracy": 0.8468001574277878, "num_tokens": 6702345.0, "step": 430 }, { "epoch": 0.5326876513317191, "grad_norm": 0.12140627950429916, "learning_rate": 1.6909325982816146e-05, "loss": 0.2268, "mean_token_accuracy": 0.8571277797222138, "num_tokens": 6857256.0, "step": 440 }, { "epoch": 0.5447941888619855, "grad_norm": 0.13262014091014862, "learning_rate": 1.6215327806294417e-05, "loss": 0.228, "mean_token_accuracy": 0.8527298241853714, "num_tokens": 7013503.0, "step": 450 }, { "epoch": 0.5569007263922519, "grad_norm": 0.18789087235927582, "learning_rate": 1.552335582838251e-05, "loss": 0.2317, "mean_token_accuracy": 0.85929856300354, "num_tokens": 7167382.0, "step": 460 }, { "epoch": 0.5690072639225182, "grad_norm": 0.1549673080444336, "learning_rate": 1.4834501434623413e-05, "loss": 0.2386, "mean_token_accuracy": 0.8493269443511963, "num_tokens": 7322223.0, "step": 470 }, { "epoch": 0.5811138014527845, "grad_norm": 0.28990328311920166, "learning_rate": 1.4149851093473319e-05, "loss": 0.2261, "mean_token_accuracy": 0.8549934804439545, "num_tokens": 7477291.0, "step": 480 }, { "epoch": 0.5932203389830508, "grad_norm": 0.34836694598197937, "learning_rate": 1.3470484642712053e-05, "loss": 0.2391, "mean_token_accuracy": 0.8534150063991547, "num_tokens": 7631181.0, "step": 490 }, { "epoch": 0.6053268765133172, "grad_norm": 0.08868297189474106, "learning_rate": 1.2797473586311476e-05, "loss": 0.235, "mean_token_accuracy": 0.8497831732034683, "num_tokens": 7786171.0, "step": 500 }, { "epoch": 0.6174334140435835, "grad_norm": 0.11514752358198166, "learning_rate": 1.2131879404448057e-05, "loss": 0.2331, "mean_token_accuracy": 0.8469379067420959, "num_tokens": 7941159.0, "step": 510 }, { "epoch": 0.6295399515738499, "grad_norm": 0.22596512734889984, "learning_rate": 1.1474751879325075e-05, "loss": 0.2374, "mean_token_accuracy": 0.8513785660266876, "num_tokens": 8095202.0, "step": 520 }, { "epoch": 0.6416464891041163, "grad_norm": 0.19439919292926788, "learning_rate": 1.0827127439444991e-05, "loss": 0.2318, "mean_token_accuracy": 0.8583654165267944, "num_tokens": 8250634.0, "step": 530 }, { "epoch": 0.6537530266343826, "grad_norm": 0.16050127148628235, "learning_rate": 1.0190027524943444e-05, "loss": 0.2247, "mean_token_accuracy": 0.8635302782058716, "num_tokens": 8408664.0, "step": 540 }, { "epoch": 0.6658595641646489, "grad_norm": 0.19398577511310577, "learning_rate": 9.564456976562993e-06, "loss": 0.2359, "mean_token_accuracy": 0.8510926723480224, "num_tokens": 8563164.0, "step": 550 }, { "epoch": 0.6779661016949152, "grad_norm": 0.16695190966129303, "learning_rate": 8.951402450807686e-06, "loss": 0.2256, "mean_token_accuracy": 0.8567656666040421, "num_tokens": 8717818.0, "step": 560 }, { "epoch": 0.6900726392251816, "grad_norm": 0.14462164044380188, "learning_rate": 8.35183086377792e-06, "loss": 0.2302, "mean_token_accuracy": 0.8581649184226989, "num_tokens": 8872048.0, "step": 570 }, { "epoch": 0.7021791767554479, "grad_norm": 0.20691347122192383, "learning_rate": 7.766687866140133e-06, "loss": 0.234, "mean_token_accuracy": 0.856579378247261, "num_tokens": 9027749.0, "step": 580 }, { "epoch": 0.7142857142857143, "grad_norm": 0.23428326845169067, "learning_rate": 7.196896351636536e-06, "loss": 0.2305, "mean_token_accuracy": 0.8628283053636551, "num_tokens": 9186566.0, "step": 590 }, { "epoch": 0.7263922518159807, "grad_norm": 0.16487430036067963, "learning_rate": 6.643355001487321e-06, "loss": 0.2298, "mean_token_accuracy": 0.8545309662818908, "num_tokens": 9343040.0, "step": 600 }, { "epoch": 0.7263922518159807, "eval_loss": 0.24567341804504395, "eval_num_tokens": 9343040.0, "eval_runtime": 26.5992, "eval_samples_per_second": 10.038, "eval_steps_per_second": 10.038, "step": 600 }, { "epoch": 0.738498789346247, "grad_norm": 0.1282430738210678, "learning_rate": 6.106936866981081e-06, "loss": 0.2249, "mean_token_accuracy": 0.8606575727462769, "num_tokens": 9499657.0, "step": 610 }, { "epoch": 0.7506053268765133, "grad_norm": 0.13524822890758514, "learning_rate": 5.588487992489113e-06, "loss": 0.2259, "mean_token_accuracy": 0.865332567691803, "num_tokens": 9654173.0, "step": 620 }, { "epoch": 0.7627118644067796, "grad_norm": 0.1036379262804985, "learning_rate": 5.088826081075191e-06, "loss": 0.2296, "mean_token_accuracy": 0.8487064689397812, "num_tokens": 9809053.0, "step": 630 }, { "epoch": 0.774818401937046, "grad_norm": 0.20920903980731964, "learning_rate": 4.6087392048056934e-06, "loss": 0.2363, "mean_token_accuracy": 0.8600066721439361, "num_tokens": 9965376.0, "step": 640 }, { "epoch": 0.7869249394673123, "grad_norm": 0.2146104872226715, "learning_rate": 4.148984561793913e-06, "loss": 0.2303, "mean_token_accuracy": 0.8529395699501038, "num_tokens": 10119911.0, "step": 650 }, { "epoch": 0.7990314769975787, "grad_norm": 0.15396490693092346, "learning_rate": 3.7102872819392174e-06, "loss": 0.2298, "mean_token_accuracy": 0.8552716702222825, "num_tokens": 10277701.0, "step": 660 }, { "epoch": 0.8111380145278451, "grad_norm": 0.15627269446849823, "learning_rate": 3.2933392832444513e-06, "loss": 0.2277, "mean_token_accuracy": 0.8558280795812607, "num_tokens": 10434155.0, "step": 670 }, { "epoch": 0.8232445520581114, "grad_norm": 0.1806221306324005, "learning_rate": 2.898798180515523e-06, "loss": 0.2316, "mean_token_accuracy": 0.8524704337120056, "num_tokens": 10592288.0, "step": 680 }, { "epoch": 0.8353510895883777, "grad_norm": 0.15548691153526306, "learning_rate": 2.527286248164371e-06, "loss": 0.2343, "mean_token_accuracy": 0.8490294456481934, "num_tokens": 10748051.0, "step": 690 }, { "epoch": 0.847457627118644, "grad_norm": 0.21228361129760742, "learning_rate": 2.179389438751151e-06, "loss": 0.2274, "mean_token_accuracy": 0.8586687803268432, "num_tokens": 10903879.0, "step": 700 }, { "epoch": 0.8595641646489104, "grad_norm": 0.1210569515824318, "learning_rate": 1.8556564588136477e-06, "loss": 0.2272, "mean_token_accuracy": 0.8536923497915268, "num_tokens": 11058830.0, "step": 710 }, { "epoch": 0.8716707021791767, "grad_norm": 0.12513045966625214, "learning_rate": 1.556597903441502e-06, "loss": 0.2322, "mean_token_accuracy": 0.8645375669002533, "num_tokens": 11214742.0, "step": 720 }, { "epoch": 0.8837772397094431, "grad_norm": 0.12597453594207764, "learning_rate": 1.2826854509602204e-06, "loss": 0.2257, "mean_token_accuracy": 0.8663272529840469, "num_tokens": 11371761.0, "step": 730 }, { "epoch": 0.8958837772397095, "grad_norm": 0.13700132071971893, "learning_rate": 1.0343511189951156e-06, "loss": 0.2226, "mean_token_accuracy": 0.862814399600029, "num_tokens": 11526770.0, "step": 740 }, { "epoch": 0.9079903147699758, "grad_norm": 0.14355961978435516, "learning_rate": 8.119865830885323e-07, "loss": 0.2285, "mean_token_accuracy": 0.8575877249240875, "num_tokens": 11683565.0, "step": 750 }, { "epoch": 0.9200968523002422, "grad_norm": 0.17410188913345337, "learning_rate": 6.159425589450137e-07, "loss": 0.2331, "mean_token_accuracy": 0.8473503857851028, "num_tokens": 11840016.0, "step": 760 }, { "epoch": 0.9322033898305084, "grad_norm": 0.10479779541492462, "learning_rate": 4.4652824927878805e-07, "loss": 0.2323, "mean_token_accuracy": 0.8583548158407212, "num_tokens": 11993687.0, "step": 770 }, { "epoch": 0.9443099273607748, "grad_norm": 0.25644639134407043, "learning_rate": 3.040108561359608e-07, "loss": 0.2262, "mean_token_accuracy": 0.8624204069375991, "num_tokens": 12149690.0, "step": 780 }, { "epoch": 0.9564164648910412, "grad_norm": 0.10093547403812408, "learning_rate": 1.8861515946060807e-07, "loss": 0.2327, "mean_token_accuracy": 0.8606836467981338, "num_tokens": 12305420.0, "step": 790 }, { "epoch": 0.9685230024213075, "grad_norm": 0.21478745341300964, "learning_rate": 1.0052316256947606e-07, "loss": 0.2312, "mean_token_accuracy": 0.8556828409433365, "num_tokens": 12461595.0, "step": 800 }, { "epoch": 0.9685230024213075, "eval_loss": 0.2455482929944992, "eval_num_tokens": 12461595.0, "eval_runtime": 26.6485, "eval_samples_per_second": 10.019, "eval_steps_per_second": 10.019, "step": 800 }, { "epoch": 0.9806295399515739, "grad_norm": 0.13691502809524536, "learning_rate": 3.987380509441307e-08, "loss": 0.2252, "mean_token_accuracy": 0.852449357509613, "num_tokens": 12617715.0, "step": 810 }, { "epoch": 0.9927360774818402, "grad_norm": 0.12197011709213257, "learning_rate": 6.76274384530412e-09, "loss": 0.2263, "mean_token_accuracy": 0.8597381263971329, "num_tokens": 12775223.0, "step": 820 } ], "logging_steps": 10, "max_steps": 826, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1877783380167885e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }