| { |
| "best_global_step": 2250, |
| "best_metric": 0.18876151740550995, |
| "best_model_checkpoint": "/kaggle/working/obsidian_critic_qwen35_t4x2_unsloth/runs/obsidian_critic_full_epoch/checkpoint-2250", |
| "epoch": 1.0, |
| "eval_steps": 125, |
| "global_step": 2256, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0004434343994235353, |
| "grad_norm": 0.21817488968372345, |
| "last_batch_tokens": 257, |
| "learning_rate": 0.0, |
| "loss": 2.5221590995788574, |
| "lr": 2e-05, |
| "step": 1, |
| "tokens_per_second": 27.955696254559246, |
| "tokens_per_step": 1560.0, |
| "total_tokens_seen": 1560 |
| }, |
| { |
| "epoch": 0.022171719971176763, |
| "grad_norm": 0.49327757954597473, |
| "last_batch_tokens": 229, |
| "learning_rate": 9.990575514806563e-05, |
| "loss": 1.981216119260204, |
| "lr": 9.990142403513012e-05, |
| "step": 50, |
| "tokens_per_second": 76.9470637679645, |
| "tokens_per_step": 1551.7, |
| "total_tokens_seen": 77585 |
| }, |
| { |
| "epoch": 0.04434343994235353, |
| "grad_norm": 0.8501848578453064, |
| "last_batch_tokens": 193, |
| "learning_rate": 9.957034339013742e-05, |
| "loss": 1.1713996887207032, |
| "lr": 9.956116660116155e-05, |
| "step": 100, |
| "tokens_per_second": 86.53475088010235, |
| "tokens_per_step": 1572.7, |
| "total_tokens_seen": 157270 |
| }, |
| { |
| "epoch": 0.05542929992794191, |
| "eval_loss": 0.9524237513542175, |
| "eval_runtime": 104.4026, |
| "eval_samples_per_second": 3.477, |
| "eval_steps_per_second": 1.743, |
| "last_batch_tokens": 172, |
| "lr": 9.930042238269485e-05, |
| "step": 125, |
| "tokens_per_second": 133.98724044386626, |
| "tokens_per_step": 1853.72, |
| "total_tokens_seen": 231715 |
| }, |
| { |
| "epoch": 0.0665151599135303, |
| "grad_norm": 0.892642617225647, |
| "last_batch_tokens": 59, |
| "learning_rate": 9.899364434012273e-05, |
| "loss": 0.8618771362304688, |
| "lr": 9.897966654380171e-05, |
| "step": 150, |
| "tokens_per_second": 78.95926927075263, |
| "tokens_per_step": 1780.0866666666666, |
| "total_tokens_seen": 267013 |
| }, |
| { |
| "epoch": 0.08868687988470705, |
| "grad_norm": 0.8177722692489624, |
| "last_batch_tokens": 275, |
| "learning_rate": 9.817846512306061e-05, |
| "loss": 0.69920166015625, |
| "lr": 9.815975435734603e-05, |
| "step": 200, |
| "tokens_per_second": 83.13681099990983, |
| "tokens_per_step": 1700.74, |
| "total_tokens_seen": 340148 |
| }, |
| { |
| "epoch": 0.11085859985588382, |
| "grad_norm": 1.009503960609436, |
| "last_batch_tokens": 181, |
| "learning_rate": 9.712877368374224e-05, |
| "loss": 0.6449888610839843, |
| "lr": 9.710542102466229e-05, |
| "step": 250, |
| "tokens_per_second": 82.25185579838005, |
| "tokens_per_step": 1656.984, |
| "total_tokens_seen": 414246 |
| }, |
| { |
| "epoch": 0.11085859985588382, |
| "eval_loss": 0.6187728047370911, |
| "eval_runtime": 88.3548, |
| "eval_samples_per_second": 4.108, |
| "eval_steps_per_second": 2.06, |
| "last_batch_tokens": 172, |
| "lr": 9.710542102466229e-05, |
| "step": 250, |
| "tokens_per_second": 363.7051920895653, |
| "tokens_per_step": 1785.536, |
| "total_tokens_seen": 446384 |
| }, |
| { |
| "epoch": 0.1330303198270606, |
| "grad_norm": 0.5571497082710266, |
| "last_batch_tokens": 329, |
| "learning_rate": 9.584967947244769e-05, |
| "loss": 0.5449295806884765, |
| "lr": 9.582179859078793e-05, |
| "step": 300, |
| "tokens_per_second": 81.02823836816424, |
| "tokens_per_step": 1724.5733333333333, |
| "total_tokens_seen": 517372 |
| }, |
| { |
| "epoch": 0.15520203979823735, |
| "grad_norm": 0.7961392998695374, |
| "last_batch_tokens": 165, |
| "learning_rate": 9.434740857432105e-05, |
| "loss": 0.46938041687011717, |
| "lr": 9.431513518232342e-05, |
| "step": 350, |
| "tokens_per_second": 89.52354398651325, |
| "tokens_per_step": 1704.1371428571429, |
| "total_tokens_seen": 596448 |
| }, |
| { |
| "epoch": 0.16628789978382574, |
| "eval_loss": 0.4863806366920471, |
| "eval_runtime": 87.0251, |
| "eval_samples_per_second": 4.171, |
| "eval_steps_per_second": 2.091, |
| "last_batch_tokens": 172, |
| "lr": 9.348041345533653e-05, |
| "step": 375, |
| "tokens_per_second": 135.38091364115044, |
| "tokens_per_step": 1784.712, |
| "total_tokens_seen": 669267 |
| }, |
| { |
| "epoch": 0.1773737597694141, |
| "grad_norm": 0.7586395144462585, |
| "last_batch_tokens": 351, |
| "learning_rate": 9.262927340344295e-05, |
| "loss": 0.4675440216064453, |
| "lr": 9.259276459421655e-05, |
| "step": 400, |
| "tokens_per_second": 81.3096016563381, |
| "tokens_per_step": 1764.9875, |
| "total_tokens_seen": 705995 |
| }, |
| { |
| "epoch": 0.19954547974059086, |
| "grad_norm": 0.7313582897186279, |
| "last_batch_tokens": 369, |
| "learning_rate": 9.070363710911735e-05, |
| "loss": 0.3964078140258789, |
| "lr": 9.066307059197612e-05, |
| "step": 450, |
| "tokens_per_second": 87.86278133239196, |
| "tokens_per_step": 1744.9444444444443, |
| "total_tokens_seen": 785225 |
| }, |
| { |
| "epoch": 0.22171719971176765, |
| "grad_norm": 0.5969849228858948, |
| "last_batch_tokens": 193, |
| "learning_rate": 8.857987286762718e-05, |
| "loss": 0.3672472381591797, |
| "lr": 8.853544610307675e-05, |
| "step": 500, |
| "tokens_per_second": 87.74574317837812, |
| "tokens_per_step": 1729.026, |
| "total_tokens_seen": 864513 |
| }, |
| { |
| "epoch": 0.22171719971176765, |
| "eval_loss": 0.40328726172447205, |
| "eval_runtime": 87.1124, |
| "eval_samples_per_second": 4.167, |
| "eval_steps_per_second": 2.089, |
| "last_batch_tokens": 172, |
| "lr": 8.853544610307675e-05, |
| "step": 500, |
| "tokens_per_second": 368.8907701487212, |
| "tokens_per_step": 1793.302, |
| "total_tokens_seen": 896651 |
| }, |
| { |
| "epoch": 0.2438889196829444, |
| "grad_norm": 0.7751753330230713, |
| "last_batch_tokens": 273, |
| "learning_rate": 8.626831825760946e-05, |
| "loss": 0.3414393615722656, |
| "lr": 8.622024749619364e-05, |
| "step": 550, |
| "tokens_per_second": 82.92877874873523, |
| "tokens_per_step": 1766.3690909090908, |
| "total_tokens_seen": 971503 |
| }, |
| { |
| "epoch": 0.2660606396541212, |
| "grad_norm": 0.7136653065681458, |
| "last_batch_tokens": 305, |
| "learning_rate": 8.378022494113098e-05, |
| "loss": 0.3377827072143555, |
| "lr": 8.372874417081631e-05, |
| "step": 600, |
| "tokens_per_second": 90.40251231127895, |
| "tokens_per_step": 1748.685, |
| "total_tokens_seen": 1049211 |
| }, |
| { |
| "epoch": 0.27714649963970955, |
| "eval_loss": 0.35334891080856323, |
| "eval_runtime": 87.0325, |
| "eval_samples_per_second": 4.171, |
| "eval_steps_per_second": 2.091, |
| "last_batch_tokens": 172, |
| "lr": 8.24206361704162e-05, |
| "step": 625, |
| "tokens_per_second": 135.75737480096265, |
| "tokens_per_step": 1791.824, |
| "total_tokens_seen": 1119890 |
| }, |
| { |
| "epoch": 0.2882323596252979, |
| "grad_norm": 0.7202998399734497, |
| "last_batch_tokens": 211, |
| "learning_rate": 8.112770389539574e-05, |
| "loss": 0.3233934020996094, |
| "lr": 8.107306370261785e-05, |
| "step": 650, |
| "tokens_per_second": 84.5144051400581, |
| "tokens_per_step": 1779.3815384615384, |
| "total_tokens_seen": 1156598 |
| }, |
| { |
| "epoch": 0.3104040795964747, |
| "grad_norm": 0.7681185007095337, |
| "last_batch_tokens": 236, |
| "learning_rate": 7.832366646167268e-05, |
| "loss": 0.3125551414489746, |
| "lr": 7.826613281158841e-05, |
| "step": 700, |
| "tokens_per_second": 84.37944807859942, |
| "tokens_per_step": 1759.6771428571428, |
| "total_tokens_seen": 1231774 |
| }, |
| { |
| "epoch": 0.3325757995676515, |
| "grad_norm": 0.659271776676178, |
| "last_batch_tokens": 939, |
| "learning_rate": 7.538176149839243e-05, |
| "loss": 0.28798053741455076, |
| "lr": 7.532161444027488e-05, |
| "step": 750, |
| "tokens_per_second": 87.73140620694117, |
| "tokens_per_step": 1745.06, |
| "total_tokens_seen": 1308795 |
| }, |
| { |
| "epoch": 0.3325757995676515, |
| "eval_loss": 0.3200623393058777, |
| "eval_runtime": 87.2377, |
| "eval_samples_per_second": 4.161, |
| "eval_steps_per_second": 2.086, |
| "last_batch_tokens": 172, |
| "lr": 7.532161444027488e-05, |
| "step": 750, |
| "tokens_per_second": 368.35941630029333, |
| "tokens_per_step": 1787.9106666666667, |
| "total_tokens_seen": 1340933 |
| }, |
| { |
| "epoch": 0.3547475195388282, |
| "grad_norm": 0.5721789598464966, |
| "last_batch_tokens": 124, |
| "learning_rate": 7.231630894432527e-05, |
| "loss": 0.29953609466552733, |
| "lr": 7.22538412484033e-05, |
| "step": 800, |
| "tokens_per_second": 65.97096831279634, |
| "tokens_per_step": 98.35625, |
| "total_tokens_seen": 78685 |
| }, |
| { |
| "epoch": 0.376919239510005, |
| "grad_norm": 0.4275953471660614, |
| "last_batch_tokens": 266, |
| "learning_rate": 6.914223011522581e-05, |
| "loss": 0.27611801147460935, |
| "lr": 6.907774584760349e-05, |
| "step": 850, |
| "tokens_per_second": 76.59339331072898, |
| "tokens_per_step": 183.97411764705882, |
| "total_tokens_seen": 156378 |
| }, |
| { |
| "epoch": 0.38800509949559336, |
| "eval_loss": 0.28222641348838806, |
| "eval_runtime": 113.424, |
| "eval_samples_per_second": 3.2, |
| "eval_steps_per_second": 1.605, |
| "last_batch_tokens": 172, |
| "lr": 6.745388997609773e-05, |
| "step": 875, |
| "tokens_per_second": 114.49594753151979, |
| "tokens_per_step": 258.8742857142857, |
| "total_tokens_seen": 226515 |
| }, |
| { |
| "epoch": 0.39909095948118173, |
| "grad_norm": 0.5093332529067993, |
| "last_batch_tokens": 209, |
| "learning_rate": 6.587497507323132e-05, |
| "loss": 0.26179553985595705, |
| "lr": 6.580878811582379e-05, |
| "step": 900, |
| "tokens_per_second": 82.29563477689274, |
| "tokens_per_step": 298.55555555555554, |
| "total_tokens_seen": 268700 |
| }, |
| { |
| "epoch": 0.4212626794523585, |
| "grad_norm": 0.3912750482559204, |
| "last_batch_tokens": 103, |
| "learning_rate": 6.253044742254792e-05, |
| "loss": 0.25117488861083986, |
| "lr": 6.246287994523805e-05, |
| "step": 950, |
| "tokens_per_second": 79.79549481684828, |
| "tokens_per_step": 366.02947368421053, |
| "total_tokens_seen": 347728 |
| }, |
| { |
| "epoch": 0.4434343994235353, |
| "grad_norm": 0.4664643406867981, |
| "last_batch_tokens": 203, |
| "learning_rate": 5.9124926897487534e-05, |
| "loss": 0.25925636291503906, |
| "lr": 5.9056307789940357e-05, |
| "step": 1000, |
| "tokens_per_second": 76.53280228762407, |
| "tokens_per_step": 422.387, |
| "total_tokens_seen": 422387 |
| }, |
| { |
| "epoch": 0.4434343994235353, |
| "eval_loss": 0.26276224851608276, |
| "eval_runtime": 95.1275, |
| "eval_samples_per_second": 3.816, |
| "eval_steps_per_second": 1.913, |
| "last_batch_tokens": 172, |
| "lr": 5.9056307789940357e-05, |
| "step": 1000, |
| "tokens_per_second": 337.8095566509732, |
| "tokens_per_step": 454.525, |
| "total_tokens_seen": 454525 |
| }, |
| { |
| "epoch": 0.465606119394712, |
| "grad_norm": 0.7413877248764038, |
| "last_batch_tokens": 252, |
| "learning_rate": 5.56749901196638e-05, |
| "loss": 0.2307398223876953, |
| "lr": 5.5605653390431875e-05, |
| "step": 1050, |
| "tokens_per_second": 85.43713054173512, |
| "tokens_per_step": 512.1695238095238, |
| "total_tokens_seen": 537778 |
| }, |
| { |
| "epoch": 0.4877778393658888, |
| "grad_norm": 0.43335428833961487, |
| "last_batch_tokens": 142, |
| "learning_rate": 5.219742991006728e-05, |
| "loss": 0.24115974426269532, |
| "lr": 5.21277130607795e-05, |
| "step": 1100, |
| "tokens_per_second": 75.7193860694182, |
| "tokens_per_step": 556.2981818181818, |
| "total_tokens_seen": 611928 |
| }, |
| { |
| "epoch": 0.4988636993514772, |
| "eval_loss": 0.24811844527721405, |
| "eval_runtime": 94.9042, |
| "eval_samples_per_second": 3.825, |
| "eval_steps_per_second": 1.918, |
| "last_batch_tokens": 172, |
| "lr": 5.038379808781369e-05, |
| "step": 1125, |
| "tokens_per_second": 123.01878328450903, |
| "tokens_per_step": 607.1377777777777, |
| "total_tokens_seen": 683030 |
| }, |
| { |
| "epoch": 0.5099495593370655, |
| "grad_norm": 0.6529182195663452, |
| "last_batch_tokens": 102, |
| "learning_rate": 4.870917354877421e-05, |
| "loss": 0.22134504318237305, |
| "lr": 4.8639415931321794e-05, |
| "step": 1150, |
| "tokens_per_second": 83.41761800246071, |
| "tokens_per_step": 630.3573913043479, |
| "total_tokens_seen": 724911 |
| }, |
| { |
| "epoch": 0.5321212793082424, |
| "grad_norm": 0.4320646822452545, |
| "last_batch_tokens": 175, |
| "learning_rate": 4.522720038016592e-05, |
| "loss": 0.2152995491027832, |
| "lr": 4.515774154488211e-05, |
| "step": 1200, |
| "tokens_per_second": 82.13691539662977, |
| "tokens_per_step": 672.07, |
| "total_tokens_seen": 806484 |
| }, |
| { |
| "epoch": 0.5542929992794191, |
| "grad_norm": 0.6192132234573364, |
| "last_batch_tokens": 267, |
| "learning_rate": 4.1768459164721196e-05, |
| "loss": 0.20546873092651366, |
| "lr": 4.1699637207595034e-05, |
| "step": 1250, |
| "tokens_per_second": 83.92327455847254, |
| "tokens_per_step": 710.0544, |
| "total_tokens_seen": 887568 |
| }, |
| { |
| "epoch": 0.5542929992794191, |
| "eval_loss": 0.23204679787158966, |
| "eval_runtime": 94.3616, |
| "eval_samples_per_second": 3.847, |
| "eval_steps_per_second": 1.929, |
| "last_batch_tokens": 172, |
| "lr": 4.1699637207595034e-05, |
| "step": 1250, |
| "tokens_per_second": 340.54961477393465, |
| "tokens_per_step": 735.7648, |
| "total_tokens_seen": 919706 |
| }, |
| { |
| "epoch": 0.5764647192505958, |
| "grad_norm": 0.3487900495529175, |
| "last_batch_tokens": 134, |
| "learning_rate": 3.8349785579678194e-05, |
| "loss": 0.21177234649658203, |
| "lr": 3.828193549664752e-05, |
| "step": 1300, |
| "tokens_per_second": 79.01101943117263, |
| "tokens_per_step": 766.2323076923077, |
| "total_tokens_seen": 996102 |
| }, |
| { |
| "epoch": 0.5986364392217727, |
| "grad_norm": 0.42593374848365784, |
| "last_batch_tokens": 942, |
| "learning_rate": 3.498782027013742e-05, |
| "loss": 0.2180424690246582, |
| "lr": 3.492127232647139e-05, |
| "step": 1350, |
| "tokens_per_second": 80.48352941836103, |
| "tokens_per_step": 795.4074074074074, |
| "total_tokens_seen": 1073800 |
| }, |
| { |
| "epoch": 0.609722299207361, |
| "eval_loss": 0.2193347066640854, |
| "eval_runtime": 94.4814, |
| "eval_samples_per_second": 3.842, |
| "eval_steps_per_second": 1.926, |
| "last_batch_tokens": 172, |
| "lr": 3.326745518863976e-05, |
| "step": 1375, |
| "tokens_per_second": 124.66627567382365, |
| "tokens_per_step": 832.9498181818182, |
| "total_tokens_seen": 1145306 |
| }, |
| { |
| "epoch": 0.6208081591929494, |
| "grad_norm": 0.3440966010093689, |
| "last_batch_tokens": 176, |
| "learning_rate": 3.169892784949768e-05, |
| "loss": 0.22419458389282226, |
| "lr": 3.163400597220633e-05, |
| "step": 1400, |
| "tokens_per_second": 84.21467446062582, |
| "tokens_per_step": 847.435, |
| "total_tokens_seen": 1186409 |
| }, |
| { |
| "epoch": 0.6429798791641261, |
| "grad_norm": 0.48472294211387634, |
| "last_batch_tokens": 99, |
| "learning_rate": 2.8499117243496988e-05, |
| "loss": 0.20303966522216796, |
| "lr": 2.843613744459269e-05, |
| "step": 1450, |
| "tokens_per_second": 84.12853596803436, |
| "tokens_per_step": 874.0124137931034, |
| "total_tokens_seen": 1267318 |
| }, |
| { |
| "epoch": 0.665151599135303, |
| "grad_norm": 0.48055633902549744, |
| "last_batch_tokens": 92, |
| "learning_rate": 2.5403963765589118e-05, |
| "loss": 0.18697463989257812, |
| "lr": 2.5343232603874866e-05, |
| "step": 1500, |
| "tokens_per_second": 83.84733093235428, |
| "tokens_per_step": 900.2046666666666, |
| "total_tokens_seen": 1350307 |
| }, |
| { |
| "epoch": 0.665151599135303, |
| "eval_loss": 0.20863106846809387, |
| "eval_runtime": 94.5131, |
| "eval_samples_per_second": 3.841, |
| "eval_steps_per_second": 1.926, |
| "last_batch_tokens": 172, |
| "lr": 2.5343232603874866e-05, |
| "step": 1500, |
| "tokens_per_second": 340.00444794736484, |
| "tokens_per_step": 921.63, |
| "total_tokens_seen": 1382445 |
| }, |
| { |
| "epoch": 0.6873233191064797, |
| "grad_norm": 0.41916459798812866, |
| "last_batch_tokens": 426, |
| "learning_rate": 2.2428533302959837e-05, |
| "loss": 0.201729736328125, |
| "lr": 2.2370346391831737e-05, |
| "step": 1550, |
| "tokens_per_second": 80.49134617228279, |
| "tokens_per_step": 942.8058064516129, |
| "total_tokens_seen": 1461349 |
| }, |
| { |
| "epoch": 0.7094950390776564, |
| "grad_norm": 0.38731154799461365, |
| "last_batch_tokens": 312, |
| "learning_rate": 1.9587308982213076e-05, |
| "loss": 0.18205615997314453, |
| "lr": 1.953194955074038e-05, |
| "step": 1600, |
| "tokens_per_second": 79.4246014683713, |
| "tokens_per_step": 961.505, |
| "total_tokens_seen": 1538408 |
| }, |
| { |
| "epoch": 0.7205808990632449, |
| "eval_loss": 0.20174801349639893, |
| "eval_runtime": 94.2485, |
| "eval_samples_per_second": 3.852, |
| "eval_steps_per_second": 1.931, |
| "last_batch_tokens": 172, |
| "lr": 1.816752961112065e-05, |
| "step": 1625, |
| "tokens_per_second": 120.14447834109774, |
| "tokens_per_step": 988.5981538461539, |
| "total_tokens_seen": 1606472 |
| }, |
| { |
| "epoch": 0.7316667590488332, |
| "grad_norm": 0.42647936940193176, |
| "last_batch_tokens": 168, |
| "learning_rate": 1.6894120671686986e-05, |
| "loss": 0.1889303970336914, |
| "lr": 1.6841858185973775e-05, |
| "step": 1650, |
| "tokens_per_second": 75.64734832954207, |
| "tokens_per_step": 995.8060606060606, |
| "total_tokens_seen": 1643080 |
| }, |
| { |
| "epoch": 0.75383847902001, |
| "grad_norm": 0.41556963324546814, |
| "last_batch_tokens": 169, |
| "learning_rate": 1.4362077663552753e-05, |
| "loss": 0.1900373077392578, |
| "lr": 1.4313166515091864e-05, |
| "step": 1700, |
| "tokens_per_second": 76.28403702273542, |
| "tokens_per_step": 1009.9758823529412, |
| "total_tokens_seen": 1716959 |
| }, |
| { |
| "epoch": 0.7760101989911867, |
| "grad_norm": 0.4044085443019867, |
| "last_batch_tokens": 140, |
| "learning_rate": 1.2003504863370746e-05, |
| "loss": 0.1899305534362793, |
| "lr": 1.1958183130774469e-05, |
| "step": 1750, |
| "tokens_per_second": 84.05214560453553, |
| "tokens_per_step": 1027.8245714285715, |
| "total_tokens_seen": 1798693 |
| }, |
| { |
| "epoch": 0.7760101989911867, |
| "eval_loss": 0.19616812467575073, |
| "eval_runtime": 94.684, |
| "eval_samples_per_second": 3.834, |
| "eval_steps_per_second": 1.922, |
| "last_batch_tokens": 172, |
| "lr": 1.1958183130774469e-05, |
| "step": 1750, |
| "tokens_per_second": 339.39215730410245, |
| "tokens_per_step": 1046.1891428571428, |
| "total_tokens_seen": 1830831 |
| }, |
| { |
| "epoch": 0.7981819189623635, |
| "grad_norm": 0.5659682154655457, |
| "last_batch_tokens": 103, |
| "learning_rate": 9.829882797706336e-06, |
| "loss": 0.1962204933166504, |
| "lr": 9.788371087841237e-06, |
| "step": 1800, |
| "tokens_per_second": 83.61138183187425, |
| "tokens_per_step": 1063.1733333333334, |
| "total_tokens_seen": 1913712 |
| }, |
| { |
| "epoch": 0.8203536389335403, |
| "grad_norm": 0.3827808201313019, |
| "last_batch_tokens": 211, |
| "learning_rate": 7.85179173182246e-06, |
| "loss": 0.17033554077148438, |
| "lr": 7.814292105989308e-06, |
| "step": 1850, |
| "tokens_per_second": 82.77525601918174, |
| "tokens_per_step": 1078.207027027027, |
| "total_tokens_seen": 1994683 |
| }, |
| { |
| "epoch": 0.8314394989191286, |
| "eval_loss": 0.19150112569332123, |
| "eval_runtime": 94.6106, |
| "eval_samples_per_second": 3.837, |
| "eval_steps_per_second": 1.924, |
| "last_batch_tokens": 172, |
| "lr": 6.9036938458111764e-06, |
| "step": 1875, |
| "tokens_per_second": 129.4537055546321, |
| "tokens_per_step": 1103.9941333333334, |
| "total_tokens_seen": 2069989 |
| }, |
| { |
| "epoch": 0.842525358904717, |
| "grad_norm": 0.4506838917732239, |
| "last_batch_tokens": 132, |
| "learning_rate": 6.078860169460415e-06, |
| "loss": 0.18061737060546876, |
| "lr": 6.045555159845828e-06, |
| "step": 1900, |
| "tokens_per_second": 84.8577689646082, |
| "tokens_per_step": 1111.4515789473685, |
| "total_tokens_seen": 2111758 |
| }, |
| { |
| "epoch": 0.8646970788758938, |
| "grad_norm": 0.42664435505867004, |
| "last_batch_tokens": 123, |
| "learning_rate": 4.519717985389665e-06, |
| "loss": 0.18581958770751952, |
| "lr": 4.490769706577352e-06, |
| "step": 1950, |
| "tokens_per_second": 81.16470478657682, |
| "tokens_per_step": 1123.2635897435898, |
| "total_tokens_seen": 2190364 |
| }, |
| { |
| "epoch": 0.8868687988470706, |
| "grad_norm": 0.3934974670410156, |
| "last_batch_tokens": 291, |
| "learning_rate": 3.18195441885778e-06, |
| "loss": 0.17605453491210937, |
| "lr": 3.157503778723847e-06, |
| "step": 2000, |
| "tokens_per_second": 78.4029933600584, |
| "tokens_per_step": 1133.7585, |
| "total_tokens_seen": 2267517 |
| }, |
| { |
| "epoch": 0.8868687988470706, |
| "eval_loss": 0.1900114119052887, |
| "eval_runtime": 94.4943, |
| "eval_samples_per_second": 3.842, |
| "eval_steps_per_second": 1.926, |
| "last_batch_tokens": 172, |
| "lr": 3.157503778723847e-06, |
| "step": 2000, |
| "tokens_per_second": 340.07002840114217, |
| "tokens_per_step": 1149.8275, |
| "total_tokens_seen": 2299655 |
| }, |
| { |
| "epoch": 0.9090405188182473, |
| "grad_norm": 0.44616127014160156, |
| "last_batch_tokens": 151, |
| "learning_rate": 2.072081132410253e-06, |
| "loss": 0.1782122802734375, |
| "lr": 2.0522471462437796e-06, |
| "step": 2050, |
| "tokens_per_second": 81.0157351221381, |
| "tokens_per_step": 1160.878536585366, |
| "total_tokens_seen": 2379801 |
| }, |
| { |
| "epoch": 0.931212238789424, |
| "grad_norm": 0.4230777621269226, |
| "last_batch_tokens": 188, |
| "learning_rate": 1.195500515894149e-06, |
| "loss": 0.17306018829345704, |
| "lr": 1.1803797270814765e-06, |
| "step": 2100, |
| "tokens_per_second": 80.14939686559167, |
| "tokens_per_step": 1170.3680952380953, |
| "total_tokens_seen": 2457773 |
| }, |
| { |
| "epoch": 0.9422980987750125, |
| "eval_loss": 0.18897105753421783, |
| "eval_runtime": 95.0115, |
| "eval_samples_per_second": 3.821, |
| "eval_steps_per_second": 1.916, |
| "last_batch_tokens": 172, |
| "lr": 8.333381642750881e-07, |
| "step": 2125, |
| "tokens_per_second": 120.10909547338339, |
| "tokens_per_step": 1188.5943529411766, |
| "total_tokens_seen": 2525763 |
| }, |
| { |
| "epoch": 0.9533839587606009, |
| "grad_norm": 0.2957008183002472, |
| "last_batch_tokens": 305, |
| "learning_rate": 5.564793899281884e-07, |
| "loss": 0.1782497787475586, |
| "lr": 5.461454000209198e-07, |
| "step": 2150, |
| "tokens_per_second": 83.24645935651418, |
| "tokens_per_step": 1193.8697674418604, |
| "total_tokens_seen": 2566820 |
| }, |
| { |
| "epoch": 0.9755556787317776, |
| "grad_norm": 0.49967435002326965, |
| "last_batch_tokens": 156, |
| "learning_rate": 1.5812823683962197e-07, |
| "loss": 0.19703115463256837, |
| "lr": 1.5263134729363583e-07, |
| "step": 2200, |
| "tokens_per_second": 75.00656410059429, |
| "tokens_per_step": 1199.9336363636364, |
| "total_tokens_seen": 2639854 |
| }, |
| { |
| "epoch": 0.9977273987029543, |
| "grad_norm": 0.26038259267807007, |
| "last_batch_tokens": 322, |
| "learning_rate": 2.386060162717918e-09, |
| "loss": 0.17010717391967772, |
| "lr": 1.7530274921462308e-09, |
| "step": 2250, |
| "tokens_per_second": 78.96076733362268, |
| "tokens_per_step": 1208.1137777777778, |
| "total_tokens_seen": 2718256 |
| }, |
| { |
| "epoch": 0.9977273987029543, |
| "eval_loss": 0.18876151740550995, |
| "eval_runtime": 95.314, |
| "eval_samples_per_second": 3.808, |
| "eval_steps_per_second": 1.909, |
| "last_batch_tokens": 172, |
| "lr": 1.7530274921462308e-09, |
| "step": 2250, |
| "tokens_per_second": 337.1431434660513, |
| "tokens_per_step": 1222.3973333333333, |
| "total_tokens_seen": 2750394 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 2256, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 250, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 3, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 0 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.666058653049815e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|