| { |
| "best_global_step": 600, |
| "best_metric": 0.23669058084487915, |
| "best_model_checkpoint": "./checkpoints/qwen253-lora-leduc_random_l_s35/checkpoint-600", |
| "epoch": 1.0, |
| "eval_steps": 200, |
| "global_step": 661, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.015128593040847202, |
| "grad_norm": 7.96875, |
| "learning_rate": 8.999999999999999e-06, |
| "loss": 0.5547, |
| "mean_token_accuracy": 0.834239986538887, |
| "num_tokens": 158163.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.030257186081694403, |
| "grad_norm": 3.1875, |
| "learning_rate": 1.8999999999999998e-05, |
| "loss": 0.2377, |
| "mean_token_accuracy": 0.8513324618339538, |
| "num_tokens": 316049.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0453857791225416, |
| "grad_norm": 2.96875, |
| "learning_rate": 2.9e-05, |
| "loss": 0.2314, |
| "mean_token_accuracy": 0.8561549067497254, |
| "num_tokens": 472484.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.060514372163388806, |
| "grad_norm": 1.0625, |
| "learning_rate": 3.499647414432928e-05, |
| "loss": 0.2276, |
| "mean_token_accuracy": 0.8648945838212967, |
| "num_tokens": 631477.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.07564296520423601, |
| "grad_norm": 0.6875, |
| "learning_rate": 3.4956824582777116e-05, |
| "loss": 0.2311, |
| "mean_token_accuracy": 0.853455251455307, |
| "num_tokens": 790829.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0907715582450832, |
| "grad_norm": 2.5, |
| "learning_rate": 3.4873218311644976e-05, |
| "loss": 0.2364, |
| "mean_token_accuracy": 0.8518135726451874, |
| "num_tokens": 949726.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.1059001512859304, |
| "grad_norm": 3.78125, |
| "learning_rate": 3.474586585356039e-05, |
| "loss": 0.2301, |
| "mean_token_accuracy": 0.8524481028318405, |
| "num_tokens": 1107604.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.12102874432677761, |
| "grad_norm": 3.125, |
| "learning_rate": 3.457508788511535e-05, |
| "loss": 0.231, |
| "mean_token_accuracy": 0.8513583898544311, |
| "num_tokens": 1265888.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.1361573373676248, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.436131442939487e-05, |
| "loss": 0.2236, |
| "mean_token_accuracy": 0.8561012089252472, |
| "num_tokens": 1425168.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.15128593040847202, |
| "grad_norm": 0.75390625, |
| "learning_rate": 3.4105083773168374e-05, |
| "loss": 0.2259, |
| "mean_token_accuracy": 0.8565482378005982, |
| "num_tokens": 1582934.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1664145234493192, |
| "grad_norm": 1.0390625, |
| "learning_rate": 3.380704111147049e-05, |
| "loss": 0.231, |
| "mean_token_accuracy": 0.855805104970932, |
| "num_tokens": 1741024.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.1815431164901664, |
| "grad_norm": 2.859375, |
| "learning_rate": 3.3467936922984234e-05, |
| "loss": 0.2247, |
| "mean_token_accuracy": 0.8517077833414077, |
| "num_tokens": 1898125.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.19667170953101362, |
| "grad_norm": 2.828125, |
| "learning_rate": 3.308862508031743e-05, |
| "loss": 0.2315, |
| "mean_token_accuracy": 0.8443083852529526, |
| "num_tokens": 2055835.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.2118003025718608, |
| "grad_norm": 0.6171875, |
| "learning_rate": 3.267006069993065e-05, |
| "loss": 0.2323, |
| "mean_token_accuracy": 0.854484823346138, |
| "num_tokens": 2213654.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.22692889561270801, |
| "grad_norm": 2.40625, |
| "learning_rate": 3.221329773713071e-05, |
| "loss": 0.2263, |
| "mean_token_accuracy": 0.861380758881569, |
| "num_tokens": 2373366.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.24205748865355523, |
| "grad_norm": 1.40625, |
| "learning_rate": 3.1719486332185534e-05, |
| "loss": 0.2313, |
| "mean_token_accuracy": 0.8440588176250458, |
| "num_tokens": 2532146.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.25718608169440244, |
| "grad_norm": 2.421875, |
| "learning_rate": 3.118986991424293e-05, |
| "loss": 0.23, |
| "mean_token_accuracy": 0.8541617065668106, |
| "num_tokens": 2691424.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.2723146747352496, |
| "grad_norm": 0.71875, |
| "learning_rate": 3.0625782070345705e-05, |
| "loss": 0.2279, |
| "mean_token_accuracy": 0.855641770362854, |
| "num_tokens": 2849973.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.2874432677760968, |
| "grad_norm": 4.5, |
| "learning_rate": 3.002864318742703e-05, |
| "loss": 0.2218, |
| "mean_token_accuracy": 0.8619469672441482, |
| "num_tokens": 3007503.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.30257186081694404, |
| "grad_norm": 2.203125, |
| "learning_rate": 2.9399956875741492e-05, |
| "loss": 0.2254, |
| "mean_token_accuracy": 0.8585571944713593, |
| "num_tokens": 3168059.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.30257186081694404, |
| "eval_loss": 0.2432168573141098, |
| "eval_num_tokens": 3168059.0, |
| "eval_runtime": 10.3614, |
| "eval_samples_per_second": 20.654, |
| "eval_steps_per_second": 20.654, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3177004538577912, |
| "grad_norm": 2.40625, |
| "learning_rate": 2.8741306182737877e-05, |
| "loss": 0.2257, |
| "mean_token_accuracy": 0.8514153599739075, |
| "num_tokens": 3325430.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.3328290468986384, |
| "grad_norm": 2.125, |
| "learning_rate": 2.805434960690712e-05, |
| "loss": 0.2266, |
| "mean_token_accuracy": 0.8573758780956269, |
| "num_tokens": 3484988.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.34795763993948564, |
| "grad_norm": 1.6875, |
| "learning_rate": 2.73408169216427e-05, |
| "loss": 0.2257, |
| "mean_token_accuracy": 0.8511970967054368, |
| "num_tokens": 3644592.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.3630862329803328, |
| "grad_norm": 0.65234375, |
| "learning_rate": 2.6602504819629076e-05, |
| "loss": 0.2204, |
| "mean_token_accuracy": 0.8690169095993042, |
| "num_tokens": 3805280.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.37821482602118, |
| "grad_norm": 3.53125, |
| "learning_rate": 2.5841272388725777e-05, |
| "loss": 0.2157, |
| "mean_token_accuracy": 0.8685499548912048, |
| "num_tokens": 3965413.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.39334341906202724, |
| "grad_norm": 0.99609375, |
| "learning_rate": 2.5059036430738846e-05, |
| "loss": 0.223, |
| "mean_token_accuracy": 0.8623712241649628, |
| "num_tokens": 4122572.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.4084720121028744, |
| "grad_norm": 2.734375, |
| "learning_rate": 2.4257766634867203e-05, |
| "loss": 0.2281, |
| "mean_token_accuracy": 0.8473478049039841, |
| "num_tokens": 4280652.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.4236006051437216, |
| "grad_norm": 1.90625, |
| "learning_rate": 2.3439480617977275e-05, |
| "loss": 0.2195, |
| "mean_token_accuracy": 0.8656352519989013, |
| "num_tokens": 4440066.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.43872919818456885, |
| "grad_norm": 2.0625, |
| "learning_rate": 2.2606238844194544e-05, |
| "loss": 0.2267, |
| "mean_token_accuracy": 0.8563310325145721, |
| "num_tokens": 4598248.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.45385779122541603, |
| "grad_norm": 2.390625, |
| "learning_rate": 2.1760139436604713e-05, |
| "loss": 0.2268, |
| "mean_token_accuracy": 0.8453394055366517, |
| "num_tokens": 4755809.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4689863842662632, |
| "grad_norm": 2.0625, |
| "learning_rate": 2.0903312894128633e-05, |
| "loss": 0.2285, |
| "mean_token_accuracy": 0.8545234054327011, |
| "num_tokens": 4913328.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.48411497730711045, |
| "grad_norm": 1.109375, |
| "learning_rate": 2.0037916726874145e-05, |
| "loss": 0.2271, |
| "mean_token_accuracy": 0.8579858303070068, |
| "num_tokens": 5072616.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.49924357034795763, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.9166130023473036e-05, |
| "loss": 0.2222, |
| "mean_token_accuracy": 0.8509624302387238, |
| "num_tokens": 5231113.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.5143721633888049, |
| "grad_norm": 1.3125, |
| "learning_rate": 1.829014796408282e-05, |
| "loss": 0.2247, |
| "mean_token_accuracy": 0.8627366036176681, |
| "num_tokens": 5389029.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.529500756429652, |
| "grad_norm": 0.92578125, |
| "learning_rate": 1.7412176292869573e-05, |
| "loss": 0.2193, |
| "mean_token_accuracy": 0.8546810537576676, |
| "num_tokens": 5547970.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.5446293494704992, |
| "grad_norm": 0.67578125, |
| "learning_rate": 1.653442576389043e-05, |
| "loss": 0.221, |
| "mean_token_accuracy": 0.8571889936923981, |
| "num_tokens": 5707525.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5597579425113465, |
| "grad_norm": 0.90234375, |
| "learning_rate": 1.5659106574360977e-05, |
| "loss": 0.2273, |
| "mean_token_accuracy": 0.8585471630096435, |
| "num_tokens": 5865689.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.5748865355521936, |
| "grad_norm": 1.2109375, |
| "learning_rate": 1.4788422799324862e-05, |
| "loss": 0.2317, |
| "mean_token_accuracy": 0.8584190517663955, |
| "num_tokens": 6021932.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.5900151285930408, |
| "grad_norm": 0.9453125, |
| "learning_rate": 1.3924566841739079e-05, |
| "loss": 0.2238, |
| "mean_token_accuracy": 0.8559250921010971, |
| "num_tokens": 6179562.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.6051437216338881, |
| "grad_norm": 0.828125, |
| "learning_rate": 1.3069713911949962e-05, |
| "loss": 0.2169, |
| "mean_token_accuracy": 0.8574993282556533, |
| "num_tokens": 6337908.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.6051437216338881, |
| "eval_loss": 0.24050775170326233, |
| "eval_num_tokens": 6337908.0, |
| "eval_runtime": 10.2898, |
| "eval_samples_per_second": 20.797, |
| "eval_steps_per_second": 20.797, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.6202723146747352, |
| "grad_norm": 1.3515625, |
| "learning_rate": 1.222601655046052e-05, |
| "loss": 0.2295, |
| "mean_token_accuracy": 0.8591887027025222, |
| "num_tokens": 6495268.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.6354009077155824, |
| "grad_norm": 1.0625, |
| "learning_rate": 1.1395599207781006e-05, |
| "loss": 0.2286, |
| "mean_token_accuracy": 0.8543924212455749, |
| "num_tokens": 6651839.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.6505295007564297, |
| "grad_norm": 0.98046875, |
| "learning_rate": 1.0580552895010796e-05, |
| "loss": 0.224, |
| "mean_token_accuracy": 0.8684775650501251, |
| "num_tokens": 6809804.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.6656580937972768, |
| "grad_norm": 1.3359375, |
| "learning_rate": 9.782929918621475e-06, |
| "loss": 0.2245, |
| "mean_token_accuracy": 0.8595554202795028, |
| "num_tokens": 6967079.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.680786686838124, |
| "grad_norm": 0.59765625, |
| "learning_rate": 9.004738712699157e-06, |
| "loss": 0.2204, |
| "mean_token_accuracy": 0.863404393196106, |
| "num_tokens": 7126399.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.6959152798789713, |
| "grad_norm": 1.234375, |
| "learning_rate": 8.247938781658551e-06, |
| "loss": 0.2206, |
| "mean_token_accuracy": 0.8627041339874267, |
| "num_tokens": 7285948.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.7110438729198184, |
| "grad_norm": 1.5546875, |
| "learning_rate": 7.514435766163046e-06, |
| "loss": 0.2279, |
| "mean_token_accuracy": 0.8660434067249299, |
| "num_tokens": 7443250.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.7261724659606656, |
| "grad_norm": 1.6015625, |
| "learning_rate": 6.806076644675154e-06, |
| "loss": 0.2233, |
| "mean_token_accuracy": 0.8606104016304016, |
| "num_tokens": 7601533.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.7413010590015129, |
| "grad_norm": 3.28125, |
| "learning_rate": 6.124645082719727e-06, |
| "loss": 0.2214, |
| "mean_token_accuracy": 0.8641792595386505, |
| "num_tokens": 7759173.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.75642965204236, |
| "grad_norm": 1.734375, |
| "learning_rate": 5.471856941570691e-06, |
| "loss": 0.2266, |
| "mean_token_accuracy": 0.8583661437034606, |
| "num_tokens": 7915271.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7715582450832073, |
| "grad_norm": 2.171875, |
| "learning_rate": 4.84935595767059e-06, |
| "loss": 0.2239, |
| "mean_token_accuracy": 0.863108116388321, |
| "num_tokens": 8072725.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.7866868381240545, |
| "grad_norm": 2.65625, |
| "learning_rate": 4.2587096036621585e-06, |
| "loss": 0.219, |
| "mean_token_accuracy": 0.8645495653152466, |
| "num_tokens": 8232048.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.8018154311649016, |
| "grad_norm": 2.65625, |
| "learning_rate": 3.70140514145403e-06, |
| "loss": 0.2203, |
| "mean_token_accuracy": 0.8692421615123749, |
| "num_tokens": 8389234.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.8169440242057489, |
| "grad_norm": 0.89453125, |
| "learning_rate": 3.1788458772590123e-06, |
| "loss": 0.2153, |
| "mean_token_accuracy": 0.857841071486473, |
| "num_tokens": 8547295.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.8320726172465961, |
| "grad_norm": 3.0, |
| "learning_rate": 2.6923476280348592e-06, |
| "loss": 0.2211, |
| "mean_token_accuracy": 0.8649828612804413, |
| "num_tokens": 8706082.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.8472012102874432, |
| "grad_norm": 1.125, |
| "learning_rate": 2.2431354082251086e-06, |
| "loss": 0.2206, |
| "mean_token_accuracy": 0.8666522175073623, |
| "num_tokens": 8864131.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.8623298033282905, |
| "grad_norm": 1.3671875, |
| "learning_rate": 1.8323403451428861e-06, |
| "loss": 0.2223, |
| "mean_token_accuracy": 0.8635704159736634, |
| "num_tokens": 9022578.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.8774583963691377, |
| "grad_norm": 1.5625, |
| "learning_rate": 1.4609968307647638e-06, |
| "loss": 0.2143, |
| "mean_token_accuracy": 0.8704730212688446, |
| "num_tokens": 9181933.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.8925869894099848, |
| "grad_norm": 1.328125, |
| "learning_rate": 1.1300399171065517e-06, |
| "loss": 0.2153, |
| "mean_token_accuracy": 0.8723822474479676, |
| "num_tokens": 9341042.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.9077155824508321, |
| "grad_norm": 0.70703125, |
| "learning_rate": 8.403029617395654e-07, |
| "loss": 0.2257, |
| "mean_token_accuracy": 0.8509276181459426, |
| "num_tokens": 9497699.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.9077155824508321, |
| "eval_loss": 0.23669058084487915, |
| "eval_num_tokens": 9497699.0, |
| "eval_runtime": 10.2029, |
| "eval_samples_per_second": 20.974, |
| "eval_steps_per_second": 20.974, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.9228441754916793, |
| "grad_norm": 2.640625, |
| "learning_rate": 5.925155293759559e-07, |
| "loss": 0.2201, |
| "mean_token_accuracy": 0.864446359872818, |
| "num_tokens": 9655514.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.9379727685325264, |
| "grad_norm": 1.015625, |
| "learning_rate": 3.8730155480696634e-07, |
| "loss": 0.2278, |
| "mean_token_accuracy": 0.848170417547226, |
| "num_tokens": 9812955.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.9531013615733737, |
| "grad_norm": 0.8984375, |
| "learning_rate": 2.2517777181995822e-07, |
| "loss": 0.214, |
| "mean_token_accuracy": 0.8704831153154373, |
| "num_tokens": 9972847.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.9682299546142209, |
| "grad_norm": 1.15625, |
| "learning_rate": 1.0655241205012516e-07, |
| "loss": 0.2211, |
| "mean_token_accuracy": 0.8742094576358795, |
| "num_tokens": 10130896.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.983358547655068, |
| "grad_norm": 1.078125, |
| "learning_rate": 3.172417704330077e-08, |
| "loss": 0.2212, |
| "mean_token_accuracy": 0.8658175647258759, |
| "num_tokens": 10290834.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.9984871406959153, |
| "grad_norm": 0.56640625, |
| "learning_rate": 8.814861181871691e-10, |
| "loss": 0.2272, |
| "mean_token_accuracy": 0.8598562389612198, |
| "num_tokens": 10447704.0, |
| "step": 660 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 661, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 200, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.7413486193799168e+17, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|