rrf / trainer_state.json
haihp02's picture
Upload checkpoint
aa2141c verified
{
"best_global_step": 600,
"best_metric": 0.23669058084487915,
"best_model_checkpoint": "./checkpoints/qwen253-lora-leduc_random_l_s35/checkpoint-600",
"epoch": 1.0,
"eval_steps": 200,
"global_step": 661,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015128593040847202,
"grad_norm": 7.96875,
"learning_rate": 8.999999999999999e-06,
"loss": 0.5547,
"mean_token_accuracy": 0.834239986538887,
"num_tokens": 158163.0,
"step": 10
},
{
"epoch": 0.030257186081694403,
"grad_norm": 3.1875,
"learning_rate": 1.8999999999999998e-05,
"loss": 0.2377,
"mean_token_accuracy": 0.8513324618339538,
"num_tokens": 316049.0,
"step": 20
},
{
"epoch": 0.0453857791225416,
"grad_norm": 2.96875,
"learning_rate": 2.9e-05,
"loss": 0.2314,
"mean_token_accuracy": 0.8561549067497254,
"num_tokens": 472484.0,
"step": 30
},
{
"epoch": 0.060514372163388806,
"grad_norm": 1.0625,
"learning_rate": 3.499647414432928e-05,
"loss": 0.2276,
"mean_token_accuracy": 0.8648945838212967,
"num_tokens": 631477.0,
"step": 40
},
{
"epoch": 0.07564296520423601,
"grad_norm": 0.6875,
"learning_rate": 3.4956824582777116e-05,
"loss": 0.2311,
"mean_token_accuracy": 0.853455251455307,
"num_tokens": 790829.0,
"step": 50
},
{
"epoch": 0.0907715582450832,
"grad_norm": 2.5,
"learning_rate": 3.4873218311644976e-05,
"loss": 0.2364,
"mean_token_accuracy": 0.8518135726451874,
"num_tokens": 949726.0,
"step": 60
},
{
"epoch": 0.1059001512859304,
"grad_norm": 3.78125,
"learning_rate": 3.474586585356039e-05,
"loss": 0.2301,
"mean_token_accuracy": 0.8524481028318405,
"num_tokens": 1107604.0,
"step": 70
},
{
"epoch": 0.12102874432677761,
"grad_norm": 3.125,
"learning_rate": 3.457508788511535e-05,
"loss": 0.231,
"mean_token_accuracy": 0.8513583898544311,
"num_tokens": 1265888.0,
"step": 80
},
{
"epoch": 0.1361573373676248,
"grad_norm": 1.640625,
"learning_rate": 3.436131442939487e-05,
"loss": 0.2236,
"mean_token_accuracy": 0.8561012089252472,
"num_tokens": 1425168.0,
"step": 90
},
{
"epoch": 0.15128593040847202,
"grad_norm": 0.75390625,
"learning_rate": 3.4105083773168374e-05,
"loss": 0.2259,
"mean_token_accuracy": 0.8565482378005982,
"num_tokens": 1582934.0,
"step": 100
},
{
"epoch": 0.1664145234493192,
"grad_norm": 1.0390625,
"learning_rate": 3.380704111147049e-05,
"loss": 0.231,
"mean_token_accuracy": 0.855805104970932,
"num_tokens": 1741024.0,
"step": 110
},
{
"epoch": 0.1815431164901664,
"grad_norm": 2.859375,
"learning_rate": 3.3467936922984234e-05,
"loss": 0.2247,
"mean_token_accuracy": 0.8517077833414077,
"num_tokens": 1898125.0,
"step": 120
},
{
"epoch": 0.19667170953101362,
"grad_norm": 2.828125,
"learning_rate": 3.308862508031743e-05,
"loss": 0.2315,
"mean_token_accuracy": 0.8443083852529526,
"num_tokens": 2055835.0,
"step": 130
},
{
"epoch": 0.2118003025718608,
"grad_norm": 0.6171875,
"learning_rate": 3.267006069993065e-05,
"loss": 0.2323,
"mean_token_accuracy": 0.854484823346138,
"num_tokens": 2213654.0,
"step": 140
},
{
"epoch": 0.22692889561270801,
"grad_norm": 2.40625,
"learning_rate": 3.221329773713071e-05,
"loss": 0.2263,
"mean_token_accuracy": 0.861380758881569,
"num_tokens": 2373366.0,
"step": 150
},
{
"epoch": 0.24205748865355523,
"grad_norm": 1.40625,
"learning_rate": 3.1719486332185534e-05,
"loss": 0.2313,
"mean_token_accuracy": 0.8440588176250458,
"num_tokens": 2532146.0,
"step": 160
},
{
"epoch": 0.25718608169440244,
"grad_norm": 2.421875,
"learning_rate": 3.118986991424293e-05,
"loss": 0.23,
"mean_token_accuracy": 0.8541617065668106,
"num_tokens": 2691424.0,
"step": 170
},
{
"epoch": 0.2723146747352496,
"grad_norm": 0.71875,
"learning_rate": 3.0625782070345705e-05,
"loss": 0.2279,
"mean_token_accuracy": 0.855641770362854,
"num_tokens": 2849973.0,
"step": 180
},
{
"epoch": 0.2874432677760968,
"grad_norm": 4.5,
"learning_rate": 3.002864318742703e-05,
"loss": 0.2218,
"mean_token_accuracy": 0.8619469672441482,
"num_tokens": 3007503.0,
"step": 190
},
{
"epoch": 0.30257186081694404,
"grad_norm": 2.203125,
"learning_rate": 2.9399956875741492e-05,
"loss": 0.2254,
"mean_token_accuracy": 0.8585571944713593,
"num_tokens": 3168059.0,
"step": 200
},
{
"epoch": 0.30257186081694404,
"eval_loss": 0.2432168573141098,
"eval_num_tokens": 3168059.0,
"eval_runtime": 10.3614,
"eval_samples_per_second": 20.654,
"eval_steps_per_second": 20.654,
"step": 200
},
{
"epoch": 0.3177004538577912,
"grad_norm": 2.40625,
"learning_rate": 2.8741306182737877e-05,
"loss": 0.2257,
"mean_token_accuracy": 0.8514153599739075,
"num_tokens": 3325430.0,
"step": 210
},
{
"epoch": 0.3328290468986384,
"grad_norm": 2.125,
"learning_rate": 2.805434960690712e-05,
"loss": 0.2266,
"mean_token_accuracy": 0.8573758780956269,
"num_tokens": 3484988.0,
"step": 220
},
{
"epoch": 0.34795763993948564,
"grad_norm": 1.6875,
"learning_rate": 2.73408169216427e-05,
"loss": 0.2257,
"mean_token_accuracy": 0.8511970967054368,
"num_tokens": 3644592.0,
"step": 230
},
{
"epoch": 0.3630862329803328,
"grad_norm": 0.65234375,
"learning_rate": 2.6602504819629076e-05,
"loss": 0.2204,
"mean_token_accuracy": 0.8690169095993042,
"num_tokens": 3805280.0,
"step": 240
},
{
"epoch": 0.37821482602118,
"grad_norm": 3.53125,
"learning_rate": 2.5841272388725777e-05,
"loss": 0.2157,
"mean_token_accuracy": 0.8685499548912048,
"num_tokens": 3965413.0,
"step": 250
},
{
"epoch": 0.39334341906202724,
"grad_norm": 0.99609375,
"learning_rate": 2.5059036430738846e-05,
"loss": 0.223,
"mean_token_accuracy": 0.8623712241649628,
"num_tokens": 4122572.0,
"step": 260
},
{
"epoch": 0.4084720121028744,
"grad_norm": 2.734375,
"learning_rate": 2.4257766634867203e-05,
"loss": 0.2281,
"mean_token_accuracy": 0.8473478049039841,
"num_tokens": 4280652.0,
"step": 270
},
{
"epoch": 0.4236006051437216,
"grad_norm": 1.90625,
"learning_rate": 2.3439480617977275e-05,
"loss": 0.2195,
"mean_token_accuracy": 0.8656352519989013,
"num_tokens": 4440066.0,
"step": 280
},
{
"epoch": 0.43872919818456885,
"grad_norm": 2.0625,
"learning_rate": 2.2606238844194544e-05,
"loss": 0.2267,
"mean_token_accuracy": 0.8563310325145721,
"num_tokens": 4598248.0,
"step": 290
},
{
"epoch": 0.45385779122541603,
"grad_norm": 2.390625,
"learning_rate": 2.1760139436604713e-05,
"loss": 0.2268,
"mean_token_accuracy": 0.8453394055366517,
"num_tokens": 4755809.0,
"step": 300
},
{
"epoch": 0.4689863842662632,
"grad_norm": 2.0625,
"learning_rate": 2.0903312894128633e-05,
"loss": 0.2285,
"mean_token_accuracy": 0.8545234054327011,
"num_tokens": 4913328.0,
"step": 310
},
{
"epoch": 0.48411497730711045,
"grad_norm": 1.109375,
"learning_rate": 2.0037916726874145e-05,
"loss": 0.2271,
"mean_token_accuracy": 0.8579858303070068,
"num_tokens": 5072616.0,
"step": 320
},
{
"epoch": 0.49924357034795763,
"grad_norm": 2.28125,
"learning_rate": 1.9166130023473036e-05,
"loss": 0.2222,
"mean_token_accuracy": 0.8509624302387238,
"num_tokens": 5231113.0,
"step": 330
},
{
"epoch": 0.5143721633888049,
"grad_norm": 1.3125,
"learning_rate": 1.829014796408282e-05,
"loss": 0.2247,
"mean_token_accuracy": 0.8627366036176681,
"num_tokens": 5389029.0,
"step": 340
},
{
"epoch": 0.529500756429652,
"grad_norm": 0.92578125,
"learning_rate": 1.7412176292869573e-05,
"loss": 0.2193,
"mean_token_accuracy": 0.8546810537576676,
"num_tokens": 5547970.0,
"step": 350
},
{
"epoch": 0.5446293494704992,
"grad_norm": 0.67578125,
"learning_rate": 1.653442576389043e-05,
"loss": 0.221,
"mean_token_accuracy": 0.8571889936923981,
"num_tokens": 5707525.0,
"step": 360
},
{
"epoch": 0.5597579425113465,
"grad_norm": 0.90234375,
"learning_rate": 1.5659106574360977e-05,
"loss": 0.2273,
"mean_token_accuracy": 0.8585471630096435,
"num_tokens": 5865689.0,
"step": 370
},
{
"epoch": 0.5748865355521936,
"grad_norm": 1.2109375,
"learning_rate": 1.4788422799324862e-05,
"loss": 0.2317,
"mean_token_accuracy": 0.8584190517663955,
"num_tokens": 6021932.0,
"step": 380
},
{
"epoch": 0.5900151285930408,
"grad_norm": 0.9453125,
"learning_rate": 1.3924566841739079e-05,
"loss": 0.2238,
"mean_token_accuracy": 0.8559250921010971,
"num_tokens": 6179562.0,
"step": 390
},
{
"epoch": 0.6051437216338881,
"grad_norm": 0.828125,
"learning_rate": 1.3069713911949962e-05,
"loss": 0.2169,
"mean_token_accuracy": 0.8574993282556533,
"num_tokens": 6337908.0,
"step": 400
},
{
"epoch": 0.6051437216338881,
"eval_loss": 0.24050775170326233,
"eval_num_tokens": 6337908.0,
"eval_runtime": 10.2898,
"eval_samples_per_second": 20.797,
"eval_steps_per_second": 20.797,
"step": 400
},
{
"epoch": 0.6202723146747352,
"grad_norm": 1.3515625,
"learning_rate": 1.222601655046052e-05,
"loss": 0.2295,
"mean_token_accuracy": 0.8591887027025222,
"num_tokens": 6495268.0,
"step": 410
},
{
"epoch": 0.6354009077155824,
"grad_norm": 1.0625,
"learning_rate": 1.1395599207781006e-05,
"loss": 0.2286,
"mean_token_accuracy": 0.8543924212455749,
"num_tokens": 6651839.0,
"step": 420
},
{
"epoch": 0.6505295007564297,
"grad_norm": 0.98046875,
"learning_rate": 1.0580552895010796e-05,
"loss": 0.224,
"mean_token_accuracy": 0.8684775650501251,
"num_tokens": 6809804.0,
"step": 430
},
{
"epoch": 0.6656580937972768,
"grad_norm": 1.3359375,
"learning_rate": 9.782929918621475e-06,
"loss": 0.2245,
"mean_token_accuracy": 0.8595554202795028,
"num_tokens": 6967079.0,
"step": 440
},
{
"epoch": 0.680786686838124,
"grad_norm": 0.59765625,
"learning_rate": 9.004738712699157e-06,
"loss": 0.2204,
"mean_token_accuracy": 0.863404393196106,
"num_tokens": 7126399.0,
"step": 450
},
{
"epoch": 0.6959152798789713,
"grad_norm": 1.234375,
"learning_rate": 8.247938781658551e-06,
"loss": 0.2206,
"mean_token_accuracy": 0.8627041339874267,
"num_tokens": 7285948.0,
"step": 460
},
{
"epoch": 0.7110438729198184,
"grad_norm": 1.5546875,
"learning_rate": 7.514435766163046e-06,
"loss": 0.2279,
"mean_token_accuracy": 0.8660434067249299,
"num_tokens": 7443250.0,
"step": 470
},
{
"epoch": 0.7261724659606656,
"grad_norm": 1.6015625,
"learning_rate": 6.806076644675154e-06,
"loss": 0.2233,
"mean_token_accuracy": 0.8606104016304016,
"num_tokens": 7601533.0,
"step": 480
},
{
"epoch": 0.7413010590015129,
"grad_norm": 3.28125,
"learning_rate": 6.124645082719727e-06,
"loss": 0.2214,
"mean_token_accuracy": 0.8641792595386505,
"num_tokens": 7759173.0,
"step": 490
},
{
"epoch": 0.75642965204236,
"grad_norm": 1.734375,
"learning_rate": 5.471856941570691e-06,
"loss": 0.2266,
"mean_token_accuracy": 0.8583661437034606,
"num_tokens": 7915271.0,
"step": 500
},
{
"epoch": 0.7715582450832073,
"grad_norm": 2.171875,
"learning_rate": 4.84935595767059e-06,
"loss": 0.2239,
"mean_token_accuracy": 0.863108116388321,
"num_tokens": 8072725.0,
"step": 510
},
{
"epoch": 0.7866868381240545,
"grad_norm": 2.65625,
"learning_rate": 4.2587096036621585e-06,
"loss": 0.219,
"mean_token_accuracy": 0.8645495653152466,
"num_tokens": 8232048.0,
"step": 520
},
{
"epoch": 0.8018154311649016,
"grad_norm": 2.65625,
"learning_rate": 3.70140514145403e-06,
"loss": 0.2203,
"mean_token_accuracy": 0.8692421615123749,
"num_tokens": 8389234.0,
"step": 530
},
{
"epoch": 0.8169440242057489,
"grad_norm": 0.89453125,
"learning_rate": 3.1788458772590123e-06,
"loss": 0.2153,
"mean_token_accuracy": 0.857841071486473,
"num_tokens": 8547295.0,
"step": 540
},
{
"epoch": 0.8320726172465961,
"grad_norm": 3.0,
"learning_rate": 2.6923476280348592e-06,
"loss": 0.2211,
"mean_token_accuracy": 0.8649828612804413,
"num_tokens": 8706082.0,
"step": 550
},
{
"epoch": 0.8472012102874432,
"grad_norm": 1.125,
"learning_rate": 2.2431354082251086e-06,
"loss": 0.2206,
"mean_token_accuracy": 0.8666522175073623,
"num_tokens": 8864131.0,
"step": 560
},
{
"epoch": 0.8623298033282905,
"grad_norm": 1.3671875,
"learning_rate": 1.8323403451428861e-06,
"loss": 0.2223,
"mean_token_accuracy": 0.8635704159736634,
"num_tokens": 9022578.0,
"step": 570
},
{
"epoch": 0.8774583963691377,
"grad_norm": 1.5625,
"learning_rate": 1.4609968307647638e-06,
"loss": 0.2143,
"mean_token_accuracy": 0.8704730212688446,
"num_tokens": 9181933.0,
"step": 580
},
{
"epoch": 0.8925869894099848,
"grad_norm": 1.328125,
"learning_rate": 1.1300399171065517e-06,
"loss": 0.2153,
"mean_token_accuracy": 0.8723822474479676,
"num_tokens": 9341042.0,
"step": 590
},
{
"epoch": 0.9077155824508321,
"grad_norm": 0.70703125,
"learning_rate": 8.403029617395654e-07,
"loss": 0.2257,
"mean_token_accuracy": 0.8509276181459426,
"num_tokens": 9497699.0,
"step": 600
},
{
"epoch": 0.9077155824508321,
"eval_loss": 0.23669058084487915,
"eval_num_tokens": 9497699.0,
"eval_runtime": 10.2029,
"eval_samples_per_second": 20.974,
"eval_steps_per_second": 20.974,
"step": 600
},
{
"epoch": 0.9228441754916793,
"grad_norm": 2.640625,
"learning_rate": 5.925155293759559e-07,
"loss": 0.2201,
"mean_token_accuracy": 0.864446359872818,
"num_tokens": 9655514.0,
"step": 610
},
{
"epoch": 0.9379727685325264,
"grad_norm": 1.015625,
"learning_rate": 3.8730155480696634e-07,
"loss": 0.2278,
"mean_token_accuracy": 0.848170417547226,
"num_tokens": 9812955.0,
"step": 620
},
{
"epoch": 0.9531013615733737,
"grad_norm": 0.8984375,
"learning_rate": 2.2517777181995822e-07,
"loss": 0.214,
"mean_token_accuracy": 0.8704831153154373,
"num_tokens": 9972847.0,
"step": 630
},
{
"epoch": 0.9682299546142209,
"grad_norm": 1.15625,
"learning_rate": 1.0655241205012516e-07,
"loss": 0.2211,
"mean_token_accuracy": 0.8742094576358795,
"num_tokens": 10130896.0,
"step": 640
},
{
"epoch": 0.983358547655068,
"grad_norm": 1.078125,
"learning_rate": 3.172417704330077e-08,
"loss": 0.2212,
"mean_token_accuracy": 0.8658175647258759,
"num_tokens": 10290834.0,
"step": 650
},
{
"epoch": 0.9984871406959153,
"grad_norm": 0.56640625,
"learning_rate": 8.814861181871691e-10,
"loss": 0.2272,
"mean_token_accuracy": 0.8598562389612198,
"num_tokens": 10447704.0,
"step": 660
}
],
"logging_steps": 10,
"max_steps": 661,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.7413486193799168e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}