{ "best_metric": 93.7, "best_model_checkpoint": "../results/phrase_retrieval/PR-pass/qa/mpsDistillbert/finetuned/checkpoint-5000", "epoch": 2.0, "eval_steps": 100, "global_step": 5066, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03947887879984208, "grad_norm": 36.08970642089844, "learning_rate": 2.9407816818002367e-05, "loss": 2.4898, "step": 100 }, { "epoch": 0.03947887879984208, "eval_exact_match": 79.53333333333333, "eval_f1": 85.07605280322392, "step": 100 }, { "epoch": 0.07895775759968417, "grad_norm": 15.822399139404297, "learning_rate": 2.8815633636004737e-05, "loss": 0.7877, "step": 200 }, { "epoch": 0.07895775759968417, "eval_exact_match": 82.16666666666667, "eval_f1": 86.96412271126349, "step": 200 }, { "epoch": 0.11843663639952626, "grad_norm": 53.009056091308594, "learning_rate": 2.8223450454007107e-05, "loss": 0.6493, "step": 300 }, { "epoch": 0.11843663639952626, "eval_exact_match": 86.96666666666667, "eval_f1": 90.26251606941636, "step": 300 }, { "epoch": 0.15791551519936833, "grad_norm": 33.941368103027344, "learning_rate": 2.7631267272009477e-05, "loss": 0.6859, "step": 400 }, { "epoch": 0.15791551519936833, "eval_exact_match": 87.7, "eval_f1": 90.70352596039436, "step": 400 }, { "epoch": 0.1973943939992104, "grad_norm": 33.127262115478516, "learning_rate": 2.7039084090011843e-05, "loss": 0.6207, "step": 500 }, { "epoch": 0.1973943939992104, "eval_exact_match": 88.76666666666667, "eval_f1": 91.3789169814247, "step": 500 }, { "epoch": 0.23687327279905251, "grad_norm": 73.58187103271484, "learning_rate": 2.6446900908014213e-05, "loss": 0.4846, "step": 600 }, { "epoch": 0.23687327279905251, "eval_exact_match": 89.6, "eval_f1": 92.12643429555197, "step": 600 }, { "epoch": 0.2763521515988946, "grad_norm": 38.15833282470703, "learning_rate": 2.585471772601658e-05, "loss": 0.4907, "step": 700 }, { "epoch": 0.2763521515988946, "eval_exact_match": 89.5, "eval_f1": 92.18318144579082, "step": 700 }, { "epoch": 0.31583103039873667, "grad_norm": 48.2036018371582, "learning_rate": 2.5262534544018952e-05, "loss": 0.4449, "step": 800 }, { "epoch": 0.31583103039873667, "eval_exact_match": 89.36666666666666, "eval_f1": 92.03679853056012, "step": 800 }, { "epoch": 0.35530990919857874, "grad_norm": 48.83040237426758, "learning_rate": 2.467035136202132e-05, "loss": 0.4882, "step": 900 }, { "epoch": 0.35530990919857874, "eval_exact_match": 91.0, "eval_f1": 93.24476194351007, "step": 900 }, { "epoch": 0.3947887879984208, "grad_norm": 33.257354736328125, "learning_rate": 2.407816818002369e-05, "loss": 0.4357, "step": 1000 }, { "epoch": 0.3947887879984208, "eval_exact_match": 91.16666666666667, "eval_f1": 93.38561670865543, "step": 1000 }, { "epoch": 0.43426766679826295, "grad_norm": 36.499168395996094, "learning_rate": 2.3485984998026055e-05, "loss": 0.499, "step": 1100 }, { "epoch": 0.43426766679826295, "eval_exact_match": 90.96666666666667, "eval_f1": 93.31853006155642, "step": 1100 }, { "epoch": 0.47374654559810503, "grad_norm": 9.117573738098145, "learning_rate": 2.2893801816028428e-05, "loss": 0.4294, "step": 1200 }, { "epoch": 0.47374654559810503, "eval_exact_match": 90.83333333333333, "eval_f1": 93.1301494526572, "step": 1200 }, { "epoch": 0.513225424397947, "grad_norm": 47.918365478515625, "learning_rate": 2.2301618634030794e-05, "loss": 0.4528, "step": 1300 }, { "epoch": 0.513225424397947, "eval_exact_match": 91.8, "eval_f1": 93.93781839408155, "step": 1300 }, { "epoch": 0.5527043031977892, "grad_norm": 2.020205020904541, "learning_rate": 2.1709435452033164e-05, "loss": 0.3965, "step": 1400 }, { "epoch": 0.5527043031977892, "eval_exact_match": 91.46666666666667, "eval_f1": 93.66177518095938, "step": 1400 }, { "epoch": 0.5921831819976313, "grad_norm": 16.072200775146484, "learning_rate": 2.111725227003553e-05, "loss": 0.3609, "step": 1500 }, { "epoch": 0.5921831819976313, "eval_exact_match": 91.7, "eval_f1": 93.75143838703438, "step": 1500 }, { "epoch": 0.6316620607974733, "grad_norm": 48.690391540527344, "learning_rate": 2.05250690880379e-05, "loss": 0.4318, "step": 1600 }, { "epoch": 0.6316620607974733, "eval_exact_match": 91.46666666666667, "eval_f1": 93.55887712871922, "step": 1600 }, { "epoch": 0.6711409395973155, "grad_norm": 3.189640998840332, "learning_rate": 1.993288590604027e-05, "loss": 0.3602, "step": 1700 }, { "epoch": 0.6711409395973155, "eval_exact_match": 92.43333333333334, "eval_f1": 94.2249514830154, "step": 1700 }, { "epoch": 0.7106198183971575, "grad_norm": 0.2732882499694824, "learning_rate": 1.9340702724042637e-05, "loss": 0.3656, "step": 1800 }, { "epoch": 0.7106198183971575, "eval_exact_match": 92.3, "eval_f1": 94.17740849673203, "step": 1800 }, { "epoch": 0.7500986971969996, "grad_norm": 25.344755172729492, "learning_rate": 1.8748519542045006e-05, "loss": 0.3825, "step": 1900 }, { "epoch": 0.7500986971969996, "eval_exact_match": 92.16666666666667, "eval_f1": 94.0608726088726, "step": 1900 }, { "epoch": 0.7895775759968416, "grad_norm": 48.43545913696289, "learning_rate": 1.8156336360047373e-05, "loss": 0.3447, "step": 2000 }, { "epoch": 0.7895775759968416, "eval_exact_match": 92.53333333333333, "eval_f1": 94.25784554225729, "step": 2000 }, { "epoch": 0.8290564547966838, "grad_norm": 38.915374755859375, "learning_rate": 1.7564153178049743e-05, "loss": 0.3033, "step": 2100 }, { "epoch": 0.8290564547966838, "eval_exact_match": 92.0, "eval_f1": 94.01127170868344, "step": 2100 }, { "epoch": 0.8685353335965259, "grad_norm": 3.7626020908355713, "learning_rate": 1.6971969996052112e-05, "loss": 0.3126, "step": 2200 }, { "epoch": 0.8685353335965259, "eval_exact_match": 92.4, "eval_f1": 94.20074145705928, "step": 2200 }, { "epoch": 0.9080142123963679, "grad_norm": 11.15784740447998, "learning_rate": 1.6379786814054482e-05, "loss": 0.368, "step": 2300 }, { "epoch": 0.9080142123963679, "eval_exact_match": 92.46666666666667, "eval_f1": 94.47528310578315, "step": 2300 }, { "epoch": 0.9474930911962101, "grad_norm": 55.31396484375, "learning_rate": 1.578760363205685e-05, "loss": 0.3398, "step": 2400 }, { "epoch": 0.9474930911962101, "eval_exact_match": 92.46666666666667, "eval_f1": 94.40168950153159, "step": 2400 }, { "epoch": 0.9869719699960521, "grad_norm": 32.73415756225586, "learning_rate": 1.5195420450059218e-05, "loss": 0.3177, "step": 2500 }, { "epoch": 0.9869719699960521, "eval_exact_match": 92.46666666666667, "eval_f1": 94.35837233137235, "step": 2500 }, { "epoch": 1.026450848795894, "grad_norm": 24.14990234375, "learning_rate": 1.4603237268061586e-05, "loss": 0.2105, "step": 2600 }, { "epoch": 1.026450848795894, "eval_exact_match": 92.7, "eval_f1": 94.4984410208528, "step": 2600 }, { "epoch": 1.0659297275957362, "grad_norm": 0.0023591353092342615, "learning_rate": 1.4011054086063956e-05, "loss": 0.1006, "step": 2700 }, { "epoch": 1.0659297275957362, "eval_exact_match": 92.36666666666666, "eval_f1": 94.37342015392016, "step": 2700 }, { "epoch": 1.1054086063955784, "grad_norm": 20.859468460083008, "learning_rate": 1.3418870904066324e-05, "loss": 0.1476, "step": 2800 }, { "epoch": 1.1054086063955784, "eval_exact_match": 92.63333333333334, "eval_f1": 94.58407021341235, "step": 2800 }, { "epoch": 1.1448874851954205, "grad_norm": 0.3791426420211792, "learning_rate": 1.2826687722068692e-05, "loss": 0.1295, "step": 2900 }, { "epoch": 1.1448874851954205, "eval_exact_match": 93.13333333333334, "eval_f1": 94.95533549783549, "step": 2900 }, { "epoch": 1.1843663639952626, "grad_norm": 4.289775371551514, "learning_rate": 1.2234504540071062e-05, "loss": 0.1727, "step": 3000 }, { "epoch": 1.1843663639952626, "eval_exact_match": 92.66666666666667, "eval_f1": 94.65173544973547, "step": 3000 }, { "epoch": 1.2238452427951045, "grad_norm": 0.0014754978474229574, "learning_rate": 1.164232135807343e-05, "loss": 0.1345, "step": 3100 }, { "epoch": 1.2238452427951045, "eval_exact_match": 92.96666666666667, "eval_f1": 94.9088371686793, "step": 3100 }, { "epoch": 1.2633241215949467, "grad_norm": 43.49968338012695, "learning_rate": 1.10501381760758e-05, "loss": 0.202, "step": 3200 }, { "epoch": 1.2633241215949467, "eval_exact_match": 92.66666666666667, "eval_f1": 94.57915070481636, "step": 3200 }, { "epoch": 1.3028030003947888, "grad_norm": 13.5677490234375, "learning_rate": 1.0457954994078168e-05, "loss": 0.1898, "step": 3300 }, { "epoch": 1.3028030003947888, "eval_exact_match": 92.86666666666666, "eval_f1": 94.8515283790284, "step": 3300 }, { "epoch": 1.342281879194631, "grad_norm": 76.90211486816406, "learning_rate": 9.865771812080538e-06, "loss": 0.1433, "step": 3400 }, { "epoch": 1.342281879194631, "eval_exact_match": 93.46666666666667, "eval_f1": 95.2914963359081, "step": 3400 }, { "epoch": 1.3817607579944728, "grad_norm": 7.809657096862793, "learning_rate": 9.273588630082906e-06, "loss": 0.1693, "step": 3500 }, { "epoch": 1.3817607579944728, "eval_exact_match": 93.0, "eval_f1": 94.87400432900435, "step": 3500 }, { "epoch": 1.421239636794315, "grad_norm": 4.223247051239014, "learning_rate": 8.681405448085274e-06, "loss": 0.1827, "step": 3600 }, { "epoch": 1.421239636794315, "eval_exact_match": 93.33333333333333, "eval_f1": 95.12902597402599, "step": 3600 }, { "epoch": 1.460718515594157, "grad_norm": 27.452770233154297, "learning_rate": 8.089222266087644e-06, "loss": 0.1843, "step": 3700 }, { "epoch": 1.460718515594157, "eval_exact_match": 93.36666666666666, "eval_f1": 95.09415584415585, "step": 3700 }, { "epoch": 1.5001973943939992, "grad_norm": 20.997011184692383, "learning_rate": 7.497039084090013e-06, "loss": 0.1656, "step": 3800 }, { "epoch": 1.5001973943939992, "eval_exact_match": 93.7, "eval_f1": 95.35056277056279, "step": 3800 }, { "epoch": 1.5396762731938414, "grad_norm": 0.005699894856661558, "learning_rate": 6.90485590209238e-06, "loss": 0.1013, "step": 3900 }, { "epoch": 1.5396762731938414, "eval_exact_match": 93.4, "eval_f1": 95.10358730158731, "step": 3900 }, { "epoch": 1.5791551519936835, "grad_norm": 4.07551383972168, "learning_rate": 6.312672720094749e-06, "loss": 0.1632, "step": 4000 }, { "epoch": 1.5791551519936835, "eval_exact_match": 93.43333333333334, "eval_f1": 95.18676190476195, "step": 4000 }, { "epoch": 1.6186340307935254, "grad_norm": 0.002155046910047531, "learning_rate": 5.720489538097118e-06, "loss": 0.1702, "step": 4100 }, { "epoch": 1.6186340307935254, "eval_exact_match": 93.36666666666666, "eval_f1": 95.09804473304477, "step": 4100 }, { "epoch": 1.6581129095933675, "grad_norm": 0.017265846952795982, "learning_rate": 5.128306356099487e-06, "loss": 0.1308, "step": 4200 }, { "epoch": 1.6581129095933675, "eval_exact_match": 93.23333333333333, "eval_f1": 94.99049422799425, "step": 4200 }, { "epoch": 1.6975917883932097, "grad_norm": 0.009932265616953373, "learning_rate": 4.536123174101856e-06, "loss": 0.1608, "step": 4300 }, { "epoch": 1.6975917883932097, "eval_exact_match": 93.63333333333334, "eval_f1": 95.32610028860032, "step": 4300 }, { "epoch": 1.7370706671930516, "grad_norm": 56.21049880981445, "learning_rate": 3.943939992104225e-06, "loss": 0.1296, "step": 4400 }, { "epoch": 1.7370706671930516, "eval_exact_match": 93.43333333333334, "eval_f1": 95.17288023088024, "step": 4400 }, { "epoch": 1.7765495459928937, "grad_norm": 78.09048461914062, "learning_rate": 3.3517568101065932e-06, "loss": 0.1752, "step": 4500 }, { "epoch": 1.7765495459928937, "eval_exact_match": 93.66666666666667, "eval_f1": 95.27616883116887, "step": 4500 }, { "epoch": 1.8160284247927359, "grad_norm": 2.959677219390869, "learning_rate": 2.7595736281089617e-06, "loss": 0.1195, "step": 4600 }, { "epoch": 1.8160284247927359, "eval_exact_match": 93.43333333333334, "eval_f1": 95.0532770562771, "step": 4600 }, { "epoch": 1.855507303592578, "grad_norm": 69.6814193725586, "learning_rate": 2.1673904461113303e-06, "loss": 0.1849, "step": 4700 }, { "epoch": 1.855507303592578, "eval_exact_match": 93.5, "eval_f1": 95.10591774891775, "step": 4700 }, { "epoch": 1.8949861823924201, "grad_norm": 0.19182445108890533, "learning_rate": 1.5752072641136992e-06, "loss": 0.09, "step": 4800 }, { "epoch": 1.8949861823924201, "eval_exact_match": 93.46666666666667, "eval_f1": 95.12782467532469, "step": 4800 }, { "epoch": 1.9344650611922622, "grad_norm": 65.0757827758789, "learning_rate": 9.83024082116068e-07, "loss": 0.1393, "step": 4900 }, { "epoch": 1.9344650611922622, "eval_exact_match": 93.7, "eval_f1": 95.2371066252588, "step": 4900 }, { "epoch": 1.9739439399921044, "grad_norm": 96.50936889648438, "learning_rate": 3.9084090011843665e-07, "loss": 0.149, "step": 5000 }, { "epoch": 1.9739439399921044, "eval_exact_match": 93.7, "eval_f1": 95.25543995859213, "step": 5000 }, { "epoch": 2.0, "step": 5066, "total_flos": 3.763306473501082e+16, "train_loss": 0.339278704556278, "train_runtime": 7807.973, "train_samples_per_second": 5.19, "train_steps_per_second": 0.649 } ], "logging_steps": 100, "max_steps": 5066, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.763306473501082e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }