| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.619834710743802, | |
| "eval_steps": 10, | |
| "global_step": 340, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.1652892561983471, | |
| "grad_norm": 2.5004961490631104, | |
| "learning_rate": 3.998781654038192e-05, | |
| "loss": 1.3959, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.1652892561983471, | |
| "eval_loss": 0.9773627519607544, | |
| "eval_runtime": 1.0911, | |
| "eval_samples_per_second": 10.998, | |
| "eval_steps_per_second": 1.833, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.3305785123966942, | |
| "grad_norm": 3.3259832859039307, | |
| "learning_rate": 3.9951281005196486e-05, | |
| "loss": 1.195, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.3305785123966942, | |
| "eval_loss": 0.783854603767395, | |
| "eval_runtime": 1.1063, | |
| "eval_samples_per_second": 10.847, | |
| "eval_steps_per_second": 1.808, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.49586776859504134, | |
| "grad_norm": 2.37343168258667, | |
| "learning_rate": 3.989043790736547e-05, | |
| "loss": 1.035, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.49586776859504134, | |
| "eval_loss": 0.6894669532775879, | |
| "eval_runtime": 1.4555, | |
| "eval_samples_per_second": 8.245, | |
| "eval_steps_per_second": 1.374, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.6611570247933884, | |
| "grad_norm": 3.1549763679504395, | |
| "learning_rate": 3.980536137483141e-05, | |
| "loss": 0.9097, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.6611570247933884, | |
| "eval_loss": 0.6716279983520508, | |
| "eval_runtime": 1.1151, | |
| "eval_samples_per_second": 10.761, | |
| "eval_steps_per_second": 1.794, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.8264462809917356, | |
| "grad_norm": 2.8315067291259766, | |
| "learning_rate": 3.9696155060244166e-05, | |
| "loss": 0.8614, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.8264462809917356, | |
| "eval_loss": 0.6319628357887268, | |
| "eval_runtime": 1.1073, | |
| "eval_samples_per_second": 10.837, | |
| "eval_steps_per_second": 1.806, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.9917355371900827, | |
| "grad_norm": 3.369000196456909, | |
| "learning_rate": 3.9562952014676116e-05, | |
| "loss": 0.9259, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.9917355371900827, | |
| "eval_loss": 0.5961965918540955, | |
| "eval_runtime": 1.3743, | |
| "eval_samples_per_second": 8.732, | |
| "eval_steps_per_second": 1.455, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.1570247933884297, | |
| "grad_norm": 2.863161087036133, | |
| "learning_rate": 3.940591452551993e-05, | |
| "loss": 0.7896, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.1570247933884297, | |
| "eval_loss": 0.5863456726074219, | |
| "eval_runtime": 1.0994, | |
| "eval_samples_per_second": 10.915, | |
| "eval_steps_per_second": 1.819, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.322314049586777, | |
| "grad_norm": 2.5636465549468994, | |
| "learning_rate": 3.922523391876638e-05, | |
| "loss": 0.8601, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.322314049586777, | |
| "eval_loss": 0.5487304925918579, | |
| "eval_runtime": 1.0935, | |
| "eval_samples_per_second": 10.974, | |
| "eval_steps_per_second": 1.829, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.487603305785124, | |
| "grad_norm": 2.7444987297058105, | |
| "learning_rate": 3.9021130325903076e-05, | |
| "loss": 0.7592, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.487603305785124, | |
| "eval_loss": 0.543647050857544, | |
| "eval_runtime": 1.1357, | |
| "eval_samples_per_second": 10.566, | |
| "eval_steps_per_second": 1.761, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.6528925619834711, | |
| "grad_norm": 4.245235919952393, | |
| "learning_rate": 3.879385241571817e-05, | |
| "loss": 0.7439, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.6528925619834711, | |
| "eval_loss": 0.5209221839904785, | |
| "eval_runtime": 1.1049, | |
| "eval_samples_per_second": 10.861, | |
| "eval_steps_per_second": 1.81, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.8181818181818183, | |
| "grad_norm": 3.720158815383911, | |
| "learning_rate": 3.854367709133575e-05, | |
| "loss": 0.7675, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.8181818181818183, | |
| "eval_loss": 0.502068281173706, | |
| "eval_runtime": 1.416, | |
| "eval_samples_per_second": 8.474, | |
| "eval_steps_per_second": 1.412, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.9834710743801653, | |
| "grad_norm": 3.368499279022217, | |
| "learning_rate": 3.827090915285202e-05, | |
| "loss": 0.7396, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.9834710743801653, | |
| "eval_loss": 0.4688035845756531, | |
| "eval_runtime": 1.1068, | |
| "eval_samples_per_second": 10.842, | |
| "eval_steps_per_second": 1.807, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.1487603305785123, | |
| "grad_norm": 3.335733652114868, | |
| "learning_rate": 3.7975880925983345e-05, | |
| "loss": 0.5938, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.1487603305785123, | |
| "eval_loss": 0.45896804332733154, | |
| "eval_runtime": 1.0901, | |
| "eval_samples_per_second": 11.008, | |
| "eval_steps_per_second": 1.835, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.3140495867768593, | |
| "grad_norm": 4.038167953491211, | |
| "learning_rate": 3.7658951857178544e-05, | |
| "loss": 0.5712, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.3140495867768593, | |
| "eval_loss": 0.4418785274028778, | |
| "eval_runtime": 1.3332, | |
| "eval_samples_per_second": 9.001, | |
| "eval_steps_per_second": 1.5, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.479338842975207, | |
| "grad_norm": 4.076087951660156, | |
| "learning_rate": 3.732050807568878e-05, | |
| "loss": 0.6814, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.479338842975207, | |
| "eval_loss": 0.4231901168823242, | |
| "eval_runtime": 1.0886, | |
| "eval_samples_per_second": 11.023, | |
| "eval_steps_per_second": 1.837, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.644628099173554, | |
| "grad_norm": 4.039748668670654, | |
| "learning_rate": 3.696096192312852e-05, | |
| "loss": 0.6155, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.644628099173554, | |
| "eval_loss": 0.4101436138153076, | |
| "eval_runtime": 1.1018, | |
| "eval_samples_per_second": 10.891, | |
| "eval_steps_per_second": 1.815, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.809917355371901, | |
| "grad_norm": 4.491166114807129, | |
| "learning_rate": 3.658075145110083e-05, | |
| "loss": 0.5797, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.809917355371901, | |
| "eval_loss": 0.3929840326309204, | |
| "eval_runtime": 1.3687, | |
| "eval_samples_per_second": 8.768, | |
| "eval_steps_per_second": 1.461, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.975206611570248, | |
| "grad_norm": 4.6367106437683105, | |
| "learning_rate": 3.6180339887498953e-05, | |
| "loss": 0.6346, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.975206611570248, | |
| "eval_loss": 0.3631528615951538, | |
| "eval_runtime": 1.1071, | |
| "eval_samples_per_second": 10.839, | |
| "eval_steps_per_second": 1.806, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 3.1404958677685952, | |
| "grad_norm": 3.62372088432312, | |
| "learning_rate": 3.576021507213444e-05, | |
| "loss": 0.4537, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 3.1404958677685952, | |
| "eval_loss": 0.35405832529067993, | |
| "eval_runtime": 1.2103, | |
| "eval_samples_per_second": 9.915, | |
| "eval_steps_per_second": 1.652, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 3.3057851239669422, | |
| "grad_norm": 3.1658225059509277, | |
| "learning_rate": 3.532088886237956e-05, | |
| "loss": 0.4568, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.3057851239669422, | |
| "eval_loss": 0.3364166021347046, | |
| "eval_runtime": 1.1013, | |
| "eval_samples_per_second": 10.896, | |
| "eval_steps_per_second": 1.816, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.4710743801652892, | |
| "grad_norm": 5.640577793121338, | |
| "learning_rate": 3.4862896509547886e-05, | |
| "loss": 0.4796, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 3.4710743801652892, | |
| "eval_loss": 0.31573551893234253, | |
| "eval_runtime": 1.0982, | |
| "eval_samples_per_second": 10.927, | |
| "eval_steps_per_second": 1.821, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 3.6363636363636362, | |
| "grad_norm": 5.163906097412109, | |
| "learning_rate": 3.438679600677303e-05, | |
| "loss": 0.4309, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 3.6363636363636362, | |
| "eval_loss": 0.29981058835983276, | |
| "eval_runtime": 1.4641, | |
| "eval_samples_per_second": 8.196, | |
| "eval_steps_per_second": 1.366, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 3.8016528925619832, | |
| "grad_norm": 10.097945213317871, | |
| "learning_rate": 3.3893167409179945e-05, | |
| "loss": 0.5423, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 3.8016528925619832, | |
| "eval_loss": 0.30080342292785645, | |
| "eval_runtime": 1.0952, | |
| "eval_samples_per_second": 10.957, | |
| "eval_steps_per_second": 1.826, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 3.9669421487603307, | |
| "grad_norm": 5.08881950378418, | |
| "learning_rate": 3.3382612127177166e-05, | |
| "loss": 0.51, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.9669421487603307, | |
| "eval_loss": 0.30431026220321655, | |
| "eval_runtime": 1.1103, | |
| "eval_samples_per_second": 10.807, | |
| "eval_steps_per_second": 1.801, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 4.132231404958677, | |
| "grad_norm": 3.142008066177368, | |
| "learning_rate": 3.285575219373079e-05, | |
| "loss": 0.4218, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 4.132231404958677, | |
| "eval_loss": 0.29047852754592896, | |
| "eval_runtime": 1.2712, | |
| "eval_samples_per_second": 9.44, | |
| "eval_steps_per_second": 1.573, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 4.297520661157025, | |
| "grad_norm": 4.357353687286377, | |
| "learning_rate": 3.2313229506513167e-05, | |
| "loss": 0.3503, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 4.297520661157025, | |
| "eval_loss": 0.29693418741226196, | |
| "eval_runtime": 1.0964, | |
| "eval_samples_per_second": 10.945, | |
| "eval_steps_per_second": 1.824, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 4.462809917355372, | |
| "grad_norm": 5.129684925079346, | |
| "learning_rate": 3.1755705045849465e-05, | |
| "loss": 0.3235, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 4.462809917355372, | |
| "eval_loss": 0.27224814891815186, | |
| "eval_runtime": 1.0905, | |
| "eval_samples_per_second": 11.004, | |
| "eval_steps_per_second": 1.834, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 4.628099173553719, | |
| "grad_norm": 5.414126396179199, | |
| "learning_rate": 3.1183858069414936e-05, | |
| "loss": 0.3671, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 4.628099173553719, | |
| "eval_loss": 0.2563490867614746, | |
| "eval_runtime": 1.0963, | |
| "eval_samples_per_second": 10.946, | |
| "eval_steps_per_second": 1.824, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 4.793388429752066, | |
| "grad_norm": 4.964554309844971, | |
| "learning_rate": 3.05983852846641e-05, | |
| "loss": 0.3399, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 4.793388429752066, | |
| "eval_loss": 0.25828373432159424, | |
| "eval_runtime": 1.1181, | |
| "eval_samples_per_second": 10.732, | |
| "eval_steps_per_second": 1.789, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 4.958677685950414, | |
| "grad_norm": 4.63615083694458, | |
| "learning_rate": 3.0000000000000004e-05, | |
| "loss": 0.3685, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.958677685950414, | |
| "eval_loss": 0.24292020499706268, | |
| "eval_runtime": 1.445, | |
| "eval_samples_per_second": 8.305, | |
| "eval_steps_per_second": 1.384, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 5.12396694214876, | |
| "grad_norm": 4.34980583190918, | |
| "learning_rate": 2.938943125571782e-05, | |
| "loss": 0.2943, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 5.12396694214876, | |
| "eval_loss": 0.20842806994915009, | |
| "eval_runtime": 1.1081, | |
| "eval_samples_per_second": 10.829, | |
| "eval_steps_per_second": 1.805, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 5.289256198347108, | |
| "grad_norm": 6.3195881843566895, | |
| "learning_rate": 2.876742293578155e-05, | |
| "loss": 0.2589, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 5.289256198347108, | |
| "eval_loss": 0.217214435338974, | |
| "eval_runtime": 1.0894, | |
| "eval_samples_per_second": 11.015, | |
| "eval_steps_per_second": 1.836, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 5.454545454545454, | |
| "grad_norm": 6.084163665771484, | |
| "learning_rate": 2.813473286151601e-05, | |
| "loss": 0.2143, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 5.454545454545454, | |
| "eval_loss": 0.22991180419921875, | |
| "eval_runtime": 1.2554, | |
| "eval_samples_per_second": 9.559, | |
| "eval_steps_per_second": 1.593, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 5.619834710743802, | |
| "grad_norm": 5.4227728843688965, | |
| "learning_rate": 2.7492131868318247e-05, | |
| "loss": 0.3084, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 5.619834710743802, | |
| "eval_loss": 0.21594808995723724, | |
| "eval_runtime": 1.1006, | |
| "eval_samples_per_second": 10.903, | |
| "eval_steps_per_second": 1.817, | |
| "step": 340 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 900, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 15, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 545316196581376.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |