{ "best_metric": 1.659671425819397, "best_model_checkpoint": "/data/user_data/gonilude/python_and_text_pythia_1b/checkpoint-150", "epoch": 3.0, "eval_steps": 50, "global_step": 237, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_accuracy": 0.19718309859154928, "eval_loss": 1.9329111576080322, "eval_runtime": 0.9209, "eval_samples_per_second": 77.098, "eval_steps_per_second": 9.773, "num_input_tokens_seen": 0, "step": 0 }, { "epoch": 0.012658227848101266, "grad_norm": NaN, "learning_rate": 0.0, "loss": 1.8643, "num_input_tokens_seen": 8192, "step": 1 }, { "epoch": 0.06329113924050633, "grad_norm": 98.8646469116211, "learning_rate": 2.5e-06, "loss": 2.0292, "num_input_tokens_seen": 40960, "step": 5 }, { "epoch": 0.12658227848101267, "grad_norm": 78.7982406616211, "learning_rate": 1.5000000000000002e-05, "loss": 2.6274, "num_input_tokens_seen": 81920, "step": 10 }, { "epoch": 0.189873417721519, "grad_norm": 44.39988327026367, "learning_rate": 1.999153201672344e-05, "loss": 1.9365, "num_input_tokens_seen": 122880, "step": 15 }, { "epoch": 0.25316455696202533, "grad_norm": 56.09316635131836, "learning_rate": 1.9953925451903757e-05, "loss": 2.8005, "num_input_tokens_seen": 163840, "step": 20 }, { "epoch": 0.31645569620253167, "grad_norm": 23.778945922851562, "learning_rate": 1.986479890027153e-05, "loss": 1.9282, "num_input_tokens_seen": 204800, "step": 25 }, { "epoch": 0.379746835443038, "grad_norm": 22.14593505859375, "learning_rate": 1.9729275686705832e-05, "loss": 1.8437, "num_input_tokens_seen": 245760, "step": 30 }, { "epoch": 0.4430379746835443, "grad_norm": 21.635009765625, "learning_rate": 1.9547993211399753e-05, "loss": 2.1747, "num_input_tokens_seen": 286720, "step": 35 }, { "epoch": 0.5063291139240507, "grad_norm": 36.09115982055664, "learning_rate": 1.932180409200991e-05, "loss": 1.8426, "num_input_tokens_seen": 327680, "step": 40 }, { "epoch": 0.569620253164557, "grad_norm": 21.926368713378906, "learning_rate": 1.905177215357839e-05, "loss": 2.0141, "num_input_tokens_seen": 368640, "step": 45 }, { "epoch": 0.6329113924050633, "grad_norm": 24.96376609802246, "learning_rate": 1.8739167425092644e-05, "loss": 1.7534, "num_input_tokens_seen": 409600, "step": 50 }, { "epoch": 0.6329113924050633, "eval_accuracy": 0.1267605633802817, "eval_loss": 2.1001596450805664, "eval_runtime": 0.8338, "eval_samples_per_second": 85.15, "eval_steps_per_second": 10.794, "num_input_tokens_seen": 409600, "step": 50 }, { "epoch": 0.6962025316455697, "grad_norm": 25.672304153442383, "learning_rate": 1.838546016621564e-05, "loss": 1.8794, "num_input_tokens_seen": 450560, "step": 55 }, { "epoch": 0.759493670886076, "grad_norm": 17.974288940429688, "learning_rate": 1.7992313952280175e-05, "loss": 1.6198, "num_input_tokens_seen": 491520, "step": 60 }, { "epoch": 0.8227848101265823, "grad_norm": 17.891626358032227, "learning_rate": 1.7561577850070355e-05, "loss": 1.8402, "num_input_tokens_seen": 532480, "step": 65 }, { "epoch": 0.8860759493670886, "grad_norm": 26.64494514465332, "learning_rate": 1.709527772118953e-05, "loss": 1.707, "num_input_tokens_seen": 573440, "step": 70 }, { "epoch": 0.9493670886075949, "grad_norm": 8.569857597351074, "learning_rate": 1.659560669391714e-05, "loss": 1.717, "num_input_tokens_seen": 614400, "step": 75 }, { "epoch": 1.0126582278481013, "grad_norm": 31.739055633544922, "learning_rate": 1.6064914848367818e-05, "loss": 1.687, "num_input_tokens_seen": 655360, "step": 80 }, { "epoch": 1.0759493670886076, "grad_norm": 28.70093536376953, "learning_rate": 1.5505698163465986e-05, "loss": 1.9082, "num_input_tokens_seen": 696320, "step": 85 }, { "epoch": 1.139240506329114, "grad_norm": 28.82413673400879, "learning_rate": 1.4920586777721231e-05, "loss": 1.7321, "num_input_tokens_seen": 737280, "step": 90 }, { "epoch": 1.2025316455696202, "grad_norm": 20.39600372314453, "learning_rate": 1.4312332619016964e-05, "loss": 1.675, "num_input_tokens_seen": 778240, "step": 95 }, { "epoch": 1.2658227848101267, "grad_norm": 14.24821949005127, "learning_rate": 1.3683796461592604e-05, "loss": 1.737, "num_input_tokens_seen": 819200, "step": 100 }, { "epoch": 1.2658227848101267, "eval_accuracy": 0.2112676056338028, "eval_loss": 1.70565927028656, "eval_runtime": 0.8499, "eval_samples_per_second": 83.543, "eval_steps_per_second": 10.59, "num_input_tokens_seen": 819200, "step": 100 }, { "epoch": 1.3291139240506329, "grad_norm": 17.15838050842285, "learning_rate": 1.3037934471093683e-05, "loss": 1.6419, "num_input_tokens_seen": 860160, "step": 105 }, { "epoch": 1.3924050632911391, "grad_norm": 20.325576782226562, "learning_rate": 1.2377784300971807e-05, "loss": 1.6242, "num_input_tokens_seen": 901120, "step": 110 }, { "epoch": 1.4556962025316456, "grad_norm": 10.852635383605957, "learning_rate": 1.1706450805626762e-05, "loss": 1.676, "num_input_tokens_seen": 942080, "step": 115 }, { "epoch": 1.518987341772152, "grad_norm": 13.203107833862305, "learning_rate": 1.1027091437485404e-05, "loss": 1.6001, "num_input_tokens_seen": 983040, "step": 120 }, { "epoch": 1.5822784810126582, "grad_norm": 13.230643272399902, "learning_rate": 1.0342901396698658e-05, "loss": 1.5378, "num_input_tokens_seen": 1024000, "step": 125 }, { "epoch": 1.6455696202531644, "grad_norm": 14.586160659790039, "learning_rate": 9.657098603301347e-06, "loss": 1.7631, "num_input_tokens_seen": 1064960, "step": 130 }, { "epoch": 1.7088607594936709, "grad_norm": 9.582100868225098, "learning_rate": 8.9729085625146e-06, "loss": 1.615, "num_input_tokens_seen": 1105920, "step": 135 }, { "epoch": 1.7721518987341773, "grad_norm": 13.78858470916748, "learning_rate": 8.293549194373243e-06, "loss": 1.5787, "num_input_tokens_seen": 1146880, "step": 140 }, { "epoch": 1.8354430379746836, "grad_norm": 15.993226051330566, "learning_rate": 7.622215699028196e-06, "loss": 1.6013, "num_input_tokens_seen": 1187840, "step": 145 }, { "epoch": 1.8987341772151898, "grad_norm": 28.815187454223633, "learning_rate": 6.962065528906321e-06, "loss": 1.6248, "num_input_tokens_seen": 1228800, "step": 150 }, { "epoch": 1.8987341772151898, "eval_accuracy": 0.19718309859154928, "eval_loss": 1.659671425819397, "eval_runtime": 0.8131, "eval_samples_per_second": 87.32, "eval_steps_per_second": 11.069, "num_input_tokens_seen": 1228800, "step": 150 }, { "epoch": 1.9620253164556962, "grad_norm": 6.942416191101074, "learning_rate": 6.316203538407397e-06, "loss": 1.5599, "num_input_tokens_seen": 1269760, "step": 155 }, { "epoch": 2.0253164556962027, "grad_norm": 10.559165954589844, "learning_rate": 5.687667380983037e-06, "loss": 1.4922, "num_input_tokens_seen": 1310720, "step": 160 }, { "epoch": 2.088607594936709, "grad_norm": 15.816205024719238, "learning_rate": 5.07941322227877e-06, "loss": 1.5809, "num_input_tokens_seen": 1351680, "step": 165 }, { "epoch": 2.151898734177215, "grad_norm": 17.880002975463867, "learning_rate": 4.494301836534016e-06, "loss": 1.6087, "num_input_tokens_seen": 1392640, "step": 170 }, { "epoch": 2.2151898734177213, "grad_norm": 22.581768035888672, "learning_rate": 3.935085151632185e-06, "loss": 1.5043, "num_input_tokens_seen": 1433600, "step": 175 }, { "epoch": 2.278481012658228, "grad_norm": 24.53186798095703, "learning_rate": 3.4043933060828606e-06, "loss": 1.6017, "num_input_tokens_seen": 1474560, "step": 180 }, { "epoch": 2.3417721518987342, "grad_norm": 19.106027603149414, "learning_rate": 2.9047222788104712e-06, "loss": 1.5861, "num_input_tokens_seen": 1515520, "step": 185 }, { "epoch": 2.4050632911392404, "grad_norm": 10.13152027130127, "learning_rate": 2.4384221499296466e-06, "loss": 1.5453, "num_input_tokens_seen": 1556480, "step": 190 }, { "epoch": 2.4683544303797467, "grad_norm": 29.276445388793945, "learning_rate": 2.007686047719831e-06, "loss": 1.4934, "num_input_tokens_seen": 1597440, "step": 195 }, { "epoch": 2.5316455696202533, "grad_norm": 10.298810005187988, "learning_rate": 1.6145398337843654e-06, "loss": 1.4481, "num_input_tokens_seen": 1638400, "step": 200 }, { "epoch": 2.5316455696202533, "eval_accuracy": 0.14084507042253522, "eval_loss": 1.7617015838623047, "eval_runtime": 0.8181, "eval_samples_per_second": 86.785, "eval_steps_per_second": 11.001, "num_input_tokens_seen": 1638400, "step": 200 }, { "epoch": 2.5949367088607596, "grad_norm": 16.260913848876953, "learning_rate": 1.2608325749073591e-06, "loss": 1.529, "num_input_tokens_seen": 1679360, "step": 205 }, { "epoch": 2.6582278481012658, "grad_norm": 12.57551383972168, "learning_rate": 9.482278464216121e-07, "loss": 1.552, "num_input_tokens_seen": 1720320, "step": 210 }, { "epoch": 2.721518987341772, "grad_norm": 13.254227638244629, "learning_rate": 6.781959079900958e-07, "loss": 1.4281, "num_input_tokens_seen": 1761280, "step": 215 }, { "epoch": 2.7848101265822782, "grad_norm": 10.004585266113281, "learning_rate": 4.520067886002488e-07, "loss": 1.439, "num_input_tokens_seen": 1802240, "step": 220 }, { "epoch": 2.848101265822785, "grad_norm": 19.180036544799805, "learning_rate": 2.707243132941717e-07, "loss": 1.5696, "num_input_tokens_seen": 1843200, "step": 225 }, { "epoch": 2.911392405063291, "grad_norm": 13.415371894836426, "learning_rate": 1.3520109972846918e-07, "loss": 1.502, "num_input_tokens_seen": 1884160, "step": 230 }, { "epoch": 2.9746835443037973, "grad_norm": 22.85205841064453, "learning_rate": 4.6074548096244346e-08, "loss": 1.5584, "num_input_tokens_seen": 1925120, "step": 235 }, { "epoch": 3.0, "num_input_tokens_seen": 1941504, "step": 237, "total_flos": 9386162506432512.0, "train_loss": 1.7149980520900292, "train_runtime": 273.4678, "train_samples_per_second": 6.922, "train_steps_per_second": 0.867 } ], "logging_steps": 5, "max_steps": 237, "num_input_tokens_seen": 1941504, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9386162506432512.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }