{ "best_global_step": 44, "best_metric": 0.061493076384067535, "best_model_checkpoint": "rewrite_results/checkpoint-42", "epoch": 4.0, "eval_steps": 2, "global_step": 56, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14814814814814814, "eval_accuracy": 0.9610916971862766, "eval_loss": 0.24771852791309357, "eval_runtime": 332.7057, "eval_samples_per_second": 0.481, "eval_steps_per_second": 0.06, "step": 2 }, { "epoch": 0.2222222222222222, "grad_norm": 0.10552783565151713, "learning_rate": 0.00027304247779439414, "loss": 0.2441, "step": 3 }, { "epoch": 0.2962962962962963, "eval_accuracy": 0.9648183236107929, "eval_loss": 0.1996598243713379, "eval_runtime": 327.4272, "eval_samples_per_second": 0.489, "eval_steps_per_second": 0.061, "step": 4 }, { "epoch": 0.4444444444444444, "grad_norm": 0.08840417088618377, "learning_rate": 0.0004, "loss": 0.2367, "step": 6 }, { "epoch": 0.4444444444444444, "eval_accuracy": 0.9687810581276459, "eval_loss": 0.15869076550006866, "eval_runtime": 326.6521, "eval_samples_per_second": 0.49, "eval_steps_per_second": 0.061, "step": 6 }, { "epoch": 0.5925925925925926, "eval_accuracy": 0.9710204154483826, "eval_loss": 0.14757852256298065, "eval_runtime": 330.7473, "eval_samples_per_second": 0.484, "eval_steps_per_second": 0.06, "step": 8 }, { "epoch": 0.6666666666666666, "grad_norm": 0.04166015387300161, "learning_rate": 0.00037647058823529414, "loss": 0.1895, "step": 9 }, { "epoch": 0.7407407407407407, "eval_accuracy": 0.9732184664675948, "eval_loss": 0.13183145225048065, "eval_runtime": 325.6161, "eval_samples_per_second": 0.491, "eval_steps_per_second": 0.061, "step": 10 }, { "epoch": 0.8888888888888888, "grad_norm": 0.026815989165312845, "learning_rate": 0.00035294117647058826, "loss": 0.1361, "step": 12 }, { "epoch": 0.8888888888888888, "eval_accuracy": 0.9759430638638582, "eval_loss": 0.11719267070293427, "eval_runtime": 323.9433, "eval_samples_per_second": 0.494, "eval_steps_per_second": 0.062, "step": 12 }, { "epoch": 1.0, "eval_accuracy": 0.9783447362046809, "eval_loss": 0.1052589863538742, "eval_runtime": 320.7346, "eval_samples_per_second": 0.499, "eval_steps_per_second": 0.062, "step": 14 }, { "epoch": 1.074074074074074, "grad_norm": 0.027111063904139955, "learning_rate": 0.0003294117647058824, "loss": 0.18, "step": 15 }, { "epoch": 1.1481481481481481, "eval_accuracy": 0.9792185262272917, "eval_loss": 0.09851272404193878, "eval_runtime": 329.3754, "eval_samples_per_second": 0.486, "eval_steps_per_second": 0.061, "step": 16 }, { "epoch": 1.2962962962962963, "grad_norm": 0.02728080298065729, "learning_rate": 0.00030588235294117644, "loss": 0.1193, "step": 18 }, { "epoch": 1.2962962962962963, "eval_accuracy": 0.9797502878582046, "eval_loss": 0.09319568425416946, "eval_runtime": 327.3319, "eval_samples_per_second": 0.489, "eval_steps_per_second": 0.061, "step": 18 }, { "epoch": 1.4444444444444444, "eval_accuracy": 0.9803937999427419, "eval_loss": 0.08752243220806122, "eval_runtime": 325.6299, "eval_samples_per_second": 0.491, "eval_steps_per_second": 0.061, "step": 20 }, { "epoch": 1.5185185185185186, "grad_norm": 0.024562359210379074, "learning_rate": 0.0002823529411764706, "loss": 0.0823, "step": 21 }, { "epoch": 1.5925925925925926, "eval_accuracy": 0.980617014762694, "eval_loss": 0.08400166034698486, "eval_runtime": 330.0542, "eval_samples_per_second": 0.485, "eval_steps_per_second": 0.061, "step": 22 }, { "epoch": 1.7407407407407407, "grad_norm": 0.04720330319206125, "learning_rate": 0.00025882352941176474, "loss": 0.1175, "step": 24 }, { "epoch": 1.7407407407407407, "eval_accuracy": 0.9813755814844891, "eval_loss": 0.07777883112430573, "eval_runtime": 330.5389, "eval_samples_per_second": 0.484, "eval_steps_per_second": 0.061, "step": 24 }, { "epoch": 1.8888888888888888, "eval_accuracy": 0.9826866606161857, "eval_loss": 0.07366437464952469, "eval_runtime": 330.0141, "eval_samples_per_second": 0.485, "eval_steps_per_second": 0.061, "step": 26 }, { "epoch": 1.9629629629629628, "grad_norm": 0.05091774794908078, "learning_rate": 0.00023529411764705883, "loss": 0.0898, "step": 27 }, { "epoch": 2.0, "eval_accuracy": 0.9836173653422126, "eval_loss": 0.07038821280002594, "eval_runtime": 330.8638, "eval_samples_per_second": 0.484, "eval_steps_per_second": 0.06, "step": 28 }, { "epoch": 2.148148148148148, "grad_norm": 0.02332212687636519, "learning_rate": 0.00021176470588235295, "loss": 0.0948, "step": 30 }, { "epoch": 2.148148148148148, "eval_accuracy": 0.9838456743046095, "eval_loss": 0.06887274235486984, "eval_runtime": 334.1801, "eval_samples_per_second": 0.479, "eval_steps_per_second": 0.06, "step": 30 }, { "epoch": 2.2962962962962963, "eval_accuracy": 0.9841358041079982, "eval_loss": 0.06697963178157806, "eval_runtime": 327.8238, "eval_samples_per_second": 0.488, "eval_steps_per_second": 0.061, "step": 32 }, { "epoch": 2.3703703703703702, "grad_norm": 0.023466685722830175, "learning_rate": 0.00018823529411764707, "loss": 0.0739, "step": 33 }, { "epoch": 2.4444444444444446, "eval_accuracy": 0.9841183559501172, "eval_loss": 0.06529785692691803, "eval_runtime": 324.9504, "eval_samples_per_second": 0.492, "eval_steps_per_second": 0.062, "step": 34 }, { "epoch": 2.5925925925925926, "grad_norm": 0.02499598734491485, "learning_rate": 0.0001647058823529412, "loss": 0.0526, "step": 36 }, { "epoch": 2.5925925925925926, "eval_accuracy": 0.984462930071049, "eval_loss": 0.06430461257696152, "eval_runtime": 328.33, "eval_samples_per_second": 0.487, "eval_steps_per_second": 0.061, "step": 36 }, { "epoch": 2.7407407407407405, "eval_accuracy": 0.9845490069370237, "eval_loss": 0.06337571144104004, "eval_runtime": 323.0777, "eval_samples_per_second": 0.495, "eval_steps_per_second": 0.062, "step": 38 }, { "epoch": 2.814814814814815, "grad_norm": 0.02235923851811797, "learning_rate": 0.0001411764705882353, "loss": 0.0564, "step": 39 }, { "epoch": 2.888888888888889, "eval_accuracy": 0.9847284644961529, "eval_loss": 0.062485575675964355, "eval_runtime": 328.1316, "eval_samples_per_second": 0.488, "eval_steps_per_second": 0.061, "step": 40 }, { "epoch": 3.0, "grad_norm": 0.03433345990626862, "learning_rate": 0.00011764705882352942, "loss": 0.0678, "step": 42 }, { "epoch": 3.0, "eval_accuracy": 0.9849732649347395, "eval_loss": 0.061602283269166946, "eval_runtime": 301.0464, "eval_samples_per_second": 0.531, "eval_steps_per_second": 0.066, "step": 42 }, { "epoch": 3.148148148148148, "eval_accuracy": 0.9852119445846472, "eval_loss": 0.061493076384067535, "eval_runtime": 331.7193, "eval_samples_per_second": 0.482, "eval_steps_per_second": 0.06, "step": 44 }, { "epoch": 3.2222222222222223, "grad_norm": 0.022600132657406128, "learning_rate": 9.411764705882353e-05, "loss": 0.0499, "step": 45 }, { "epoch": 3.2962962962962963, "eval_accuracy": 0.9853399724949231, "eval_loss": 0.06156101077795029, "eval_runtime": 327.3955, "eval_samples_per_second": 0.489, "eval_steps_per_second": 0.061, "step": 46 }, { "epoch": 3.4444444444444446, "grad_norm": 0.018635515413178816, "learning_rate": 7.058823529411765e-05, "loss": 0.0437, "step": 48 }, { "epoch": 3.4444444444444446, "eval_accuracy": 0.9852882434474587, "eval_loss": 0.06198165938258171, "eval_runtime": 335.6061, "eval_samples_per_second": 0.477, "eval_steps_per_second": 0.06, "step": 48 }, { "epoch": 3.5925925925925926, "eval_accuracy": 0.9850508846328356, "eval_loss": 0.06207697466015816, "eval_runtime": 324.7443, "eval_samples_per_second": 0.493, "eval_steps_per_second": 0.062, "step": 50 }, { "epoch": 3.6666666666666665, "grad_norm": 0.021001664364668726, "learning_rate": 4.705882352941177e-05, "loss": 0.0421, "step": 51 }, { "epoch": 3.7407407407407405, "eval_accuracy": 0.9851825126851306, "eval_loss": 0.06235222890973091, "eval_runtime": 330.039, "eval_samples_per_second": 0.485, "eval_steps_per_second": 0.061, "step": 52 }, { "epoch": 3.888888888888889, "grad_norm": 0.02399059372934013, "learning_rate": 2.3529411764705884e-05, "loss": 0.0557, "step": 54 }, { "epoch": 3.888888888888889, "eval_accuracy": 0.985193215099636, "eval_loss": 0.06244580075144768, "eval_runtime": 328.4431, "eval_samples_per_second": 0.487, "eval_steps_per_second": 0.061, "step": 54 }, { "epoch": 4.0, "eval_accuracy": 0.9850785367286026, "eval_loss": 0.062325846403837204, "eval_runtime": 334.0117, "eval_samples_per_second": 0.479, "eval_steps_per_second": 0.06, "step": 56 }, { "epoch": 4.0, "step": 56, "total_flos": 83324244590592.0, "train_loss": 0.10440204206055828, "train_runtime": 17139.0998, "train_samples_per_second": 0.101, "train_steps_per_second": 0.003 } ], "logging_steps": 3, "max_steps": 56, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 6, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 83324244590592.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }