{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9411764705882355, "eval_steps": 20, "global_step": 111, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05378151260504202, "grad_norm": 0.012224463745951653, "learning_rate": 0.001981818181818182, "loss": 2.4733, "step": 2 }, { "epoch": 0.10756302521008404, "grad_norm": 0.04710804298520088, "learning_rate": 0.0019454545454545456, "loss": 2.4443, "step": 4 }, { "epoch": 0.16134453781512606, "grad_norm": 0.09238269925117493, "learning_rate": 0.0019090909090909091, "loss": 2.3956, "step": 6 }, { "epoch": 0.21512605042016808, "grad_norm": 0.17334744334220886, "learning_rate": 0.0018727272727272729, "loss": 2.4043, "step": 8 }, { "epoch": 0.2689075630252101, "grad_norm": 0.11766365170478821, "learning_rate": 0.0018363636363636364, "loss": 2.385, "step": 10 }, { "epoch": 0.3226890756302521, "grad_norm": 0.1385774314403534, "learning_rate": 0.0018000000000000002, "loss": 2.3933, "step": 12 }, { "epoch": 0.3764705882352941, "grad_norm": 0.1210022047162056, "learning_rate": 0.0017636363636363637, "loss": 2.3733, "step": 14 }, { "epoch": 0.43025210084033616, "grad_norm": 0.11510306596755981, "learning_rate": 0.0017272727272727272, "loss": 2.3297, "step": 16 }, { "epoch": 0.48403361344537815, "grad_norm": 0.08369912207126617, "learning_rate": 0.001690909090909091, "loss": 2.351, "step": 18 }, { "epoch": 0.5378151260504201, "grad_norm": 0.09298688918352127, "learning_rate": 0.0016545454545454545, "loss": 2.3175, "step": 20 }, { "epoch": 0.5378151260504201, "eval_loss": 2.3251912593841553, "eval_runtime": 84.2914, "eval_samples_per_second": 14.118, "eval_steps_per_second": 1.768, "step": 20 }, { "epoch": 0.5915966386554622, "grad_norm": 0.10441266000270844, "learning_rate": 0.0016181818181818183, "loss": 2.3643, "step": 22 }, { "epoch": 0.6453781512605042, "grad_norm": 0.09343012422323227, "learning_rate": 0.0015818181818181818, "loss": 2.3391, "step": 24 }, { "epoch": 0.6991596638655462, "grad_norm": 0.09008985757827759, "learning_rate": 0.0015454545454545454, "loss": 2.2984, "step": 26 }, { "epoch": 0.7529411764705882, "grad_norm": 0.08069847524166107, "learning_rate": 0.0015090909090909091, "loss": 2.3202, "step": 28 }, { "epoch": 0.8067226890756303, "grad_norm": 0.08655106276273727, "learning_rate": 0.0014727272727272727, "loss": 2.3438, "step": 30 }, { "epoch": 0.8605042016806723, "grad_norm": 0.08203998953104019, "learning_rate": 0.0014363636363636362, "loss": 2.2862, "step": 32 }, { "epoch": 0.9142857142857143, "grad_norm": 0.10055471211671829, "learning_rate": 0.0014, "loss": 2.3448, "step": 34 }, { "epoch": 0.9680672268907563, "grad_norm": 0.08500978350639343, "learning_rate": 0.0013636363636363635, "loss": 2.3221, "step": 36 }, { "epoch": 1.0, "grad_norm": 0.16502036154270172, "learning_rate": 0.0013272727272727275, "loss": 2.3438, "step": 38 }, { "epoch": 1.053781512605042, "grad_norm": 0.08134379237890244, "learning_rate": 0.001290909090909091, "loss": 2.3075, "step": 40 }, { "epoch": 1.053781512605042, "eval_loss": 2.299827814102173, "eval_runtime": 84.1245, "eval_samples_per_second": 14.146, "eval_steps_per_second": 1.771, "step": 40 }, { "epoch": 1.107563025210084, "grad_norm": 0.09189953655004501, "learning_rate": 0.0012545454545454546, "loss": 2.2457, "step": 42 }, { "epoch": 1.1613445378151261, "grad_norm": 0.09041959792375565, "learning_rate": 0.0012181818181818183, "loss": 2.2977, "step": 44 }, { "epoch": 1.2151260504201682, "grad_norm": 0.08456366509199142, "learning_rate": 0.0011818181818181819, "loss": 2.2843, "step": 46 }, { "epoch": 1.26890756302521, "grad_norm": 0.08097781240940094, "learning_rate": 0.0011454545454545454, "loss": 2.2328, "step": 48 }, { "epoch": 1.322689075630252, "grad_norm": 0.10243827849626541, "learning_rate": 0.0011090909090909092, "loss": 2.254, "step": 50 }, { "epoch": 1.3764705882352941, "grad_norm": 0.09242815524339676, "learning_rate": 0.0010727272727272727, "loss": 2.3295, "step": 52 }, { "epoch": 1.4302521008403362, "grad_norm": 0.09403648227453232, "learning_rate": 0.0010363636363636365, "loss": 2.2749, "step": 54 }, { "epoch": 1.4840336134453782, "grad_norm": 0.09187959879636765, "learning_rate": 0.001, "loss": 2.2606, "step": 56 }, { "epoch": 1.53781512605042, "grad_norm": 0.09116198122501373, "learning_rate": 0.0009636363636363637, "loss": 2.2676, "step": 58 }, { "epoch": 1.5915966386554623, "grad_norm": 0.08270075172185898, "learning_rate": 0.0009272727272727273, "loss": 2.2741, "step": 60 }, { "epoch": 1.5915966386554623, "eval_loss": 2.3082737922668457, "eval_runtime": 84.2709, "eval_samples_per_second": 14.121, "eval_steps_per_second": 1.768, "step": 60 }, { "epoch": 1.6453781512605041, "grad_norm": 0.09275200217962265, "learning_rate": 0.0008909090909090909, "loss": 2.2582, "step": 62 }, { "epoch": 1.6991596638655462, "grad_norm": 0.09241969138383865, "learning_rate": 0.0008545454545454545, "loss": 2.2554, "step": 64 }, { "epoch": 1.7529411764705882, "grad_norm": 0.08338718116283417, "learning_rate": 0.0008181818181818183, "loss": 2.2244, "step": 66 }, { "epoch": 1.8067226890756303, "grad_norm": 0.09568168222904205, "learning_rate": 0.0007818181818181819, "loss": 2.2719, "step": 68 }, { "epoch": 1.8605042016806723, "grad_norm": 0.0905410498380661, "learning_rate": 0.0007454545454545455, "loss": 2.2505, "step": 70 }, { "epoch": 1.9142857142857141, "grad_norm": 0.08841802924871445, "learning_rate": 0.0007090909090909091, "loss": 2.3005, "step": 72 }, { "epoch": 1.9680672268907564, "grad_norm": 0.09013470262289047, "learning_rate": 0.0006727272727272728, "loss": 2.2682, "step": 74 }, { "epoch": 2.0, "grad_norm": 0.19737772643566132, "learning_rate": 0.0006363636363636364, "loss": 2.3476, "step": 76 }, { "epoch": 2.053781512605042, "grad_norm": 0.0839110016822815, "learning_rate": 0.0006, "loss": 2.2338, "step": 78 }, { "epoch": 2.107563025210084, "grad_norm": 0.10582801699638367, "learning_rate": 0.0005636363636363636, "loss": 2.2257, "step": 80 }, { "epoch": 2.107563025210084, "eval_loss": 2.304074764251709, "eval_runtime": 84.3347, "eval_samples_per_second": 14.11, "eval_steps_per_second": 1.767, "step": 80 }, { "epoch": 2.161344537815126, "grad_norm": 0.09145358949899673, "learning_rate": 0.0005272727272727272, "loss": 2.2488, "step": 82 }, { "epoch": 2.215126050420168, "grad_norm": 0.08459240943193436, "learning_rate": 0.0004909090909090909, "loss": 2.2518, "step": 84 }, { "epoch": 2.26890756302521, "grad_norm": 0.09590018540620804, "learning_rate": 0.00045454545454545455, "loss": 2.2324, "step": 86 }, { "epoch": 2.3226890756302523, "grad_norm": 0.10032965242862701, "learning_rate": 0.00041818181818181814, "loss": 2.2099, "step": 88 }, { "epoch": 2.376470588235294, "grad_norm": 0.09092257171869278, "learning_rate": 0.00038181818181818184, "loss": 2.2077, "step": 90 }, { "epoch": 2.4302521008403364, "grad_norm": 0.10066290944814682, "learning_rate": 0.00034545454545454544, "loss": 2.2629, "step": 92 }, { "epoch": 2.484033613445378, "grad_norm": 0.0973694771528244, "learning_rate": 0.0003090909090909091, "loss": 2.2292, "step": 94 }, { "epoch": 2.53781512605042, "grad_norm": 0.09254106879234314, "learning_rate": 0.00027272727272727274, "loss": 2.1923, "step": 96 }, { "epoch": 2.5915966386554623, "grad_norm": 0.10056042671203613, "learning_rate": 0.00023636363636363636, "loss": 2.2445, "step": 98 }, { "epoch": 2.645378151260504, "grad_norm": 0.09601625055074692, "learning_rate": 0.0002, "loss": 2.2605, "step": 100 }, { "epoch": 2.645378151260504, "eval_loss": 2.3115394115448, "eval_runtime": 84.2807, "eval_samples_per_second": 14.119, "eval_steps_per_second": 1.768, "step": 100 }, { "epoch": 2.6991596638655464, "grad_norm": 0.09498832374811172, "learning_rate": 0.00016363636363636363, "loss": 2.215, "step": 102 }, { "epoch": 2.7529411764705882, "grad_norm": 0.09191343188285828, "learning_rate": 0.00012727272727272725, "loss": 2.2116, "step": 104 }, { "epoch": 2.80672268907563, "grad_norm": 0.10717286169528961, "learning_rate": 9.090909090909092e-05, "loss": 2.2435, "step": 106 }, { "epoch": 2.8605042016806723, "grad_norm": 0.09715902805328369, "learning_rate": 5.4545454545454546e-05, "loss": 2.2196, "step": 108 }, { "epoch": 2.914285714285714, "grad_norm": 0.10500436276197433, "learning_rate": 1.8181818181818182e-05, "loss": 2.2351, "step": 110 }, { "epoch": 2.9411764705882355, "step": 111, "total_flos": 8.1776874848256e+17, "train_loss": 2.2894913076280474, "train_runtime": 2825.7191, "train_samples_per_second": 10.107, "train_steps_per_second": 0.039 }, { "epoch": 2.9411764705882355, "eval_loss": 2.311664342880249, "eval_runtime": 84.4111, "eval_samples_per_second": 14.098, "eval_steps_per_second": 1.765, "step": 111 }, { "epoch": 2.9411764705882355, "eval_loss": 2.3323311805725098, "eval_runtime": 84.1513, "eval_samples_per_second": 14.141, "eval_steps_per_second": 1.771, "step": 111 } ], "logging_steps": 2, "max_steps": 111, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.1776874848256e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }