{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 500, "global_step": 231, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15151515151515152, "grad_norm": 7.071829850114283, "learning_rate": 6.666666666666667e-06, "loss": 0.4885, "loss_nan_ranks": 0, "loss_rank_avg": 0.15010309219360352, "step": 5, "valid_targets_mean": 8453.9, "valid_targets_min": 1970 }, { "epoch": 0.30303030303030304, "grad_norm": 1.9804373529490895, "learning_rate": 1.5000000000000002e-05, "loss": 0.4116, "loss_nan_ranks": 0, "loss_rank_avg": 0.11914314329624176, "step": 10, "valid_targets_mean": 8011.8, "valid_targets_min": 1960 }, { "epoch": 0.45454545454545453, "grad_norm": 0.6412351971777132, "learning_rate": 2.3333333333333336e-05, "loss": 0.3555, "loss_nan_ranks": 0, "loss_rank_avg": 0.10543923079967499, "step": 15, "valid_targets_mean": 8846.7, "valid_targets_min": 4611 }, { "epoch": 0.6060606060606061, "grad_norm": 0.5392085970382771, "learning_rate": 3.1666666666666666e-05, "loss": 0.3169, "loss_nan_ranks": 0, "loss_rank_avg": 0.10168177634477615, "step": 20, "valid_targets_mean": 9232.6, "valid_targets_min": 3741 }, { "epoch": 0.7575757575757576, "grad_norm": 0.3582208500451154, "learning_rate": 4e-05, "loss": 0.2743, "loss_nan_ranks": 0, "loss_rank_avg": 0.09165775775909424, "step": 25, "valid_targets_mean": 8923.8, "valid_targets_min": 3026 }, { "epoch": 0.9090909090909091, "grad_norm": 0.26631192084809374, "learning_rate": 3.994244399375679e-05, "loss": 0.2516, "loss_nan_ranks": 0, "loss_rank_avg": 0.08204009383916855, "step": 30, "valid_targets_mean": 9076.3, "valid_targets_min": 4099 }, { "epoch": 1.0606060606060606, "grad_norm": 0.22624905586056185, "learning_rate": 3.977010724441261e-05, "loss": 0.2285, "loss_nan_ranks": 0, "loss_rank_avg": 0.06856784224510193, "step": 35, "valid_targets_mean": 7689.8, "valid_targets_min": 3519 }, { "epoch": 1.2121212121212122, "grad_norm": 0.19597958600025933, "learning_rate": 3.9483981653469586e-05, "loss": 0.2195, "loss_nan_ranks": 0, "loss_rank_avg": 0.06643202900886536, "step": 40, "valid_targets_mean": 8430.5, "valid_targets_min": 2426 }, { "epoch": 1.3636363636363638, "grad_norm": 0.18547010915072298, "learning_rate": 3.908571404555758e-05, "loss": 0.2076, "loss_nan_ranks": 0, "loss_rank_avg": 0.06439946591854095, "step": 45, "valid_targets_mean": 9294.1, "valid_targets_min": 1312 }, { "epoch": 1.5151515151515151, "grad_norm": 0.17502242360460948, "learning_rate": 3.8577596689969346e-05, "loss": 0.194, "loss_nan_ranks": 0, "loss_rank_avg": 0.06616371870040894, "step": 50, "valid_targets_mean": 8604.2, "valid_targets_min": 2696 }, { "epoch": 1.6666666666666665, "grad_norm": 0.15763655671676294, "learning_rate": 3.7962554107273926e-05, "loss": 0.1975, "loss_nan_ranks": 0, "loss_rank_avg": 0.05964214354753494, "step": 55, "valid_targets_mean": 8201.2, "valid_targets_min": 2157 }, { "epoch": 1.8181818181818183, "grad_norm": 0.15305317018363324, "learning_rate": 3.724412623694427e-05, "loss": 0.1887, "loss_nan_ranks": 0, "loss_rank_avg": 0.05983173847198486, "step": 60, "valid_targets_mean": 8784.1, "valid_targets_min": 2568 }, { "epoch": 1.9696969696969697, "grad_norm": 0.15685613419616962, "learning_rate": 3.642644806287938e-05, "loss": 0.1842, "loss_nan_ranks": 0, "loss_rank_avg": 0.06401412934064865, "step": 65, "valid_targets_mean": 9732.0, "valid_targets_min": 3691 }, { "epoch": 2.121212121212121, "grad_norm": 0.1710095040672745, "learning_rate": 3.55142258140884e-05, "loss": 0.1774, "loss_nan_ranks": 0, "loss_rank_avg": 0.05440504476428032, "step": 70, "valid_targets_mean": 8249.6, "valid_targets_min": 3584 }, { "epoch": 2.2727272727272725, "grad_norm": 0.1775001328477045, "learning_rate": 3.451270987751598e-05, "loss": 0.1751, "loss_nan_ranks": 0, "loss_rank_avg": 0.05906001478433609, "step": 75, "valid_targets_mean": 8529.4, "valid_targets_min": 3214 }, { "epoch": 2.4242424242424243, "grad_norm": 0.15374353835337343, "learning_rate": 3.342766457891194e-05, "loss": 0.1732, "loss_nan_ranks": 0, "loss_rank_avg": 0.06226389855146408, "step": 80, "valid_targets_mean": 9319.1, "valid_targets_min": 3163 }, { "epoch": 2.5757575757575757, "grad_norm": 0.16344244686978898, "learning_rate": 3.226533500567433e-05, "loss": 0.1768, "loss_nan_ranks": 0, "loss_rank_avg": 0.06373937427997589, "step": 85, "valid_targets_mean": 8842.0, "valid_targets_min": 4024 }, { "epoch": 2.7272727272727275, "grad_norm": 0.1590870643550257, "learning_rate": 3.1032411062620544e-05, "loss": 0.1675, "loss_nan_ranks": 0, "loss_rank_avg": 0.05495002493262291, "step": 90, "valid_targets_mean": 9056.3, "valid_targets_min": 2998 }, { "epoch": 2.878787878787879, "grad_norm": 0.1572481613873198, "learning_rate": 2.973598896756697e-05, "loss": 0.1658, "loss_nan_ranks": 0, "loss_rank_avg": 0.05916336178779602, "step": 95, "valid_targets_mean": 9378.6, "valid_targets_min": 4128 }, { "epoch": 3.0303030303030303, "grad_norm": 0.16522830538286623, "learning_rate": 2.8383530408333285e-05, "loss": 0.1705, "loss_nan_ranks": 0, "loss_rank_avg": 0.06000653654336929, "step": 100, "valid_targets_mean": 9399.1, "valid_targets_min": 4429 }, { "epoch": 3.1818181818181817, "grad_norm": 0.15664808473003858, "learning_rate": 2.6982819596247373e-05, "loss": 0.1662, "loss_nan_ranks": 0, "loss_rank_avg": 0.051601044833660126, "step": 105, "valid_targets_mean": 8927.5, "valid_targets_min": 3117 }, { "epoch": 3.3333333333333335, "grad_norm": 0.15996355844466018, "learning_rate": 2.554191846333378e-05, "loss": 0.1626, "loss_nan_ranks": 0, "loss_rank_avg": 0.054302021861076355, "step": 110, "valid_targets_mean": 8787.9, "valid_targets_min": 3691 }, { "epoch": 3.484848484848485, "grad_norm": 0.15935216780397668, "learning_rate": 2.4069120261052682e-05, "loss": 0.1571, "loss_nan_ranks": 0, "loss_rank_avg": 0.047706879675388336, "step": 115, "valid_targets_mean": 7594.4, "valid_targets_min": 1960 }, { "epoch": 3.6363636363636362, "grad_norm": 0.18279188836268426, "learning_rate": 2.2572901827656626e-05, "loss": 0.1568, "loss_nan_ranks": 0, "loss_rank_avg": 0.05731729045510292, "step": 120, "valid_targets_mean": 8903.4, "valid_targets_min": 1670 }, { "epoch": 3.787878787878788, "grad_norm": 0.16897393812957626, "learning_rate": 2.1061874798894992e-05, "loss": 0.1595, "loss_nan_ranks": 0, "loss_rank_avg": 0.05671117454767227, "step": 125, "valid_targets_mean": 8410.8, "valid_targets_min": 3052 }, { "epoch": 3.9393939393939394, "grad_norm": 0.16560200225340949, "learning_rate": 1.9544736042877886e-05, "loss": 0.1608, "loss_nan_ranks": 0, "loss_rank_avg": 0.0500858873128891, "step": 130, "valid_targets_mean": 8883.9, "valid_targets_min": 2364 }, { "epoch": 4.090909090909091, "grad_norm": 0.17765486125420513, "learning_rate": 1.8030217604376628e-05, "loss": 0.1568, "loss_nan_ranks": 0, "loss_rank_avg": 0.05107717961072922, "step": 135, "valid_targets_mean": 7918.4, "valid_targets_min": 2880 }, { "epoch": 4.242424242424242, "grad_norm": 0.16121021939384844, "learning_rate": 1.6527036446661396e-05, "loss": 0.1572, "loss_nan_ranks": 0, "loss_rank_avg": 0.056909848004579544, "step": 140, "valid_targets_mean": 8202.9, "valid_targets_min": 2989 }, { "epoch": 4.393939393939394, "grad_norm": 0.15780609040060248, "learning_rate": 1.5043844280142005e-05, "loss": 0.1516, "loss_nan_ranks": 0, "loss_rank_avg": 0.05371875315904617, "step": 145, "valid_targets_mean": 9110.0, "valid_targets_min": 2450 }, { "epoch": 4.545454545454545, "grad_norm": 0.16910637578622967, "learning_rate": 1.358917776657806e-05, "loss": 0.1537, "loss_nan_ranks": 0, "loss_rank_avg": 0.0523504763841629, "step": 150, "valid_targets_mean": 8569.7, "valid_targets_min": 3473 }, { "epoch": 4.696969696969697, "grad_norm": 0.15315713301670586, "learning_rate": 1.2171409385463218e-05, "loss": 0.1539, "loss_nan_ranks": 0, "loss_rank_avg": 0.051369499415159225, "step": 155, "valid_targets_mean": 9342.3, "valid_targets_min": 4669 }, { "epoch": 4.848484848484849, "grad_norm": 0.15519091165573462, "learning_rate": 1.0798699245376959e-05, "loss": 0.1537, "loss_nan_ranks": 0, "loss_rank_avg": 0.05453380197286606, "step": 160, "valid_targets_mean": 8980.6, "valid_targets_min": 2311 }, { "epoch": 5.0, "grad_norm": 0.15338125297734576, "learning_rate": 9.478948117658577e-06, "loss": 0.152, "loss_nan_ranks": 0, "loss_rank_avg": 0.048132434487342834, "step": 165, "valid_targets_mean": 9222.9, "valid_targets_min": 3021 }, { "epoch": 5.151515151515151, "grad_norm": 0.15963497291261625, "learning_rate": 8.219751962722726e-06, "loss": 0.1513, "loss_nan_ranks": 0, "loss_rank_avg": 0.05217127129435539, "step": 170, "valid_targets_mean": 8096.7, "valid_targets_min": 1663 }, { "epoch": 5.303030303030303, "grad_norm": 0.1701013399219855, "learning_rate": 7.028358210744881e-06, "loss": 0.1483, "loss_nan_ranks": 0, "loss_rank_avg": 0.04974319785833359, "step": 175, "valid_targets_mean": 8601.6, "valid_targets_min": 3238 }, { "epoch": 5.454545454545454, "grad_norm": 0.15824877992240952, "learning_rate": 5.911624048347757e-06, "loss": 0.1515, "loss_nan_ranks": 0, "loss_rank_avg": 0.054281268268823624, "step": 180, "valid_targets_mean": 9436.0, "valid_targets_min": 3369 }, { "epoch": 5.606060606060606, "grad_norm": 0.16388221222616883, "learning_rate": 4.875976951373633e-06, "loss": 0.1498, "loss_nan_ranks": 0, "loss_rank_avg": 0.04890450835227966, "step": 185, "valid_targets_mean": 8094.2, "valid_targets_min": 2998 }, { "epoch": 5.757575757575758, "grad_norm": 0.15237739649000462, "learning_rate": 3.927377690900436e-06, "loss": 0.1468, "loss_nan_ranks": 0, "loss_rank_avg": 0.04883284121751785, "step": 190, "valid_targets_mean": 9055.8, "valid_targets_min": 4367 }, { "epoch": 5.909090909090909, "grad_norm": 0.1678247264659432, "learning_rate": 3.071286025423983e-06, "loss": 0.1513, "loss_nan_ranks": 0, "loss_rank_avg": 0.050182901322841644, "step": 195, "valid_targets_mean": 8657.7, "valid_targets_min": 2063 }, { "epoch": 6.0606060606060606, "grad_norm": 0.1550783342570967, "learning_rate": 2.312629276668554e-06, "loss": 0.1532, "loss_nan_ranks": 0, "loss_rank_avg": 0.047827623784542084, "step": 200, "valid_targets_mean": 8015.0, "valid_targets_min": 2224 }, { "epoch": 6.212121212121212, "grad_norm": 0.1551034370223493, "learning_rate": 1.6557739698909436e-06, "loss": 0.1483, "loss_nan_ranks": 0, "loss_rank_avg": 0.04756705462932587, "step": 205, "valid_targets_mean": 8738.5, "valid_targets_min": 973 }, { "epoch": 6.363636363636363, "grad_norm": 0.16985454441654288, "learning_rate": 1.1045007019049182e-06, "loss": 0.1478, "loss_nan_ranks": 0, "loss_rank_avg": 0.050542425364255905, "step": 210, "valid_targets_mean": 8397.1, "valid_targets_min": 3736 }, { "epoch": 6.515151515151516, "grad_norm": 0.17287398062167753, "learning_rate": 6.619823814758786e-07, "loss": 0.1519, "loss_nan_ranks": 0, "loss_rank_avg": 0.049958501011133194, "step": 215, "valid_targets_mean": 9084.4, "valid_targets_min": 1312 }, { "epoch": 6.666666666666667, "grad_norm": 0.15268537058766385, "learning_rate": 3.307659673251595e-07, "loss": 0.1489, "loss_nan_ranks": 0, "loss_rank_avg": 0.044708285480737686, "step": 220, "valid_targets_mean": 8133.6, "valid_targets_min": 1325 }, { "epoch": 6.818181818181818, "grad_norm": 0.17158051378559555, "learning_rate": 1.1275780885282806e-07, "loss": 0.145, "loss_nan_ranks": 0, "loss_rank_avg": 0.047260358929634094, "step": 225, "valid_targets_mean": 9015.6, "valid_targets_min": 3238 }, { "epoch": 6.96969696969697, "grad_norm": 0.1521451879785229, "learning_rate": 9.212673951897177e-09, "loss": 0.1523, "loss_nan_ranks": 0, "loss_rank_avg": 0.04904649779200554, "step": 230, "valid_targets_mean": 8038.4, "valid_targets_min": 1670 }, { "epoch": 7.0, "step": 231, "total_flos": 1.7248182061550797e+18, "train_loss": 0.0, "train_runtime": 6.073, "train_samples_per_second": 3642.329, "train_steps_per_second": 38.037 } ], "logging_steps": 5, "max_steps": 231, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7248182061550797e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }