{ "best_global_step": 67000, "best_metric": 1.3910651206970215, "best_model_checkpoint": "/home/auguste/Desktop/eDNA/TeleoClassification/scripts/DNABert2/experiments/masking_training/outputs/masking_teleo/checkpoints/checkpoint-67000", "epoch": 108.06451612903226, "eval_steps": 1000, "global_step": 67000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016129032258064516, "grad_norm": 107.56168365478516, "learning_rate": 2e-05, "loss": 7.9233, "step": 1 }, { "epoch": 1.6129032258064515, "grad_norm": 16.32627296447754, "learning_rate": 1.9785161290322584e-05, "loss": 3.0779, "step": 1000 }, { "epoch": 1.6129032258064515, "eval_loss": 2.738837718963623, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.2955, "eval_samples_per_second": 1766.611, "eval_steps_per_second": 111.682, "step": 1000 }, { "epoch": 3.225806451612903, "grad_norm": 12.881124496459961, "learning_rate": 1.9570107526881724e-05, "loss": 2.506, "step": 2000 }, { "epoch": 3.225806451612903, "eval_loss": 2.4902684688568115, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.2968, "eval_samples_per_second": 1758.892, "eval_steps_per_second": 111.194, "step": 2000 }, { "epoch": 4.838709677419355, "grad_norm": 12.914713859558105, "learning_rate": 1.935505376344086e-05, "loss": 2.734, "step": 3000 }, { "epoch": 4.838709677419355, "eval_loss": 2.305058479309082, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3006, "eval_samples_per_second": 1736.761, "eval_steps_per_second": 109.795, "step": 3000 }, { "epoch": 6.451612903225806, "grad_norm": 13.617836952209473, "learning_rate": 1.914e-05, "loss": 2.2267, "step": 4000 }, { "epoch": 6.451612903225806, "eval_loss": 2.3899621963500977, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3004, "eval_samples_per_second": 1737.628, "eval_steps_per_second": 109.85, "step": 4000 }, { "epoch": 8.064516129032258, "grad_norm": 11.493875503540039, "learning_rate": 1.892494623655914e-05, "loss": 2.1095, "step": 5000 }, { "epoch": 8.064516129032258, "eval_loss": 2.1791865825653076, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.2998, "eval_samples_per_second": 1740.976, "eval_steps_per_second": 110.062, "step": 5000 }, { "epoch": 9.67741935483871, "grad_norm": 16.104379653930664, "learning_rate": 1.870989247311828e-05, "loss": 1.9622, "step": 6000 }, { "epoch": 9.67741935483871, "eval_loss": 2.0534751415252686, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3144, "eval_samples_per_second": 1660.314, "eval_steps_per_second": 104.962, "step": 6000 }, { "epoch": 11.290322580645162, "grad_norm": 15.933501243591309, "learning_rate": 1.8494838709677422e-05, "loss": 1.8713, "step": 7000 }, { "epoch": 11.290322580645162, "eval_loss": 2.1255111694335938, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.31, "eval_samples_per_second": 1684.034, "eval_steps_per_second": 106.462, "step": 7000 }, { "epoch": 12.903225806451612, "grad_norm": 9.397466659545898, "learning_rate": 1.8279784946236562e-05, "loss": 1.7906, "step": 8000 }, { "epoch": 12.903225806451612, "eval_loss": 1.9397249221801758, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3026, "eval_samples_per_second": 1724.803, "eval_steps_per_second": 109.039, "step": 8000 }, { "epoch": 14.516129032258064, "grad_norm": 14.291478157043457, "learning_rate": 1.8064731182795702e-05, "loss": 1.7149, "step": 9000 }, { "epoch": 14.516129032258064, "eval_loss": 1.8910889625549316, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3013, "eval_samples_per_second": 1732.385, "eval_steps_per_second": 109.519, "step": 9000 }, { "epoch": 16.129032258064516, "grad_norm": 15.776030540466309, "learning_rate": 1.784967741935484e-05, "loss": 1.634, "step": 10000 }, { "epoch": 16.129032258064516, "eval_loss": 1.893878698348999, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3023, "eval_samples_per_second": 1726.506, "eval_steps_per_second": 109.147, "step": 10000 }, { "epoch": 17.741935483870968, "grad_norm": 12.53177547454834, "learning_rate": 1.763462365591398e-05, "loss": 1.5991, "step": 11000 }, { "epoch": 17.741935483870968, "eval_loss": 1.8701565265655518, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3035, "eval_samples_per_second": 1720.089, "eval_steps_per_second": 108.741, "step": 11000 }, { "epoch": 19.35483870967742, "grad_norm": 13.62909984588623, "learning_rate": 1.741956989247312e-05, "loss": 1.5008, "step": 12000 }, { "epoch": 19.35483870967742, "eval_loss": 1.7572582960128784, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3051, "eval_samples_per_second": 1710.701, "eval_steps_per_second": 108.148, "step": 12000 }, { "epoch": 20.967741935483872, "grad_norm": 13.886764526367188, "learning_rate": 1.720451612903226e-05, "loss": 1.4469, "step": 13000 }, { "epoch": 20.967741935483872, "eval_loss": 1.7456613779067993, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3048, "eval_samples_per_second": 1712.389, "eval_steps_per_second": 108.254, "step": 13000 }, { "epoch": 22.580645161290324, "grad_norm": 16.04749870300293, "learning_rate": 1.6989462365591397e-05, "loss": 1.404, "step": 14000 }, { "epoch": 22.580645161290324, "eval_loss": 1.7826714515686035, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3034, "eval_samples_per_second": 1720.509, "eval_steps_per_second": 108.768, "step": 14000 }, { "epoch": 24.193548387096776, "grad_norm": 14.932185173034668, "learning_rate": 1.6774408602150537e-05, "loss": 1.3552, "step": 15000 }, { "epoch": 24.193548387096776, "eval_loss": 1.7234201431274414, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3061, "eval_samples_per_second": 1705.173, "eval_steps_per_second": 107.798, "step": 15000 }, { "epoch": 25.806451612903224, "grad_norm": 8.178566932678223, "learning_rate": 1.6559354838709676e-05, "loss": 1.313, "step": 16000 }, { "epoch": 25.806451612903224, "eval_loss": 1.8201613426208496, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3194, "eval_samples_per_second": 1634.314, "eval_steps_per_second": 103.319, "step": 16000 }, { "epoch": 27.419354838709676, "grad_norm": 16.086894989013672, "learning_rate": 1.634430107526882e-05, "loss": 1.2751, "step": 17000 }, { "epoch": 27.419354838709676, "eval_loss": 1.6344752311706543, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3053, "eval_samples_per_second": 1709.792, "eval_steps_per_second": 108.09, "step": 17000 }, { "epoch": 29.032258064516128, "grad_norm": 9.854013442993164, "learning_rate": 1.612924731182796e-05, "loss": 1.2377, "step": 18000 }, { "epoch": 29.032258064516128, "eval_loss": 1.6381661891937256, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.343, "eval_samples_per_second": 1521.702, "eval_steps_per_second": 96.2, "step": 18000 }, { "epoch": 30.64516129032258, "grad_norm": 13.270648956298828, "learning_rate": 1.59141935483871e-05, "loss": 1.1772, "step": 19000 }, { "epoch": 30.64516129032258, "eval_loss": 1.6601710319519043, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3037, "eval_samples_per_second": 1718.661, "eval_steps_per_second": 108.651, "step": 19000 }, { "epoch": 32.25806451612903, "grad_norm": 20.389537811279297, "learning_rate": 1.569913978494624e-05, "loss": 1.176, "step": 20000 }, { "epoch": 32.25806451612903, "eval_loss": 1.6632287502288818, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3045, "eval_samples_per_second": 1714.128, "eval_steps_per_second": 108.364, "step": 20000 }, { "epoch": 33.87096774193548, "grad_norm": 11.229137420654297, "learning_rate": 1.548408602150538e-05, "loss": 1.1184, "step": 21000 }, { "epoch": 33.87096774193548, "eval_loss": 1.7555991411209106, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3043, "eval_samples_per_second": 1715.561, "eval_steps_per_second": 108.455, "step": 21000 }, { "epoch": 35.483870967741936, "grad_norm": 10.823155403137207, "learning_rate": 1.5269032258064518e-05, "loss": 1.0793, "step": 22000 }, { "epoch": 35.483870967741936, "eval_loss": 1.6087384223937988, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3059, "eval_samples_per_second": 1706.414, "eval_steps_per_second": 107.877, "step": 22000 }, { "epoch": 37.096774193548384, "grad_norm": 6.54916524887085, "learning_rate": 1.5053978494623658e-05, "loss": 1.0632, "step": 23000 }, { "epoch": 37.096774193548384, "eval_loss": 1.6815119981765747, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3044, "eval_samples_per_second": 1714.683, "eval_steps_per_second": 108.399, "step": 23000 }, { "epoch": 38.70967741935484, "grad_norm": 14.550293922424316, "learning_rate": 1.4838924731182798e-05, "loss": 1.0185, "step": 24000 }, { "epoch": 38.70967741935484, "eval_loss": 1.6611889600753784, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3028, "eval_samples_per_second": 1724.12, "eval_steps_per_second": 108.996, "step": 24000 }, { "epoch": 40.32258064516129, "grad_norm": 14.825828552246094, "learning_rate": 1.4623870967741937e-05, "loss": 1.0148, "step": 25000 }, { "epoch": 40.32258064516129, "eval_loss": 1.5314302444458008, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3061, "eval_samples_per_second": 1705.416, "eval_steps_per_second": 107.814, "step": 25000 }, { "epoch": 41.935483870967744, "grad_norm": 15.808582305908203, "learning_rate": 1.4408817204301075e-05, "loss": 0.9492, "step": 26000 }, { "epoch": 41.935483870967744, "eval_loss": 1.717032790184021, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3131, "eval_samples_per_second": 1667.386, "eval_steps_per_second": 105.409, "step": 26000 }, { "epoch": 43.54838709677419, "grad_norm": 13.56778621673584, "learning_rate": 1.4193763440860215e-05, "loss": 0.9352, "step": 27000 }, { "epoch": 43.54838709677419, "eval_loss": 1.631635069847107, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3065, "eval_samples_per_second": 1703.186, "eval_steps_per_second": 107.673, "step": 27000 }, { "epoch": 45.16129032258065, "grad_norm": 14.375411987304688, "learning_rate": 1.3978709677419355e-05, "loss": 0.9287, "step": 28000 }, { "epoch": 45.16129032258065, "eval_loss": 1.643862247467041, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3075, "eval_samples_per_second": 1697.381, "eval_steps_per_second": 107.306, "step": 28000 }, { "epoch": 46.774193548387096, "grad_norm": 12.451338768005371, "learning_rate": 1.3763655913978495e-05, "loss": 0.9052, "step": 29000 }, { "epoch": 46.774193548387096, "eval_loss": 1.4976590871810913, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3151, "eval_samples_per_second": 1656.651, "eval_steps_per_second": 104.731, "step": 29000 }, { "epoch": 48.38709677419355, "grad_norm": 15.790621757507324, "learning_rate": 1.3548602150537636e-05, "loss": 0.8897, "step": 30000 }, { "epoch": 48.38709677419355, "eval_loss": 1.544758915901184, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3045, "eval_samples_per_second": 1714.222, "eval_steps_per_second": 108.37, "step": 30000 }, { "epoch": 50.0, "grad_norm": 15.337139129638672, "learning_rate": 1.3333548387096776e-05, "loss": 0.9353, "step": 31000 }, { "epoch": 50.0, "eval_loss": 1.7019206285476685, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3181, "eval_samples_per_second": 1640.743, "eval_steps_per_second": 103.725, "step": 31000 }, { "epoch": 51.61290322580645, "grad_norm": 17.48087501525879, "learning_rate": 1.3118494623655916e-05, "loss": 0.8976, "step": 32000 }, { "epoch": 51.61290322580645, "eval_loss": 1.6256884336471558, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3131, "eval_samples_per_second": 1667.367, "eval_steps_per_second": 105.408, "step": 32000 }, { "epoch": 53.225806451612904, "grad_norm": 15.387638092041016, "learning_rate": 1.2903440860215055e-05, "loss": 0.8414, "step": 33000 }, { "epoch": 53.225806451612904, "eval_loss": 1.5139249563217163, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3191, "eval_samples_per_second": 1635.972, "eval_steps_per_second": 103.424, "step": 33000 }, { "epoch": 54.83870967741935, "grad_norm": 15.2994384765625, "learning_rate": 1.2688387096774195e-05, "loss": 0.7897, "step": 34000 }, { "epoch": 54.83870967741935, "eval_loss": 1.7013849020004272, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3268, "eval_samples_per_second": 1597.145, "eval_steps_per_second": 100.969, "step": 34000 }, { "epoch": 56.45161290322581, "grad_norm": 14.40909481048584, "learning_rate": 1.2473333333333335e-05, "loss": 0.8627, "step": 35000 }, { "epoch": 56.45161290322581, "eval_loss": 1.7141073942184448, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3089, "eval_samples_per_second": 1689.899, "eval_steps_per_second": 106.833, "step": 35000 }, { "epoch": 58.064516129032256, "grad_norm": 19.243818283081055, "learning_rate": 1.2258279569892474e-05, "loss": 0.9135, "step": 36000 }, { "epoch": 58.064516129032256, "eval_loss": 1.678747296333313, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3278, "eval_samples_per_second": 1592.41, "eval_steps_per_second": 100.67, "step": 36000 }, { "epoch": 59.67741935483871, "grad_norm": 14.35431957244873, "learning_rate": 1.2043225806451614e-05, "loss": 0.9226, "step": 37000 }, { "epoch": 59.67741935483871, "eval_loss": 1.9941015243530273, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3066, "eval_samples_per_second": 1702.667, "eval_steps_per_second": 107.64, "step": 37000 }, { "epoch": 61.29032258064516, "grad_norm": 16.02369499206543, "learning_rate": 1.1828172043010752e-05, "loss": 0.8849, "step": 38000 }, { "epoch": 61.29032258064516, "eval_loss": 1.5911988019943237, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3059, "eval_samples_per_second": 1706.321, "eval_steps_per_second": 107.871, "step": 38000 }, { "epoch": 62.903225806451616, "grad_norm": 24.164094924926758, "learning_rate": 1.1613118279569892e-05, "loss": 0.7974, "step": 39000 }, { "epoch": 62.903225806451616, "eval_loss": 1.5700287818908691, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3059, "eval_samples_per_second": 1706.437, "eval_steps_per_second": 107.878, "step": 39000 }, { "epoch": 64.51612903225806, "grad_norm": 10.7676420211792, "learning_rate": 1.1398064516129033e-05, "loss": 0.7892, "step": 40000 }, { "epoch": 64.51612903225806, "eval_loss": 1.6208666563034058, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3135, "eval_samples_per_second": 1665.325, "eval_steps_per_second": 105.279, "step": 40000 }, { "epoch": 66.12903225806451, "grad_norm": 8.90040111541748, "learning_rate": 1.1183010752688173e-05, "loss": 0.7728, "step": 41000 }, { "epoch": 66.12903225806451, "eval_loss": 1.5275108814239502, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3064, "eval_samples_per_second": 1703.395, "eval_steps_per_second": 107.686, "step": 41000 }, { "epoch": 67.74193548387096, "grad_norm": 16.836742401123047, "learning_rate": 1.0967956989247313e-05, "loss": 0.7309, "step": 42000 }, { "epoch": 67.74193548387096, "eval_loss": 1.6568617820739746, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3052, "eval_samples_per_second": 1710.328, "eval_steps_per_second": 108.124, "step": 42000 }, { "epoch": 69.35483870967742, "grad_norm": 16.19956398010254, "learning_rate": 1.0752903225806453e-05, "loss": 0.6891, "step": 43000 }, { "epoch": 69.35483870967742, "eval_loss": 1.4376003742218018, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3272, "eval_samples_per_second": 1595.464, "eval_steps_per_second": 100.863, "step": 43000 }, { "epoch": 70.96774193548387, "grad_norm": 19.571664810180664, "learning_rate": 1.0537849462365592e-05, "loss": 0.6732, "step": 44000 }, { "epoch": 70.96774193548387, "eval_loss": 1.6094655990600586, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3144, "eval_samples_per_second": 1660.491, "eval_steps_per_second": 104.974, "step": 44000 }, { "epoch": 72.58064516129032, "grad_norm": 11.60450267791748, "learning_rate": 1.0322795698924732e-05, "loss": 0.6475, "step": 45000 }, { "epoch": 72.58064516129032, "eval_loss": 1.569161295890808, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3241, "eval_samples_per_second": 1610.77, "eval_steps_per_second": 101.83, "step": 45000 }, { "epoch": 74.19354838709677, "grad_norm": 14.973388671875, "learning_rate": 1.0107741935483872e-05, "loss": 0.674, "step": 46000 }, { "epoch": 74.19354838709677, "eval_loss": 1.4532381296157837, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3117, "eval_samples_per_second": 1674.469, "eval_steps_per_second": 105.857, "step": 46000 }, { "epoch": 75.80645161290323, "grad_norm": 19.416486740112305, "learning_rate": 9.892688172043012e-06, "loss": 0.6339, "step": 47000 }, { "epoch": 75.80645161290323, "eval_loss": 1.5601801872253418, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3055, "eval_samples_per_second": 1708.783, "eval_steps_per_second": 108.027, "step": 47000 }, { "epoch": 77.41935483870968, "grad_norm": 12.237533569335938, "learning_rate": 9.677634408602151e-06, "loss": 0.628, "step": 48000 }, { "epoch": 77.41935483870968, "eval_loss": 1.5352447032928467, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3243, "eval_samples_per_second": 1609.448, "eval_steps_per_second": 101.747, "step": 48000 }, { "epoch": 79.03225806451613, "grad_norm": 8.90131664276123, "learning_rate": 9.462580645161291e-06, "loss": 0.6123, "step": 49000 }, { "epoch": 79.03225806451613, "eval_loss": 1.6023005247116089, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3064, "eval_samples_per_second": 1703.925, "eval_steps_per_second": 107.719, "step": 49000 }, { "epoch": 80.64516129032258, "grad_norm": 19.542125701904297, "learning_rate": 9.24752688172043e-06, "loss": 0.5913, "step": 50000 }, { "epoch": 80.64516129032258, "eval_loss": 1.4985138177871704, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3143, "eval_samples_per_second": 1660.843, "eval_steps_per_second": 104.996, "step": 50000 }, { "epoch": 82.25806451612904, "grad_norm": 15.9403715133667, "learning_rate": 9.03247311827957e-06, "loss": 0.5919, "step": 51000 }, { "epoch": 82.25806451612904, "eval_loss": 1.557279109954834, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3138, "eval_samples_per_second": 1663.684, "eval_steps_per_second": 105.175, "step": 51000 }, { "epoch": 83.87096774193549, "grad_norm": 16.341463088989258, "learning_rate": 8.81741935483871e-06, "loss": 0.5849, "step": 52000 }, { "epoch": 83.87096774193549, "eval_loss": 1.744088888168335, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3074, "eval_samples_per_second": 1698.241, "eval_steps_per_second": 107.36, "step": 52000 }, { "epoch": 85.48387096774194, "grad_norm": 17.496572494506836, "learning_rate": 8.60236559139785e-06, "loss": 0.5798, "step": 53000 }, { "epoch": 85.48387096774194, "eval_loss": 1.5605759620666504, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3312, "eval_samples_per_second": 1576.078, "eval_steps_per_second": 99.637, "step": 53000 }, { "epoch": 87.09677419354838, "grad_norm": 22.154132843017578, "learning_rate": 8.38731182795699e-06, "loss": 0.5627, "step": 54000 }, { "epoch": 87.09677419354838, "eval_loss": 1.486401081085205, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3046, "eval_samples_per_second": 1713.463, "eval_steps_per_second": 108.322, "step": 54000 }, { "epoch": 88.70967741935483, "grad_norm": 12.007641792297363, "learning_rate": 8.17225806451613e-06, "loss": 0.5926, "step": 55000 }, { "epoch": 88.70967741935483, "eval_loss": 1.533622145652771, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3189, "eval_samples_per_second": 1636.753, "eval_steps_per_second": 103.473, "step": 55000 }, { "epoch": 90.3225806451613, "grad_norm": 16.921255111694336, "learning_rate": 7.957204301075269e-06, "loss": 0.5737, "step": 56000 }, { "epoch": 90.3225806451613, "eval_loss": 1.595588207244873, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3248, "eval_samples_per_second": 1607.001, "eval_steps_per_second": 101.592, "step": 56000 }, { "epoch": 91.93548387096774, "grad_norm": 14.567840576171875, "learning_rate": 7.74215053763441e-06, "loss": 0.5521, "step": 57000 }, { "epoch": 91.93548387096774, "eval_loss": 1.6286988258361816, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.308, "eval_samples_per_second": 1694.658, "eval_steps_per_second": 107.134, "step": 57000 }, { "epoch": 93.54838709677419, "grad_norm": 7.83158016204834, "learning_rate": 7.5270967741935486e-06, "loss": 0.5672, "step": 58000 }, { "epoch": 93.54838709677419, "eval_loss": 1.6612709760665894, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3047, "eval_samples_per_second": 1713.282, "eval_steps_per_second": 108.311, "step": 58000 }, { "epoch": 95.16129032258064, "grad_norm": 20.766202926635742, "learning_rate": 7.312043010752688e-06, "loss": 0.5685, "step": 59000 }, { "epoch": 95.16129032258064, "eval_loss": 1.5319266319274902, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3061, "eval_samples_per_second": 1705.367, "eval_steps_per_second": 107.811, "step": 59000 }, { "epoch": 96.7741935483871, "grad_norm": 13.834534645080566, "learning_rate": 7.096989247311829e-06, "loss": 0.5394, "step": 60000 }, { "epoch": 96.7741935483871, "eval_loss": 1.5068557262420654, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3061, "eval_samples_per_second": 1705.255, "eval_steps_per_second": 107.803, "step": 60000 }, { "epoch": 98.38709677419355, "grad_norm": 9.130626678466797, "learning_rate": 6.881935483870969e-06, "loss": 0.5095, "step": 61000 }, { "epoch": 98.38709677419355, "eval_loss": 1.4926313161849976, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3074, "eval_samples_per_second": 1698.19, "eval_steps_per_second": 107.357, "step": 61000 }, { "epoch": 100.0, "grad_norm": 18.79903793334961, "learning_rate": 6.666881720430108e-06, "loss": 0.5327, "step": 62000 }, { "epoch": 100.0, "eval_loss": 1.4378135204315186, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3146, "eval_samples_per_second": 1659.02, "eval_steps_per_second": 104.881, "step": 62000 }, { "epoch": 101.61290322580645, "grad_norm": 17.528038024902344, "learning_rate": 6.451827956989248e-06, "loss": 0.5108, "step": 63000 }, { "epoch": 101.61290322580645, "eval_loss": 1.4716895818710327, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3118, "eval_samples_per_second": 1673.899, "eval_steps_per_second": 105.821, "step": 63000 }, { "epoch": 103.2258064516129, "grad_norm": 9.862174034118652, "learning_rate": 6.236774193548387e-06, "loss": 0.4874, "step": 64000 }, { "epoch": 103.2258064516129, "eval_loss": 1.519917368888855, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3105, "eval_samples_per_second": 1681.31, "eval_steps_per_second": 106.29, "step": 64000 }, { "epoch": 104.83870967741936, "grad_norm": 11.85350513458252, "learning_rate": 6.0217204301075275e-06, "loss": 0.4856, "step": 65000 }, { "epoch": 104.83870967741936, "eval_loss": 1.5175796747207642, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3143, "eval_samples_per_second": 1661.035, "eval_steps_per_second": 105.008, "step": 65000 }, { "epoch": 106.45161290322581, "grad_norm": 21.145742416381836, "learning_rate": 5.806666666666667e-06, "loss": 0.4665, "step": 66000 }, { "epoch": 106.45161290322581, "eval_loss": 1.5837030410766602, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3108, "eval_samples_per_second": 1679.611, "eval_steps_per_second": 106.182, "step": 66000 }, { "epoch": 108.06451612903226, "grad_norm": 8.358002662658691, "learning_rate": 5.591612903225807e-06, "loss": 0.4846, "step": 67000 }, { "epoch": 108.06451612903226, "eval_loss": 1.3910651206970215, "eval_model_preparation_time": 0.0012, "eval_runtime": 0.3115, "eval_samples_per_second": 1676.029, "eval_steps_per_second": 105.956, "step": 67000 } ], "logging_steps": 1000, "max_steps": 93000, "num_input_tokens_seen": 0, "num_train_epochs": 150, "save_steps": 1000, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 80, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1978332035428352e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }