{ "best_metric": 0.6332046332046332, "best_model_checkpoint": "./Validated_cracks_22122025_outputs/checkpoint-1620", "epoch": 30.0, "eval_steps": 500, "global_step": 3240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.46296296296296297, "grad_norm": 10.873927116394043, "learning_rate": 2.901234567901235e-06, "loss": 1.415, "step": 50 }, { "epoch": 0.9259259259259259, "grad_norm": 10.678427696228027, "learning_rate": 5.9876543209876546e-06, "loss": 1.3724, "step": 100 }, { "epoch": 1.0, "eval_accuracy": 0.3359073359073359, "eval_loss": 1.3528215885162354, "eval_runtime": 1.7714, "eval_samples_per_second": 146.211, "eval_steps_per_second": 18.629, "step": 108 }, { "epoch": 1.3888888888888888, "grad_norm": 15.727510452270508, "learning_rate": 9.074074074074075e-06, "loss": 1.3621, "step": 150 }, { "epoch": 1.8518518518518519, "grad_norm": 18.13157844543457, "learning_rate": 9.996809987196146e-06, "loss": 1.3528, "step": 200 }, { "epoch": 2.0, "eval_accuracy": 0.42084942084942084, "eval_loss": 1.2848700284957886, "eval_runtime": 1.764, "eval_samples_per_second": 146.821, "eval_steps_per_second": 18.707, "step": 216 }, { "epoch": 2.314814814814815, "grad_norm": 11.158724784851074, "learning_rate": 9.981195232868493e-06, "loss": 1.2858, "step": 250 }, { "epoch": 2.7777777777777777, "grad_norm": 27.68938446044922, "learning_rate": 9.952610423187516e-06, "loss": 1.2839, "step": 300 }, { "epoch": 3.0, "eval_accuracy": 0.46332046332046334, "eval_loss": 1.2019163370132446, "eval_runtime": 1.7428, "eval_samples_per_second": 148.61, "eval_steps_per_second": 18.935, "step": 324 }, { "epoch": 3.240740740740741, "grad_norm": 16.059484481811523, "learning_rate": 9.91208532978737e-06, "loss": 1.2499, "step": 350 }, { "epoch": 3.7037037037037037, "grad_norm": 10.013123512268066, "learning_rate": 9.85807175279907e-06, "loss": 1.2608, "step": 400 }, { "epoch": 4.0, "eval_accuracy": 0.5212355212355212, "eval_loss": 1.1559516191482544, "eval_runtime": 1.7474, "eval_samples_per_second": 148.218, "eval_steps_per_second": 18.885, "step": 432 }, { "epoch": 4.166666666666667, "grad_norm": 24.753084182739258, "learning_rate": 9.791408709849578e-06, "loss": 1.2172, "step": 450 }, { "epoch": 4.62962962962963, "grad_norm": 12.23658275604248, "learning_rate": 9.712269778427157e-06, "loss": 1.2178, "step": 500 }, { "epoch": 5.0, "eval_accuracy": 0.555984555984556, "eval_loss": 1.0907317399978638, "eval_runtime": 1.747, "eval_samples_per_second": 148.252, "eval_steps_per_second": 18.889, "step": 540 }, { "epoch": 5.092592592592593, "grad_norm": 15.365643501281738, "learning_rate": 9.620861020786583e-06, "loss": 1.2052, "step": 550 }, { "epoch": 5.555555555555555, "grad_norm": 12.787246704101562, "learning_rate": 9.517420447403444e-06, "loss": 1.1624, "step": 600 }, { "epoch": 6.0, "eval_accuracy": 0.528957528957529, "eval_loss": 1.0542538166046143, "eval_runtime": 1.7382, "eval_samples_per_second": 149.005, "eval_steps_per_second": 18.985, "step": 648 }, { "epoch": 6.018518518518518, "grad_norm": 13.462636947631836, "learning_rate": 9.404634840109069e-06, "loss": 1.166, "step": 650 }, { "epoch": 6.481481481481482, "grad_norm": 15.236122131347656, "learning_rate": 9.278195395448948e-06, "loss": 1.1335, "step": 700 }, { "epoch": 6.944444444444445, "grad_norm": 19.687719345092773, "learning_rate": 9.140616369122732e-06, "loss": 1.1108, "step": 750 }, { "epoch": 7.0, "eval_accuracy": 0.5598455598455598, "eval_loss": 1.0452436208724976, "eval_runtime": 1.7531, "eval_samples_per_second": 147.737, "eval_steps_per_second": 18.824, "step": 756 }, { "epoch": 7.407407407407407, "grad_norm": 16.47871971130371, "learning_rate": 8.992255989929632e-06, "loss": 1.0708, "step": 800 }, { "epoch": 7.87037037037037, "grad_norm": 19.686279296875, "learning_rate": 8.833500559197024e-06, "loss": 1.1028, "step": 850 }, { "epoch": 8.0, "eval_accuracy": 0.5598455598455598, "eval_loss": 1.0568522214889526, "eval_runtime": 1.7528, "eval_samples_per_second": 147.764, "eval_steps_per_second": 18.827, "step": 864 }, { "epoch": 8.333333333333334, "grad_norm": 37.79273986816406, "learning_rate": 8.664763444927562e-06, "loss": 1.0619, "step": 900 }, { "epoch": 8.796296296296296, "grad_norm": 16.420129776000977, "learning_rate": 8.486484005469977e-06, "loss": 1.1023, "step": 950 }, { "epoch": 9.0, "eval_accuracy": 0.5675675675675675, "eval_loss": 1.057991862297058, "eval_runtime": 1.7353, "eval_samples_per_second": 149.25, "eval_steps_per_second": 19.016, "step": 972 }, { "epoch": 9.25925925925926, "grad_norm": 25.430509567260742, "learning_rate": 8.299126445516126e-06, "loss": 1.1072, "step": 1000 }, { "epoch": 9.722222222222221, "grad_norm": 35.4229850769043, "learning_rate": 8.103178607403005e-06, "loss": 1.0572, "step": 1050 }, { "epoch": 10.0, "eval_accuracy": 0.6138996138996139, "eval_loss": 1.0031245946884155, "eval_runtime": 1.743, "eval_samples_per_second": 148.597, "eval_steps_per_second": 18.933, "step": 1080 }, { "epoch": 10.185185185185185, "grad_norm": 14.466697692871094, "learning_rate": 7.899150700867014e-06, "loss": 1.0489, "step": 1100 }, { "epoch": 10.648148148148149, "grad_norm": 21.01825714111328, "learning_rate": 7.687573974557857e-06, "loss": 0.9874, "step": 1150 }, { "epoch": 11.0, "eval_accuracy": 0.583011583011583, "eval_loss": 1.0340404510498047, "eval_runtime": 1.7486, "eval_samples_per_second": 148.117, "eval_steps_per_second": 18.872, "step": 1188 }, { "epoch": 11.11111111111111, "grad_norm": 16.48180389404297, "learning_rate": 7.4689993327712765e-06, "loss": 1.0212, "step": 1200 }, { "epoch": 11.574074074074074, "grad_norm": 18.663524627685547, "learning_rate": 7.243995901002312e-06, "loss": 1.0132, "step": 1250 }, { "epoch": 12.0, "eval_accuracy": 0.61003861003861, "eval_loss": 1.004916787147522, "eval_runtime": 1.7428, "eval_samples_per_second": 148.608, "eval_steps_per_second": 18.935, "step": 1296 }, { "epoch": 12.037037037037036, "grad_norm": 24.026588439941406, "learning_rate": 7.013149544054148e-06, "loss": 1.0111, "step": 1300 }, { "epoch": 12.5, "grad_norm": 20.475515365600586, "learning_rate": 6.777061340561082e-06, "loss": 1.005, "step": 1350 }, { "epoch": 12.962962962962964, "grad_norm": 49.801326751708984, "learning_rate": 6.5363460178976524e-06, "loss": 0.9898, "step": 1400 }, { "epoch": 13.0, "eval_accuracy": 0.6216216216216216, "eval_loss": 0.9874952435493469, "eval_runtime": 1.7492, "eval_samples_per_second": 148.065, "eval_steps_per_second": 18.865, "step": 1404 }, { "epoch": 13.425925925925926, "grad_norm": 38.209232330322266, "learning_rate": 6.291630351549136e-06, "loss": 0.9332, "step": 1450 }, { "epoch": 13.88888888888889, "grad_norm": 16.34885025024414, "learning_rate": 6.043551533111121e-06, "loss": 1.0182, "step": 1500 }, { "epoch": 14.0, "eval_accuracy": 0.61003861003861, "eval_loss": 0.9667864441871643, "eval_runtime": 1.7502, "eval_samples_per_second": 147.984, "eval_steps_per_second": 18.855, "step": 1512 }, { "epoch": 14.351851851851851, "grad_norm": 21.03826904296875, "learning_rate": 5.792755511167572e-06, "loss": 0.9279, "step": 1550 }, { "epoch": 14.814814814814815, "grad_norm": 34.984375, "learning_rate": 5.544968491028696e-06, "loss": 0.9889, "step": 1600 }, { "epoch": 15.0, "eval_accuracy": 0.6332046332046332, "eval_loss": 0.9692270755767822, "eval_runtime": 1.7309, "eval_samples_per_second": 149.63, "eval_steps_per_second": 19.065, "step": 1620 }, { "epoch": 15.277777777777779, "grad_norm": 16.456146240234375, "learning_rate": 5.290724144552379e-06, "loss": 0.9673, "step": 1650 }, { "epoch": 15.74074074074074, "grad_norm": 27.009721755981445, "learning_rate": 5.035722809427533e-06, "loss": 0.9446, "step": 1700 }, { "epoch": 16.0, "eval_accuracy": 0.6332046332046332, "eval_loss": 0.9777135848999023, "eval_runtime": 1.7383, "eval_samples_per_second": 148.995, "eval_steps_per_second": 18.984, "step": 1728 }, { "epoch": 16.203703703703702, "grad_norm": 12.319013595581055, "learning_rate": 4.780628459113764e-06, "loss": 0.9024, "step": 1750 }, { "epoch": 16.666666666666668, "grad_norm": 20.682802200317383, "learning_rate": 4.526105309263983e-06, "loss": 0.9519, "step": 1800 }, { "epoch": 17.0, "eval_accuracy": 0.5984555984555985, "eval_loss": 1.0030242204666138, "eval_runtime": 1.7421, "eval_samples_per_second": 148.671, "eval_steps_per_second": 18.943, "step": 1836 }, { "epoch": 17.12962962962963, "grad_norm": 24.291671752929688, "learning_rate": 4.272816088237135e-06, "loss": 1.003, "step": 1850 }, { "epoch": 17.59259259259259, "grad_norm": 13.384744644165039, "learning_rate": 4.021420311483538e-06, "loss": 0.9458, "step": 1900 }, { "epoch": 18.0, "eval_accuracy": 0.5984555984555985, "eval_loss": 0.97477126121521, "eval_runtime": 1.7465, "eval_samples_per_second": 148.299, "eval_steps_per_second": 18.895, "step": 1944 }, { "epoch": 18.055555555555557, "grad_norm": 15.239733695983887, "learning_rate": 3.7725725642960047e-06, "loss": 0.9798, "step": 1950 }, { "epoch": 18.51851851851852, "grad_norm": 24.09659767150879, "learning_rate": 3.526920797398148e-06, "loss": 0.9346, "step": 2000 }, { "epoch": 18.98148148148148, "grad_norm": 23.59765625, "learning_rate": 3.2851046398077705e-06, "loss": 0.9347, "step": 2050 }, { "epoch": 19.0, "eval_accuracy": 0.6177606177606177, "eval_loss": 0.9744410514831543, "eval_runtime": 1.7456, "eval_samples_per_second": 148.373, "eval_steps_per_second": 18.905, "step": 2052 }, { "epoch": 19.444444444444443, "grad_norm": 22.252914428710938, "learning_rate": 3.0477537333683815e-06, "loss": 0.8767, "step": 2100 }, { "epoch": 19.90740740740741, "grad_norm": 32.37730026245117, "learning_rate": 2.815486093285317e-06, "loss": 0.8863, "step": 2150 }, { "epoch": 20.0, "eval_accuracy": 0.6293436293436293, "eval_loss": 0.9656945466995239, "eval_runtime": 1.7376, "eval_samples_per_second": 149.058, "eval_steps_per_second": 18.992, "step": 2160 }, { "epoch": 20.37037037037037, "grad_norm": 18.95844841003418, "learning_rate": 2.5889064989353797e-06, "loss": 0.9081, "step": 2200 }, { "epoch": 20.833333333333332, "grad_norm": 84.69548034667969, "learning_rate": 2.3686049191399614e-06, "loss": 0.8507, "step": 2250 }, { "epoch": 21.0, "eval_accuracy": 0.6254826254826255, "eval_loss": 0.9784243702888489, "eval_runtime": 1.7492, "eval_samples_per_second": 148.065, "eval_steps_per_second": 18.865, "step": 2268 }, { "epoch": 21.296296296296298, "grad_norm": 34.8240852355957, "learning_rate": 2.155154976001948e-06, "loss": 0.9308, "step": 2300 }, { "epoch": 21.75925925925926, "grad_norm": 39.87207794189453, "learning_rate": 1.949112451306282e-06, "loss": 0.8712, "step": 2350 }, { "epoch": 22.0, "eval_accuracy": 0.6254826254826255, "eval_loss": 0.9790379405021667, "eval_runtime": 1.7458, "eval_samples_per_second": 148.355, "eval_steps_per_second": 18.902, "step": 2376 }, { "epoch": 22.22222222222222, "grad_norm": 15.800605773925781, "learning_rate": 1.7510138393732029e-06, "loss": 0.8973, "step": 2400 }, { "epoch": 22.685185185185187, "grad_norm": 13.362822532653809, "learning_rate": 1.5613749501322705e-06, "loss": 0.8857, "step": 2450 }, { "epoch": 23.0, "eval_accuracy": 0.6177606177606177, "eval_loss": 0.9682068824768066, "eval_runtime": 1.7485, "eval_samples_per_second": 148.127, "eval_steps_per_second": 18.873, "step": 2484 }, { "epoch": 23.14814814814815, "grad_norm": 26.928611755371094, "learning_rate": 1.3806895660544805e-06, "loss": 0.8446, "step": 2500 }, { "epoch": 23.61111111111111, "grad_norm": 52.092063903808594, "learning_rate": 1.2094281564395628e-06, "loss": 0.8848, "step": 2550 }, { "epoch": 24.0, "eval_accuracy": 0.6254826254826255, "eval_loss": 0.9722690582275391, "eval_runtime": 1.7534, "eval_samples_per_second": 147.714, "eval_steps_per_second": 18.821, "step": 2592 }, { "epoch": 24.074074074074073, "grad_norm": 22.29905128479004, "learning_rate": 1.0480366524062041e-06, "loss": 0.8778, "step": 2600 }, { "epoch": 24.537037037037038, "grad_norm": 23.949947357177734, "learning_rate": 8.969352857748842e-07, "loss": 0.8672, "step": 2650 }, { "epoch": 25.0, "grad_norm": 31.42759895324707, "learning_rate": 7.565174948666382e-07, "loss": 0.904, "step": 2700 }, { "epoch": 25.0, "eval_accuracy": 0.6177606177606177, "eval_loss": 0.975362241268158, "eval_runtime": 1.7452, "eval_samples_per_second": 148.407, "eval_steps_per_second": 18.909, "step": 2700 }, { "epoch": 25.462962962962962, "grad_norm": 20.711061477661133, "learning_rate": 6.271489000668418e-07, "loss": 0.7932, "step": 2750 }, { "epoch": 25.925925925925927, "grad_norm": 22.23466682434082, "learning_rate": 5.091663518214407e-07, "loss": 0.943, "step": 2800 }, { "epoch": 26.0, "eval_accuracy": 0.6216216216216216, "eval_loss": 0.9710479974746704, "eval_runtime": 1.7435, "eval_samples_per_second": 148.554, "eval_steps_per_second": 18.928, "step": 2808 }, { "epoch": 26.38888888888889, "grad_norm": 18.16496467590332, "learning_rate": 4.0287705354446147e-07, "loss": 0.9108, "step": 2850 }, { "epoch": 26.85185185185185, "grad_norm": 50.363128662109375, "learning_rate": 3.085577617205765e-07, "loss": 0.862, "step": 2900 }, { "epoch": 27.0, "eval_accuracy": 0.6177606177606177, "eval_loss": 0.9716824889183044, "eval_runtime": 1.7298, "eval_samples_per_second": 149.728, "eval_steps_per_second": 19.077, "step": 2916 }, { "epoch": 27.314814814814813, "grad_norm": 69.7729721069336, "learning_rate": 2.2645406528550407e-07, "loss": 0.8412, "step": 2950 }, { "epoch": 27.77777777777778, "grad_norm": 20.13194465637207, "learning_rate": 1.5677974616058856e-07, "loss": 0.864, "step": 3000 }, { "epoch": 28.0, "eval_accuracy": 0.6177606177606177, "eval_loss": 0.9705125689506531, "eval_runtime": 1.7418, "eval_samples_per_second": 148.699, "eval_steps_per_second": 18.946, "step": 3024 }, { "epoch": 28.24074074074074, "grad_norm": 29.41867446899414, "learning_rate": 9.971622260661007e-08, "loss": 0.8656, "step": 3050 }, { "epoch": 28.703703703703702, "grad_norm": 19.62738609313965, "learning_rate": 5.541207684621908e-08, "loss": 0.8879, "step": 3100 }, { "epoch": 29.0, "eval_accuracy": 0.6177606177606177, "eval_loss": 0.97031569480896, "eval_runtime": 1.7373, "eval_samples_per_second": 149.084, "eval_steps_per_second": 18.995, "step": 3132 }, { "epoch": 29.166666666666668, "grad_norm": 18.350698471069336, "learning_rate": 2.398266818496864e-08, "loss": 0.8594, "step": 3150 }, { "epoch": 29.62962962962963, "grad_norm": 28.054824829101562, "learning_rate": 5.509832638314061e-09, "loss": 0.8099, "step": 3200 }, { "epoch": 30.0, "eval_accuracy": 0.6177606177606177, "eval_loss": 0.97026127576828, "eval_runtime": 1.8767, "eval_samples_per_second": 138.009, "eval_steps_per_second": 17.584, "step": 3240 }, { "epoch": 30.0, "step": 3240, "total_flos": 2.0405460235862016e+18, "train_loss": 1.012899116233543, "train_runtime": 640.9857, "train_samples_per_second": 40.251, "train_steps_per_second": 5.055 } ], "logging_steps": 50, "max_steps": 3240, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0405460235862016e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }