| { | |
| "best_global_step": 1296, | |
| "best_metric": 0.8998624190985544, | |
| "best_model_checkpoint": "models/modernbert_location_improved/checkpoint-1296", | |
| "epoch": 20.0, | |
| "eval_steps": 500, | |
| "global_step": 2160, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.46296296296296297, | |
| "grad_norm": 200.00001525878906, | |
| "learning_rate": 3.0246913580246917e-06, | |
| "loss": 1.1761, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.9259259259259259, | |
| "grad_norm": 31.97449493408203, | |
| "learning_rate": 6.111111111111112e-06, | |
| "loss": 0.7491, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_f1_macro": 0.1659703881552621, | |
| "eval_f1_micro": 0.3064516129032258, | |
| "eval_hamming_accuracy": 0.8130434782608695, | |
| "eval_loss": 0.7170208096504211, | |
| "eval_precision_micro": 0.30158730158730157, | |
| "eval_recall_micro": 0.3114754098360656, | |
| "eval_runtime": 0.8049, | |
| "eval_samples_per_second": 228.588, | |
| "eval_steps_per_second": 14.908, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.3888888888888888, | |
| "grad_norm": 20.814834594726562, | |
| "learning_rate": 9.197530864197531e-06, | |
| "loss": 0.5513, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.8518518518518519, | |
| "grad_norm": 15.15282154083252, | |
| "learning_rate": 1.228395061728395e-05, | |
| "loss": 0.4094, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_f1_macro": 0.6753457587631049, | |
| "eval_f1_micro": 0.7657992565055762, | |
| "eval_hamming_accuracy": 0.9315217391304348, | |
| "eval_loss": 0.3277210593223572, | |
| "eval_precision_micro": 0.7006802721088435, | |
| "eval_recall_micro": 0.8442622950819673, | |
| "eval_runtime": 0.8136, | |
| "eval_samples_per_second": 226.16, | |
| "eval_steps_per_second": 14.75, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 2.314814814814815, | |
| "grad_norm": 15.625266075134277, | |
| "learning_rate": 1.537037037037037e-05, | |
| "loss": 0.2508, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.7777777777777777, | |
| "grad_norm": 14.964295387268066, | |
| "learning_rate": 1.8456790123456792e-05, | |
| "loss": 0.364, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_f1_macro": 0.7302185347460938, | |
| "eval_f1_micro": 0.7072243346007605, | |
| "eval_hamming_accuracy": 0.9163043478260869, | |
| "eval_loss": 0.6868911981582642, | |
| "eval_precision_micro": 0.6595744680851063, | |
| "eval_recall_micro": 0.7622950819672131, | |
| "eval_runtime": 0.8049, | |
| "eval_samples_per_second": 228.614, | |
| "eval_steps_per_second": 14.91, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 3.240740740740741, | |
| "grad_norm": 11.293182373046875, | |
| "learning_rate": 1.9727668845315906e-05, | |
| "loss": 0.2485, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.7037037037037037, | |
| "grad_norm": 50.22275161743164, | |
| "learning_rate": 1.9183006535947716e-05, | |
| "loss": 0.1665, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_f1_macro": 0.8837470449172576, | |
| "eval_f1_micro": 0.8755020080321285, | |
| "eval_hamming_accuracy": 0.966304347826087, | |
| "eval_loss": 0.3539002537727356, | |
| "eval_precision_micro": 0.8582677165354331, | |
| "eval_recall_micro": 0.8934426229508197, | |
| "eval_runtime": 0.7776, | |
| "eval_samples_per_second": 236.63, | |
| "eval_steps_per_second": 15.432, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 4.166666666666667, | |
| "grad_norm": 0.16227640211582184, | |
| "learning_rate": 1.8638344226579522e-05, | |
| "loss": 0.1069, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 4.62962962962963, | |
| "grad_norm": 10.707866668701172, | |
| "learning_rate": 1.809368191721133e-05, | |
| "loss": 0.0876, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_f1_macro": 0.8056742217126731, | |
| "eval_f1_micro": 0.8093023255813954, | |
| "eval_hamming_accuracy": 0.9554347826086956, | |
| "eval_loss": 0.4993918538093567, | |
| "eval_precision_micro": 0.9354838709677419, | |
| "eval_recall_micro": 0.7131147540983607, | |
| "eval_runtime": 0.7714, | |
| "eval_samples_per_second": 238.515, | |
| "eval_steps_per_second": 15.555, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 5.092592592592593, | |
| "grad_norm": 0.11287817358970642, | |
| "learning_rate": 1.7549019607843138e-05, | |
| "loss": 0.1097, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 5.555555555555555, | |
| "grad_norm": 0.1228959709405899, | |
| "learning_rate": 1.7004357298474948e-05, | |
| "loss": 0.1002, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_f1_macro": 0.8003939860255092, | |
| "eval_f1_micro": 0.8523206751054853, | |
| "eval_hamming_accuracy": 0.9619565217391304, | |
| "eval_loss": 0.34277206659317017, | |
| "eval_precision_micro": 0.8782608695652174, | |
| "eval_recall_micro": 0.8278688524590164, | |
| "eval_runtime": 0.7799, | |
| "eval_samples_per_second": 235.933, | |
| "eval_steps_per_second": 15.387, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 6.018518518518518, | |
| "grad_norm": 1.964501976966858, | |
| "learning_rate": 1.6459694989106757e-05, | |
| "loss": 0.1086, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 6.481481481481482, | |
| "grad_norm": 5.079234600067139, | |
| "learning_rate": 1.5915032679738563e-05, | |
| "loss": 0.0317, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 6.944444444444445, | |
| "grad_norm": 0.015461280010640621, | |
| "learning_rate": 1.537037037037037e-05, | |
| "loss": 0.0337, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_f1_macro": 0.8087484096381885, | |
| "eval_f1_micro": 0.8760330578512396, | |
| "eval_hamming_accuracy": 0.967391304347826, | |
| "eval_loss": 0.5436545014381409, | |
| "eval_precision_micro": 0.8833333333333333, | |
| "eval_recall_micro": 0.8688524590163934, | |
| "eval_runtime": 0.7919, | |
| "eval_samples_per_second": 232.353, | |
| "eval_steps_per_second": 15.153, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 7.407407407407407, | |
| "grad_norm": 0.021498998627066612, | |
| "learning_rate": 1.4825708061002179e-05, | |
| "loss": 0.0692, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 7.87037037037037, | |
| "grad_norm": 0.15338653326034546, | |
| "learning_rate": 1.4281045751633989e-05, | |
| "loss": 0.0329, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_f1_macro": 0.7915487427320095, | |
| "eval_f1_micro": 0.8594377510040161, | |
| "eval_hamming_accuracy": 0.9619565217391304, | |
| "eval_loss": 0.5873637795448303, | |
| "eval_precision_micro": 0.84251968503937, | |
| "eval_recall_micro": 0.8770491803278688, | |
| "eval_runtime": 0.774, | |
| "eval_samples_per_second": 237.727, | |
| "eval_steps_per_second": 15.504, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 8.333333333333334, | |
| "grad_norm": 0.38751789927482605, | |
| "learning_rate": 1.3736383442265797e-05, | |
| "loss": 0.0084, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 8.796296296296296, | |
| "grad_norm": 0.0011671845568343997, | |
| "learning_rate": 1.3191721132897603e-05, | |
| "loss": 0.0076, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_f1_macro": 0.8627879519336726, | |
| "eval_f1_micro": 0.8524590163934426, | |
| "eval_hamming_accuracy": 0.9608695652173913, | |
| "eval_loss": 0.47524958848953247, | |
| "eval_precision_micro": 0.8524590163934426, | |
| "eval_recall_micro": 0.8524590163934426, | |
| "eval_runtime": 0.7994, | |
| "eval_samples_per_second": 230.182, | |
| "eval_steps_per_second": 15.012, | |
| "step": 972 | |
| }, | |
| { | |
| "epoch": 9.25925925925926, | |
| "grad_norm": 0.002920223632827401, | |
| "learning_rate": 1.2647058823529412e-05, | |
| "loss": 0.0165, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 9.722222222222221, | |
| "grad_norm": 0.0009965004865080118, | |
| "learning_rate": 1.210239651416122e-05, | |
| "loss": 0.0026, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_f1_macro": 0.8722597314787984, | |
| "eval_f1_micro": 0.8688524590163934, | |
| "eval_hamming_accuracy": 0.9652173913043478, | |
| "eval_loss": 0.5498170256614685, | |
| "eval_precision_micro": 0.8688524590163934, | |
| "eval_recall_micro": 0.8688524590163934, | |
| "eval_runtime": 0.7755, | |
| "eval_samples_per_second": 237.267, | |
| "eval_steps_per_second": 15.474, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 10.185185185185185, | |
| "grad_norm": 0.0005692149861715734, | |
| "learning_rate": 1.155773420479303e-05, | |
| "loss": 0.0021, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 10.648148148148149, | |
| "grad_norm": 0.008419875986874104, | |
| "learning_rate": 1.1013071895424838e-05, | |
| "loss": 0.0161, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_f1_macro": 0.8881631826284233, | |
| "eval_f1_micro": 0.888, | |
| "eval_hamming_accuracy": 0.9695652173913043, | |
| "eval_loss": 0.4741128981113434, | |
| "eval_precision_micro": 0.8671875, | |
| "eval_recall_micro": 0.9098360655737705, | |
| "eval_runtime": 0.7789, | |
| "eval_samples_per_second": 236.228, | |
| "eval_steps_per_second": 15.406, | |
| "step": 1188 | |
| }, | |
| { | |
| "epoch": 11.11111111111111, | |
| "grad_norm": 0.0006177773466333747, | |
| "learning_rate": 1.0468409586056646e-05, | |
| "loss": 0.0001, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 11.574074074074074, | |
| "grad_norm": 9.912909507751465, | |
| "learning_rate": 9.923747276688453e-06, | |
| "loss": 0.0011, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_f1_macro": 0.8998624190985544, | |
| "eval_f1_micro": 0.888, | |
| "eval_hamming_accuracy": 0.9695652173913043, | |
| "eval_loss": 0.47868961095809937, | |
| "eval_precision_micro": 0.8671875, | |
| "eval_recall_micro": 0.9098360655737705, | |
| "eval_runtime": 0.8483, | |
| "eval_samples_per_second": 216.907, | |
| "eval_steps_per_second": 14.146, | |
| "step": 1296 | |
| }, | |
| { | |
| "epoch": 12.037037037037036, | |
| "grad_norm": 0.0006370625924319029, | |
| "learning_rate": 9.379084967320261e-06, | |
| "loss": 0.0002, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 12.5, | |
| "grad_norm": 0.0052326153963804245, | |
| "learning_rate": 8.834422657952071e-06, | |
| "loss": 0.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 12.962962962962964, | |
| "grad_norm": 0.002041223691776395, | |
| "learning_rate": 8.289760348583879e-06, | |
| "loss": 0.0029, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_f1_macro": 0.878162395977428, | |
| "eval_f1_micro": 0.8734693877551021, | |
| "eval_hamming_accuracy": 0.966304347826087, | |
| "eval_loss": 0.510221540927887, | |
| "eval_precision_micro": 0.8699186991869918, | |
| "eval_recall_micro": 0.8770491803278688, | |
| "eval_runtime": 0.801, | |
| "eval_samples_per_second": 229.721, | |
| "eval_steps_per_second": 14.982, | |
| "step": 1404 | |
| }, | |
| { | |
| "epoch": 13.425925925925926, | |
| "grad_norm": 0.0002072990027954802, | |
| "learning_rate": 7.745098039215687e-06, | |
| "loss": 0.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 13.88888888888889, | |
| "grad_norm": 0.004894016310572624, | |
| "learning_rate": 7.200435729847495e-06, | |
| "loss": 0.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_f1_macro": 0.878162395977428, | |
| "eval_f1_micro": 0.8734693877551021, | |
| "eval_hamming_accuracy": 0.966304347826087, | |
| "eval_loss": 0.5220283269882202, | |
| "eval_precision_micro": 0.8699186991869918, | |
| "eval_recall_micro": 0.8770491803278688, | |
| "eval_runtime": 0.7755, | |
| "eval_samples_per_second": 237.272, | |
| "eval_steps_per_second": 15.474, | |
| "step": 1512 | |
| }, | |
| { | |
| "epoch": 14.351851851851851, | |
| "grad_norm": 0.00022991809237282723, | |
| "learning_rate": 6.655773420479303e-06, | |
| "loss": 0.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 14.814814814814815, | |
| "grad_norm": 0.003730728989467025, | |
| "learning_rate": 6.111111111111112e-06, | |
| "loss": 0.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_f1_macro": 0.878162395977428, | |
| "eval_f1_micro": 0.8734693877551021, | |
| "eval_hamming_accuracy": 0.966304347826087, | |
| "eval_loss": 0.5267383456230164, | |
| "eval_precision_micro": 0.8699186991869918, | |
| "eval_recall_micro": 0.8770491803278688, | |
| "eval_runtime": 0.7738, | |
| "eval_samples_per_second": 237.782, | |
| "eval_steps_per_second": 15.507, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 15.277777777777779, | |
| "grad_norm": 0.0003092221450060606, | |
| "learning_rate": 5.56644880174292e-06, | |
| "loss": 0.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 15.74074074074074, | |
| "grad_norm": 0.0007665773155167699, | |
| "learning_rate": 5.021786492374729e-06, | |
| "loss": 0.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_f1_macro": 0.878162395977428, | |
| "eval_f1_micro": 0.8734693877551021, | |
| "eval_hamming_accuracy": 0.966304347826087, | |
| "eval_loss": 0.5296750068664551, | |
| "eval_precision_micro": 0.8699186991869918, | |
| "eval_recall_micro": 0.8770491803278688, | |
| "eval_runtime": 0.772, | |
| "eval_samples_per_second": 238.335, | |
| "eval_steps_per_second": 15.544, | |
| "step": 1728 | |
| }, | |
| { | |
| "epoch": 16.203703703703702, | |
| "grad_norm": 0.00021788894082419574, | |
| "learning_rate": 4.477124183006537e-06, | |
| "loss": 0.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 16.666666666666668, | |
| "grad_norm": 0.0010182112455368042, | |
| "learning_rate": 3.9324618736383445e-06, | |
| "loss": 0.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_f1_macro": 0.878162395977428, | |
| "eval_f1_micro": 0.8734693877551021, | |
| "eval_hamming_accuracy": 0.966304347826087, | |
| "eval_loss": 0.5321551561355591, | |
| "eval_precision_micro": 0.8699186991869918, | |
| "eval_recall_micro": 0.8770491803278688, | |
| "eval_runtime": 0.7878, | |
| "eval_samples_per_second": 233.558, | |
| "eval_steps_per_second": 15.232, | |
| "step": 1836 | |
| }, | |
| { | |
| "epoch": 17.12962962962963, | |
| "grad_norm": 0.00028640340315178037, | |
| "learning_rate": 3.387799564270153e-06, | |
| "loss": 0.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 17.59259259259259, | |
| "grad_norm": 3.951051621697843e-05, | |
| "learning_rate": 2.843137254901961e-06, | |
| "loss": 0.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_f1_macro": 0.878162395977428, | |
| "eval_f1_micro": 0.8734693877551021, | |
| "eval_hamming_accuracy": 0.966304347826087, | |
| "eval_loss": 0.533598780632019, | |
| "eval_precision_micro": 0.8699186991869918, | |
| "eval_recall_micro": 0.8770491803278688, | |
| "eval_runtime": 0.7821, | |
| "eval_samples_per_second": 235.258, | |
| "eval_steps_per_second": 15.343, | |
| "step": 1944 | |
| }, | |
| { | |
| "epoch": 18.055555555555557, | |
| "grad_norm": 0.00041217764373868704, | |
| "learning_rate": 2.2984749455337694e-06, | |
| "loss": 0.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 18.51851851851852, | |
| "grad_norm": 0.0014055465580895543, | |
| "learning_rate": 1.7538126361655775e-06, | |
| "loss": 0.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 18.98148148148148, | |
| "grad_norm": 0.00048187788343057036, | |
| "learning_rate": 1.2091503267973858e-06, | |
| "loss": 0.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_f1_macro": 0.878162395977428, | |
| "eval_f1_micro": 0.8734693877551021, | |
| "eval_hamming_accuracy": 0.966304347826087, | |
| "eval_loss": 0.5342282652854919, | |
| "eval_precision_micro": 0.8699186991869918, | |
| "eval_recall_micro": 0.8770491803278688, | |
| "eval_runtime": 0.7786, | |
| "eval_samples_per_second": 236.331, | |
| "eval_steps_per_second": 15.413, | |
| "step": 2052 | |
| }, | |
| { | |
| "epoch": 19.444444444444443, | |
| "grad_norm": 0.0001349859667243436, | |
| "learning_rate": 6.64488017429194e-07, | |
| "loss": 0.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 19.90740740740741, | |
| "grad_norm": 3.7313504435587674e-05, | |
| "learning_rate": 1.1982570806100218e-07, | |
| "loss": 0.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_f1_macro": 0.878162395977428, | |
| "eval_f1_micro": 0.8734693877551021, | |
| "eval_hamming_accuracy": 0.966304347826087, | |
| "eval_loss": 0.5344623327255249, | |
| "eval_precision_micro": 0.8699186991869918, | |
| "eval_recall_micro": 0.8770491803278688, | |
| "eval_runtime": 0.7905, | |
| "eval_samples_per_second": 232.759, | |
| "eval_steps_per_second": 15.18, | |
| "step": 2160 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 2160, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.288650996406848e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |