{ "best_global_step": 1296, "best_metric": 0.8998624190985544, "best_model_checkpoint": "models/modernbert_location_improved/checkpoint-1296", "epoch": 20.0, "eval_steps": 500, "global_step": 2160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.46296296296296297, "grad_norm": 200.00001525878906, "learning_rate": 3.0246913580246917e-06, "loss": 1.1761, "step": 50 }, { "epoch": 0.9259259259259259, "grad_norm": 31.97449493408203, "learning_rate": 6.111111111111112e-06, "loss": 0.7491, "step": 100 }, { "epoch": 1.0, "eval_f1_macro": 0.1659703881552621, "eval_f1_micro": 0.3064516129032258, "eval_hamming_accuracy": 0.8130434782608695, "eval_loss": 0.7170208096504211, "eval_precision_micro": 0.30158730158730157, "eval_recall_micro": 0.3114754098360656, "eval_runtime": 0.8049, "eval_samples_per_second": 228.588, "eval_steps_per_second": 14.908, "step": 108 }, { "epoch": 1.3888888888888888, "grad_norm": 20.814834594726562, "learning_rate": 9.197530864197531e-06, "loss": 0.5513, "step": 150 }, { "epoch": 1.8518518518518519, "grad_norm": 15.15282154083252, "learning_rate": 1.228395061728395e-05, "loss": 0.4094, "step": 200 }, { "epoch": 2.0, "eval_f1_macro": 0.6753457587631049, "eval_f1_micro": 0.7657992565055762, "eval_hamming_accuracy": 0.9315217391304348, "eval_loss": 0.3277210593223572, "eval_precision_micro": 0.7006802721088435, "eval_recall_micro": 0.8442622950819673, "eval_runtime": 0.8136, "eval_samples_per_second": 226.16, "eval_steps_per_second": 14.75, "step": 216 }, { "epoch": 2.314814814814815, "grad_norm": 15.625266075134277, "learning_rate": 1.537037037037037e-05, "loss": 0.2508, "step": 250 }, { "epoch": 2.7777777777777777, "grad_norm": 14.964295387268066, "learning_rate": 1.8456790123456792e-05, "loss": 0.364, "step": 300 }, { "epoch": 3.0, "eval_f1_macro": 0.7302185347460938, "eval_f1_micro": 0.7072243346007605, "eval_hamming_accuracy": 0.9163043478260869, "eval_loss": 0.6868911981582642, "eval_precision_micro": 0.6595744680851063, "eval_recall_micro": 0.7622950819672131, "eval_runtime": 0.8049, "eval_samples_per_second": 228.614, "eval_steps_per_second": 14.91, "step": 324 }, { "epoch": 3.240740740740741, "grad_norm": 11.293182373046875, "learning_rate": 1.9727668845315906e-05, "loss": 0.2485, "step": 350 }, { "epoch": 3.7037037037037037, "grad_norm": 50.22275161743164, "learning_rate": 1.9183006535947716e-05, "loss": 0.1665, "step": 400 }, { "epoch": 4.0, "eval_f1_macro": 0.8837470449172576, "eval_f1_micro": 0.8755020080321285, "eval_hamming_accuracy": 0.966304347826087, "eval_loss": 0.3539002537727356, "eval_precision_micro": 0.8582677165354331, "eval_recall_micro": 0.8934426229508197, "eval_runtime": 0.7776, "eval_samples_per_second": 236.63, "eval_steps_per_second": 15.432, "step": 432 }, { "epoch": 4.166666666666667, "grad_norm": 0.16227640211582184, "learning_rate": 1.8638344226579522e-05, "loss": 0.1069, "step": 450 }, { "epoch": 4.62962962962963, "grad_norm": 10.707866668701172, "learning_rate": 1.809368191721133e-05, "loss": 0.0876, "step": 500 }, { "epoch": 5.0, "eval_f1_macro": 0.8056742217126731, "eval_f1_micro": 0.8093023255813954, "eval_hamming_accuracy": 0.9554347826086956, "eval_loss": 0.4993918538093567, "eval_precision_micro": 0.9354838709677419, "eval_recall_micro": 0.7131147540983607, "eval_runtime": 0.7714, "eval_samples_per_second": 238.515, "eval_steps_per_second": 15.555, "step": 540 }, { "epoch": 5.092592592592593, "grad_norm": 0.11287817358970642, "learning_rate": 1.7549019607843138e-05, "loss": 0.1097, "step": 550 }, { "epoch": 5.555555555555555, "grad_norm": 0.1228959709405899, "learning_rate": 1.7004357298474948e-05, "loss": 0.1002, "step": 600 }, { "epoch": 6.0, "eval_f1_macro": 0.8003939860255092, "eval_f1_micro": 0.8523206751054853, "eval_hamming_accuracy": 0.9619565217391304, "eval_loss": 0.34277206659317017, "eval_precision_micro": 0.8782608695652174, "eval_recall_micro": 0.8278688524590164, "eval_runtime": 0.7799, "eval_samples_per_second": 235.933, "eval_steps_per_second": 15.387, "step": 648 }, { "epoch": 6.018518518518518, "grad_norm": 1.964501976966858, "learning_rate": 1.6459694989106757e-05, "loss": 0.1086, "step": 650 }, { "epoch": 6.481481481481482, "grad_norm": 5.079234600067139, "learning_rate": 1.5915032679738563e-05, "loss": 0.0317, "step": 700 }, { "epoch": 6.944444444444445, "grad_norm": 0.015461280010640621, "learning_rate": 1.537037037037037e-05, "loss": 0.0337, "step": 750 }, { "epoch": 7.0, "eval_f1_macro": 0.8087484096381885, "eval_f1_micro": 0.8760330578512396, "eval_hamming_accuracy": 0.967391304347826, "eval_loss": 0.5436545014381409, "eval_precision_micro": 0.8833333333333333, "eval_recall_micro": 0.8688524590163934, "eval_runtime": 0.7919, "eval_samples_per_second": 232.353, "eval_steps_per_second": 15.153, "step": 756 }, { "epoch": 7.407407407407407, "grad_norm": 0.021498998627066612, "learning_rate": 1.4825708061002179e-05, "loss": 0.0692, "step": 800 }, { "epoch": 7.87037037037037, "grad_norm": 0.15338653326034546, "learning_rate": 1.4281045751633989e-05, "loss": 0.0329, "step": 850 }, { "epoch": 8.0, "eval_f1_macro": 0.7915487427320095, "eval_f1_micro": 0.8594377510040161, "eval_hamming_accuracy": 0.9619565217391304, "eval_loss": 0.5873637795448303, "eval_precision_micro": 0.84251968503937, "eval_recall_micro": 0.8770491803278688, "eval_runtime": 0.774, "eval_samples_per_second": 237.727, "eval_steps_per_second": 15.504, "step": 864 }, { "epoch": 8.333333333333334, "grad_norm": 0.38751789927482605, "learning_rate": 1.3736383442265797e-05, "loss": 0.0084, "step": 900 }, { "epoch": 8.796296296296296, "grad_norm": 0.0011671845568343997, "learning_rate": 1.3191721132897603e-05, "loss": 0.0076, "step": 950 }, { "epoch": 9.0, "eval_f1_macro": 0.8627879519336726, "eval_f1_micro": 0.8524590163934426, "eval_hamming_accuracy": 0.9608695652173913, "eval_loss": 0.47524958848953247, "eval_precision_micro": 0.8524590163934426, "eval_recall_micro": 0.8524590163934426, "eval_runtime": 0.7994, "eval_samples_per_second": 230.182, "eval_steps_per_second": 15.012, "step": 972 }, { "epoch": 9.25925925925926, "grad_norm": 0.002920223632827401, "learning_rate": 1.2647058823529412e-05, "loss": 0.0165, "step": 1000 }, { "epoch": 9.722222222222221, "grad_norm": 0.0009965004865080118, "learning_rate": 1.210239651416122e-05, "loss": 0.0026, "step": 1050 }, { "epoch": 10.0, "eval_f1_macro": 0.8722597314787984, "eval_f1_micro": 0.8688524590163934, "eval_hamming_accuracy": 0.9652173913043478, "eval_loss": 0.5498170256614685, "eval_precision_micro": 0.8688524590163934, "eval_recall_micro": 0.8688524590163934, "eval_runtime": 0.7755, "eval_samples_per_second": 237.267, "eval_steps_per_second": 15.474, "step": 1080 }, { "epoch": 10.185185185185185, "grad_norm": 0.0005692149861715734, "learning_rate": 1.155773420479303e-05, "loss": 0.0021, "step": 1100 }, { "epoch": 10.648148148148149, "grad_norm": 0.008419875986874104, "learning_rate": 1.1013071895424838e-05, "loss": 0.0161, "step": 1150 }, { "epoch": 11.0, "eval_f1_macro": 0.8881631826284233, "eval_f1_micro": 0.888, "eval_hamming_accuracy": 0.9695652173913043, "eval_loss": 0.4741128981113434, "eval_precision_micro": 0.8671875, "eval_recall_micro": 0.9098360655737705, "eval_runtime": 0.7789, "eval_samples_per_second": 236.228, "eval_steps_per_second": 15.406, "step": 1188 }, { "epoch": 11.11111111111111, "grad_norm": 0.0006177773466333747, "learning_rate": 1.0468409586056646e-05, "loss": 0.0001, "step": 1200 }, { "epoch": 11.574074074074074, "grad_norm": 9.912909507751465, "learning_rate": 9.923747276688453e-06, "loss": 0.0011, "step": 1250 }, { "epoch": 12.0, "eval_f1_macro": 0.8998624190985544, "eval_f1_micro": 0.888, "eval_hamming_accuracy": 0.9695652173913043, "eval_loss": 0.47868961095809937, "eval_precision_micro": 0.8671875, "eval_recall_micro": 0.9098360655737705, "eval_runtime": 0.8483, "eval_samples_per_second": 216.907, "eval_steps_per_second": 14.146, "step": 1296 }, { "epoch": 12.037037037037036, "grad_norm": 0.0006370625924319029, "learning_rate": 9.379084967320261e-06, "loss": 0.0002, "step": 1300 }, { "epoch": 12.5, "grad_norm": 0.0052326153963804245, "learning_rate": 8.834422657952071e-06, "loss": 0.0, "step": 1350 }, { "epoch": 12.962962962962964, "grad_norm": 0.002041223691776395, "learning_rate": 8.289760348583879e-06, "loss": 0.0029, "step": 1400 }, { "epoch": 13.0, "eval_f1_macro": 0.878162395977428, "eval_f1_micro": 0.8734693877551021, "eval_hamming_accuracy": 0.966304347826087, "eval_loss": 0.510221540927887, "eval_precision_micro": 0.8699186991869918, "eval_recall_micro": 0.8770491803278688, "eval_runtime": 0.801, "eval_samples_per_second": 229.721, "eval_steps_per_second": 14.982, "step": 1404 }, { "epoch": 13.425925925925926, "grad_norm": 0.0002072990027954802, "learning_rate": 7.745098039215687e-06, "loss": 0.0, "step": 1450 }, { "epoch": 13.88888888888889, "grad_norm": 0.004894016310572624, "learning_rate": 7.200435729847495e-06, "loss": 0.0, "step": 1500 }, { "epoch": 14.0, "eval_f1_macro": 0.878162395977428, "eval_f1_micro": 0.8734693877551021, "eval_hamming_accuracy": 0.966304347826087, "eval_loss": 0.5220283269882202, "eval_precision_micro": 0.8699186991869918, "eval_recall_micro": 0.8770491803278688, "eval_runtime": 0.7755, "eval_samples_per_second": 237.272, "eval_steps_per_second": 15.474, "step": 1512 }, { "epoch": 14.351851851851851, "grad_norm": 0.00022991809237282723, "learning_rate": 6.655773420479303e-06, "loss": 0.0, "step": 1550 }, { "epoch": 14.814814814814815, "grad_norm": 0.003730728989467025, "learning_rate": 6.111111111111112e-06, "loss": 0.0, "step": 1600 }, { "epoch": 15.0, "eval_f1_macro": 0.878162395977428, "eval_f1_micro": 0.8734693877551021, "eval_hamming_accuracy": 0.966304347826087, "eval_loss": 0.5267383456230164, "eval_precision_micro": 0.8699186991869918, "eval_recall_micro": 0.8770491803278688, "eval_runtime": 0.7738, "eval_samples_per_second": 237.782, "eval_steps_per_second": 15.507, "step": 1620 }, { "epoch": 15.277777777777779, "grad_norm": 0.0003092221450060606, "learning_rate": 5.56644880174292e-06, "loss": 0.0, "step": 1650 }, { "epoch": 15.74074074074074, "grad_norm": 0.0007665773155167699, "learning_rate": 5.021786492374729e-06, "loss": 0.0, "step": 1700 }, { "epoch": 16.0, "eval_f1_macro": 0.878162395977428, "eval_f1_micro": 0.8734693877551021, "eval_hamming_accuracy": 0.966304347826087, "eval_loss": 0.5296750068664551, "eval_precision_micro": 0.8699186991869918, "eval_recall_micro": 0.8770491803278688, "eval_runtime": 0.772, "eval_samples_per_second": 238.335, "eval_steps_per_second": 15.544, "step": 1728 }, { "epoch": 16.203703703703702, "grad_norm": 0.00021788894082419574, "learning_rate": 4.477124183006537e-06, "loss": 0.0, "step": 1750 }, { "epoch": 16.666666666666668, "grad_norm": 0.0010182112455368042, "learning_rate": 3.9324618736383445e-06, "loss": 0.0, "step": 1800 }, { "epoch": 17.0, "eval_f1_macro": 0.878162395977428, "eval_f1_micro": 0.8734693877551021, "eval_hamming_accuracy": 0.966304347826087, "eval_loss": 0.5321551561355591, "eval_precision_micro": 0.8699186991869918, "eval_recall_micro": 0.8770491803278688, "eval_runtime": 0.7878, "eval_samples_per_second": 233.558, "eval_steps_per_second": 15.232, "step": 1836 }, { "epoch": 17.12962962962963, "grad_norm": 0.00028640340315178037, "learning_rate": 3.387799564270153e-06, "loss": 0.0, "step": 1850 }, { "epoch": 17.59259259259259, "grad_norm": 3.951051621697843e-05, "learning_rate": 2.843137254901961e-06, "loss": 0.0, "step": 1900 }, { "epoch": 18.0, "eval_f1_macro": 0.878162395977428, "eval_f1_micro": 0.8734693877551021, "eval_hamming_accuracy": 0.966304347826087, "eval_loss": 0.533598780632019, "eval_precision_micro": 0.8699186991869918, "eval_recall_micro": 0.8770491803278688, "eval_runtime": 0.7821, "eval_samples_per_second": 235.258, "eval_steps_per_second": 15.343, "step": 1944 }, { "epoch": 18.055555555555557, "grad_norm": 0.00041217764373868704, "learning_rate": 2.2984749455337694e-06, "loss": 0.0, "step": 1950 }, { "epoch": 18.51851851851852, "grad_norm": 0.0014055465580895543, "learning_rate": 1.7538126361655775e-06, "loss": 0.0, "step": 2000 }, { "epoch": 18.98148148148148, "grad_norm": 0.00048187788343057036, "learning_rate": 1.2091503267973858e-06, "loss": 0.0, "step": 2050 }, { "epoch": 19.0, "eval_f1_macro": 0.878162395977428, "eval_f1_micro": 0.8734693877551021, "eval_hamming_accuracy": 0.966304347826087, "eval_loss": 0.5342282652854919, "eval_precision_micro": 0.8699186991869918, "eval_recall_micro": 0.8770491803278688, "eval_runtime": 0.7786, "eval_samples_per_second": 236.331, "eval_steps_per_second": 15.413, "step": 2052 }, { "epoch": 19.444444444444443, "grad_norm": 0.0001349859667243436, "learning_rate": 6.64488017429194e-07, "loss": 0.0, "step": 2100 }, { "epoch": 19.90740740740741, "grad_norm": 3.7313504435587674e-05, "learning_rate": 1.1982570806100218e-07, "loss": 0.0, "step": 2150 }, { "epoch": 20.0, "eval_f1_macro": 0.878162395977428, "eval_f1_micro": 0.8734693877551021, "eval_hamming_accuracy": 0.966304347826087, "eval_loss": 0.5344623327255249, "eval_precision_micro": 0.8699186991869918, "eval_recall_micro": 0.8770491803278688, "eval_runtime": 0.7905, "eval_samples_per_second": 232.759, "eval_steps_per_second": 15.18, "step": 2160 } ], "logging_steps": 50, "max_steps": 2160, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.288650996406848e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }