{ "best_global_step": 2000, "best_metric": 0.34400272369384766, "best_model_checkpoint": "Assignment4_Distilled_ModernBERT/run-2/checkpoint-2000", "epoch": 4.1928721174004195, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.20964360587002095, "grad_norm": 15.277114868164062, "learning_rate": 5.99213045445736e-05, "loss": 6.7857, "step": 100 }, { "epoch": 0.20964360587002095, "eval_accuracy": 0.7109677419354838, "eval_loss": 3.844492197036743, "eval_runtime": 24.4798, "eval_samples_per_second": 126.635, "eval_steps_per_second": 15.85, "step": 100 }, { "epoch": 0.4192872117400419, "grad_norm": 9.93507194519043, "learning_rate": 5.968245320843699e-05, "loss": 2.8413, "step": 200 }, { "epoch": 0.4192872117400419, "eval_accuracy": 0.8938709677419355, "eval_loss": 1.8548765182495117, "eval_runtime": 24.5612, "eval_samples_per_second": 126.215, "eval_steps_per_second": 15.797, "step": 200 }, { "epoch": 0.6289308176100629, "grad_norm": 3.672398805618286, "learning_rate": 5.928471649139723e-05, "loss": 1.4945, "step": 300 }, { "epoch": 0.6289308176100629, "eval_accuracy": 0.94, "eval_loss": 1.1517655849456787, "eval_runtime": 24.6593, "eval_samples_per_second": 125.713, "eval_steps_per_second": 15.734, "step": 300 }, { "epoch": 0.8385744234800838, "grad_norm": 9.015824317932129, "learning_rate": 5.87302234139009e-05, "loss": 1.0247, "step": 400 }, { "epoch": 0.8385744234800838, "eval_accuracy": 0.9532258064516129, "eval_loss": 0.9113911390304565, "eval_runtime": 24.4669, "eval_samples_per_second": 126.702, "eval_steps_per_second": 15.858, "step": 400 }, { "epoch": 1.0482180293501049, "grad_norm": 13.825676918029785, "learning_rate": 5.802194208788969e-05, "loss": 0.7584, "step": 500 }, { "epoch": 1.0482180293501049, "eval_accuracy": 0.9558064516129032, "eval_loss": 0.7309688329696655, "eval_runtime": 24.9176, "eval_samples_per_second": 124.41, "eval_steps_per_second": 15.571, "step": 500 }, { "epoch": 1.2578616352201257, "grad_norm": 2.16816782951355, "learning_rate": 5.716366382897622e-05, "loss": 0.5289, "step": 600 }, { "epoch": 1.2578616352201257, "eval_accuracy": 0.9616129032258065, "eval_loss": 0.6186437606811523, "eval_runtime": 24.8855, "eval_samples_per_second": 124.571, "eval_steps_per_second": 15.591, "step": 600 }, { "epoch": 1.4675052410901468, "grad_norm": 3.3350861072540283, "learning_rate": 5.6159982862143515e-05, "loss": 0.4857, "step": 700 }, { "epoch": 1.4675052410901468, "eval_accuracy": 0.96, "eval_loss": 0.5846336483955383, "eval_runtime": 24.8682, "eval_samples_per_second": 124.657, "eval_steps_per_second": 15.602, "step": 700 }, { "epoch": 1.6771488469601676, "grad_norm": 1.4833406209945679, "learning_rate": 5.5016271729600304e-05, "loss": 0.4169, "step": 800 }, { "epoch": 1.6771488469601676, "eval_accuracy": 0.9638709677419355, "eval_loss": 0.5170760750770569, "eval_runtime": 31.1132, "eval_samples_per_second": 99.636, "eval_steps_per_second": 12.471, "step": 800 }, { "epoch": 1.8867924528301887, "grad_norm": 8.999539375305176, "learning_rate": 5.3738652532429715e-05, "loss": 0.3953, "step": 900 }, { "epoch": 1.8867924528301887, "eval_accuracy": 0.9683870967741935, "eval_loss": 0.4950183629989624, "eval_runtime": 25.9065, "eval_samples_per_second": 119.661, "eval_steps_per_second": 14.977, "step": 900 }, { "epoch": 2.0964360587002098, "grad_norm": 1.5073931217193604, "learning_rate": 5.2333964159970384e-05, "loss": 0.3393, "step": 1000 }, { "epoch": 2.0964360587002098, "eval_accuracy": 0.9674193548387097, "eval_loss": 0.45995765924453735, "eval_runtime": 25.1127, "eval_samples_per_second": 123.444, "eval_steps_per_second": 15.45, "step": 1000 }, { "epoch": 2.3060796645702304, "grad_norm": 1.204892873764038, "learning_rate": 5.080972568234569e-05, "loss": 0.2747, "step": 1100 }, { "epoch": 2.3060796645702304, "eval_accuracy": 0.9661290322580646, "eval_loss": 0.44675561785697937, "eval_runtime": 24.863, "eval_samples_per_second": 124.683, "eval_steps_per_second": 15.606, "step": 1100 }, { "epoch": 2.5157232704402515, "grad_norm": 1.581334114074707, "learning_rate": 4.9174096102095087e-05, "loss": 0.2703, "step": 1200 }, { "epoch": 2.5157232704402515, "eval_accuracy": 0.9664516129032258, "eval_loss": 0.43446871638298035, "eval_runtime": 24.788, "eval_samples_per_second": 125.061, "eval_steps_per_second": 15.653, "step": 1200 }, { "epoch": 2.7253668763102725, "grad_norm": 1.4468449354171753, "learning_rate": 4.7435830680350456e-05, "loss": 0.2482, "step": 1300 }, { "epoch": 2.7253668763102725, "eval_accuracy": 0.967741935483871, "eval_loss": 0.4083913564682007, "eval_runtime": 30.9276, "eval_samples_per_second": 100.234, "eval_steps_per_second": 12.545, "step": 1300 }, { "epoch": 2.9350104821802936, "grad_norm": 1.0635446310043335, "learning_rate": 4.5604234071336463e-05, "loss": 0.2342, "step": 1400 }, { "epoch": 2.9350104821802936, "eval_accuracy": 0.9674193548387097, "eval_loss": 0.3973832428455353, "eval_runtime": 25.2744, "eval_samples_per_second": 122.654, "eval_steps_per_second": 15.351, "step": 1400 }, { "epoch": 3.1446540880503147, "grad_norm": 1.0511268377304077, "learning_rate": 4.368911051605842e-05, "loss": 0.2073, "step": 1500 }, { "epoch": 3.1446540880503147, "eval_accuracy": 0.9680645161290322, "eval_loss": 0.3778529465198517, "eval_runtime": 24.3903, "eval_samples_per_second": 127.1, "eval_steps_per_second": 15.908, "step": 1500 }, { "epoch": 3.3542976939203353, "grad_norm": 1.3159152269363403, "learning_rate": 4.1700711361782675e-05, "loss": 0.1973, "step": 1600 }, { "epoch": 3.3542976939203353, "eval_accuracy": 0.9696774193548388, "eval_loss": 0.3735784590244293, "eval_runtime": 24.7557, "eval_samples_per_second": 125.224, "eval_steps_per_second": 15.673, "step": 1600 }, { "epoch": 3.5639412997903563, "grad_norm": 0.8998211622238159, "learning_rate": 3.9649680188229416e-05, "loss": 0.1876, "step": 1700 }, { "epoch": 3.5639412997903563, "eval_accuracy": 0.9696774193548388, "eval_loss": 0.3719204366207123, "eval_runtime": 25.0523, "eval_samples_per_second": 123.741, "eval_steps_per_second": 15.488, "step": 1700 }, { "epoch": 3.7735849056603774, "grad_norm": 1.0484949350357056, "learning_rate": 3.754699583420843e-05, "loss": 0.1836, "step": 1800 }, { "epoch": 3.7735849056603774, "eval_accuracy": 0.9703225806451613, "eval_loss": 0.35943037271499634, "eval_runtime": 29.7226, "eval_samples_per_second": 104.298, "eval_steps_per_second": 13.054, "step": 1800 }, { "epoch": 3.9832285115303985, "grad_norm": 1.2159234285354614, "learning_rate": 3.5403913629667045e-05, "loss": 0.184, "step": 1900 }, { "epoch": 3.9832285115303985, "eval_accuracy": 0.9709677419354839, "eval_loss": 0.35340631008148193, "eval_runtime": 26.1611, "eval_samples_per_second": 118.496, "eval_steps_per_second": 14.831, "step": 1900 }, { "epoch": 4.1928721174004195, "grad_norm": 0.8908376097679138, "learning_rate": 3.323190514772574e-05, "loss": 0.1558, "step": 2000 }, { "epoch": 4.1928721174004195, "eval_accuracy": 0.97, "eval_loss": 0.34400272369384766, "eval_runtime": 24.8773, "eval_samples_per_second": 124.612, "eval_steps_per_second": 15.597, "step": 2000 } ], "logging_steps": 100, "max_steps": 4293, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 896601881570736.0, "train_batch_size": 32, "trial_name": null, "trial_params": { "alpha": 0.497477897443408, "num_train_epochs": 9, "temperature": 7 } }