{ "best_global_step": 2000, "best_metric": 0.33800485730171204, "best_model_checkpoint": "Assignment4_Distilled_ModernBERT/run-4/checkpoint-2000", "epoch": 4.1928721174004195, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.20964360587002095, "grad_norm": 11.635396957397461, "learning_rate": 5.974527645836366e-05, "loss": 6.4293, "step": 100 }, { "epoch": 0.20964360587002095, "eval_accuracy": 0.7254838709677419, "eval_loss": 3.423161268234253, "eval_runtime": 24.7311, "eval_samples_per_second": 125.348, "eval_steps_per_second": 15.689, "step": 100 }, { "epoch": 0.4192872117400419, "grad_norm": 7.08547830581665, "learning_rate": 5.897521628664326e-05, "loss": 2.4649, "step": 200 }, { "epoch": 0.4192872117400419, "eval_accuracy": 0.9012903225806451, "eval_loss": 1.6392749547958374, "eval_runtime": 24.334, "eval_samples_per_second": 127.394, "eval_steps_per_second": 15.945, "step": 200 }, { "epoch": 0.6289308176100629, "grad_norm": 2.8314898014068604, "learning_rate": 5.7703135417326854e-05, "loss": 1.364, "step": 300 }, { "epoch": 0.6289308176100629, "eval_accuracy": 0.9361290322580645, "eval_loss": 1.0801855325698853, "eval_runtime": 24.4718, "eval_samples_per_second": 126.676, "eval_steps_per_second": 15.855, "step": 300 }, { "epoch": 0.8385744234800838, "grad_norm": 11.189908981323242, "learning_rate": 5.595107375234632e-05, "loss": 0.9328, "step": 400 }, { "epoch": 0.8385744234800838, "eval_accuracy": 0.9506451612903226, "eval_loss": 0.8573997616767883, "eval_runtime": 24.7134, "eval_samples_per_second": 125.438, "eval_steps_per_second": 15.7, "step": 400 }, { "epoch": 1.0482180293501049, "grad_norm": 9.433755874633789, "learning_rate": 5.3749387276298395e-05, "loss": 0.708, "step": 500 }, { "epoch": 1.0482180293501049, "eval_accuracy": 0.9564516129032258, "eval_loss": 0.6847298741340637, "eval_runtime": 24.9978, "eval_samples_per_second": 124.011, "eval_steps_per_second": 15.521, "step": 500 }, { "epoch": 1.2578616352201257, "grad_norm": 1.7648718357086182, "learning_rate": 5.113622211274119e-05, "loss": 0.4702, "step": 600 }, { "epoch": 1.2578616352201257, "eval_accuracy": 0.9616129032258065, "eval_loss": 0.5767695307731628, "eval_runtime": 25.0749, "eval_samples_per_second": 123.63, "eval_steps_per_second": 15.474, "step": 600 }, { "epoch": 1.4675052410901468, "grad_norm": 2.782994031906128, "learning_rate": 4.815685360959153e-05, "loss": 0.4294, "step": 700 }, { "epoch": 1.4675052410901468, "eval_accuracy": 0.9583870967741935, "eval_loss": 0.5501284599304199, "eval_runtime": 24.8053, "eval_samples_per_second": 124.973, "eval_steps_per_second": 15.642, "step": 700 }, { "epoch": 1.6771488469601676, "grad_norm": 1.1726200580596924, "learning_rate": 4.486290190454053e-05, "loss": 0.3644, "step": 800 }, { "epoch": 1.6771488469601676, "eval_accuracy": 0.964516129032258, "eval_loss": 0.48550063371658325, "eval_runtime": 25.3011, "eval_samples_per_second": 122.524, "eval_steps_per_second": 15.335, "step": 800 }, { "epoch": 1.8867924528301887, "grad_norm": 11.229223251342773, "learning_rate": 4.13114375614956e-05, "loss": 0.3547, "step": 900 }, { "epoch": 1.8867924528301887, "eval_accuracy": 0.9664516129032258, "eval_loss": 0.46239548921585083, "eval_runtime": 25.1571, "eval_samples_per_second": 123.226, "eval_steps_per_second": 15.423, "step": 900 }, { "epoch": 2.0964360587002098, "grad_norm": 1.082562804222107, "learning_rate": 3.756399277367126e-05, "loss": 0.2973, "step": 1000 }, { "epoch": 2.0964360587002098, "eval_accuracy": 0.967741935483871, "eval_loss": 0.4247872829437256, "eval_runtime": 31.4174, "eval_samples_per_second": 98.671, "eval_steps_per_second": 12.35, "step": 1000 }, { "epoch": 2.3060796645702304, "grad_norm": 1.0132005214691162, "learning_rate": 3.36854952650912e-05, "loss": 0.2349, "step": 1100 }, { "epoch": 2.3060796645702304, "eval_accuracy": 0.9703225806451613, "eval_loss": 0.3974890410900116, "eval_runtime": 29.8794, "eval_samples_per_second": 103.75, "eval_steps_per_second": 12.986, "step": 1100 }, { "epoch": 2.5157232704402515, "grad_norm": 1.2016338109970093, "learning_rate": 2.9743143361580517e-05, "loss": 0.2317, "step": 1200 }, { "epoch": 2.5157232704402515, "eval_accuracy": 0.9683870967741935, "eval_loss": 0.3919287621974945, "eval_runtime": 26.2852, "eval_samples_per_second": 117.937, "eval_steps_per_second": 14.761, "step": 1200 }, { "epoch": 2.7253668763102725, "grad_norm": 0.9690835475921631, "learning_rate": 2.5805241721616645e-05, "loss": 0.2045, "step": 1300 }, { "epoch": 2.7253668763102725, "eval_accuracy": 0.97, "eval_loss": 0.37409406900405884, "eval_runtime": 25.1471, "eval_samples_per_second": 123.275, "eval_steps_per_second": 15.429, "step": 1300 }, { "epoch": 2.9350104821802936, "grad_norm": 0.7975201606750488, "learning_rate": 2.1940017899009786e-05, "loss": 0.197, "step": 1400 }, { "epoch": 2.9350104821802936, "eval_accuracy": 0.9690322580645161, "eval_loss": 0.36775872111320496, "eval_runtime": 24.7763, "eval_samples_per_second": 125.12, "eval_steps_per_second": 15.66, "step": 1400 }, { "epoch": 3.1446540880503147, "grad_norm": 0.850993275642395, "learning_rate": 1.8214440241488357e-05, "loss": 0.1785, "step": 1500 }, { "epoch": 3.1446540880503147, "eval_accuracy": 0.9696774193548388, "eval_loss": 0.35433417558670044, "eval_runtime": 24.6341, "eval_samples_per_second": 125.842, "eval_steps_per_second": 15.751, "step": 1500 }, { "epoch": 3.3542976939203353, "grad_norm": 0.8356935977935791, "learning_rate": 1.4693057606119536e-05, "loss": 0.1645, "step": 1600 }, { "epoch": 3.3542976939203353, "eval_accuracy": 0.9703225806451613, "eval_loss": 0.35072386264801025, "eval_runtime": 24.97, "eval_samples_per_second": 124.149, "eval_steps_per_second": 15.539, "step": 1600 }, { "epoch": 3.5639412997903563, "grad_norm": 0.5763674974441528, "learning_rate": 1.143688099449935e-05, "loss": 0.1576, "step": 1700 }, { "epoch": 3.5639412997903563, "eval_accuracy": 0.9680645161290322, "eval_loss": 0.3471537232398987, "eval_runtime": 24.9021, "eval_samples_per_second": 124.488, "eval_steps_per_second": 15.581, "step": 1700 }, { "epoch": 3.7735849056603774, "grad_norm": 0.7285761833190918, "learning_rate": 8.502326484350608e-06, "loss": 0.1553, "step": 1800 }, { "epoch": 3.7735849056603774, "eval_accuracy": 0.9693548387096774, "eval_loss": 0.3421308100223541, "eval_runtime": 25.0889, "eval_samples_per_second": 123.561, "eval_steps_per_second": 15.465, "step": 1800 }, { "epoch": 3.9832285115303985, "grad_norm": 0.9058928489685059, "learning_rate": 5.940237772153215e-06, "loss": 0.155, "step": 1900 }, { "epoch": 3.9832285115303985, "eval_accuracy": 0.97, "eval_loss": 0.3395131826400757, "eval_runtime": 25.8266, "eval_samples_per_second": 120.031, "eval_steps_per_second": 15.023, "step": 1900 }, { "epoch": 4.1928721174004195, "grad_norm": 0.5501458048820496, "learning_rate": 3.795005262102136e-06, "loss": 0.1413, "step": 2000 }, { "epoch": 4.1928721174004195, "eval_accuracy": 0.9696774193548388, "eval_loss": 0.33800485730171204, "eval_runtime": 28.266, "eval_samples_per_second": 109.673, "eval_steps_per_second": 13.727, "step": 2000 } ], "logging_steps": 100, "max_steps": 2385, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 896601881570736.0, "train_batch_size": 32, "trial_name": null, "trial_params": { "alpha": 0.5611126477660928, "num_train_epochs": 5, "temperature": 7 } }