{ "best_metric": 0.23246362805366516, "best_model_checkpoint": "./fine-tuned/checkpoint-2000", "epoch": 3.99667497921862, "eval_steps": 100, "global_step": 2404, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0831255195344971, "grad_norm": 36186.3984375, "learning_rate": 2.937603993344426e-05, "loss": 1.1504, "step": 50 }, { "epoch": 0.1662510390689942, "grad_norm": 26857.228515625, "learning_rate": 2.8752079866888522e-05, "loss": 0.4993, "step": 100 }, { "epoch": 0.1662510390689942, "eval_loss": 0.35283052921295166, "eval_runtime": 38.0523, "eval_samples_per_second": 12.982, "eval_steps_per_second": 1.629, "step": 100 }, { "epoch": 0.24937655860349128, "grad_norm": 53959.0078125, "learning_rate": 2.812811980033278e-05, "loss": 0.4236, "step": 150 }, { "epoch": 0.3325020781379884, "grad_norm": 32498.453125, "learning_rate": 2.7504159733777037e-05, "loss": 0.376, "step": 200 }, { "epoch": 0.3325020781379884, "eval_loss": 0.30865946412086487, "eval_runtime": 38.2162, "eval_samples_per_second": 12.926, "eval_steps_per_second": 1.622, "step": 200 }, { "epoch": 0.41562759767248547, "grad_norm": 32577.630859375, "learning_rate": 2.6880199667221298e-05, "loss": 0.3822, "step": 250 }, { "epoch": 0.49875311720698257, "grad_norm": 28815.4609375, "learning_rate": 2.625623960066556e-05, "loss": 0.3569, "step": 300 }, { "epoch": 0.49875311720698257, "eval_loss": 0.2891499996185303, "eval_runtime": 38.1858, "eval_samples_per_second": 12.937, "eval_steps_per_second": 1.624, "step": 300 }, { "epoch": 0.5818786367414797, "grad_norm": 33334.5546875, "learning_rate": 2.563227953410982e-05, "loss": 0.3414, "step": 350 }, { "epoch": 0.6650041562759768, "grad_norm": 23824.673828125, "learning_rate": 2.5008319467554077e-05, "loss": 0.3671, "step": 400 }, { "epoch": 0.6650041562759768, "eval_loss": 0.2757515609264374, "eval_runtime": 38.2247, "eval_samples_per_second": 12.924, "eval_steps_per_second": 1.622, "step": 400 }, { "epoch": 0.7481296758104738, "grad_norm": 24386.048828125, "learning_rate": 2.4384359400998338e-05, "loss": 0.3234, "step": 450 }, { "epoch": 0.8312551953449709, "grad_norm": 19840.703125, "learning_rate": 2.3760399334442595e-05, "loss": 0.3299, "step": 500 }, { "epoch": 0.8312551953449709, "eval_loss": 0.2673098146915436, "eval_runtime": 38.0207, "eval_samples_per_second": 12.993, "eval_steps_per_second": 1.631, "step": 500 }, { "epoch": 0.914380714879468, "grad_norm": 37282.65625, "learning_rate": 2.3136439267886856e-05, "loss": 0.3215, "step": 550 }, { "epoch": 0.9975062344139651, "grad_norm": 21197.52734375, "learning_rate": 2.2512479201331116e-05, "loss": 0.3407, "step": 600 }, { "epoch": 0.9975062344139651, "eval_loss": 0.26070085167884827, "eval_runtime": 37.7694, "eval_samples_per_second": 13.079, "eval_steps_per_second": 1.642, "step": 600 }, { "epoch": 1.0806317539484622, "grad_norm": 21656.6328125, "learning_rate": 2.1888519134775374e-05, "loss": 0.3191, "step": 650 }, { "epoch": 1.1637572734829593, "grad_norm": 22402.8125, "learning_rate": 2.1264559068219635e-05, "loss": 0.299, "step": 700 }, { "epoch": 1.1637572734829593, "eval_loss": 0.256939560174942, "eval_runtime": 38.0385, "eval_samples_per_second": 12.987, "eval_steps_per_second": 1.63, "step": 700 }, { "epoch": 1.2468827930174564, "grad_norm": 49383.11328125, "learning_rate": 2.0640599001663895e-05, "loss": 0.3131, "step": 750 }, { "epoch": 1.3300083125519535, "grad_norm": 27295.173828125, "learning_rate": 2.0016638935108153e-05, "loss": 0.3149, "step": 800 }, { "epoch": 1.3300083125519535, "eval_loss": 0.2525966763496399, "eval_runtime": 38.1831, "eval_samples_per_second": 12.938, "eval_steps_per_second": 1.624, "step": 800 }, { "epoch": 1.4131338320864506, "grad_norm": 15751.75390625, "learning_rate": 1.9392678868552414e-05, "loss": 0.2845, "step": 850 }, { "epoch": 1.4962593516209477, "grad_norm": 25327.384765625, "learning_rate": 1.8768718801996674e-05, "loss": 0.2945, "step": 900 }, { "epoch": 1.4962593516209477, "eval_loss": 0.24994711577892303, "eval_runtime": 37.4678, "eval_samples_per_second": 13.185, "eval_steps_per_second": 1.655, "step": 900 }, { "epoch": 1.5793848711554448, "grad_norm": 25815.150390625, "learning_rate": 1.8144758735440932e-05, "loss": 0.3005, "step": 950 }, { "epoch": 1.6625103906899419, "grad_norm": 24868.796875, "learning_rate": 1.7520798668885192e-05, "loss": 0.3129, "step": 1000 }, { "epoch": 1.6625103906899419, "eval_loss": 0.246443971991539, "eval_runtime": 37.8324, "eval_samples_per_second": 13.058, "eval_steps_per_second": 1.639, "step": 1000 }, { "epoch": 1.745635910224439, "grad_norm": 32112.76171875, "learning_rate": 1.6896838602329453e-05, "loss": 0.2959, "step": 1050 }, { "epoch": 1.828761429758936, "grad_norm": 22870.244140625, "learning_rate": 1.627287853577371e-05, "loss": 0.2896, "step": 1100 }, { "epoch": 1.828761429758936, "eval_loss": 0.24409395456314087, "eval_runtime": 37.7821, "eval_samples_per_second": 13.075, "eval_steps_per_second": 1.641, "step": 1100 }, { "epoch": 1.9118869492934332, "grad_norm": 30326.173828125, "learning_rate": 1.564891846921797e-05, "loss": 0.287, "step": 1150 }, { "epoch": 1.9950124688279303, "grad_norm": 20373.96875, "learning_rate": 1.502495840266223e-05, "loss": 0.2847, "step": 1200 }, { "epoch": 1.9950124688279303, "eval_loss": 0.2411041557788849, "eval_runtime": 37.3119, "eval_samples_per_second": 13.24, "eval_steps_per_second": 1.662, "step": 1200 }, { "epoch": 2.0781379883624274, "grad_norm": 13517.228515625, "learning_rate": 1.440099833610649e-05, "loss": 0.2754, "step": 1250 }, { "epoch": 2.1612635078969245, "grad_norm": 37159.10546875, "learning_rate": 1.3777038269550749e-05, "loss": 0.2841, "step": 1300 }, { "epoch": 2.1612635078969245, "eval_loss": 0.23963774740695953, "eval_runtime": 37.7867, "eval_samples_per_second": 13.073, "eval_steps_per_second": 1.641, "step": 1300 }, { "epoch": 2.2443890274314215, "grad_norm": 13807.1201171875, "learning_rate": 1.315307820299501e-05, "loss": 0.2831, "step": 1350 }, { "epoch": 2.3275145469659186, "grad_norm": 70717.4296875, "learning_rate": 1.2529118136439268e-05, "loss": 0.2926, "step": 1400 }, { "epoch": 2.3275145469659186, "eval_loss": 0.23829442262649536, "eval_runtime": 37.7411, "eval_samples_per_second": 13.089, "eval_steps_per_second": 1.643, "step": 1400 }, { "epoch": 2.4106400665004157, "grad_norm": 20201.111328125, "learning_rate": 1.1905158069883528e-05, "loss": 0.285, "step": 1450 }, { "epoch": 2.493765586034913, "grad_norm": 28600.62109375, "learning_rate": 1.1281198003327787e-05, "loss": 0.2593, "step": 1500 }, { "epoch": 2.493765586034913, "eval_loss": 0.2369847148656845, "eval_runtime": 38.1397, "eval_samples_per_second": 12.952, "eval_steps_per_second": 1.626, "step": 1500 }, { "epoch": 2.57689110556941, "grad_norm": 22271.5, "learning_rate": 1.0657237936772047e-05, "loss": 0.2684, "step": 1550 }, { "epoch": 2.660016625103907, "grad_norm": 17982.9140625, "learning_rate": 1.0033277870216307e-05, "loss": 0.2753, "step": 1600 }, { "epoch": 2.660016625103907, "eval_loss": 0.23503336310386658, "eval_runtime": 37.7531, "eval_samples_per_second": 13.085, "eval_steps_per_second": 1.642, "step": 1600 }, { "epoch": 2.743142144638404, "grad_norm": 20275.65625, "learning_rate": 9.409317803660566e-06, "loss": 0.282, "step": 1650 }, { "epoch": 2.826267664172901, "grad_norm": 21899.2109375, "learning_rate": 8.785357737104826e-06, "loss": 0.2699, "step": 1700 }, { "epoch": 2.826267664172901, "eval_loss": 0.23422521352767944, "eval_runtime": 37.8729, "eval_samples_per_second": 13.044, "eval_steps_per_second": 1.637, "step": 1700 }, { "epoch": 2.9093931837073983, "grad_norm": 17812.615234375, "learning_rate": 8.161397670549084e-06, "loss": 0.277, "step": 1750 }, { "epoch": 2.9925187032418954, "grad_norm": 18110.896484375, "learning_rate": 7.5374376039933445e-06, "loss": 0.2673, "step": 1800 }, { "epoch": 2.9925187032418954, "eval_loss": 0.23330263793468475, "eval_runtime": 37.8678, "eval_samples_per_second": 13.045, "eval_steps_per_second": 1.637, "step": 1800 }, { "epoch": 3.0756442227763925, "grad_norm": 64865.62109375, "learning_rate": 6.913477537437604e-06, "loss": 0.2742, "step": 1850 }, { "epoch": 3.1587697423108896, "grad_norm": 22536.302734375, "learning_rate": 6.289517470881864e-06, "loss": 0.2723, "step": 1900 }, { "epoch": 3.1587697423108896, "eval_loss": 0.23302872478961945, "eval_runtime": 38.1436, "eval_samples_per_second": 12.951, "eval_steps_per_second": 1.625, "step": 1900 }, { "epoch": 3.2418952618453867, "grad_norm": 26661.78125, "learning_rate": 5.6655574043261234e-06, "loss": 0.273, "step": 1950 }, { "epoch": 3.3250207813798838, "grad_norm": 39719.59765625, "learning_rate": 5.0415973377703825e-06, "loss": 0.2746, "step": 2000 }, { "epoch": 3.3250207813798838, "eval_loss": 0.23246362805366516, "eval_runtime": 37.97, "eval_samples_per_second": 13.01, "eval_steps_per_second": 1.633, "step": 2000 }, { "epoch": 3.408146300914381, "grad_norm": 19064.091796875, "learning_rate": 4.4176372712146424e-06, "loss": 0.2531, "step": 2050 }, { "epoch": 3.491271820448878, "grad_norm": 24487.681640625, "learning_rate": 3.793677204658902e-06, "loss": 0.2763, "step": 2100 }, { "epoch": 3.491271820448878, "eval_loss": 0.23180559277534485, "eval_runtime": 37.9308, "eval_samples_per_second": 13.024, "eval_steps_per_second": 1.635, "step": 2100 }, { "epoch": 3.574397339983375, "grad_norm": 33160.66015625, "learning_rate": 3.1697171381031614e-06, "loss": 0.2706, "step": 2150 }, { "epoch": 3.657522859517872, "grad_norm": 20284.03515625, "learning_rate": 2.545757071547421e-06, "loss": 0.2521, "step": 2200 }, { "epoch": 3.657522859517872, "eval_loss": 0.23114623129367828, "eval_runtime": 38.1017, "eval_samples_per_second": 12.965, "eval_steps_per_second": 1.627, "step": 2200 }, { "epoch": 3.7406483790523692, "grad_norm": 55974.03125, "learning_rate": 1.9217970049916804e-06, "loss": 0.2542, "step": 2250 }, { "epoch": 3.8237738985868663, "grad_norm": 18724.478515625, "learning_rate": 1.2978369384359402e-06, "loss": 0.2684, "step": 2300 }, { "epoch": 3.8237738985868663, "eval_loss": 0.23083852231502533, "eval_runtime": 38.0527, "eval_samples_per_second": 12.982, "eval_steps_per_second": 1.629, "step": 2300 }, { "epoch": 3.9068994181213634, "grad_norm": 26152.619140625, "learning_rate": 6.738768718801997e-07, "loss": 0.2582, "step": 2350 }, { "epoch": 3.9900249376558605, "grad_norm": 20345.572265625, "learning_rate": 4.9916805324459236e-08, "loss": 0.2529, "step": 2400 }, { "epoch": 3.9900249376558605, "eval_loss": 0.23079748451709747, "eval_runtime": 37.7401, "eval_samples_per_second": 13.09, "eval_steps_per_second": 1.643, "step": 2400 } ], "logging_steps": 50, "max_steps": 2404, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.342112942882816e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }