| { | |
| "best_metric": 0.23246362805366516, | |
| "best_model_checkpoint": "./fine-tuned/checkpoint-2000", | |
| "epoch": 3.99667497921862, | |
| "eval_steps": 100, | |
| "global_step": 2404, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0831255195344971, | |
| "grad_norm": 36186.3984375, | |
| "learning_rate": 2.937603993344426e-05, | |
| "loss": 1.1504, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1662510390689942, | |
| "grad_norm": 26857.228515625, | |
| "learning_rate": 2.8752079866888522e-05, | |
| "loss": 0.4993, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1662510390689942, | |
| "eval_loss": 0.35283052921295166, | |
| "eval_runtime": 38.0523, | |
| "eval_samples_per_second": 12.982, | |
| "eval_steps_per_second": 1.629, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.24937655860349128, | |
| "grad_norm": 53959.0078125, | |
| "learning_rate": 2.812811980033278e-05, | |
| "loss": 0.4236, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3325020781379884, | |
| "grad_norm": 32498.453125, | |
| "learning_rate": 2.7504159733777037e-05, | |
| "loss": 0.376, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3325020781379884, | |
| "eval_loss": 0.30865946412086487, | |
| "eval_runtime": 38.2162, | |
| "eval_samples_per_second": 12.926, | |
| "eval_steps_per_second": 1.622, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.41562759767248547, | |
| "grad_norm": 32577.630859375, | |
| "learning_rate": 2.6880199667221298e-05, | |
| "loss": 0.3822, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.49875311720698257, | |
| "grad_norm": 28815.4609375, | |
| "learning_rate": 2.625623960066556e-05, | |
| "loss": 0.3569, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.49875311720698257, | |
| "eval_loss": 0.2891499996185303, | |
| "eval_runtime": 38.1858, | |
| "eval_samples_per_second": 12.937, | |
| "eval_steps_per_second": 1.624, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5818786367414797, | |
| "grad_norm": 33334.5546875, | |
| "learning_rate": 2.563227953410982e-05, | |
| "loss": 0.3414, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6650041562759768, | |
| "grad_norm": 23824.673828125, | |
| "learning_rate": 2.5008319467554077e-05, | |
| "loss": 0.3671, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6650041562759768, | |
| "eval_loss": 0.2757515609264374, | |
| "eval_runtime": 38.2247, | |
| "eval_samples_per_second": 12.924, | |
| "eval_steps_per_second": 1.622, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7481296758104738, | |
| "grad_norm": 24386.048828125, | |
| "learning_rate": 2.4384359400998338e-05, | |
| "loss": 0.3234, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8312551953449709, | |
| "grad_norm": 19840.703125, | |
| "learning_rate": 2.3760399334442595e-05, | |
| "loss": 0.3299, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8312551953449709, | |
| "eval_loss": 0.2673098146915436, | |
| "eval_runtime": 38.0207, | |
| "eval_samples_per_second": 12.993, | |
| "eval_steps_per_second": 1.631, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.914380714879468, | |
| "grad_norm": 37282.65625, | |
| "learning_rate": 2.3136439267886856e-05, | |
| "loss": 0.3215, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.9975062344139651, | |
| "grad_norm": 21197.52734375, | |
| "learning_rate": 2.2512479201331116e-05, | |
| "loss": 0.3407, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9975062344139651, | |
| "eval_loss": 0.26070085167884827, | |
| "eval_runtime": 37.7694, | |
| "eval_samples_per_second": 13.079, | |
| "eval_steps_per_second": 1.642, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.0806317539484622, | |
| "grad_norm": 21656.6328125, | |
| "learning_rate": 2.1888519134775374e-05, | |
| "loss": 0.3191, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.1637572734829593, | |
| "grad_norm": 22402.8125, | |
| "learning_rate": 2.1264559068219635e-05, | |
| "loss": 0.299, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.1637572734829593, | |
| "eval_loss": 0.256939560174942, | |
| "eval_runtime": 38.0385, | |
| "eval_samples_per_second": 12.987, | |
| "eval_steps_per_second": 1.63, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.2468827930174564, | |
| "grad_norm": 49383.11328125, | |
| "learning_rate": 2.0640599001663895e-05, | |
| "loss": 0.3131, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.3300083125519535, | |
| "grad_norm": 27295.173828125, | |
| "learning_rate": 2.0016638935108153e-05, | |
| "loss": 0.3149, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.3300083125519535, | |
| "eval_loss": 0.2525966763496399, | |
| "eval_runtime": 38.1831, | |
| "eval_samples_per_second": 12.938, | |
| "eval_steps_per_second": 1.624, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.4131338320864506, | |
| "grad_norm": 15751.75390625, | |
| "learning_rate": 1.9392678868552414e-05, | |
| "loss": 0.2845, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.4962593516209477, | |
| "grad_norm": 25327.384765625, | |
| "learning_rate": 1.8768718801996674e-05, | |
| "loss": 0.2945, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.4962593516209477, | |
| "eval_loss": 0.24994711577892303, | |
| "eval_runtime": 37.4678, | |
| "eval_samples_per_second": 13.185, | |
| "eval_steps_per_second": 1.655, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.5793848711554448, | |
| "grad_norm": 25815.150390625, | |
| "learning_rate": 1.8144758735440932e-05, | |
| "loss": 0.3005, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.6625103906899419, | |
| "grad_norm": 24868.796875, | |
| "learning_rate": 1.7520798668885192e-05, | |
| "loss": 0.3129, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6625103906899419, | |
| "eval_loss": 0.246443971991539, | |
| "eval_runtime": 37.8324, | |
| "eval_samples_per_second": 13.058, | |
| "eval_steps_per_second": 1.639, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.745635910224439, | |
| "grad_norm": 32112.76171875, | |
| "learning_rate": 1.6896838602329453e-05, | |
| "loss": 0.2959, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.828761429758936, | |
| "grad_norm": 22870.244140625, | |
| "learning_rate": 1.627287853577371e-05, | |
| "loss": 0.2896, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.828761429758936, | |
| "eval_loss": 0.24409395456314087, | |
| "eval_runtime": 37.7821, | |
| "eval_samples_per_second": 13.075, | |
| "eval_steps_per_second": 1.641, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.9118869492934332, | |
| "grad_norm": 30326.173828125, | |
| "learning_rate": 1.564891846921797e-05, | |
| "loss": 0.287, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.9950124688279303, | |
| "grad_norm": 20373.96875, | |
| "learning_rate": 1.502495840266223e-05, | |
| "loss": 0.2847, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.9950124688279303, | |
| "eval_loss": 0.2411041557788849, | |
| "eval_runtime": 37.3119, | |
| "eval_samples_per_second": 13.24, | |
| "eval_steps_per_second": 1.662, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.0781379883624274, | |
| "grad_norm": 13517.228515625, | |
| "learning_rate": 1.440099833610649e-05, | |
| "loss": 0.2754, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.1612635078969245, | |
| "grad_norm": 37159.10546875, | |
| "learning_rate": 1.3777038269550749e-05, | |
| "loss": 0.2841, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.1612635078969245, | |
| "eval_loss": 0.23963774740695953, | |
| "eval_runtime": 37.7867, | |
| "eval_samples_per_second": 13.073, | |
| "eval_steps_per_second": 1.641, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.2443890274314215, | |
| "grad_norm": 13807.1201171875, | |
| "learning_rate": 1.315307820299501e-05, | |
| "loss": 0.2831, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.3275145469659186, | |
| "grad_norm": 70717.4296875, | |
| "learning_rate": 1.2529118136439268e-05, | |
| "loss": 0.2926, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.3275145469659186, | |
| "eval_loss": 0.23829442262649536, | |
| "eval_runtime": 37.7411, | |
| "eval_samples_per_second": 13.089, | |
| "eval_steps_per_second": 1.643, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.4106400665004157, | |
| "grad_norm": 20201.111328125, | |
| "learning_rate": 1.1905158069883528e-05, | |
| "loss": 0.285, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.493765586034913, | |
| "grad_norm": 28600.62109375, | |
| "learning_rate": 1.1281198003327787e-05, | |
| "loss": 0.2593, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.493765586034913, | |
| "eval_loss": 0.2369847148656845, | |
| "eval_runtime": 38.1397, | |
| "eval_samples_per_second": 12.952, | |
| "eval_steps_per_second": 1.626, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.57689110556941, | |
| "grad_norm": 22271.5, | |
| "learning_rate": 1.0657237936772047e-05, | |
| "loss": 0.2684, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.660016625103907, | |
| "grad_norm": 17982.9140625, | |
| "learning_rate": 1.0033277870216307e-05, | |
| "loss": 0.2753, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.660016625103907, | |
| "eval_loss": 0.23503336310386658, | |
| "eval_runtime": 37.7531, | |
| "eval_samples_per_second": 13.085, | |
| "eval_steps_per_second": 1.642, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.743142144638404, | |
| "grad_norm": 20275.65625, | |
| "learning_rate": 9.409317803660566e-06, | |
| "loss": 0.282, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.826267664172901, | |
| "grad_norm": 21899.2109375, | |
| "learning_rate": 8.785357737104826e-06, | |
| "loss": 0.2699, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.826267664172901, | |
| "eval_loss": 0.23422521352767944, | |
| "eval_runtime": 37.8729, | |
| "eval_samples_per_second": 13.044, | |
| "eval_steps_per_second": 1.637, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.9093931837073983, | |
| "grad_norm": 17812.615234375, | |
| "learning_rate": 8.161397670549084e-06, | |
| "loss": 0.277, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.9925187032418954, | |
| "grad_norm": 18110.896484375, | |
| "learning_rate": 7.5374376039933445e-06, | |
| "loss": 0.2673, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.9925187032418954, | |
| "eval_loss": 0.23330263793468475, | |
| "eval_runtime": 37.8678, | |
| "eval_samples_per_second": 13.045, | |
| "eval_steps_per_second": 1.637, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.0756442227763925, | |
| "grad_norm": 64865.62109375, | |
| "learning_rate": 6.913477537437604e-06, | |
| "loss": 0.2742, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 3.1587697423108896, | |
| "grad_norm": 22536.302734375, | |
| "learning_rate": 6.289517470881864e-06, | |
| "loss": 0.2723, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.1587697423108896, | |
| "eval_loss": 0.23302872478961945, | |
| "eval_runtime": 38.1436, | |
| "eval_samples_per_second": 12.951, | |
| "eval_steps_per_second": 1.625, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.2418952618453867, | |
| "grad_norm": 26661.78125, | |
| "learning_rate": 5.6655574043261234e-06, | |
| "loss": 0.273, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 3.3250207813798838, | |
| "grad_norm": 39719.59765625, | |
| "learning_rate": 5.0415973377703825e-06, | |
| "loss": 0.2746, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.3250207813798838, | |
| "eval_loss": 0.23246362805366516, | |
| "eval_runtime": 37.97, | |
| "eval_samples_per_second": 13.01, | |
| "eval_steps_per_second": 1.633, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.408146300914381, | |
| "grad_norm": 19064.091796875, | |
| "learning_rate": 4.4176372712146424e-06, | |
| "loss": 0.2531, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 3.491271820448878, | |
| "grad_norm": 24487.681640625, | |
| "learning_rate": 3.793677204658902e-06, | |
| "loss": 0.2763, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.491271820448878, | |
| "eval_loss": 0.23180559277534485, | |
| "eval_runtime": 37.9308, | |
| "eval_samples_per_second": 13.024, | |
| "eval_steps_per_second": 1.635, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.574397339983375, | |
| "grad_norm": 33160.66015625, | |
| "learning_rate": 3.1697171381031614e-06, | |
| "loss": 0.2706, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 3.657522859517872, | |
| "grad_norm": 20284.03515625, | |
| "learning_rate": 2.545757071547421e-06, | |
| "loss": 0.2521, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.657522859517872, | |
| "eval_loss": 0.23114623129367828, | |
| "eval_runtime": 38.1017, | |
| "eval_samples_per_second": 12.965, | |
| "eval_steps_per_second": 1.627, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.7406483790523692, | |
| "grad_norm": 55974.03125, | |
| "learning_rate": 1.9217970049916804e-06, | |
| "loss": 0.2542, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 3.8237738985868663, | |
| "grad_norm": 18724.478515625, | |
| "learning_rate": 1.2978369384359402e-06, | |
| "loss": 0.2684, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 3.8237738985868663, | |
| "eval_loss": 0.23083852231502533, | |
| "eval_runtime": 38.0527, | |
| "eval_samples_per_second": 12.982, | |
| "eval_steps_per_second": 1.629, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 3.9068994181213634, | |
| "grad_norm": 26152.619140625, | |
| "learning_rate": 6.738768718801997e-07, | |
| "loss": 0.2582, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 3.9900249376558605, | |
| "grad_norm": 20345.572265625, | |
| "learning_rate": 4.9916805324459236e-08, | |
| "loss": 0.2529, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 3.9900249376558605, | |
| "eval_loss": 0.23079748451709747, | |
| "eval_runtime": 37.7401, | |
| "eval_samples_per_second": 13.09, | |
| "eval_steps_per_second": 1.643, | |
| "step": 2400 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 2404, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.342112942882816e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |