{ "best_metric": 0.5330188679245284, "best_model_checkpoint": "./Validated_Balanced_Raw_Data_model_boost3_outputs/checkpoint-640", "epoch": 30.0, "eval_steps": 500, "global_step": 2400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.625, "grad_norm": 9.254714012145996, "learning_rate": 4.166666666666667e-06, "loss": 1.3971, "step": 50 }, { "epoch": 1.0, "eval_accuracy": 0.3584905660377358, "eval_loss": 1.33805251121521, "eval_runtime": 1.7498, "eval_samples_per_second": 121.156, "eval_steps_per_second": 15.43, "step": 80 }, { "epoch": 1.25, "grad_norm": 14.49019718170166, "learning_rate": 8.333333333333334e-06, "loss": 1.3703, "step": 100 }, { "epoch": 1.875, "grad_norm": 21.699073791503906, "learning_rate": 1.25e-05, "loss": 1.3299, "step": 150 }, { "epoch": 2.0, "eval_accuracy": 0.3632075471698113, "eval_loss": 1.2896820306777954, "eval_runtime": 1.7829, "eval_samples_per_second": 118.907, "eval_steps_per_second": 15.144, "step": 160 }, { "epoch": 2.5, "grad_norm": 16.081377029418945, "learning_rate": 1.6666666666666667e-05, "loss": 1.2782, "step": 200 }, { "epoch": 3.0, "eval_accuracy": 0.41509433962264153, "eval_loss": 1.242042064666748, "eval_runtime": 1.747, "eval_samples_per_second": 121.352, "eval_steps_per_second": 15.455, "step": 240 }, { "epoch": 3.125, "grad_norm": 15.81972885131836, "learning_rate": 1.9998942319271076e-05, "loss": 1.2726, "step": 250 }, { "epoch": 3.75, "grad_norm": 18.504892349243164, "learning_rate": 1.9961946980917457e-05, "loss": 1.2151, "step": 300 }, { "epoch": 4.0, "eval_accuracy": 0.4481132075471698, "eval_loss": 1.1927876472473145, "eval_runtime": 1.7636, "eval_samples_per_second": 120.21, "eval_steps_per_second": 15.31, "step": 320 }, { "epoch": 4.375, "grad_norm": 20.491024017333984, "learning_rate": 1.9872291131173743e-05, "loss": 1.2169, "step": 350 }, { "epoch": 5.0, "grad_norm": 34.251853942871094, "learning_rate": 1.973044870579824e-05, "loss": 1.1703, "step": 400 }, { "epoch": 5.0, "eval_accuracy": 0.49056603773584906, "eval_loss": 1.1870641708374023, "eval_runtime": 1.7599, "eval_samples_per_second": 120.459, "eval_steps_per_second": 15.341, "step": 400 }, { "epoch": 5.625, "grad_norm": 28.04253387451172, "learning_rate": 1.953716950748227e-05, "loss": 1.1426, "step": 450 }, { "epoch": 6.0, "eval_accuracy": 0.4811320754716981, "eval_loss": 1.1826516389846802, "eval_runtime": 1.7574, "eval_samples_per_second": 120.632, "eval_steps_per_second": 15.363, "step": 480 }, { "epoch": 6.25, "grad_norm": 16.116487503051758, "learning_rate": 1.9293475242268224e-05, "loss": 1.0804, "step": 500 }, { "epoch": 6.875, "grad_norm": 28.656999588012695, "learning_rate": 1.900065411864121e-05, "loss": 1.0837, "step": 550 }, { "epoch": 7.0, "eval_accuracy": 0.5094339622641509, "eval_loss": 1.1960099935531616, "eval_runtime": 1.7695, "eval_samples_per_second": 119.81, "eval_steps_per_second": 15.259, "step": 560 }, { "epoch": 7.5, "grad_norm": 22.728029251098633, "learning_rate": 1.866025403784439e-05, "loss": 1.0393, "step": 600 }, { "epoch": 8.0, "eval_accuracy": 0.5330188679245284, "eval_loss": 1.14811110496521, "eval_runtime": 1.7557, "eval_samples_per_second": 120.75, "eval_steps_per_second": 15.378, "step": 640 }, { "epoch": 8.125, "grad_norm": 19.9659423828125, "learning_rate": 1.8274074411415104e-05, "loss": 1.0651, "step": 650 }, { "epoch": 8.75, "grad_norm": 28.96779441833496, "learning_rate": 1.784415664919576e-05, "loss": 1.0316, "step": 700 }, { "epoch": 9.0, "eval_accuracy": 0.4858490566037736, "eval_loss": 1.193508267402649, "eval_runtime": 1.7697, "eval_samples_per_second": 119.791, "eval_steps_per_second": 15.256, "step": 720 }, { "epoch": 9.375, "grad_norm": 30.056156158447266, "learning_rate": 1.737277336810124e-05, "loss": 0.9686, "step": 750 }, { "epoch": 10.0, "grad_norm": 32.97503662109375, "learning_rate": 1.686241637868734e-05, "loss": 1.0134, "step": 800 }, { "epoch": 10.0, "eval_accuracy": 0.49528301886792453, "eval_loss": 1.1633896827697754, "eval_runtime": 1.7634, "eval_samples_per_second": 120.222, "eval_steps_per_second": 15.311, "step": 800 }, { "epoch": 10.625, "grad_norm": 28.390098571777344, "learning_rate": 1.6315783513024977e-05, "loss": 0.9324, "step": 850 }, { "epoch": 11.0, "eval_accuracy": 0.5094339622641509, "eval_loss": 1.186944603919983, "eval_runtime": 1.754, "eval_samples_per_second": 120.867, "eval_steps_per_second": 15.393, "step": 880 }, { "epoch": 11.25, "grad_norm": 23.563337326049805, "learning_rate": 1.573576436351046e-05, "loss": 0.9198, "step": 900 }, { "epoch": 11.875, "grad_norm": 24.938207626342773, "learning_rate": 1.5125425007998653e-05, "loss": 0.9005, "step": 950 }, { "epoch": 12.0, "eval_accuracy": 0.4858490566037736, "eval_loss": 1.1605207920074463, "eval_runtime": 1.7703, "eval_samples_per_second": 119.756, "eval_steps_per_second": 15.252, "step": 960 }, { "epoch": 12.5, "grad_norm": 32.487918853759766, "learning_rate": 1.4487991802004625e-05, "loss": 0.8917, "step": 1000 }, { "epoch": 13.0, "eval_accuracy": 0.4858490566037736, "eval_loss": 1.1817502975463867, "eval_runtime": 1.7512, "eval_samples_per_second": 121.057, "eval_steps_per_second": 15.418, "step": 1040 }, { "epoch": 13.125, "grad_norm": 36.24566650390625, "learning_rate": 1.3826834323650899e-05, "loss": 0.8511, "step": 1050 }, { "epoch": 13.75, "grad_norm": 26.883840560913086, "learning_rate": 1.3145447561516138e-05, "loss": 0.8299, "step": 1100 }, { "epoch": 14.0, "eval_accuracy": 0.49528301886792453, "eval_loss": 1.1758817434310913, "eval_runtime": 1.7567, "eval_samples_per_second": 120.677, "eval_steps_per_second": 15.369, "step": 1120 }, { "epoch": 14.375, "grad_norm": 14.808755874633789, "learning_rate": 1.2447433439543239e-05, "loss": 0.8226, "step": 1150 }, { "epoch": 15.0, "grad_norm": 36.614830017089844, "learning_rate": 1.1736481776669307e-05, "loss": 0.8314, "step": 1200 }, { "epoch": 15.0, "eval_accuracy": 0.49056603773584906, "eval_loss": 1.1998909711837769, "eval_runtime": 1.7606, "eval_samples_per_second": 120.41, "eval_steps_per_second": 15.335, "step": 1200 }, { "epoch": 15.625, "grad_norm": 23.366907119750977, "learning_rate": 1.101635078182802e-05, "loss": 0.7891, "step": 1250 }, { "epoch": 16.0, "eval_accuracy": 0.5, "eval_loss": 1.211138129234314, "eval_runtime": 1.7578, "eval_samples_per_second": 120.609, "eval_steps_per_second": 15.361, "step": 1280 }, { "epoch": 16.25, "grad_norm": 59.76485824584961, "learning_rate": 1.0290847187431115e-05, "loss": 0.7951, "step": 1300 }, { "epoch": 16.875, "grad_norm": 23.98710823059082, "learning_rate": 9.563806126346643e-06, "loss": 0.7702, "step": 1350 }, { "epoch": 17.0, "eval_accuracy": 0.47641509433962265, "eval_loss": 1.2256454229354858, "eval_runtime": 1.7428, "eval_samples_per_second": 121.644, "eval_steps_per_second": 15.492, "step": 1360 }, { "epoch": 17.5, "grad_norm": 25.670591354370117, "learning_rate": 8.839070858747697e-06, "loss": 0.7821, "step": 1400 }, { "epoch": 18.0, "eval_accuracy": 0.5141509433962265, "eval_loss": 1.2363587617874146, "eval_runtime": 1.7717, "eval_samples_per_second": 119.661, "eval_steps_per_second": 15.24, "step": 1440 }, { "epoch": 18.125, "grad_norm": 21.44415855407715, "learning_rate": 8.120472455998882e-06, "loss": 0.7652, "step": 1450 }, { "epoch": 18.75, "grad_norm": 23.542112350463867, "learning_rate": 7.411809548974792e-06, "loss": 0.7391, "step": 1500 }, { "epoch": 19.0, "eval_accuracy": 0.5047169811320755, "eval_loss": 1.2107821702957153, "eval_runtime": 1.7876, "eval_samples_per_second": 118.592, "eval_steps_per_second": 15.104, "step": 1520 }, { "epoch": 19.375, "grad_norm": 24.51466941833496, "learning_rate": 6.716828247864391e-06, "loss": 0.7376, "step": 1550 }, { "epoch": 20.0, "grad_norm": 6.434605598449707, "learning_rate": 6.039202339608432e-06, "loss": 0.7078, "step": 1600 }, { "epoch": 20.0, "eval_accuracy": 0.5, "eval_loss": 1.1987223625183105, "eval_runtime": 1.7661, "eval_samples_per_second": 120.04, "eval_steps_per_second": 15.288, "step": 1600 }, { "epoch": 20.625, "grad_norm": 32.26969528198242, "learning_rate": 5.382513867649663e-06, "loss": 0.7245, "step": 1650 }, { "epoch": 21.0, "eval_accuracy": 0.5283018867924528, "eval_loss": 1.1981287002563477, "eval_runtime": 1.764, "eval_samples_per_second": 120.182, "eval_steps_per_second": 15.306, "step": 1680 }, { "epoch": 21.25, "grad_norm": 9.658790588378906, "learning_rate": 4.7502341966544e-06, "loss": 0.665, "step": 1700 }, { "epoch": 21.875, "grad_norm": 34.42046356201172, "learning_rate": 4.1457056623005954e-06, "loss": 0.6822, "step": 1750 }, { "epoch": 22.0, "eval_accuracy": 0.5283018867924528, "eval_loss": 1.2109999656677246, "eval_runtime": 1.7742, "eval_samples_per_second": 119.493, "eval_steps_per_second": 15.218, "step": 1760 }, { "epoch": 22.5, "grad_norm": 43.55071258544922, "learning_rate": 3.5721239031346067e-06, "loss": 0.6646, "step": 1800 }, { "epoch": 23.0, "eval_accuracy": 0.5330188679245284, "eval_loss": 1.2094640731811523, "eval_runtime": 1.7778, "eval_samples_per_second": 119.246, "eval_steps_per_second": 15.187, "step": 1840 }, { "epoch": 23.125, "grad_norm": 29.20159149169922, "learning_rate": 3.032520967893453e-06, "loss": 0.7001, "step": 1850 }, { "epoch": 23.75, "grad_norm": 32.616455078125, "learning_rate": 2.529749287590042e-06, "loss": 0.7144, "step": 1900 }, { "epoch": 24.0, "eval_accuracy": 0.5235849056603774, "eval_loss": 1.207753300666809, "eval_runtime": 1.7715, "eval_samples_per_second": 119.672, "eval_steps_per_second": 15.241, "step": 1920 }, { "epoch": 24.375, "grad_norm": 33.372379302978516, "learning_rate": 2.0664665970876496e-06, "loss": 0.6557, "step": 1950 }, { "epoch": 25.0, "grad_norm": 58.919925689697266, "learning_rate": 1.6451218858706374e-06, "loss": 0.7271, "step": 2000 }, { "epoch": 25.0, "eval_accuracy": 0.5188679245283019, "eval_loss": 1.2087972164154053, "eval_runtime": 1.7765, "eval_samples_per_second": 119.336, "eval_steps_per_second": 15.198, "step": 2000 }, { "epoch": 25.625, "grad_norm": 15.74853515625, "learning_rate": 1.2679424522780426e-06, "loss": 0.6563, "step": 2050 }, { "epoch": 26.0, "eval_accuracy": 0.5094339622641509, "eval_loss": 1.213672399520874, "eval_runtime": 1.7824, "eval_samples_per_second": 118.944, "eval_steps_per_second": 15.148, "step": 2080 }, { "epoch": 26.25, "grad_norm": 21.280942916870117, "learning_rate": 9.369221296335007e-07, "loss": 0.681, "step": 2100 }, { "epoch": 26.875, "grad_norm": 5.9813551902771, "learning_rate": 6.538107465101162e-07, "loss": 0.6447, "step": 2150 }, { "epoch": 27.0, "eval_accuracy": 0.5235849056603774, "eval_loss": 1.2157169580459595, "eval_runtime": 1.7502, "eval_samples_per_second": 121.129, "eval_steps_per_second": 15.427, "step": 2160 }, { "epoch": 27.5, "grad_norm": 26.834735870361328, "learning_rate": 4.2010487684511105e-07, "loss": 0.6763, "step": 2200 }, { "epoch": 28.0, "eval_accuracy": 0.5188679245283019, "eval_loss": 1.213538408279419, "eval_runtime": 1.7574, "eval_samples_per_second": 120.634, "eval_steps_per_second": 15.364, "step": 2240 }, { "epoch": 28.125, "grad_norm": 11.102935791015625, "learning_rate": 2.370399288006664e-07, "loss": 0.6374, "step": 2250 }, { "epoch": 28.75, "grad_norm": 33.84370803833008, "learning_rate": 1.055836141905553e-07, "loss": 0.6434, "step": 2300 }, { "epoch": 29.0, "eval_accuracy": 0.5188679245283019, "eval_loss": 1.21367347240448, "eval_runtime": 1.7659, "eval_samples_per_second": 120.05, "eval_steps_per_second": 15.289, "step": 2320 }, { "epoch": 29.375, "grad_norm": 20.113740921020508, "learning_rate": 2.643083299427751e-08, "loss": 0.6788, "step": 2350 }, { "epoch": 30.0, "grad_norm": 13.546830177307129, "learning_rate": 0.0, "loss": 0.6727, "step": 2400 }, { "epoch": 30.0, "eval_accuracy": 0.5188679245283019, "eval_loss": 1.213610291481018, "eval_runtime": 3.0973, "eval_samples_per_second": 68.446, "eval_steps_per_second": 8.717, "step": 2400 }, { "epoch": 30.0, "step": 2400, "total_flos": 1.5090549662800282e+18, "train_loss": 0.886767615477244, "train_runtime": 578.1499, "train_samples_per_second": 33.002, "train_steps_per_second": 4.151 } ], "logging_steps": 50, "max_steps": 2400, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5090549662800282e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }