| { | |
| "best_metric": 2.009188652038574, | |
| "best_model_checkpoint": "ckpts/sft_gemma-2b/checkpoint-1680", | |
| "epoch": 8.865435356200528, | |
| "eval_steps": 20, | |
| "global_step": 1680, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.10554089709762533, | |
| "grad_norm": 6.84375, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 2.4965, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10554089709762533, | |
| "eval_loss": 2.404269218444824, | |
| "eval_runtime": 8.1633, | |
| "eval_samples_per_second": 24.5, | |
| "eval_steps_per_second": 6.125, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.21108179419525067, | |
| "grad_norm": 3.0, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 2.2807, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.21108179419525067, | |
| "eval_loss": 2.1983509063720703, | |
| "eval_runtime": 7.9364, | |
| "eval_samples_per_second": 25.2, | |
| "eval_steps_per_second": 6.3, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.316622691292876, | |
| "grad_norm": 3.0, | |
| "learning_rate": 1e-05, | |
| "loss": 2.1723, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.316622691292876, | |
| "eval_loss": 2.157801628112793, | |
| "eval_runtime": 8.1488, | |
| "eval_samples_per_second": 24.543, | |
| "eval_steps_per_second": 6.136, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.42216358839050133, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0888, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.42216358839050133, | |
| "eval_loss": 2.1461212635040283, | |
| "eval_runtime": 8.0327, | |
| "eval_samples_per_second": 24.898, | |
| "eval_steps_per_second": 6.225, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5277044854881267, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 1e-05, | |
| "loss": 2.1187, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5277044854881267, | |
| "eval_loss": 2.140232801437378, | |
| "eval_runtime": 8.0366, | |
| "eval_samples_per_second": 24.886, | |
| "eval_steps_per_second": 6.222, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.633245382585752, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 1e-05, | |
| "loss": 2.1293, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.633245382585752, | |
| "eval_loss": 2.135404109954834, | |
| "eval_runtime": 8.1159, | |
| "eval_samples_per_second": 24.643, | |
| "eval_steps_per_second": 6.161, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7387862796833773, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 1e-05, | |
| "loss": 2.1351, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.7387862796833773, | |
| "eval_loss": 2.1313905715942383, | |
| "eval_runtime": 7.9204, | |
| "eval_samples_per_second": 25.251, | |
| "eval_steps_per_second": 6.313, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8443271767810027, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 1e-05, | |
| "loss": 2.1204, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.8443271767810027, | |
| "eval_loss": 2.1264946460723877, | |
| "eval_runtime": 8.1968, | |
| "eval_samples_per_second": 24.4, | |
| "eval_steps_per_second": 6.1, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.9498680738786279, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0984, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.9498680738786279, | |
| "eval_loss": 2.123286247253418, | |
| "eval_runtime": 8.0328, | |
| "eval_samples_per_second": 24.898, | |
| "eval_steps_per_second": 6.225, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.0554089709762533, | |
| "grad_norm": 3.0, | |
| "learning_rate": 1e-05, | |
| "loss": 2.1008, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0554089709762533, | |
| "eval_loss": 2.1210150718688965, | |
| "eval_runtime": 7.8913, | |
| "eval_samples_per_second": 25.344, | |
| "eval_steps_per_second": 6.336, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.1609498680738786, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0771, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.1609498680738786, | |
| "eval_loss": 2.118210792541504, | |
| "eval_runtime": 8.2706, | |
| "eval_samples_per_second": 24.182, | |
| "eval_steps_per_second": 6.045, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.266490765171504, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0659, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.266490765171504, | |
| "eval_loss": 2.1160335540771484, | |
| "eval_runtime": 8.1186, | |
| "eval_samples_per_second": 24.635, | |
| "eval_steps_per_second": 6.159, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.3720316622691293, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0616, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.3720316622691293, | |
| "eval_loss": 2.113948106765747, | |
| "eval_runtime": 8.3043, | |
| "eval_samples_per_second": 24.084, | |
| "eval_steps_per_second": 6.021, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.4775725593667546, | |
| "grad_norm": 2.875, | |
| "learning_rate": 1e-05, | |
| "loss": 2.1086, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.4775725593667546, | |
| "eval_loss": 2.1105477809906006, | |
| "eval_runtime": 8.3509, | |
| "eval_samples_per_second": 23.95, | |
| "eval_steps_per_second": 5.987, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.58311345646438, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0473, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.58311345646438, | |
| "eval_loss": 2.1075851917266846, | |
| "eval_runtime": 8.3203, | |
| "eval_samples_per_second": 24.037, | |
| "eval_steps_per_second": 6.009, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.6886543535620053, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0455, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.6886543535620053, | |
| "eval_loss": 2.1052379608154297, | |
| "eval_runtime": 8.4112, | |
| "eval_samples_per_second": 23.778, | |
| "eval_steps_per_second": 5.944, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.7941952506596306, | |
| "grad_norm": 3.125, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0664, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.7941952506596306, | |
| "eval_loss": 2.102696180343628, | |
| "eval_runtime": 8.1772, | |
| "eval_samples_per_second": 24.458, | |
| "eval_steps_per_second": 6.115, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.899736147757256, | |
| "grad_norm": 2.75, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0559, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.899736147757256, | |
| "eval_loss": 2.100424289703369, | |
| "eval_runtime": 8.2651, | |
| "eval_samples_per_second": 24.198, | |
| "eval_steps_per_second": 6.05, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.005277044854881, | |
| "grad_norm": 3.125, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0638, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.005277044854881, | |
| "eval_loss": 2.0989837646484375, | |
| "eval_runtime": 8.3491, | |
| "eval_samples_per_second": 23.955, | |
| "eval_steps_per_second": 5.989, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.1108179419525066, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0455, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.1108179419525066, | |
| "eval_loss": 2.097106456756592, | |
| "eval_runtime": 8.261, | |
| "eval_samples_per_second": 24.21, | |
| "eval_steps_per_second": 6.053, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.216358839050132, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0114, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.216358839050132, | |
| "eval_loss": 2.095724582672119, | |
| "eval_runtime": 8.1933, | |
| "eval_samples_per_second": 24.41, | |
| "eval_steps_per_second": 6.103, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.321899736147757, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0263, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.321899736147757, | |
| "eval_loss": 2.0944039821624756, | |
| "eval_runtime": 8.3678, | |
| "eval_samples_per_second": 23.901, | |
| "eval_steps_per_second": 5.975, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.4274406332453826, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0127, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.4274406332453826, | |
| "eval_loss": 2.0919580459594727, | |
| "eval_runtime": 8.3065, | |
| "eval_samples_per_second": 24.078, | |
| "eval_steps_per_second": 6.019, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.532981530343008, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9744, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.532981530343008, | |
| "eval_loss": 2.087993860244751, | |
| "eval_runtime": 8.3136, | |
| "eval_samples_per_second": 24.057, | |
| "eval_steps_per_second": 6.014, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.638522427440633, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0236, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.638522427440633, | |
| "eval_loss": 2.086052656173706, | |
| "eval_runtime": 8.3652, | |
| "eval_samples_per_second": 23.908, | |
| "eval_steps_per_second": 5.977, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.7440633245382586, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0146, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.7440633245382586, | |
| "eval_loss": 2.08500075340271, | |
| "eval_runtime": 8.387, | |
| "eval_samples_per_second": 23.846, | |
| "eval_steps_per_second": 5.962, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.849604221635884, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0086, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.849604221635884, | |
| "eval_loss": 2.0832457542419434, | |
| "eval_runtime": 8.2202, | |
| "eval_samples_per_second": 24.33, | |
| "eval_steps_per_second": 6.083, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.955145118733509, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0381, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.955145118733509, | |
| "eval_loss": 2.0816333293914795, | |
| "eval_runtime": 8.1766, | |
| "eval_samples_per_second": 24.46, | |
| "eval_steps_per_second": 6.115, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 3.0606860158311346, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9999, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.0606860158311346, | |
| "eval_loss": 2.0814907550811768, | |
| "eval_runtime": 8.1023, | |
| "eval_samples_per_second": 24.684, | |
| "eval_steps_per_second": 6.171, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.16622691292876, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9754, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.16622691292876, | |
| "eval_loss": 2.0809905529022217, | |
| "eval_runtime": 8.2702, | |
| "eval_samples_per_second": 24.183, | |
| "eval_steps_per_second": 6.046, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.271767810026385, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9742, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 3.271767810026385, | |
| "eval_loss": 2.0800254344940186, | |
| "eval_runtime": 8.3829, | |
| "eval_samples_per_second": 23.858, | |
| "eval_steps_per_second": 5.965, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 3.3773087071240107, | |
| "grad_norm": 3.375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9646, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.3773087071240107, | |
| "eval_loss": 2.078634738922119, | |
| "eval_runtime": 8.1957, | |
| "eval_samples_per_second": 24.403, | |
| "eval_steps_per_second": 6.101, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.4828496042216357, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9785, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 3.4828496042216357, | |
| "eval_loss": 2.075782537460327, | |
| "eval_runtime": 8.2605, | |
| "eval_samples_per_second": 24.211, | |
| "eval_steps_per_second": 6.053, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 3.588390501319261, | |
| "grad_norm": 4.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9755, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 3.588390501319261, | |
| "eval_loss": 2.0737786293029785, | |
| "eval_runtime": 8.3746, | |
| "eval_samples_per_second": 23.882, | |
| "eval_steps_per_second": 5.97, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 3.6939313984168867, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9667, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.6939313984168867, | |
| "eval_loss": 2.0726418495178223, | |
| "eval_runtime": 8.2601, | |
| "eval_samples_per_second": 24.213, | |
| "eval_steps_per_second": 6.053, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.7994722955145117, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9623, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 3.7994722955145117, | |
| "eval_loss": 2.070995330810547, | |
| "eval_runtime": 8.3051, | |
| "eval_samples_per_second": 24.081, | |
| "eval_steps_per_second": 6.02, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 3.905013192612137, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9702, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 3.905013192612137, | |
| "eval_loss": 2.068690776824951, | |
| "eval_runtime": 8.3707, | |
| "eval_samples_per_second": 23.893, | |
| "eval_steps_per_second": 5.973, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 4.010554089709762, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9795, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 4.010554089709762, | |
| "eval_loss": 2.0664331912994385, | |
| "eval_runtime": 9.1952, | |
| "eval_samples_per_second": 21.75, | |
| "eval_steps_per_second": 5.438, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 4.116094986807388, | |
| "grad_norm": 3.0, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9469, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 4.116094986807388, | |
| "eval_loss": 2.0662195682525635, | |
| "eval_runtime": 8.2606, | |
| "eval_samples_per_second": 24.211, | |
| "eval_steps_per_second": 6.053, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 4.221635883905013, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9415, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.221635883905013, | |
| "eval_loss": 2.0639867782592773, | |
| "eval_runtime": 8.227, | |
| "eval_samples_per_second": 24.31, | |
| "eval_steps_per_second": 6.078, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.327176781002638, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9574, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 4.327176781002638, | |
| "eval_loss": 2.062091588973999, | |
| "eval_runtime": 8.2262, | |
| "eval_samples_per_second": 24.313, | |
| "eval_steps_per_second": 6.078, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 4.432717678100264, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9202, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 4.432717678100264, | |
| "eval_loss": 2.0608723163604736, | |
| "eval_runtime": 8.2161, | |
| "eval_samples_per_second": 24.343, | |
| "eval_steps_per_second": 6.086, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 4.538258575197889, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9302, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 4.538258575197889, | |
| "eval_loss": 2.058367967605591, | |
| "eval_runtime": 8.2827, | |
| "eval_samples_per_second": 24.147, | |
| "eval_steps_per_second": 6.037, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 4.643799472295514, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9112, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 4.643799472295514, | |
| "eval_loss": 2.058866500854492, | |
| "eval_runtime": 8.2412, | |
| "eval_samples_per_second": 24.268, | |
| "eval_steps_per_second": 6.067, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 4.74934036939314, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9127, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 4.74934036939314, | |
| "eval_loss": 2.0560219287872314, | |
| "eval_runtime": 8.2213, | |
| "eval_samples_per_second": 24.327, | |
| "eval_steps_per_second": 6.082, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 4.854881266490765, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.899, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 4.854881266490765, | |
| "eval_loss": 2.054474353790283, | |
| "eval_runtime": 8.0902, | |
| "eval_samples_per_second": 24.721, | |
| "eval_steps_per_second": 6.18, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 4.96042216358839, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9248, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 4.96042216358839, | |
| "eval_loss": 2.052360773086548, | |
| "eval_runtime": 8.1899, | |
| "eval_samples_per_second": 24.42, | |
| "eval_steps_per_second": 6.105, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 5.065963060686016, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8878, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 5.065963060686016, | |
| "eval_loss": 2.0526058673858643, | |
| "eval_runtime": 8.0873, | |
| "eval_samples_per_second": 24.73, | |
| "eval_steps_per_second": 6.183, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 5.171503957783641, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8789, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 5.171503957783641, | |
| "eval_loss": 2.0522069931030273, | |
| "eval_runtime": 8.0535, | |
| "eval_samples_per_second": 24.834, | |
| "eval_steps_per_second": 6.208, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 5.277044854881266, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8908, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 5.277044854881266, | |
| "eval_loss": 2.051866054534912, | |
| "eval_runtime": 7.9907, | |
| "eval_samples_per_second": 25.029, | |
| "eval_steps_per_second": 6.257, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 5.382585751978892, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8944, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 5.382585751978892, | |
| "eval_loss": 2.0503861904144287, | |
| "eval_runtime": 7.927, | |
| "eval_samples_per_second": 25.23, | |
| "eval_steps_per_second": 6.308, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 5.488126649076517, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8867, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 5.488126649076517, | |
| "eval_loss": 2.0466654300689697, | |
| "eval_runtime": 8.3776, | |
| "eval_samples_per_second": 23.873, | |
| "eval_steps_per_second": 5.968, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 5.593667546174142, | |
| "grad_norm": 3.0, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8764, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 5.593667546174142, | |
| "eval_loss": 2.0448718070983887, | |
| "eval_runtime": 8.2526, | |
| "eval_samples_per_second": 24.235, | |
| "eval_steps_per_second": 6.059, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 5.699208443271768, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9082, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 5.699208443271768, | |
| "eval_loss": 2.0424439907073975, | |
| "eval_runtime": 8.0973, | |
| "eval_samples_per_second": 24.7, | |
| "eval_steps_per_second": 6.175, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 5.804749340369393, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8782, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 5.804749340369393, | |
| "eval_loss": 2.0422487258911133, | |
| "eval_runtime": 8.3197, | |
| "eval_samples_per_second": 24.039, | |
| "eval_steps_per_second": 6.01, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 5.910290237467018, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8394, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 5.910290237467018, | |
| "eval_loss": 2.0410642623901367, | |
| "eval_runtime": 8.2994, | |
| "eval_samples_per_second": 24.098, | |
| "eval_steps_per_second": 6.025, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 6.015831134564644, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.864, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 6.015831134564644, | |
| "eval_loss": 2.039353370666504, | |
| "eval_runtime": 8.2994, | |
| "eval_samples_per_second": 24.098, | |
| "eval_steps_per_second": 6.025, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 6.121372031662269, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8246, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 6.121372031662269, | |
| "eval_loss": 2.042710304260254, | |
| "eval_runtime": 8.5324, | |
| "eval_samples_per_second": 23.44, | |
| "eval_steps_per_second": 5.86, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 6.226912928759894, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8343, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 6.226912928759894, | |
| "eval_loss": 2.0403542518615723, | |
| "eval_runtime": 8.2652, | |
| "eval_samples_per_second": 24.198, | |
| "eval_steps_per_second": 6.049, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 6.33245382585752, | |
| "grad_norm": 3.984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8541, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 6.33245382585752, | |
| "eval_loss": 2.03813099861145, | |
| "eval_runtime": 8.4385, | |
| "eval_samples_per_second": 23.701, | |
| "eval_steps_per_second": 5.925, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 6.437994722955145, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8182, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 6.437994722955145, | |
| "eval_loss": 2.038771629333496, | |
| "eval_runtime": 8.3561, | |
| "eval_samples_per_second": 23.934, | |
| "eval_steps_per_second": 5.984, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 6.54353562005277, | |
| "grad_norm": 3.125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8427, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 6.54353562005277, | |
| "eval_loss": 2.0339856147766113, | |
| "eval_runtime": 8.3288, | |
| "eval_samples_per_second": 24.013, | |
| "eval_steps_per_second": 6.003, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 6.649076517150396, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8289, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 6.649076517150396, | |
| "eval_loss": 2.035248041152954, | |
| "eval_runtime": 8.281, | |
| "eval_samples_per_second": 24.152, | |
| "eval_steps_per_second": 6.038, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 6.754617414248021, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8415, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 6.754617414248021, | |
| "eval_loss": 2.031825304031372, | |
| "eval_runtime": 8.3052, | |
| "eval_samples_per_second": 24.081, | |
| "eval_steps_per_second": 6.02, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 6.860158311345646, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8357, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 6.860158311345646, | |
| "eval_loss": 2.028428316116333, | |
| "eval_runtime": 8.3001, | |
| "eval_samples_per_second": 24.096, | |
| "eval_steps_per_second": 6.024, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 6.965699208443271, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8324, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 6.965699208443271, | |
| "eval_loss": 2.0289885997772217, | |
| "eval_runtime": 8.1618, | |
| "eval_samples_per_second": 24.504, | |
| "eval_steps_per_second": 6.126, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 7.071240105540897, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8069, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 7.071240105540897, | |
| "eval_loss": 2.0348060131073, | |
| "eval_runtime": 8.1951, | |
| "eval_samples_per_second": 24.405, | |
| "eval_steps_per_second": 6.101, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 7.176781002638522, | |
| "grad_norm": 3.375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8152, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 7.176781002638522, | |
| "eval_loss": 2.0321884155273438, | |
| "eval_runtime": 8.2785, | |
| "eval_samples_per_second": 24.159, | |
| "eval_steps_per_second": 6.04, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 7.282321899736147, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.7871, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 7.282321899736147, | |
| "eval_loss": 2.0307512283325195, | |
| "eval_runtime": 8.0505, | |
| "eval_samples_per_second": 24.843, | |
| "eval_steps_per_second": 6.211, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 7.387862796833773, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.7871, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 7.387862796833773, | |
| "eval_loss": 2.0273208618164062, | |
| "eval_runtime": 8.1896, | |
| "eval_samples_per_second": 24.421, | |
| "eval_steps_per_second": 6.105, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 7.493403693931398, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8076, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 7.493403693931398, | |
| "eval_loss": 2.0257158279418945, | |
| "eval_runtime": 7.9266, | |
| "eval_samples_per_second": 25.232, | |
| "eval_steps_per_second": 6.308, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 7.598944591029023, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.7753, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 7.598944591029023, | |
| "eval_loss": 2.026719570159912, | |
| "eval_runtime": 7.8566, | |
| "eval_samples_per_second": 25.456, | |
| "eval_steps_per_second": 6.364, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 7.704485488126649, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.761, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 7.704485488126649, | |
| "eval_loss": 2.022343397140503, | |
| "eval_runtime": 8.1505, | |
| "eval_samples_per_second": 24.538, | |
| "eval_steps_per_second": 6.135, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 7.810026385224274, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.7837, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 7.810026385224274, | |
| "eval_loss": 2.0227696895599365, | |
| "eval_runtime": 7.9021, | |
| "eval_samples_per_second": 25.31, | |
| "eval_steps_per_second": 6.327, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 7.915567282321899, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.7809, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 7.915567282321899, | |
| "eval_loss": 2.0224175453186035, | |
| "eval_runtime": 8.146, | |
| "eval_samples_per_second": 24.552, | |
| "eval_steps_per_second": 6.138, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 8.021108179419524, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.779, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 8.021108179419524, | |
| "eval_loss": 2.0209100246429443, | |
| "eval_runtime": 8.392, | |
| "eval_samples_per_second": 23.832, | |
| "eval_steps_per_second": 5.958, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 8.12664907651715, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.7353, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 8.12664907651715, | |
| "eval_loss": 2.0220282077789307, | |
| "eval_runtime": 8.6161, | |
| "eval_samples_per_second": 23.212, | |
| "eval_steps_per_second": 5.803, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 8.232189973614776, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.7363, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 8.232189973614776, | |
| "eval_loss": 2.0166220664978027, | |
| "eval_runtime": 8.2719, | |
| "eval_samples_per_second": 24.178, | |
| "eval_steps_per_second": 6.045, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 8.3377308707124, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.7511, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 8.3377308707124, | |
| "eval_loss": 2.01631236076355, | |
| "eval_runtime": 8.2537, | |
| "eval_samples_per_second": 24.232, | |
| "eval_steps_per_second": 6.058, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 8.443271767810026, | |
| "grad_norm": 3.828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.767, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 8.443271767810026, | |
| "eval_loss": 2.016242265701294, | |
| "eval_runtime": 8.1762, | |
| "eval_samples_per_second": 24.461, | |
| "eval_steps_per_second": 6.115, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 8.548812664907652, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.6945, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 8.548812664907652, | |
| "eval_loss": 2.019789218902588, | |
| "eval_runtime": 8.4823, | |
| "eval_samples_per_second": 23.579, | |
| "eval_steps_per_second": 5.895, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 8.654353562005277, | |
| "grad_norm": 4.5625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.7087, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 8.654353562005277, | |
| "eval_loss": 2.0212345123291016, | |
| "eval_runtime": 8.1335, | |
| "eval_samples_per_second": 24.59, | |
| "eval_steps_per_second": 6.147, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 8.759894459102902, | |
| "grad_norm": 3.921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.7702, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 8.759894459102902, | |
| "eval_loss": 2.0104410648345947, | |
| "eval_runtime": 8.0047, | |
| "eval_samples_per_second": 24.985, | |
| "eval_steps_per_second": 6.246, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 8.865435356200528, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.7563, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 8.865435356200528, | |
| "eval_loss": 2.009188652038574, | |
| "eval_runtime": 8.1504, | |
| "eval_samples_per_second": 24.539, | |
| "eval_steps_per_second": 6.135, | |
| "step": 1680 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 9450, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 20, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 5, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.6272812961435648e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |