| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005, |
| "grad_norm": 6.723871231079102, |
| "learning_rate": 4.9800000000000004e-05, |
| "loss": 6.3518, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 3.806936502456665, |
| "learning_rate": 4.9550000000000005e-05, |
| "loss": 5.2146, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 5.464715003967285, |
| "learning_rate": 4.93e-05, |
| "loss": 5.3944, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 7.181273937225342, |
| "learning_rate": 4.905e-05, |
| "loss": 4.9653, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 3.466884136199951, |
| "learning_rate": 4.88e-05, |
| "loss": 4.3435, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 6.4920830726623535, |
| "learning_rate": 4.855e-05, |
| "loss": 5.0843, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 6.69465446472168, |
| "learning_rate": 4.83e-05, |
| "loss": 5.5838, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 13.27942943572998, |
| "learning_rate": 4.805e-05, |
| "loss": 5.4903, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 7.836321830749512, |
| "learning_rate": 4.78e-05, |
| "loss": 6.0388, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 6.168878555297852, |
| "learning_rate": 4.755e-05, |
| "loss": 5.1972, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 4.194369316101074, |
| "learning_rate": 4.73e-05, |
| "loss": 4.9553, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 14.01821517944336, |
| "learning_rate": 4.705e-05, |
| "loss": 5.059, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.065, |
| "grad_norm": 5.50333309173584, |
| "learning_rate": 4.6800000000000006e-05, |
| "loss": 5.1333, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 8.491551399230957, |
| "learning_rate": 4.655000000000001e-05, |
| "loss": 5.6745, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 6.9947357177734375, |
| "learning_rate": 4.630000000000001e-05, |
| "loss": 5.9163, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 9.96507453918457, |
| "learning_rate": 4.605e-05, |
| "loss": 6.2083, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.085, |
| "grad_norm": 4.116439342498779, |
| "learning_rate": 4.58e-05, |
| "loss": 4.9216, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 12.154153823852539, |
| "learning_rate": 4.555e-05, |
| "loss": 4.8928, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.095, |
| "grad_norm": 5.3574395179748535, |
| "learning_rate": 4.53e-05, |
| "loss": 6.1012, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 4.469454288482666, |
| "learning_rate": 4.5050000000000004e-05, |
| "loss": 4.9137, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.105, |
| "grad_norm": 4.144026756286621, |
| "learning_rate": 4.4800000000000005e-05, |
| "loss": 4.9198, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 14.926289558410645, |
| "learning_rate": 4.4550000000000005e-05, |
| "loss": 5.0548, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.115, |
| "grad_norm": 3.8435521125793457, |
| "learning_rate": 4.43e-05, |
| "loss": 5.5779, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 5.218202114105225, |
| "learning_rate": 4.405e-05, |
| "loss": 4.2187, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 3.6816904544830322, |
| "learning_rate": 4.38e-05, |
| "loss": 4.8792, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 4.585783004760742, |
| "learning_rate": 4.355e-05, |
| "loss": 4.9485, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.135, |
| "grad_norm": 2.7602925300598145, |
| "learning_rate": 4.33e-05, |
| "loss": 4.9472, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 3.877700090408325, |
| "learning_rate": 4.305e-05, |
| "loss": 6.275, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.145, |
| "grad_norm": 3.4372103214263916, |
| "learning_rate": 4.2800000000000004e-05, |
| "loss": 5.5237, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 7.72742223739624, |
| "learning_rate": 4.2550000000000004e-05, |
| "loss": 5.1829, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.155, |
| "grad_norm": 3.0909502506256104, |
| "learning_rate": 4.23e-05, |
| "loss": 4.9871, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 6.721127033233643, |
| "learning_rate": 4.205e-05, |
| "loss": 4.5381, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.165, |
| "grad_norm": 3.377737522125244, |
| "learning_rate": 4.18e-05, |
| "loss": 4.3582, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 5.443429470062256, |
| "learning_rate": 4.155e-05, |
| "loss": 4.8136, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.175, |
| "grad_norm": 6.62492036819458, |
| "learning_rate": 4.13e-05, |
| "loss": 4.5059, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 3.996607780456543, |
| "learning_rate": 4.105e-05, |
| "loss": 5.043, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.185, |
| "grad_norm": 8.51787281036377, |
| "learning_rate": 4.08e-05, |
| "loss": 3.9811, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 32.909263610839844, |
| "learning_rate": 4.055e-05, |
| "loss": 5.8957, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.195, |
| "grad_norm": 14.843151092529297, |
| "learning_rate": 4.0300000000000004e-05, |
| "loss": 5.7668, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 16.71015167236328, |
| "learning_rate": 4.0050000000000004e-05, |
| "loss": 5.345, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.205, |
| "grad_norm": 8.61973762512207, |
| "learning_rate": 3.9800000000000005e-05, |
| "loss": 4.6346, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 9.57416820526123, |
| "learning_rate": 3.9550000000000006e-05, |
| "loss": 4.7641, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.215, |
| "grad_norm": 3.333556890487671, |
| "learning_rate": 3.9300000000000007e-05, |
| "loss": 4.798, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 6.404730319976807, |
| "learning_rate": 3.905e-05, |
| "loss": 4.673, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.225, |
| "grad_norm": 6.648563861846924, |
| "learning_rate": 3.88e-05, |
| "loss": 4.8667, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 10.567391395568848, |
| "learning_rate": 3.855e-05, |
| "loss": 4.3667, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.235, |
| "grad_norm": 7.087283134460449, |
| "learning_rate": 3.83e-05, |
| "loss": 4.3814, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 5.590321063995361, |
| "learning_rate": 3.805e-05, |
| "loss": 5.2016, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.245, |
| "grad_norm": 9.443580627441406, |
| "learning_rate": 3.7800000000000004e-05, |
| "loss": 5.268, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 8.740052223205566, |
| "learning_rate": 3.7550000000000005e-05, |
| "loss": 4.3214, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.255, |
| "grad_norm": 3.701566696166992, |
| "learning_rate": 3.73e-05, |
| "loss": 4.5039, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 4.700649738311768, |
| "learning_rate": 3.705e-05, |
| "loss": 5.3121, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.265, |
| "grad_norm": 10.507079124450684, |
| "learning_rate": 3.68e-05, |
| "loss": 4.8834, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 6.398303985595703, |
| "learning_rate": 3.655e-05, |
| "loss": 5.1738, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.275, |
| "grad_norm": 5.041212558746338, |
| "learning_rate": 3.63e-05, |
| "loss": 4.9648, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 14.750165939331055, |
| "learning_rate": 3.605e-05, |
| "loss": 5.099, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.285, |
| "grad_norm": 6.1462812423706055, |
| "learning_rate": 3.58e-05, |
| "loss": 4.1443, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 11.911263465881348, |
| "learning_rate": 3.555e-05, |
| "loss": 5.0924, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.295, |
| "grad_norm": 6.538726806640625, |
| "learning_rate": 3.53e-05, |
| "loss": 4.4447, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 4.611289978027344, |
| "learning_rate": 3.505e-05, |
| "loss": 5.1157, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.305, |
| "grad_norm": 10.949462890625, |
| "learning_rate": 3.48e-05, |
| "loss": 5.8432, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 30.82380485534668, |
| "learning_rate": 3.455e-05, |
| "loss": 5.3243, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.315, |
| "grad_norm": 13.503152847290039, |
| "learning_rate": 3.430000000000001e-05, |
| "loss": 5.1542, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 5.6366868019104, |
| "learning_rate": 3.405e-05, |
| "loss": 3.954, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.325, |
| "grad_norm": 10.930379867553711, |
| "learning_rate": 3.38e-05, |
| "loss": 4.8881, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 4.548320293426514, |
| "learning_rate": 3.355e-05, |
| "loss": 4.4602, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.335, |
| "grad_norm": 7.964004993438721, |
| "learning_rate": 3.33e-05, |
| "loss": 4.9787, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 8.944062232971191, |
| "learning_rate": 3.3050000000000004e-05, |
| "loss": 4.1921, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.345, |
| "grad_norm": 11.169071197509766, |
| "learning_rate": 3.2800000000000004e-05, |
| "loss": 4.9608, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 12.389691352844238, |
| "learning_rate": 3.2550000000000005e-05, |
| "loss": 4.9599, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.355, |
| "grad_norm": 5.404703140258789, |
| "learning_rate": 3.2300000000000006e-05, |
| "loss": 4.4157, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 9.989474296569824, |
| "learning_rate": 3.205e-05, |
| "loss": 5.0414, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.365, |
| "grad_norm": 9.875152587890625, |
| "learning_rate": 3.18e-05, |
| "loss": 5.5424, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 5.059978485107422, |
| "learning_rate": 3.155e-05, |
| "loss": 3.9109, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.375, |
| "grad_norm": 9.487578392028809, |
| "learning_rate": 3.13e-05, |
| "loss": 4.8966, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 7.851027965545654, |
| "learning_rate": 3.105e-05, |
| "loss": 4.5922, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.385, |
| "grad_norm": 6.921792030334473, |
| "learning_rate": 3.08e-05, |
| "loss": 4.1319, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 2.766820192337036, |
| "learning_rate": 3.0550000000000004e-05, |
| "loss": 4.8304, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.395, |
| "grad_norm": 11.35091495513916, |
| "learning_rate": 3.03e-05, |
| "loss": 4.961, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 5.968329906463623, |
| "learning_rate": 3.0050000000000002e-05, |
| "loss": 4.7912, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.405, |
| "grad_norm": 7.958022117614746, |
| "learning_rate": 2.98e-05, |
| "loss": 4.3972, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 14.175585746765137, |
| "learning_rate": 2.955e-05, |
| "loss": 4.899, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.415, |
| "grad_norm": 7.102541923522949, |
| "learning_rate": 2.93e-05, |
| "loss": 4.0319, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 10.61821460723877, |
| "learning_rate": 2.9049999999999998e-05, |
| "loss": 4.3293, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.425, |
| "grad_norm": 9.615349769592285, |
| "learning_rate": 2.88e-05, |
| "loss": 4.5338, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 6.818640232086182, |
| "learning_rate": 2.855e-05, |
| "loss": 4.7826, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.435, |
| "grad_norm": 12.359649658203125, |
| "learning_rate": 2.83e-05, |
| "loss": 4.2227, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 24.38874053955078, |
| "learning_rate": 2.8050000000000004e-05, |
| "loss": 4.7491, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.445, |
| "grad_norm": 12.400574684143066, |
| "learning_rate": 2.7800000000000005e-05, |
| "loss": 4.91, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 6.146411418914795, |
| "learning_rate": 2.7550000000000002e-05, |
| "loss": 4.5604, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.455, |
| "grad_norm": 10.598999977111816, |
| "learning_rate": 2.7300000000000003e-05, |
| "loss": 4.5914, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 6.878369331359863, |
| "learning_rate": 2.7050000000000004e-05, |
| "loss": 4.5466, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.465, |
| "grad_norm": 20.473426818847656, |
| "learning_rate": 2.6800000000000004e-05, |
| "loss": 5.1533, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 10.888360977172852, |
| "learning_rate": 2.655e-05, |
| "loss": 4.6344, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.475, |
| "grad_norm": 9.319945335388184, |
| "learning_rate": 2.6300000000000002e-05, |
| "loss": 4.9767, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 8.176156997680664, |
| "learning_rate": 2.6050000000000003e-05, |
| "loss": 4.3192, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.485, |
| "grad_norm": 21.128265380859375, |
| "learning_rate": 2.58e-05, |
| "loss": 4.7935, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 12.109440803527832, |
| "learning_rate": 2.555e-05, |
| "loss": 4.54, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.495, |
| "grad_norm": 9.297119140625, |
| "learning_rate": 2.5300000000000002e-05, |
| "loss": 4.4552, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 7.4618635177612305, |
| "learning_rate": 2.5050000000000002e-05, |
| "loss": 4.1316, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.505, |
| "grad_norm": 12.802547454833984, |
| "learning_rate": 2.48e-05, |
| "loss": 5.2916, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 14.957279205322266, |
| "learning_rate": 2.455e-05, |
| "loss": 4.887, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.515, |
| "grad_norm": 20.8640079498291, |
| "learning_rate": 2.43e-05, |
| "loss": 5.0191, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 13.2233304977417, |
| "learning_rate": 2.4050000000000002e-05, |
| "loss": 5.7143, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.525, |
| "grad_norm": 10.220726013183594, |
| "learning_rate": 2.38e-05, |
| "loss": 5.0084, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 10.89086627960205, |
| "learning_rate": 2.355e-05, |
| "loss": 4.6592, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.535, |
| "grad_norm": 15.364632606506348, |
| "learning_rate": 2.3300000000000004e-05, |
| "loss": 4.6751, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 14.81102180480957, |
| "learning_rate": 2.305e-05, |
| "loss": 4.5612, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.545, |
| "grad_norm": 16.8745174407959, |
| "learning_rate": 2.2800000000000002e-05, |
| "loss": 4.486, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 5.1208014488220215, |
| "learning_rate": 2.2550000000000003e-05, |
| "loss": 4.6282, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.555, |
| "grad_norm": 3.00327467918396, |
| "learning_rate": 2.23e-05, |
| "loss": 4.5903, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 11.089088439941406, |
| "learning_rate": 2.205e-05, |
| "loss": 4.8291, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.565, |
| "grad_norm": 5.438385963439941, |
| "learning_rate": 2.18e-05, |
| "loss": 5.6635, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 9.857734680175781, |
| "learning_rate": 2.1550000000000002e-05, |
| "loss": 4.2215, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.575, |
| "grad_norm": 6.085028171539307, |
| "learning_rate": 2.13e-05, |
| "loss": 3.4748, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 8.20434856414795, |
| "learning_rate": 2.105e-05, |
| "loss": 3.9847, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.585, |
| "grad_norm": 6.412227630615234, |
| "learning_rate": 2.08e-05, |
| "loss": 4.2073, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 11.259173393249512, |
| "learning_rate": 2.055e-05, |
| "loss": 4.5817, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.595, |
| "grad_norm": 6.3808979988098145, |
| "learning_rate": 2.0300000000000002e-05, |
| "loss": 4.5585, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 14.231278419494629, |
| "learning_rate": 2.0050000000000003e-05, |
| "loss": 4.6529, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.605, |
| "grad_norm": 3.1947903633117676, |
| "learning_rate": 1.9800000000000004e-05, |
| "loss": 4.3442, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 19.08649253845215, |
| "learning_rate": 1.955e-05, |
| "loss": 5.5732, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.615, |
| "grad_norm": 15.618348121643066, |
| "learning_rate": 1.93e-05, |
| "loss": 5.3036, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 18.417282104492188, |
| "learning_rate": 1.9050000000000002e-05, |
| "loss": 4.6238, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.625, |
| "grad_norm": 4.049787521362305, |
| "learning_rate": 1.88e-05, |
| "loss": 4.3357, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 8.666927337646484, |
| "learning_rate": 1.855e-05, |
| "loss": 4.0803, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.635, |
| "grad_norm": 7.594979763031006, |
| "learning_rate": 1.83e-05, |
| "loss": 4.1425, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 9.953749656677246, |
| "learning_rate": 1.805e-05, |
| "loss": 4.0354, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.645, |
| "grad_norm": 14.118829727172852, |
| "learning_rate": 1.78e-05, |
| "loss": 4.8014, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 3.095034599304199, |
| "learning_rate": 1.755e-05, |
| "loss": 3.581, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.655, |
| "grad_norm": 8.881584167480469, |
| "learning_rate": 1.73e-05, |
| "loss": 4.6398, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 8.370692253112793, |
| "learning_rate": 1.705e-05, |
| "loss": 4.545, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.665, |
| "grad_norm": 21.478994369506836, |
| "learning_rate": 1.6800000000000002e-05, |
| "loss": 4.8406, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 12.012208938598633, |
| "learning_rate": 1.6550000000000002e-05, |
| "loss": 4.5001, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.675, |
| "grad_norm": 18.751039505004883, |
| "learning_rate": 1.63e-05, |
| "loss": 6.0012, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 12.919386863708496, |
| "learning_rate": 1.605e-05, |
| "loss": 4.7294, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.685, |
| "grad_norm": 3.2335610389709473, |
| "learning_rate": 1.58e-05, |
| "loss": 4.6821, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 16.39644432067871, |
| "learning_rate": 1.5550000000000002e-05, |
| "loss": 5.0447, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.695, |
| "grad_norm": 8.815535545349121, |
| "learning_rate": 1.53e-05, |
| "loss": 4.4124, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 6.829135894775391, |
| "learning_rate": 1.505e-05, |
| "loss": 4.3269, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.705, |
| "grad_norm": 15.070808410644531, |
| "learning_rate": 1.48e-05, |
| "loss": 4.9741, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 6.726645469665527, |
| "learning_rate": 1.455e-05, |
| "loss": 4.3877, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.715, |
| "grad_norm": 11.980883598327637, |
| "learning_rate": 1.43e-05, |
| "loss": 4.532, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 9.280970573425293, |
| "learning_rate": 1.4050000000000003e-05, |
| "loss": 4.6453, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.725, |
| "grad_norm": 13.877763748168945, |
| "learning_rate": 1.3800000000000002e-05, |
| "loss": 3.898, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 17.35505485534668, |
| "learning_rate": 1.3550000000000002e-05, |
| "loss": 4.4691, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.735, |
| "grad_norm": 6.942783355712891, |
| "learning_rate": 1.3300000000000001e-05, |
| "loss": 4.7933, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 3.886617660522461, |
| "learning_rate": 1.305e-05, |
| "loss": 5.0594, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.745, |
| "grad_norm": 11.779265403747559, |
| "learning_rate": 1.2800000000000001e-05, |
| "loss": 3.881, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 10.424500465393066, |
| "learning_rate": 1.255e-05, |
| "loss": 5.0379, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.755, |
| "grad_norm": 9.827173233032227, |
| "learning_rate": 1.23e-05, |
| "loss": 4.4965, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 12.023284912109375, |
| "learning_rate": 1.205e-05, |
| "loss": 3.7343, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.765, |
| "grad_norm": 20.79689598083496, |
| "learning_rate": 1.18e-05, |
| "loss": 4.9825, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 14.315791130065918, |
| "learning_rate": 1.1550000000000001e-05, |
| "loss": 4.2083, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.775, |
| "grad_norm": 41.02420425415039, |
| "learning_rate": 1.13e-05, |
| "loss": 4.4618, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 3.5200512409210205, |
| "learning_rate": 1.1050000000000001e-05, |
| "loss": 4.1416, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.785, |
| "grad_norm": 12.9066162109375, |
| "learning_rate": 1.08e-05, |
| "loss": 4.8546, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 6.802006721496582, |
| "learning_rate": 1.055e-05, |
| "loss": 5.0877, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.795, |
| "grad_norm": 3.493497610092163, |
| "learning_rate": 1.03e-05, |
| "loss": 4.5689, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 7.396031856536865, |
| "learning_rate": 1.005e-05, |
| "loss": 4.9949, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.805, |
| "grad_norm": 6.319110870361328, |
| "learning_rate": 9.800000000000001e-06, |
| "loss": 4.0997, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 11.769895553588867, |
| "learning_rate": 9.55e-06, |
| "loss": 4.5879, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.815, |
| "grad_norm": 8.29397201538086, |
| "learning_rate": 9.3e-06, |
| "loss": 4.5328, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 12.618614196777344, |
| "learning_rate": 9.05e-06, |
| "loss": 4.5797, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.825, |
| "grad_norm": 9.412469863891602, |
| "learning_rate": 8.8e-06, |
| "loss": 5.1889, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 6.6949238777160645, |
| "learning_rate": 8.550000000000001e-06, |
| "loss": 4.3719, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.835, |
| "grad_norm": 11.582685470581055, |
| "learning_rate": 8.3e-06, |
| "loss": 4.5434, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 15.929965019226074, |
| "learning_rate": 8.050000000000001e-06, |
| "loss": 4.6556, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.845, |
| "grad_norm": 10.424652099609375, |
| "learning_rate": 7.8e-06, |
| "loss": 4.4395, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 16.287343978881836, |
| "learning_rate": 7.55e-06, |
| "loss": 4.1417, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.855, |
| "grad_norm": 6.144033432006836, |
| "learning_rate": 7.2999999999999996e-06, |
| "loss": 5.6976, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 8.335988998413086, |
| "learning_rate": 7.049999999999999e-06, |
| "loss": 4.696, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.865, |
| "grad_norm": 13.787694931030273, |
| "learning_rate": 6.800000000000001e-06, |
| "loss": 4.9442, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 6.451244831085205, |
| "learning_rate": 6.550000000000001e-06, |
| "loss": 4.4329, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.875, |
| "grad_norm": 6.438804626464844, |
| "learning_rate": 6.300000000000001e-06, |
| "loss": 4.7078, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 16.51960563659668, |
| "learning_rate": 6.0500000000000005e-06, |
| "loss": 4.3228, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.885, |
| "grad_norm": 11.381933212280273, |
| "learning_rate": 5.8e-06, |
| "loss": 4.7362, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 9.30029010772705, |
| "learning_rate": 5.55e-06, |
| "loss": 4.1752, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.895, |
| "grad_norm": 9.520707130432129, |
| "learning_rate": 5.3e-06, |
| "loss": 4.4088, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 8.361503601074219, |
| "learning_rate": 5.050000000000001e-06, |
| "loss": 5.2401, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.905, |
| "grad_norm": 15.447181701660156, |
| "learning_rate": 4.800000000000001e-06, |
| "loss": 4.5311, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 16.983518600463867, |
| "learning_rate": 4.5500000000000005e-06, |
| "loss": 4.8463, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.915, |
| "grad_norm": 14.654956817626953, |
| "learning_rate": 4.2999999999999995e-06, |
| "loss": 4.3627, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 7.659664630889893, |
| "learning_rate": 4.05e-06, |
| "loss": 4.8368, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.925, |
| "grad_norm": 5.867129802703857, |
| "learning_rate": 3.8e-06, |
| "loss": 4.3896, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 14.801767349243164, |
| "learning_rate": 3.55e-06, |
| "loss": 4.989, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.935, |
| "grad_norm": 6.913944244384766, |
| "learning_rate": 3.3e-06, |
| "loss": 4.5205, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 5.782718181610107, |
| "learning_rate": 3.05e-06, |
| "loss": 4.7444, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.945, |
| "grad_norm": 3.508906364440918, |
| "learning_rate": 2.8000000000000003e-06, |
| "loss": 4.2515, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 22.21052360534668, |
| "learning_rate": 2.55e-06, |
| "loss": 4.4148, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.955, |
| "grad_norm": 3.518575429916382, |
| "learning_rate": 2.3e-06, |
| "loss": 4.9844, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 6.860954761505127, |
| "learning_rate": 2.0500000000000003e-06, |
| "loss": 4.9682, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.965, |
| "grad_norm": 6.980912685394287, |
| "learning_rate": 1.8e-06, |
| "loss": 5.5068, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 16.69953727722168, |
| "learning_rate": 1.55e-06, |
| "loss": 4.232, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.975, |
| "grad_norm": 4.831995487213135, |
| "learning_rate": 1.3e-06, |
| "loss": 5.1479, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 3.615955114364624, |
| "learning_rate": 1.0500000000000001e-06, |
| "loss": 4.1974, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.985, |
| "grad_norm": 8.048654556274414, |
| "learning_rate": 8.000000000000001e-07, |
| "loss": 3.8064, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 14.857714653015137, |
| "learning_rate": 5.5e-07, |
| "loss": 4.3121, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.995, |
| "grad_norm": 33.41465759277344, |
| "learning_rate": 3.0000000000000004e-07, |
| "loss": 5.6582, |
| "step": 995 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 19.370040893554688, |
| "learning_rate": 5.0000000000000004e-08, |
| "loss": 4.2669, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|