| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.887573964497041, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.05917159763313609, |
| "grad_norm": 1.1964364051818848, |
| "learning_rate": 8e-05, |
| "loss": 2.1543, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.11834319526627218, |
| "grad_norm": 1.1647157669067383, |
| "learning_rate": 0.00018, |
| "loss": 1.6346, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.17751479289940827, |
| "grad_norm": 0.6181473135948181, |
| "learning_rate": 0.00019904761904761907, |
| "loss": 0.9978, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.23668639053254437, |
| "grad_norm": 0.5694869160652161, |
| "learning_rate": 0.00019785714285714288, |
| "loss": 0.9807, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.2958579881656805, |
| "grad_norm": 0.6708640456199646, |
| "learning_rate": 0.00019666666666666666, |
| "loss": 1.2485, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.35502958579881655, |
| "grad_norm": 0.7021521925926208, |
| "learning_rate": 0.00019547619047619047, |
| "loss": 1.1107, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.41420118343195267, |
| "grad_norm": 0.5067740082740784, |
| "learning_rate": 0.0001942857142857143, |
| "loss": 1.1293, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.47337278106508873, |
| "grad_norm": 0.5455656051635742, |
| "learning_rate": 0.0001930952380952381, |
| "loss": 1.1371, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.5325443786982249, |
| "grad_norm": 0.6190764307975769, |
| "learning_rate": 0.00019190476190476192, |
| "loss": 1.0131, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.591715976331361, |
| "grad_norm": 0.544291615486145, |
| "learning_rate": 0.00019071428571428573, |
| "loss": 0.969, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.650887573964497, |
| "grad_norm": 0.600204348564148, |
| "learning_rate": 0.0001895238095238095, |
| "loss": 1.0509, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.7100591715976331, |
| "grad_norm": 0.5397897958755493, |
| "learning_rate": 0.00018833333333333335, |
| "loss": 0.9839, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.7692307692307693, |
| "grad_norm": 0.571107804775238, |
| "learning_rate": 0.00018714285714285716, |
| "loss": 0.9281, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.8284023668639053, |
| "grad_norm": 0.5789744257926941, |
| "learning_rate": 0.00018595238095238097, |
| "loss": 0.9543, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.8875739644970414, |
| "grad_norm": 0.6334110498428345, |
| "learning_rate": 0.00018476190476190478, |
| "loss": 1.0625, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.9467455621301775, |
| "grad_norm": 0.5225853323936462, |
| "learning_rate": 0.00018357142857142858, |
| "loss": 0.9734, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.7854479551315308, |
| "learning_rate": 0.0001823809523809524, |
| "loss": 0.9452, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.0591715976331362, |
| "grad_norm": 0.47304806113243103, |
| "learning_rate": 0.0001811904761904762, |
| "loss": 0.8473, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.1183431952662721, |
| "grad_norm": 0.6840282678604126, |
| "learning_rate": 0.00018, |
| "loss": 0.6189, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.1775147928994083, |
| "grad_norm": 0.5173171162605286, |
| "learning_rate": 0.00017880952380952382, |
| "loss": 0.9301, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.2366863905325443, |
| "grad_norm": 0.4892285466194153, |
| "learning_rate": 0.00017761904761904763, |
| "loss": 0.7659, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.2958579881656804, |
| "grad_norm": 0.5754849910736084, |
| "learning_rate": 0.00017642857142857144, |
| "loss": 0.7756, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.3550295857988166, |
| "grad_norm": 0.47277289628982544, |
| "learning_rate": 0.00017523809523809525, |
| "loss": 0.7844, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.4142011834319526, |
| "grad_norm": 0.8198840022087097, |
| "learning_rate": 0.00017404761904761906, |
| "loss": 0.8969, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.4733727810650887, |
| "grad_norm": 0.5040334463119507, |
| "learning_rate": 0.00017285714285714287, |
| "loss": 0.8727, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.532544378698225, |
| "grad_norm": 0.5382494926452637, |
| "learning_rate": 0.00017166666666666667, |
| "loss": 0.7677, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.5917159763313609, |
| "grad_norm": 0.631537914276123, |
| "learning_rate": 0.00017047619047619048, |
| "loss": 0.8364, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.650887573964497, |
| "grad_norm": 0.5718739628791809, |
| "learning_rate": 0.0001692857142857143, |
| "loss": 0.7447, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.7100591715976332, |
| "grad_norm": 0.557224452495575, |
| "learning_rate": 0.0001680952380952381, |
| "loss": 0.8828, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.7692307692307692, |
| "grad_norm": 0.6206871271133423, |
| "learning_rate": 0.0001669047619047619, |
| "loss": 0.5674, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.8284023668639053, |
| "grad_norm": 0.6297276616096497, |
| "learning_rate": 0.00016571428571428575, |
| "loss": 0.7956, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.8875739644970415, |
| "grad_norm": 0.6178033351898193, |
| "learning_rate": 0.00016452380952380953, |
| "loss": 0.5527, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.9467455621301775, |
| "grad_norm": 0.6269710063934326, |
| "learning_rate": 0.00016333333333333334, |
| "loss": 0.9635, |
| "step": 165 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.7290076613426208, |
| "learning_rate": 0.00016214285714285715, |
| "loss": 0.8048, |
| "step": 170 |
| }, |
| { |
| "epoch": 2.059171597633136, |
| "grad_norm": 0.5376546382904053, |
| "learning_rate": 0.00016095238095238096, |
| "loss": 0.602, |
| "step": 175 |
| }, |
| { |
| "epoch": 2.1183431952662723, |
| "grad_norm": 0.720078706741333, |
| "learning_rate": 0.0001597619047619048, |
| "loss": 0.5738, |
| "step": 180 |
| }, |
| { |
| "epoch": 2.1775147928994083, |
| "grad_norm": 0.5647716522216797, |
| "learning_rate": 0.00015857142857142857, |
| "loss": 0.5445, |
| "step": 185 |
| }, |
| { |
| "epoch": 2.2366863905325443, |
| "grad_norm": 0.7397224307060242, |
| "learning_rate": 0.00015738095238095238, |
| "loss": 0.5383, |
| "step": 190 |
| }, |
| { |
| "epoch": 2.2958579881656807, |
| "grad_norm": 0.8834079504013062, |
| "learning_rate": 0.0001561904761904762, |
| "loss": 0.6575, |
| "step": 195 |
| }, |
| { |
| "epoch": 2.3550295857988166, |
| "grad_norm": 0.6497870683670044, |
| "learning_rate": 0.000155, |
| "loss": 0.6677, |
| "step": 200 |
| }, |
| { |
| "epoch": 2.4142011834319526, |
| "grad_norm": 0.686392605304718, |
| "learning_rate": 0.00015380952380952384, |
| "loss": 0.493, |
| "step": 205 |
| }, |
| { |
| "epoch": 2.4733727810650885, |
| "grad_norm": 0.719688892364502, |
| "learning_rate": 0.00015261904761904762, |
| "loss": 0.5356, |
| "step": 210 |
| }, |
| { |
| "epoch": 2.532544378698225, |
| "grad_norm": 0.6884217262268066, |
| "learning_rate": 0.00015142857142857143, |
| "loss": 0.7167, |
| "step": 215 |
| }, |
| { |
| "epoch": 2.591715976331361, |
| "grad_norm": 0.7767056822776794, |
| "learning_rate": 0.00015023809523809524, |
| "loss": 0.7346, |
| "step": 220 |
| }, |
| { |
| "epoch": 2.6508875739644973, |
| "grad_norm": 0.6508312225341797, |
| "learning_rate": 0.00014904761904761904, |
| "loss": 0.547, |
| "step": 225 |
| }, |
| { |
| "epoch": 2.710059171597633, |
| "grad_norm": 0.6159693598747253, |
| "learning_rate": 0.00014785714285714288, |
| "loss": 0.5539, |
| "step": 230 |
| }, |
| { |
| "epoch": 2.769230769230769, |
| "grad_norm": 0.7028509378433228, |
| "learning_rate": 0.00014666666666666666, |
| "loss": 0.609, |
| "step": 235 |
| }, |
| { |
| "epoch": 2.828402366863905, |
| "grad_norm": 0.6096014976501465, |
| "learning_rate": 0.00014547619047619047, |
| "loss": 0.5913, |
| "step": 240 |
| }, |
| { |
| "epoch": 2.8875739644970415, |
| "grad_norm": 0.8518397212028503, |
| "learning_rate": 0.00014428571428571428, |
| "loss": 0.6356, |
| "step": 245 |
| }, |
| { |
| "epoch": 2.9467455621301775, |
| "grad_norm": 0.6462046504020691, |
| "learning_rate": 0.00014309523809523812, |
| "loss": 0.527, |
| "step": 250 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.8931583762168884, |
| "learning_rate": 0.00014190476190476193, |
| "loss": 0.6505, |
| "step": 255 |
| }, |
| { |
| "epoch": 3.059171597633136, |
| "grad_norm": 0.6089041233062744, |
| "learning_rate": 0.00014071428571428573, |
| "loss": 0.4721, |
| "step": 260 |
| }, |
| { |
| "epoch": 3.1183431952662723, |
| "grad_norm": 0.9845924973487854, |
| "learning_rate": 0.00013952380952380952, |
| "loss": 0.4478, |
| "step": 265 |
| }, |
| { |
| "epoch": 3.1775147928994083, |
| "grad_norm": 0.6962316036224365, |
| "learning_rate": 0.00013833333333333333, |
| "loss": 0.3862, |
| "step": 270 |
| }, |
| { |
| "epoch": 3.2366863905325443, |
| "grad_norm": 0.6963745951652527, |
| "learning_rate": 0.00013714285714285716, |
| "loss": 0.5446, |
| "step": 275 |
| }, |
| { |
| "epoch": 3.2958579881656807, |
| "grad_norm": 0.9289587736129761, |
| "learning_rate": 0.00013595238095238097, |
| "loss": 0.4985, |
| "step": 280 |
| }, |
| { |
| "epoch": 3.3550295857988166, |
| "grad_norm": 0.7913327813148499, |
| "learning_rate": 0.00013476190476190478, |
| "loss": 0.4291, |
| "step": 285 |
| }, |
| { |
| "epoch": 3.4142011834319526, |
| "grad_norm": 0.7623841166496277, |
| "learning_rate": 0.00013357142857142856, |
| "loss": 0.4198, |
| "step": 290 |
| }, |
| { |
| "epoch": 3.4733727810650885, |
| "grad_norm": 1.1334826946258545, |
| "learning_rate": 0.00013238095238095237, |
| "loss": 0.4442, |
| "step": 295 |
| }, |
| { |
| "epoch": 3.532544378698225, |
| "grad_norm": 0.8162091374397278, |
| "learning_rate": 0.0001311904761904762, |
| "loss": 0.4249, |
| "step": 300 |
| }, |
| { |
| "epoch": 3.591715976331361, |
| "grad_norm": 0.7582007646560669, |
| "learning_rate": 0.00013000000000000002, |
| "loss": 0.4166, |
| "step": 305 |
| }, |
| { |
| "epoch": 3.6508875739644973, |
| "grad_norm": 0.8337474465370178, |
| "learning_rate": 0.00012880952380952382, |
| "loss": 0.3552, |
| "step": 310 |
| }, |
| { |
| "epoch": 3.710059171597633, |
| "grad_norm": 0.7497977018356323, |
| "learning_rate": 0.0001276190476190476, |
| "loss": 0.3778, |
| "step": 315 |
| }, |
| { |
| "epoch": 3.769230769230769, |
| "grad_norm": 0.9030293226242065, |
| "learning_rate": 0.00012642857142857144, |
| "loss": 0.4048, |
| "step": 320 |
| }, |
| { |
| "epoch": 3.828402366863905, |
| "grad_norm": 0.8548532128334045, |
| "learning_rate": 0.00012523809523809525, |
| "loss": 0.5433, |
| "step": 325 |
| }, |
| { |
| "epoch": 3.8875739644970415, |
| "grad_norm": 1.1865911483764648, |
| "learning_rate": 0.00012404761904761906, |
| "loss": 0.4465, |
| "step": 330 |
| }, |
| { |
| "epoch": 3.9467455621301775, |
| "grad_norm": 0.6329714059829712, |
| "learning_rate": 0.00012285714285714287, |
| "loss": 0.499, |
| "step": 335 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.8335389494895935, |
| "learning_rate": 0.00012166666666666667, |
| "loss": 0.3774, |
| "step": 340 |
| }, |
| { |
| "epoch": 4.059171597633136, |
| "grad_norm": 0.7739379405975342, |
| "learning_rate": 0.00012047619047619047, |
| "loss": 0.2472, |
| "step": 345 |
| }, |
| { |
| "epoch": 4.118343195266272, |
| "grad_norm": 1.0731953382492065, |
| "learning_rate": 0.00011928571428571428, |
| "loss": 0.3058, |
| "step": 350 |
| }, |
| { |
| "epoch": 4.177514792899408, |
| "grad_norm": 1.051379680633545, |
| "learning_rate": 0.0001180952380952381, |
| "loss": 0.3446, |
| "step": 355 |
| }, |
| { |
| "epoch": 4.236686390532545, |
| "grad_norm": 0.6324198842048645, |
| "learning_rate": 0.00011690476190476191, |
| "loss": 0.2351, |
| "step": 360 |
| }, |
| { |
| "epoch": 4.295857988165681, |
| "grad_norm": 0.9921632409095764, |
| "learning_rate": 0.00011571428571428574, |
| "loss": 0.2795, |
| "step": 365 |
| }, |
| { |
| "epoch": 4.355029585798817, |
| "grad_norm": 0.9360544085502625, |
| "learning_rate": 0.00011452380952380952, |
| "loss": 0.4056, |
| "step": 370 |
| }, |
| { |
| "epoch": 4.414201183431953, |
| "grad_norm": 0.956781268119812, |
| "learning_rate": 0.00011333333333333334, |
| "loss": 0.3001, |
| "step": 375 |
| }, |
| { |
| "epoch": 4.4733727810650885, |
| "grad_norm": 1.0604465007781982, |
| "learning_rate": 0.00011214285714285715, |
| "loss": 0.3972, |
| "step": 380 |
| }, |
| { |
| "epoch": 4.5325443786982245, |
| "grad_norm": 0.8613020181655884, |
| "learning_rate": 0.00011095238095238096, |
| "loss": 0.2828, |
| "step": 385 |
| }, |
| { |
| "epoch": 4.591715976331361, |
| "grad_norm": 0.666599690914154, |
| "learning_rate": 0.00010976190476190478, |
| "loss": 0.3187, |
| "step": 390 |
| }, |
| { |
| "epoch": 4.650887573964497, |
| "grad_norm": 0.7497467398643494, |
| "learning_rate": 0.00010857142857142856, |
| "loss": 0.4278, |
| "step": 395 |
| }, |
| { |
| "epoch": 4.710059171597633, |
| "grad_norm": 0.733259916305542, |
| "learning_rate": 0.00010738095238095239, |
| "loss": 0.2384, |
| "step": 400 |
| }, |
| { |
| "epoch": 4.769230769230769, |
| "grad_norm": 0.7570552229881287, |
| "learning_rate": 0.0001061904761904762, |
| "loss": 0.3938, |
| "step": 405 |
| }, |
| { |
| "epoch": 4.828402366863905, |
| "grad_norm": 0.8109162449836731, |
| "learning_rate": 0.000105, |
| "loss": 0.2729, |
| "step": 410 |
| }, |
| { |
| "epoch": 4.887573964497041, |
| "grad_norm": 0.7985107898712158, |
| "learning_rate": 0.00010380952380952383, |
| "loss": 0.3805, |
| "step": 415 |
| }, |
| { |
| "epoch": 4.946745562130177, |
| "grad_norm": 0.704366147518158, |
| "learning_rate": 0.00010261904761904761, |
| "loss": 0.2468, |
| "step": 420 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 1.054413080215454, |
| "learning_rate": 0.00010142857142857143, |
| "loss": 0.2892, |
| "step": 425 |
| }, |
| { |
| "epoch": 5.059171597633136, |
| "grad_norm": 0.7603329420089722, |
| "learning_rate": 0.00010023809523809524, |
| "loss": 0.2865, |
| "step": 430 |
| }, |
| { |
| "epoch": 5.118343195266272, |
| "grad_norm": 0.9612646102905273, |
| "learning_rate": 9.904761904761905e-05, |
| "loss": 0.2215, |
| "step": 435 |
| }, |
| { |
| "epoch": 5.177514792899408, |
| "grad_norm": 0.8669071793556213, |
| "learning_rate": 9.785714285714286e-05, |
| "loss": 0.2078, |
| "step": 440 |
| }, |
| { |
| "epoch": 5.236686390532545, |
| "grad_norm": 0.7441051006317139, |
| "learning_rate": 9.666666666666667e-05, |
| "loss": 0.233, |
| "step": 445 |
| }, |
| { |
| "epoch": 5.295857988165681, |
| "grad_norm": 0.5900620818138123, |
| "learning_rate": 9.547619047619049e-05, |
| "loss": 0.2581, |
| "step": 450 |
| }, |
| { |
| "epoch": 5.355029585798817, |
| "grad_norm": 0.990178644657135, |
| "learning_rate": 9.428571428571429e-05, |
| "loss": 0.2603, |
| "step": 455 |
| }, |
| { |
| "epoch": 5.414201183431953, |
| "grad_norm": 0.7644340991973877, |
| "learning_rate": 9.309523809523811e-05, |
| "loss": 0.2021, |
| "step": 460 |
| }, |
| { |
| "epoch": 5.4733727810650885, |
| "grad_norm": 0.5087964534759521, |
| "learning_rate": 9.19047619047619e-05, |
| "loss": 0.1126, |
| "step": 465 |
| }, |
| { |
| "epoch": 5.5325443786982245, |
| "grad_norm": 0.7896738052368164, |
| "learning_rate": 9.071428571428571e-05, |
| "loss": 0.2084, |
| "step": 470 |
| }, |
| { |
| "epoch": 5.591715976331361, |
| "grad_norm": 0.71749347448349, |
| "learning_rate": 8.952380952380953e-05, |
| "loss": 0.1618, |
| "step": 475 |
| }, |
| { |
| "epoch": 5.650887573964497, |
| "grad_norm": 0.8466284871101379, |
| "learning_rate": 8.833333333333333e-05, |
| "loss": 0.2574, |
| "step": 480 |
| }, |
| { |
| "epoch": 5.710059171597633, |
| "grad_norm": 1.0023925304412842, |
| "learning_rate": 8.714285714285715e-05, |
| "loss": 0.1985, |
| "step": 485 |
| }, |
| { |
| "epoch": 5.769230769230769, |
| "grad_norm": 0.8096638321876526, |
| "learning_rate": 8.595238095238096e-05, |
| "loss": 0.285, |
| "step": 490 |
| }, |
| { |
| "epoch": 5.828402366863905, |
| "grad_norm": 0.9154897332191467, |
| "learning_rate": 8.476190476190477e-05, |
| "loss": 0.2051, |
| "step": 495 |
| }, |
| { |
| "epoch": 5.887573964497041, |
| "grad_norm": 0.9506188035011292, |
| "learning_rate": 8.357142857142858e-05, |
| "loss": 0.2664, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 850, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.524257725218611e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|