| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 10.0, |
| "eval_steps": 500, |
| "global_step": 850, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.05917159763313609, |
| "grad_norm": 1.1964364051818848, |
| "learning_rate": 8e-05, |
| "loss": 2.1543, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.11834319526627218, |
| "grad_norm": 1.1647157669067383, |
| "learning_rate": 0.00018, |
| "loss": 1.6346, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.17751479289940827, |
| "grad_norm": 0.6181473135948181, |
| "learning_rate": 0.00019904761904761907, |
| "loss": 0.9978, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.23668639053254437, |
| "grad_norm": 0.5694869160652161, |
| "learning_rate": 0.00019785714285714288, |
| "loss": 0.9807, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.2958579881656805, |
| "grad_norm": 0.6708640456199646, |
| "learning_rate": 0.00019666666666666666, |
| "loss": 1.2485, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.35502958579881655, |
| "grad_norm": 0.7021521925926208, |
| "learning_rate": 0.00019547619047619047, |
| "loss": 1.1107, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.41420118343195267, |
| "grad_norm": 0.5067740082740784, |
| "learning_rate": 0.0001942857142857143, |
| "loss": 1.1293, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.47337278106508873, |
| "grad_norm": 0.5455656051635742, |
| "learning_rate": 0.0001930952380952381, |
| "loss": 1.1371, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.5325443786982249, |
| "grad_norm": 0.6190764307975769, |
| "learning_rate": 0.00019190476190476192, |
| "loss": 1.0131, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.591715976331361, |
| "grad_norm": 0.544291615486145, |
| "learning_rate": 0.00019071428571428573, |
| "loss": 0.969, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.650887573964497, |
| "grad_norm": 0.600204348564148, |
| "learning_rate": 0.0001895238095238095, |
| "loss": 1.0509, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.7100591715976331, |
| "grad_norm": 0.5397897958755493, |
| "learning_rate": 0.00018833333333333335, |
| "loss": 0.9839, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.7692307692307693, |
| "grad_norm": 0.571107804775238, |
| "learning_rate": 0.00018714285714285716, |
| "loss": 0.9281, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.8284023668639053, |
| "grad_norm": 0.5789744257926941, |
| "learning_rate": 0.00018595238095238097, |
| "loss": 0.9543, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.8875739644970414, |
| "grad_norm": 0.6334110498428345, |
| "learning_rate": 0.00018476190476190478, |
| "loss": 1.0625, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.9467455621301775, |
| "grad_norm": 0.5225853323936462, |
| "learning_rate": 0.00018357142857142858, |
| "loss": 0.9734, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.7854479551315308, |
| "learning_rate": 0.0001823809523809524, |
| "loss": 0.9452, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.0591715976331362, |
| "grad_norm": 0.47304806113243103, |
| "learning_rate": 0.0001811904761904762, |
| "loss": 0.8473, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.1183431952662721, |
| "grad_norm": 0.6840282678604126, |
| "learning_rate": 0.00018, |
| "loss": 0.6189, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.1775147928994083, |
| "grad_norm": 0.5173171162605286, |
| "learning_rate": 0.00017880952380952382, |
| "loss": 0.9301, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.2366863905325443, |
| "grad_norm": 0.4892285466194153, |
| "learning_rate": 0.00017761904761904763, |
| "loss": 0.7659, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.2958579881656804, |
| "grad_norm": 0.5754849910736084, |
| "learning_rate": 0.00017642857142857144, |
| "loss": 0.7756, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.3550295857988166, |
| "grad_norm": 0.47277289628982544, |
| "learning_rate": 0.00017523809523809525, |
| "loss": 0.7844, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.4142011834319526, |
| "grad_norm": 0.8198840022087097, |
| "learning_rate": 0.00017404761904761906, |
| "loss": 0.8969, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.4733727810650887, |
| "grad_norm": 0.5040334463119507, |
| "learning_rate": 0.00017285714285714287, |
| "loss": 0.8727, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.532544378698225, |
| "grad_norm": 0.5382494926452637, |
| "learning_rate": 0.00017166666666666667, |
| "loss": 0.7677, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.5917159763313609, |
| "grad_norm": 0.631537914276123, |
| "learning_rate": 0.00017047619047619048, |
| "loss": 0.8364, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.650887573964497, |
| "grad_norm": 0.5718739628791809, |
| "learning_rate": 0.0001692857142857143, |
| "loss": 0.7447, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.7100591715976332, |
| "grad_norm": 0.557224452495575, |
| "learning_rate": 0.0001680952380952381, |
| "loss": 0.8828, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.7692307692307692, |
| "grad_norm": 0.6206871271133423, |
| "learning_rate": 0.0001669047619047619, |
| "loss": 0.5674, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.8284023668639053, |
| "grad_norm": 0.6297276616096497, |
| "learning_rate": 0.00016571428571428575, |
| "loss": 0.7956, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.8875739644970415, |
| "grad_norm": 0.6178033351898193, |
| "learning_rate": 0.00016452380952380953, |
| "loss": 0.5527, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.9467455621301775, |
| "grad_norm": 0.6269710063934326, |
| "learning_rate": 0.00016333333333333334, |
| "loss": 0.9635, |
| "step": 165 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.7290076613426208, |
| "learning_rate": 0.00016214285714285715, |
| "loss": 0.8048, |
| "step": 170 |
| }, |
| { |
| "epoch": 2.059171597633136, |
| "grad_norm": 0.5376546382904053, |
| "learning_rate": 0.00016095238095238096, |
| "loss": 0.602, |
| "step": 175 |
| }, |
| { |
| "epoch": 2.1183431952662723, |
| "grad_norm": 0.720078706741333, |
| "learning_rate": 0.0001597619047619048, |
| "loss": 0.5738, |
| "step": 180 |
| }, |
| { |
| "epoch": 2.1775147928994083, |
| "grad_norm": 0.5647716522216797, |
| "learning_rate": 0.00015857142857142857, |
| "loss": 0.5445, |
| "step": 185 |
| }, |
| { |
| "epoch": 2.2366863905325443, |
| "grad_norm": 0.7397224307060242, |
| "learning_rate": 0.00015738095238095238, |
| "loss": 0.5383, |
| "step": 190 |
| }, |
| { |
| "epoch": 2.2958579881656807, |
| "grad_norm": 0.8834079504013062, |
| "learning_rate": 0.0001561904761904762, |
| "loss": 0.6575, |
| "step": 195 |
| }, |
| { |
| "epoch": 2.3550295857988166, |
| "grad_norm": 0.6497870683670044, |
| "learning_rate": 0.000155, |
| "loss": 0.6677, |
| "step": 200 |
| }, |
| { |
| "epoch": 2.4142011834319526, |
| "grad_norm": 0.686392605304718, |
| "learning_rate": 0.00015380952380952384, |
| "loss": 0.493, |
| "step": 205 |
| }, |
| { |
| "epoch": 2.4733727810650885, |
| "grad_norm": 0.719688892364502, |
| "learning_rate": 0.00015261904761904762, |
| "loss": 0.5356, |
| "step": 210 |
| }, |
| { |
| "epoch": 2.532544378698225, |
| "grad_norm": 0.6884217262268066, |
| "learning_rate": 0.00015142857142857143, |
| "loss": 0.7167, |
| "step": 215 |
| }, |
| { |
| "epoch": 2.591715976331361, |
| "grad_norm": 0.7767056822776794, |
| "learning_rate": 0.00015023809523809524, |
| "loss": 0.7346, |
| "step": 220 |
| }, |
| { |
| "epoch": 2.6508875739644973, |
| "grad_norm": 0.6508312225341797, |
| "learning_rate": 0.00014904761904761904, |
| "loss": 0.547, |
| "step": 225 |
| }, |
| { |
| "epoch": 2.710059171597633, |
| "grad_norm": 0.6159693598747253, |
| "learning_rate": 0.00014785714285714288, |
| "loss": 0.5539, |
| "step": 230 |
| }, |
| { |
| "epoch": 2.769230769230769, |
| "grad_norm": 0.7028509378433228, |
| "learning_rate": 0.00014666666666666666, |
| "loss": 0.609, |
| "step": 235 |
| }, |
| { |
| "epoch": 2.828402366863905, |
| "grad_norm": 0.6096014976501465, |
| "learning_rate": 0.00014547619047619047, |
| "loss": 0.5913, |
| "step": 240 |
| }, |
| { |
| "epoch": 2.8875739644970415, |
| "grad_norm": 0.8518397212028503, |
| "learning_rate": 0.00014428571428571428, |
| "loss": 0.6356, |
| "step": 245 |
| }, |
| { |
| "epoch": 2.9467455621301775, |
| "grad_norm": 0.6462046504020691, |
| "learning_rate": 0.00014309523809523812, |
| "loss": 0.527, |
| "step": 250 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.8931583762168884, |
| "learning_rate": 0.00014190476190476193, |
| "loss": 0.6505, |
| "step": 255 |
| }, |
| { |
| "epoch": 3.059171597633136, |
| "grad_norm": 0.6089041233062744, |
| "learning_rate": 0.00014071428571428573, |
| "loss": 0.4721, |
| "step": 260 |
| }, |
| { |
| "epoch": 3.1183431952662723, |
| "grad_norm": 0.9845924973487854, |
| "learning_rate": 0.00013952380952380952, |
| "loss": 0.4478, |
| "step": 265 |
| }, |
| { |
| "epoch": 3.1775147928994083, |
| "grad_norm": 0.6962316036224365, |
| "learning_rate": 0.00013833333333333333, |
| "loss": 0.3862, |
| "step": 270 |
| }, |
| { |
| "epoch": 3.2366863905325443, |
| "grad_norm": 0.6963745951652527, |
| "learning_rate": 0.00013714285714285716, |
| "loss": 0.5446, |
| "step": 275 |
| }, |
| { |
| "epoch": 3.2958579881656807, |
| "grad_norm": 0.9289587736129761, |
| "learning_rate": 0.00013595238095238097, |
| "loss": 0.4985, |
| "step": 280 |
| }, |
| { |
| "epoch": 3.3550295857988166, |
| "grad_norm": 0.7913327813148499, |
| "learning_rate": 0.00013476190476190478, |
| "loss": 0.4291, |
| "step": 285 |
| }, |
| { |
| "epoch": 3.4142011834319526, |
| "grad_norm": 0.7623841166496277, |
| "learning_rate": 0.00013357142857142856, |
| "loss": 0.4198, |
| "step": 290 |
| }, |
| { |
| "epoch": 3.4733727810650885, |
| "grad_norm": 1.1334826946258545, |
| "learning_rate": 0.00013238095238095237, |
| "loss": 0.4442, |
| "step": 295 |
| }, |
| { |
| "epoch": 3.532544378698225, |
| "grad_norm": 0.8162091374397278, |
| "learning_rate": 0.0001311904761904762, |
| "loss": 0.4249, |
| "step": 300 |
| }, |
| { |
| "epoch": 3.591715976331361, |
| "grad_norm": 0.7582007646560669, |
| "learning_rate": 0.00013000000000000002, |
| "loss": 0.4166, |
| "step": 305 |
| }, |
| { |
| "epoch": 3.6508875739644973, |
| "grad_norm": 0.8337474465370178, |
| "learning_rate": 0.00012880952380952382, |
| "loss": 0.3552, |
| "step": 310 |
| }, |
| { |
| "epoch": 3.710059171597633, |
| "grad_norm": 0.7497977018356323, |
| "learning_rate": 0.0001276190476190476, |
| "loss": 0.3778, |
| "step": 315 |
| }, |
| { |
| "epoch": 3.769230769230769, |
| "grad_norm": 0.9030293226242065, |
| "learning_rate": 0.00012642857142857144, |
| "loss": 0.4048, |
| "step": 320 |
| }, |
| { |
| "epoch": 3.828402366863905, |
| "grad_norm": 0.8548532128334045, |
| "learning_rate": 0.00012523809523809525, |
| "loss": 0.5433, |
| "step": 325 |
| }, |
| { |
| "epoch": 3.8875739644970415, |
| "grad_norm": 1.1865911483764648, |
| "learning_rate": 0.00012404761904761906, |
| "loss": 0.4465, |
| "step": 330 |
| }, |
| { |
| "epoch": 3.9467455621301775, |
| "grad_norm": 0.6329714059829712, |
| "learning_rate": 0.00012285714285714287, |
| "loss": 0.499, |
| "step": 335 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.8335389494895935, |
| "learning_rate": 0.00012166666666666667, |
| "loss": 0.3774, |
| "step": 340 |
| }, |
| { |
| "epoch": 4.059171597633136, |
| "grad_norm": 0.7739379405975342, |
| "learning_rate": 0.00012047619047619047, |
| "loss": 0.2472, |
| "step": 345 |
| }, |
| { |
| "epoch": 4.118343195266272, |
| "grad_norm": 1.0731953382492065, |
| "learning_rate": 0.00011928571428571428, |
| "loss": 0.3058, |
| "step": 350 |
| }, |
| { |
| "epoch": 4.177514792899408, |
| "grad_norm": 1.051379680633545, |
| "learning_rate": 0.0001180952380952381, |
| "loss": 0.3446, |
| "step": 355 |
| }, |
| { |
| "epoch": 4.236686390532545, |
| "grad_norm": 0.6324198842048645, |
| "learning_rate": 0.00011690476190476191, |
| "loss": 0.2351, |
| "step": 360 |
| }, |
| { |
| "epoch": 4.295857988165681, |
| "grad_norm": 0.9921632409095764, |
| "learning_rate": 0.00011571428571428574, |
| "loss": 0.2795, |
| "step": 365 |
| }, |
| { |
| "epoch": 4.355029585798817, |
| "grad_norm": 0.9360544085502625, |
| "learning_rate": 0.00011452380952380952, |
| "loss": 0.4056, |
| "step": 370 |
| }, |
| { |
| "epoch": 4.414201183431953, |
| "grad_norm": 0.956781268119812, |
| "learning_rate": 0.00011333333333333334, |
| "loss": 0.3001, |
| "step": 375 |
| }, |
| { |
| "epoch": 4.4733727810650885, |
| "grad_norm": 1.0604465007781982, |
| "learning_rate": 0.00011214285714285715, |
| "loss": 0.3972, |
| "step": 380 |
| }, |
| { |
| "epoch": 4.5325443786982245, |
| "grad_norm": 0.8613020181655884, |
| "learning_rate": 0.00011095238095238096, |
| "loss": 0.2828, |
| "step": 385 |
| }, |
| { |
| "epoch": 4.591715976331361, |
| "grad_norm": 0.666599690914154, |
| "learning_rate": 0.00010976190476190478, |
| "loss": 0.3187, |
| "step": 390 |
| }, |
| { |
| "epoch": 4.650887573964497, |
| "grad_norm": 0.7497467398643494, |
| "learning_rate": 0.00010857142857142856, |
| "loss": 0.4278, |
| "step": 395 |
| }, |
| { |
| "epoch": 4.710059171597633, |
| "grad_norm": 0.733259916305542, |
| "learning_rate": 0.00010738095238095239, |
| "loss": 0.2384, |
| "step": 400 |
| }, |
| { |
| "epoch": 4.769230769230769, |
| "grad_norm": 0.7570552229881287, |
| "learning_rate": 0.0001061904761904762, |
| "loss": 0.3938, |
| "step": 405 |
| }, |
| { |
| "epoch": 4.828402366863905, |
| "grad_norm": 0.8109162449836731, |
| "learning_rate": 0.000105, |
| "loss": 0.2729, |
| "step": 410 |
| }, |
| { |
| "epoch": 4.887573964497041, |
| "grad_norm": 0.7985107898712158, |
| "learning_rate": 0.00010380952380952383, |
| "loss": 0.3805, |
| "step": 415 |
| }, |
| { |
| "epoch": 4.946745562130177, |
| "grad_norm": 0.704366147518158, |
| "learning_rate": 0.00010261904761904761, |
| "loss": 0.2468, |
| "step": 420 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 1.054413080215454, |
| "learning_rate": 0.00010142857142857143, |
| "loss": 0.2892, |
| "step": 425 |
| }, |
| { |
| "epoch": 5.059171597633136, |
| "grad_norm": 0.7603329420089722, |
| "learning_rate": 0.00010023809523809524, |
| "loss": 0.2865, |
| "step": 430 |
| }, |
| { |
| "epoch": 5.118343195266272, |
| "grad_norm": 0.9612646102905273, |
| "learning_rate": 9.904761904761905e-05, |
| "loss": 0.2215, |
| "step": 435 |
| }, |
| { |
| "epoch": 5.177514792899408, |
| "grad_norm": 0.8669071793556213, |
| "learning_rate": 9.785714285714286e-05, |
| "loss": 0.2078, |
| "step": 440 |
| }, |
| { |
| "epoch": 5.236686390532545, |
| "grad_norm": 0.7441051006317139, |
| "learning_rate": 9.666666666666667e-05, |
| "loss": 0.233, |
| "step": 445 |
| }, |
| { |
| "epoch": 5.295857988165681, |
| "grad_norm": 0.5900620818138123, |
| "learning_rate": 9.547619047619049e-05, |
| "loss": 0.2581, |
| "step": 450 |
| }, |
| { |
| "epoch": 5.355029585798817, |
| "grad_norm": 0.990178644657135, |
| "learning_rate": 9.428571428571429e-05, |
| "loss": 0.2603, |
| "step": 455 |
| }, |
| { |
| "epoch": 5.414201183431953, |
| "grad_norm": 0.7644340991973877, |
| "learning_rate": 9.309523809523811e-05, |
| "loss": 0.2021, |
| "step": 460 |
| }, |
| { |
| "epoch": 5.4733727810650885, |
| "grad_norm": 0.5087964534759521, |
| "learning_rate": 9.19047619047619e-05, |
| "loss": 0.1126, |
| "step": 465 |
| }, |
| { |
| "epoch": 5.5325443786982245, |
| "grad_norm": 0.7896738052368164, |
| "learning_rate": 9.071428571428571e-05, |
| "loss": 0.2084, |
| "step": 470 |
| }, |
| { |
| "epoch": 5.591715976331361, |
| "grad_norm": 0.71749347448349, |
| "learning_rate": 8.952380952380953e-05, |
| "loss": 0.1618, |
| "step": 475 |
| }, |
| { |
| "epoch": 5.650887573964497, |
| "grad_norm": 0.8466284871101379, |
| "learning_rate": 8.833333333333333e-05, |
| "loss": 0.2574, |
| "step": 480 |
| }, |
| { |
| "epoch": 5.710059171597633, |
| "grad_norm": 1.0023925304412842, |
| "learning_rate": 8.714285714285715e-05, |
| "loss": 0.1985, |
| "step": 485 |
| }, |
| { |
| "epoch": 5.769230769230769, |
| "grad_norm": 0.8096638321876526, |
| "learning_rate": 8.595238095238096e-05, |
| "loss": 0.285, |
| "step": 490 |
| }, |
| { |
| "epoch": 5.828402366863905, |
| "grad_norm": 0.9154897332191467, |
| "learning_rate": 8.476190476190477e-05, |
| "loss": 0.2051, |
| "step": 495 |
| }, |
| { |
| "epoch": 5.887573964497041, |
| "grad_norm": 0.9506188035011292, |
| "learning_rate": 8.357142857142858e-05, |
| "loss": 0.2664, |
| "step": 500 |
| }, |
| { |
| "epoch": 5.946745562130177, |
| "grad_norm": 0.8935821056365967, |
| "learning_rate": 8.238095238095238e-05, |
| "loss": 0.1717, |
| "step": 505 |
| }, |
| { |
| "epoch": 6.0, |
| "grad_norm": 1.2274423837661743, |
| "learning_rate": 8.11904761904762e-05, |
| "loss": 0.2529, |
| "step": 510 |
| }, |
| { |
| "epoch": 6.059171597633136, |
| "grad_norm": 0.6341638565063477, |
| "learning_rate": 8e-05, |
| "loss": 0.1795, |
| "step": 515 |
| }, |
| { |
| "epoch": 6.118343195266272, |
| "grad_norm": 1.273710012435913, |
| "learning_rate": 7.880952380952382e-05, |
| "loss": 0.1612, |
| "step": 520 |
| }, |
| { |
| "epoch": 6.177514792899408, |
| "grad_norm": 1.065499186515808, |
| "learning_rate": 7.761904761904762e-05, |
| "loss": 0.1839, |
| "step": 525 |
| }, |
| { |
| "epoch": 6.236686390532545, |
| "grad_norm": 0.5382740497589111, |
| "learning_rate": 7.642857142857143e-05, |
| "loss": 0.1628, |
| "step": 530 |
| }, |
| { |
| "epoch": 6.295857988165681, |
| "grad_norm": 0.6181464791297913, |
| "learning_rate": 7.523809523809524e-05, |
| "loss": 0.141, |
| "step": 535 |
| }, |
| { |
| "epoch": 6.355029585798817, |
| "grad_norm": 0.7450206875801086, |
| "learning_rate": 7.404761904761905e-05, |
| "loss": 0.1453, |
| "step": 540 |
| }, |
| { |
| "epoch": 6.414201183431953, |
| "grad_norm": 0.9426142573356628, |
| "learning_rate": 7.285714285714286e-05, |
| "loss": 0.1403, |
| "step": 545 |
| }, |
| { |
| "epoch": 6.4733727810650885, |
| "grad_norm": 0.9675353169441223, |
| "learning_rate": 7.166666666666667e-05, |
| "loss": 0.1216, |
| "step": 550 |
| }, |
| { |
| "epoch": 6.5325443786982245, |
| "grad_norm": 0.5108327269554138, |
| "learning_rate": 7.047619047619048e-05, |
| "loss": 0.1484, |
| "step": 555 |
| }, |
| { |
| "epoch": 6.591715976331361, |
| "grad_norm": 0.6549590229988098, |
| "learning_rate": 6.928571428571429e-05, |
| "loss": 0.1338, |
| "step": 560 |
| }, |
| { |
| "epoch": 6.650887573964497, |
| "grad_norm": 0.843664288520813, |
| "learning_rate": 6.80952380952381e-05, |
| "loss": 0.1586, |
| "step": 565 |
| }, |
| { |
| "epoch": 6.710059171597633, |
| "grad_norm": 0.8650611639022827, |
| "learning_rate": 6.69047619047619e-05, |
| "loss": 0.1517, |
| "step": 570 |
| }, |
| { |
| "epoch": 6.769230769230769, |
| "grad_norm": 0.7471471428871155, |
| "learning_rate": 6.571428571428571e-05, |
| "loss": 0.1966, |
| "step": 575 |
| }, |
| { |
| "epoch": 6.828402366863905, |
| "grad_norm": 0.7219163775444031, |
| "learning_rate": 6.452380952380954e-05, |
| "loss": 0.1368, |
| "step": 580 |
| }, |
| { |
| "epoch": 6.887573964497041, |
| "grad_norm": 0.911191463470459, |
| "learning_rate": 6.333333333333333e-05, |
| "loss": 0.1681, |
| "step": 585 |
| }, |
| { |
| "epoch": 6.946745562130177, |
| "grad_norm": 0.6017201542854309, |
| "learning_rate": 6.214285714285714e-05, |
| "loss": 0.1675, |
| "step": 590 |
| }, |
| { |
| "epoch": 7.0, |
| "grad_norm": 1.0746937990188599, |
| "learning_rate": 6.0952380952380964e-05, |
| "loss": 0.1944, |
| "step": 595 |
| }, |
| { |
| "epoch": 7.059171597633136, |
| "grad_norm": 0.8227368593215942, |
| "learning_rate": 5.9761904761904766e-05, |
| "loss": 0.076, |
| "step": 600 |
| }, |
| { |
| "epoch": 7.118343195266272, |
| "grad_norm": 0.6826525330543518, |
| "learning_rate": 5.8571428571428575e-05, |
| "loss": 0.1048, |
| "step": 605 |
| }, |
| { |
| "epoch": 7.177514792899408, |
| "grad_norm": 1.0038989782333374, |
| "learning_rate": 5.738095238095238e-05, |
| "loss": 0.0998, |
| "step": 610 |
| }, |
| { |
| "epoch": 7.236686390532545, |
| "grad_norm": 0.8537135720252991, |
| "learning_rate": 5.619047619047619e-05, |
| "loss": 0.1035, |
| "step": 615 |
| }, |
| { |
| "epoch": 7.295857988165681, |
| "grad_norm": 0.7067388892173767, |
| "learning_rate": 5.500000000000001e-05, |
| "loss": 0.1186, |
| "step": 620 |
| }, |
| { |
| "epoch": 7.355029585798817, |
| "grad_norm": 0.7003813982009888, |
| "learning_rate": 5.380952380952381e-05, |
| "loss": 0.0803, |
| "step": 625 |
| }, |
| { |
| "epoch": 7.414201183431953, |
| "grad_norm": 0.9923582673072815, |
| "learning_rate": 5.261904761904763e-05, |
| "loss": 0.1121, |
| "step": 630 |
| }, |
| { |
| "epoch": 7.4733727810650885, |
| "grad_norm": 0.4991615116596222, |
| "learning_rate": 5.142857142857143e-05, |
| "loss": 0.1056, |
| "step": 635 |
| }, |
| { |
| "epoch": 7.5325443786982245, |
| "grad_norm": 0.7406235337257385, |
| "learning_rate": 5.023809523809524e-05, |
| "loss": 0.1135, |
| "step": 640 |
| }, |
| { |
| "epoch": 7.591715976331361, |
| "grad_norm": 0.5871267318725586, |
| "learning_rate": 4.904761904761905e-05, |
| "loss": 0.1141, |
| "step": 645 |
| }, |
| { |
| "epoch": 7.650887573964497, |
| "grad_norm": 0.5495700836181641, |
| "learning_rate": 4.785714285714286e-05, |
| "loss": 0.07, |
| "step": 650 |
| }, |
| { |
| "epoch": 7.710059171597633, |
| "grad_norm": 0.9295830130577087, |
| "learning_rate": 4.666666666666667e-05, |
| "loss": 0.1293, |
| "step": 655 |
| }, |
| { |
| "epoch": 7.769230769230769, |
| "grad_norm": 0.9041563272476196, |
| "learning_rate": 4.547619047619048e-05, |
| "loss": 0.1034, |
| "step": 660 |
| }, |
| { |
| "epoch": 7.828402366863905, |
| "grad_norm": 0.6490697264671326, |
| "learning_rate": 4.428571428571428e-05, |
| "loss": 0.0898, |
| "step": 665 |
| }, |
| { |
| "epoch": 7.887573964497041, |
| "grad_norm": 0.5583420991897583, |
| "learning_rate": 4.30952380952381e-05, |
| "loss": 0.0891, |
| "step": 670 |
| }, |
| { |
| "epoch": 7.946745562130177, |
| "grad_norm": 0.7829737663269043, |
| "learning_rate": 4.190476190476191e-05, |
| "loss": 0.112, |
| "step": 675 |
| }, |
| { |
| "epoch": 8.0, |
| "grad_norm": 0.48943546414375305, |
| "learning_rate": 4.0714285714285717e-05, |
| "loss": 0.115, |
| "step": 680 |
| }, |
| { |
| "epoch": 8.059171597633137, |
| "grad_norm": 0.4989611506462097, |
| "learning_rate": 3.9523809523809526e-05, |
| "loss": 0.0645, |
| "step": 685 |
| }, |
| { |
| "epoch": 8.118343195266272, |
| "grad_norm": 0.5723634362220764, |
| "learning_rate": 3.8333333333333334e-05, |
| "loss": 0.0603, |
| "step": 690 |
| }, |
| { |
| "epoch": 8.177514792899409, |
| "grad_norm": 0.5361748933792114, |
| "learning_rate": 3.7142857142857143e-05, |
| "loss": 0.0626, |
| "step": 695 |
| }, |
| { |
| "epoch": 8.236686390532544, |
| "grad_norm": 0.8497764468193054, |
| "learning_rate": 3.595238095238095e-05, |
| "loss": 0.0856, |
| "step": 700 |
| }, |
| { |
| "epoch": 8.29585798816568, |
| "grad_norm": 0.48423126339912415, |
| "learning_rate": 3.476190476190476e-05, |
| "loss": 0.0568, |
| "step": 705 |
| }, |
| { |
| "epoch": 8.355029585798816, |
| "grad_norm": 0.30722182989120483, |
| "learning_rate": 3.357142857142857e-05, |
| "loss": 0.0564, |
| "step": 710 |
| }, |
| { |
| "epoch": 8.414201183431953, |
| "grad_norm": 0.637298047542572, |
| "learning_rate": 3.2380952380952386e-05, |
| "loss": 0.0792, |
| "step": 715 |
| }, |
| { |
| "epoch": 8.47337278106509, |
| "grad_norm": 1.1620301008224487, |
| "learning_rate": 3.1190476190476195e-05, |
| "loss": 0.0848, |
| "step": 720 |
| }, |
| { |
| "epoch": 8.532544378698224, |
| "grad_norm": 0.9842550158500671, |
| "learning_rate": 3e-05, |
| "loss": 0.0695, |
| "step": 725 |
| }, |
| { |
| "epoch": 8.591715976331361, |
| "grad_norm": 0.5429280996322632, |
| "learning_rate": 2.880952380952381e-05, |
| "loss": 0.0691, |
| "step": 730 |
| }, |
| { |
| "epoch": 8.650887573964496, |
| "grad_norm": 0.4466063380241394, |
| "learning_rate": 2.7619047619047622e-05, |
| "loss": 0.0427, |
| "step": 735 |
| }, |
| { |
| "epoch": 8.710059171597633, |
| "grad_norm": 0.7048435807228088, |
| "learning_rate": 2.642857142857143e-05, |
| "loss": 0.066, |
| "step": 740 |
| }, |
| { |
| "epoch": 8.76923076923077, |
| "grad_norm": 0.71544349193573, |
| "learning_rate": 2.523809523809524e-05, |
| "loss": 0.048, |
| "step": 745 |
| }, |
| { |
| "epoch": 8.828402366863905, |
| "grad_norm": 0.27939409017562866, |
| "learning_rate": 2.404761904761905e-05, |
| "loss": 0.047, |
| "step": 750 |
| }, |
| { |
| "epoch": 8.887573964497042, |
| "grad_norm": 0.30130356550216675, |
| "learning_rate": 2.2857142857142858e-05, |
| "loss": 0.0715, |
| "step": 755 |
| }, |
| { |
| "epoch": 8.946745562130177, |
| "grad_norm": 0.6010681986808777, |
| "learning_rate": 2.1666666666666667e-05, |
| "loss": 0.0478, |
| "step": 760 |
| }, |
| { |
| "epoch": 9.0, |
| "grad_norm": 0.5269991159439087, |
| "learning_rate": 2.0476190476190476e-05, |
| "loss": 0.048, |
| "step": 765 |
| }, |
| { |
| "epoch": 9.059171597633137, |
| "grad_norm": 0.30882084369659424, |
| "learning_rate": 1.928571428571429e-05, |
| "loss": 0.0508, |
| "step": 770 |
| }, |
| { |
| "epoch": 9.118343195266272, |
| "grad_norm": 0.3261071741580963, |
| "learning_rate": 1.8095238095238094e-05, |
| "loss": 0.0439, |
| "step": 775 |
| }, |
| { |
| "epoch": 9.177514792899409, |
| "grad_norm": 0.33494994044303894, |
| "learning_rate": 1.6904761904761906e-05, |
| "loss": 0.0394, |
| "step": 780 |
| }, |
| { |
| "epoch": 9.236686390532544, |
| "grad_norm": 0.470735639333725, |
| "learning_rate": 1.5714285714285715e-05, |
| "loss": 0.0465, |
| "step": 785 |
| }, |
| { |
| "epoch": 9.29585798816568, |
| "grad_norm": 0.4816909432411194, |
| "learning_rate": 1.4523809523809526e-05, |
| "loss": 0.0461, |
| "step": 790 |
| }, |
| { |
| "epoch": 9.355029585798816, |
| "grad_norm": 0.332380086183548, |
| "learning_rate": 1.3333333333333333e-05, |
| "loss": 0.0438, |
| "step": 795 |
| }, |
| { |
| "epoch": 9.414201183431953, |
| "grad_norm": 1.0364826917648315, |
| "learning_rate": 1.2142857142857144e-05, |
| "loss": 0.0575, |
| "step": 800 |
| }, |
| { |
| "epoch": 9.47337278106509, |
| "grad_norm": 0.31686267256736755, |
| "learning_rate": 1.0952380952380953e-05, |
| "loss": 0.0376, |
| "step": 805 |
| }, |
| { |
| "epoch": 9.532544378698224, |
| "grad_norm": 0.19316697120666504, |
| "learning_rate": 9.761904761904762e-06, |
| "loss": 0.0546, |
| "step": 810 |
| }, |
| { |
| "epoch": 9.591715976331361, |
| "grad_norm": 0.3316756784915924, |
| "learning_rate": 8.571428571428573e-06, |
| "loss": 0.0303, |
| "step": 815 |
| }, |
| { |
| "epoch": 9.650887573964496, |
| "grad_norm": 0.38312825560569763, |
| "learning_rate": 7.380952380952382e-06, |
| "loss": 0.0407, |
| "step": 820 |
| }, |
| { |
| "epoch": 9.710059171597633, |
| "grad_norm": 0.1672104001045227, |
| "learning_rate": 6.190476190476191e-06, |
| "loss": 0.0406, |
| "step": 825 |
| }, |
| { |
| "epoch": 9.76923076923077, |
| "grad_norm": 0.30679717659950256, |
| "learning_rate": 5e-06, |
| "loss": 0.0316, |
| "step": 830 |
| }, |
| { |
| "epoch": 9.828402366863905, |
| "grad_norm": 0.3245919346809387, |
| "learning_rate": 3.8095238095238102e-06, |
| "loss": 0.0361, |
| "step": 835 |
| }, |
| { |
| "epoch": 9.887573964497042, |
| "grad_norm": 0.25631895661354065, |
| "learning_rate": 2.6190476190476192e-06, |
| "loss": 0.0351, |
| "step": 840 |
| }, |
| { |
| "epoch": 9.946745562130177, |
| "grad_norm": 0.2532467842102051, |
| "learning_rate": 1.4285714285714286e-06, |
| "loss": 0.0397, |
| "step": 845 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.7708420157432556, |
| "learning_rate": 2.3809523809523814e-07, |
| "loss": 0.0416, |
| "step": 850 |
| }, |
| { |
| "epoch": 10.0, |
| "step": 850, |
| "total_flos": 5.971841440128e+16, |
| "train_loss": 0.3865027742876726, |
| "train_runtime": 1732.5271, |
| "train_samples_per_second": 3.902, |
| "train_steps_per_second": 0.491 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 850, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.971841440128e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|