{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 850, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05917159763313609, "grad_norm": 1.1964364051818848, "learning_rate": 8e-05, "loss": 2.1543, "step": 5 }, { "epoch": 0.11834319526627218, "grad_norm": 1.1647157669067383, "learning_rate": 0.00018, "loss": 1.6346, "step": 10 }, { "epoch": 0.17751479289940827, "grad_norm": 0.6181473135948181, "learning_rate": 0.00019904761904761907, "loss": 0.9978, "step": 15 }, { "epoch": 0.23668639053254437, "grad_norm": 0.5694869160652161, "learning_rate": 0.00019785714285714288, "loss": 0.9807, "step": 20 }, { "epoch": 0.2958579881656805, "grad_norm": 0.6708640456199646, "learning_rate": 0.00019666666666666666, "loss": 1.2485, "step": 25 }, { "epoch": 0.35502958579881655, "grad_norm": 0.7021521925926208, "learning_rate": 0.00019547619047619047, "loss": 1.1107, "step": 30 }, { "epoch": 0.41420118343195267, "grad_norm": 0.5067740082740784, "learning_rate": 0.0001942857142857143, "loss": 1.1293, "step": 35 }, { "epoch": 0.47337278106508873, "grad_norm": 0.5455656051635742, "learning_rate": 0.0001930952380952381, "loss": 1.1371, "step": 40 }, { "epoch": 0.5325443786982249, "grad_norm": 0.6190764307975769, "learning_rate": 0.00019190476190476192, "loss": 1.0131, "step": 45 }, { "epoch": 0.591715976331361, "grad_norm": 0.544291615486145, "learning_rate": 0.00019071428571428573, "loss": 0.969, "step": 50 }, { "epoch": 0.650887573964497, "grad_norm": 0.600204348564148, "learning_rate": 0.0001895238095238095, "loss": 1.0509, "step": 55 }, { "epoch": 0.7100591715976331, "grad_norm": 0.5397897958755493, "learning_rate": 0.00018833333333333335, "loss": 0.9839, "step": 60 }, { "epoch": 0.7692307692307693, "grad_norm": 0.571107804775238, "learning_rate": 0.00018714285714285716, "loss": 0.9281, "step": 65 }, { "epoch": 0.8284023668639053, "grad_norm": 0.5789744257926941, "learning_rate": 0.00018595238095238097, "loss": 0.9543, "step": 70 }, { "epoch": 0.8875739644970414, "grad_norm": 0.6334110498428345, "learning_rate": 0.00018476190476190478, "loss": 1.0625, "step": 75 }, { "epoch": 0.9467455621301775, "grad_norm": 0.5225853323936462, "learning_rate": 0.00018357142857142858, "loss": 0.9734, "step": 80 }, { "epoch": 1.0, "grad_norm": 0.7854479551315308, "learning_rate": 0.0001823809523809524, "loss": 0.9452, "step": 85 }, { "epoch": 1.0591715976331362, "grad_norm": 0.47304806113243103, "learning_rate": 0.0001811904761904762, "loss": 0.8473, "step": 90 }, { "epoch": 1.1183431952662721, "grad_norm": 0.6840282678604126, "learning_rate": 0.00018, "loss": 0.6189, "step": 95 }, { "epoch": 1.1775147928994083, "grad_norm": 0.5173171162605286, "learning_rate": 0.00017880952380952382, "loss": 0.9301, "step": 100 }, { "epoch": 1.2366863905325443, "grad_norm": 0.4892285466194153, "learning_rate": 0.00017761904761904763, "loss": 0.7659, "step": 105 }, { "epoch": 1.2958579881656804, "grad_norm": 0.5754849910736084, "learning_rate": 0.00017642857142857144, "loss": 0.7756, "step": 110 }, { "epoch": 1.3550295857988166, "grad_norm": 0.47277289628982544, "learning_rate": 0.00017523809523809525, "loss": 0.7844, "step": 115 }, { "epoch": 1.4142011834319526, "grad_norm": 0.8198840022087097, "learning_rate": 0.00017404761904761906, "loss": 0.8969, "step": 120 }, { "epoch": 1.4733727810650887, "grad_norm": 0.5040334463119507, "learning_rate": 0.00017285714285714287, "loss": 0.8727, "step": 125 }, { "epoch": 1.532544378698225, "grad_norm": 0.5382494926452637, "learning_rate": 0.00017166666666666667, "loss": 0.7677, "step": 130 }, { "epoch": 1.5917159763313609, "grad_norm": 0.631537914276123, "learning_rate": 0.00017047619047619048, "loss": 0.8364, "step": 135 }, { "epoch": 1.650887573964497, "grad_norm": 0.5718739628791809, "learning_rate": 0.0001692857142857143, "loss": 0.7447, "step": 140 }, { "epoch": 1.7100591715976332, "grad_norm": 0.557224452495575, "learning_rate": 0.0001680952380952381, "loss": 0.8828, "step": 145 }, { "epoch": 1.7692307692307692, "grad_norm": 0.6206871271133423, "learning_rate": 0.0001669047619047619, "loss": 0.5674, "step": 150 }, { "epoch": 1.8284023668639053, "grad_norm": 0.6297276616096497, "learning_rate": 0.00016571428571428575, "loss": 0.7956, "step": 155 }, { "epoch": 1.8875739644970415, "grad_norm": 0.6178033351898193, "learning_rate": 0.00016452380952380953, "loss": 0.5527, "step": 160 }, { "epoch": 1.9467455621301775, "grad_norm": 0.6269710063934326, "learning_rate": 0.00016333333333333334, "loss": 0.9635, "step": 165 }, { "epoch": 2.0, "grad_norm": 0.7290076613426208, "learning_rate": 0.00016214285714285715, "loss": 0.8048, "step": 170 }, { "epoch": 2.059171597633136, "grad_norm": 0.5376546382904053, "learning_rate": 0.00016095238095238096, "loss": 0.602, "step": 175 }, { "epoch": 2.1183431952662723, "grad_norm": 0.720078706741333, "learning_rate": 0.0001597619047619048, "loss": 0.5738, "step": 180 }, { "epoch": 2.1775147928994083, "grad_norm": 0.5647716522216797, "learning_rate": 0.00015857142857142857, "loss": 0.5445, "step": 185 }, { "epoch": 2.2366863905325443, "grad_norm": 0.7397224307060242, "learning_rate": 0.00015738095238095238, "loss": 0.5383, "step": 190 }, { "epoch": 2.2958579881656807, "grad_norm": 0.8834079504013062, "learning_rate": 0.0001561904761904762, "loss": 0.6575, "step": 195 }, { "epoch": 2.3550295857988166, "grad_norm": 0.6497870683670044, "learning_rate": 0.000155, "loss": 0.6677, "step": 200 }, { "epoch": 2.4142011834319526, "grad_norm": 0.686392605304718, "learning_rate": 0.00015380952380952384, "loss": 0.493, "step": 205 }, { "epoch": 2.4733727810650885, "grad_norm": 0.719688892364502, "learning_rate": 0.00015261904761904762, "loss": 0.5356, "step": 210 }, { "epoch": 2.532544378698225, "grad_norm": 0.6884217262268066, "learning_rate": 0.00015142857142857143, "loss": 0.7167, "step": 215 }, { "epoch": 2.591715976331361, "grad_norm": 0.7767056822776794, "learning_rate": 0.00015023809523809524, "loss": 0.7346, "step": 220 }, { "epoch": 2.6508875739644973, "grad_norm": 0.6508312225341797, "learning_rate": 0.00014904761904761904, "loss": 0.547, "step": 225 }, { "epoch": 2.710059171597633, "grad_norm": 0.6159693598747253, "learning_rate": 0.00014785714285714288, "loss": 0.5539, "step": 230 }, { "epoch": 2.769230769230769, "grad_norm": 0.7028509378433228, "learning_rate": 0.00014666666666666666, "loss": 0.609, "step": 235 }, { "epoch": 2.828402366863905, "grad_norm": 0.6096014976501465, "learning_rate": 0.00014547619047619047, "loss": 0.5913, "step": 240 }, { "epoch": 2.8875739644970415, "grad_norm": 0.8518397212028503, "learning_rate": 0.00014428571428571428, "loss": 0.6356, "step": 245 }, { "epoch": 2.9467455621301775, "grad_norm": 0.6462046504020691, "learning_rate": 0.00014309523809523812, "loss": 0.527, "step": 250 }, { "epoch": 3.0, "grad_norm": 0.8931583762168884, "learning_rate": 0.00014190476190476193, "loss": 0.6505, "step": 255 }, { "epoch": 3.059171597633136, "grad_norm": 0.6089041233062744, "learning_rate": 0.00014071428571428573, "loss": 0.4721, "step": 260 }, { "epoch": 3.1183431952662723, "grad_norm": 0.9845924973487854, "learning_rate": 0.00013952380952380952, "loss": 0.4478, "step": 265 }, { "epoch": 3.1775147928994083, "grad_norm": 0.6962316036224365, "learning_rate": 0.00013833333333333333, "loss": 0.3862, "step": 270 }, { "epoch": 3.2366863905325443, "grad_norm": 0.6963745951652527, "learning_rate": 0.00013714285714285716, "loss": 0.5446, "step": 275 }, { "epoch": 3.2958579881656807, "grad_norm": 0.9289587736129761, "learning_rate": 0.00013595238095238097, "loss": 0.4985, "step": 280 }, { "epoch": 3.3550295857988166, "grad_norm": 0.7913327813148499, "learning_rate": 0.00013476190476190478, "loss": 0.4291, "step": 285 }, { "epoch": 3.4142011834319526, "grad_norm": 0.7623841166496277, "learning_rate": 0.00013357142857142856, "loss": 0.4198, "step": 290 }, { "epoch": 3.4733727810650885, "grad_norm": 1.1334826946258545, "learning_rate": 0.00013238095238095237, "loss": 0.4442, "step": 295 }, { "epoch": 3.532544378698225, "grad_norm": 0.8162091374397278, "learning_rate": 0.0001311904761904762, "loss": 0.4249, "step": 300 }, { "epoch": 3.591715976331361, "grad_norm": 0.7582007646560669, "learning_rate": 0.00013000000000000002, "loss": 0.4166, "step": 305 }, { "epoch": 3.6508875739644973, "grad_norm": 0.8337474465370178, "learning_rate": 0.00012880952380952382, "loss": 0.3552, "step": 310 }, { "epoch": 3.710059171597633, "grad_norm": 0.7497977018356323, "learning_rate": 0.0001276190476190476, "loss": 0.3778, "step": 315 }, { "epoch": 3.769230769230769, "grad_norm": 0.9030293226242065, "learning_rate": 0.00012642857142857144, "loss": 0.4048, "step": 320 }, { "epoch": 3.828402366863905, "grad_norm": 0.8548532128334045, "learning_rate": 0.00012523809523809525, "loss": 0.5433, "step": 325 }, { "epoch": 3.8875739644970415, "grad_norm": 1.1865911483764648, "learning_rate": 0.00012404761904761906, "loss": 0.4465, "step": 330 }, { "epoch": 3.9467455621301775, "grad_norm": 0.6329714059829712, "learning_rate": 0.00012285714285714287, "loss": 0.499, "step": 335 }, { "epoch": 4.0, "grad_norm": 0.8335389494895935, "learning_rate": 0.00012166666666666667, "loss": 0.3774, "step": 340 }, { "epoch": 4.059171597633136, "grad_norm": 0.7739379405975342, "learning_rate": 0.00012047619047619047, "loss": 0.2472, "step": 345 }, { "epoch": 4.118343195266272, "grad_norm": 1.0731953382492065, "learning_rate": 0.00011928571428571428, "loss": 0.3058, "step": 350 }, { "epoch": 4.177514792899408, "grad_norm": 1.051379680633545, "learning_rate": 0.0001180952380952381, "loss": 0.3446, "step": 355 }, { "epoch": 4.236686390532545, "grad_norm": 0.6324198842048645, "learning_rate": 0.00011690476190476191, "loss": 0.2351, "step": 360 }, { "epoch": 4.295857988165681, "grad_norm": 0.9921632409095764, "learning_rate": 0.00011571428571428574, "loss": 0.2795, "step": 365 }, { "epoch": 4.355029585798817, "grad_norm": 0.9360544085502625, "learning_rate": 0.00011452380952380952, "loss": 0.4056, "step": 370 }, { "epoch": 4.414201183431953, "grad_norm": 0.956781268119812, "learning_rate": 0.00011333333333333334, "loss": 0.3001, "step": 375 }, { "epoch": 4.4733727810650885, "grad_norm": 1.0604465007781982, "learning_rate": 0.00011214285714285715, "loss": 0.3972, "step": 380 }, { "epoch": 4.5325443786982245, "grad_norm": 0.8613020181655884, "learning_rate": 0.00011095238095238096, "loss": 0.2828, "step": 385 }, { "epoch": 4.591715976331361, "grad_norm": 0.666599690914154, "learning_rate": 0.00010976190476190478, "loss": 0.3187, "step": 390 }, { "epoch": 4.650887573964497, "grad_norm": 0.7497467398643494, "learning_rate": 0.00010857142857142856, "loss": 0.4278, "step": 395 }, { "epoch": 4.710059171597633, "grad_norm": 0.733259916305542, "learning_rate": 0.00010738095238095239, "loss": 0.2384, "step": 400 }, { "epoch": 4.769230769230769, "grad_norm": 0.7570552229881287, "learning_rate": 0.0001061904761904762, "loss": 0.3938, "step": 405 }, { "epoch": 4.828402366863905, "grad_norm": 0.8109162449836731, "learning_rate": 0.000105, "loss": 0.2729, "step": 410 }, { "epoch": 4.887573964497041, "grad_norm": 0.7985107898712158, "learning_rate": 0.00010380952380952383, "loss": 0.3805, "step": 415 }, { "epoch": 4.946745562130177, "grad_norm": 0.704366147518158, "learning_rate": 0.00010261904761904761, "loss": 0.2468, "step": 420 }, { "epoch": 5.0, "grad_norm": 1.054413080215454, "learning_rate": 0.00010142857142857143, "loss": 0.2892, "step": 425 }, { "epoch": 5.059171597633136, "grad_norm": 0.7603329420089722, "learning_rate": 0.00010023809523809524, "loss": 0.2865, "step": 430 }, { "epoch": 5.118343195266272, "grad_norm": 0.9612646102905273, "learning_rate": 9.904761904761905e-05, "loss": 0.2215, "step": 435 }, { "epoch": 5.177514792899408, "grad_norm": 0.8669071793556213, "learning_rate": 9.785714285714286e-05, "loss": 0.2078, "step": 440 }, { "epoch": 5.236686390532545, "grad_norm": 0.7441051006317139, "learning_rate": 9.666666666666667e-05, "loss": 0.233, "step": 445 }, { "epoch": 5.295857988165681, "grad_norm": 0.5900620818138123, "learning_rate": 9.547619047619049e-05, "loss": 0.2581, "step": 450 }, { "epoch": 5.355029585798817, "grad_norm": 0.990178644657135, "learning_rate": 9.428571428571429e-05, "loss": 0.2603, "step": 455 }, { "epoch": 5.414201183431953, "grad_norm": 0.7644340991973877, "learning_rate": 9.309523809523811e-05, "loss": 0.2021, "step": 460 }, { "epoch": 5.4733727810650885, "grad_norm": 0.5087964534759521, "learning_rate": 9.19047619047619e-05, "loss": 0.1126, "step": 465 }, { "epoch": 5.5325443786982245, "grad_norm": 0.7896738052368164, "learning_rate": 9.071428571428571e-05, "loss": 0.2084, "step": 470 }, { "epoch": 5.591715976331361, "grad_norm": 0.71749347448349, "learning_rate": 8.952380952380953e-05, "loss": 0.1618, "step": 475 }, { "epoch": 5.650887573964497, "grad_norm": 0.8466284871101379, "learning_rate": 8.833333333333333e-05, "loss": 0.2574, "step": 480 }, { "epoch": 5.710059171597633, "grad_norm": 1.0023925304412842, "learning_rate": 8.714285714285715e-05, "loss": 0.1985, "step": 485 }, { "epoch": 5.769230769230769, "grad_norm": 0.8096638321876526, "learning_rate": 8.595238095238096e-05, "loss": 0.285, "step": 490 }, { "epoch": 5.828402366863905, "grad_norm": 0.9154897332191467, "learning_rate": 8.476190476190477e-05, "loss": 0.2051, "step": 495 }, { "epoch": 5.887573964497041, "grad_norm": 0.9506188035011292, "learning_rate": 8.357142857142858e-05, "loss": 0.2664, "step": 500 }, { "epoch": 5.946745562130177, "grad_norm": 0.8935821056365967, "learning_rate": 8.238095238095238e-05, "loss": 0.1717, "step": 505 }, { "epoch": 6.0, "grad_norm": 1.2274423837661743, "learning_rate": 8.11904761904762e-05, "loss": 0.2529, "step": 510 }, { "epoch": 6.059171597633136, "grad_norm": 0.6341638565063477, "learning_rate": 8e-05, "loss": 0.1795, "step": 515 }, { "epoch": 6.118343195266272, "grad_norm": 1.273710012435913, "learning_rate": 7.880952380952382e-05, "loss": 0.1612, "step": 520 }, { "epoch": 6.177514792899408, "grad_norm": 1.065499186515808, "learning_rate": 7.761904761904762e-05, "loss": 0.1839, "step": 525 }, { "epoch": 6.236686390532545, "grad_norm": 0.5382740497589111, "learning_rate": 7.642857142857143e-05, "loss": 0.1628, "step": 530 }, { "epoch": 6.295857988165681, "grad_norm": 0.6181464791297913, "learning_rate": 7.523809523809524e-05, "loss": 0.141, "step": 535 }, { "epoch": 6.355029585798817, "grad_norm": 0.7450206875801086, "learning_rate": 7.404761904761905e-05, "loss": 0.1453, "step": 540 }, { "epoch": 6.414201183431953, "grad_norm": 0.9426142573356628, "learning_rate": 7.285714285714286e-05, "loss": 0.1403, "step": 545 }, { "epoch": 6.4733727810650885, "grad_norm": 0.9675353169441223, "learning_rate": 7.166666666666667e-05, "loss": 0.1216, "step": 550 }, { "epoch": 6.5325443786982245, "grad_norm": 0.5108327269554138, "learning_rate": 7.047619047619048e-05, "loss": 0.1484, "step": 555 }, { "epoch": 6.591715976331361, "grad_norm": 0.6549590229988098, "learning_rate": 6.928571428571429e-05, "loss": 0.1338, "step": 560 }, { "epoch": 6.650887573964497, "grad_norm": 0.843664288520813, "learning_rate": 6.80952380952381e-05, "loss": 0.1586, "step": 565 }, { "epoch": 6.710059171597633, "grad_norm": 0.8650611639022827, "learning_rate": 6.69047619047619e-05, "loss": 0.1517, "step": 570 }, { "epoch": 6.769230769230769, "grad_norm": 0.7471471428871155, "learning_rate": 6.571428571428571e-05, "loss": 0.1966, "step": 575 }, { "epoch": 6.828402366863905, "grad_norm": 0.7219163775444031, "learning_rate": 6.452380952380954e-05, "loss": 0.1368, "step": 580 }, { "epoch": 6.887573964497041, "grad_norm": 0.911191463470459, "learning_rate": 6.333333333333333e-05, "loss": 0.1681, "step": 585 }, { "epoch": 6.946745562130177, "grad_norm": 0.6017201542854309, "learning_rate": 6.214285714285714e-05, "loss": 0.1675, "step": 590 }, { "epoch": 7.0, "grad_norm": 1.0746937990188599, "learning_rate": 6.0952380952380964e-05, "loss": 0.1944, "step": 595 }, { "epoch": 7.059171597633136, "grad_norm": 0.8227368593215942, "learning_rate": 5.9761904761904766e-05, "loss": 0.076, "step": 600 }, { "epoch": 7.118343195266272, "grad_norm": 0.6826525330543518, "learning_rate": 5.8571428571428575e-05, "loss": 0.1048, "step": 605 }, { "epoch": 7.177514792899408, "grad_norm": 1.0038989782333374, "learning_rate": 5.738095238095238e-05, "loss": 0.0998, "step": 610 }, { "epoch": 7.236686390532545, "grad_norm": 0.8537135720252991, "learning_rate": 5.619047619047619e-05, "loss": 0.1035, "step": 615 }, { "epoch": 7.295857988165681, "grad_norm": 0.7067388892173767, "learning_rate": 5.500000000000001e-05, "loss": 0.1186, "step": 620 }, { "epoch": 7.355029585798817, "grad_norm": 0.7003813982009888, "learning_rate": 5.380952380952381e-05, "loss": 0.0803, "step": 625 }, { "epoch": 7.414201183431953, "grad_norm": 0.9923582673072815, "learning_rate": 5.261904761904763e-05, "loss": 0.1121, "step": 630 }, { "epoch": 7.4733727810650885, "grad_norm": 0.4991615116596222, "learning_rate": 5.142857142857143e-05, "loss": 0.1056, "step": 635 }, { "epoch": 7.5325443786982245, "grad_norm": 0.7406235337257385, "learning_rate": 5.023809523809524e-05, "loss": 0.1135, "step": 640 }, { "epoch": 7.591715976331361, "grad_norm": 0.5871267318725586, "learning_rate": 4.904761904761905e-05, "loss": 0.1141, "step": 645 }, { "epoch": 7.650887573964497, "grad_norm": 0.5495700836181641, "learning_rate": 4.785714285714286e-05, "loss": 0.07, "step": 650 }, { "epoch": 7.710059171597633, "grad_norm": 0.9295830130577087, "learning_rate": 4.666666666666667e-05, "loss": 0.1293, "step": 655 }, { "epoch": 7.769230769230769, "grad_norm": 0.9041563272476196, "learning_rate": 4.547619047619048e-05, "loss": 0.1034, "step": 660 }, { "epoch": 7.828402366863905, "grad_norm": 0.6490697264671326, "learning_rate": 4.428571428571428e-05, "loss": 0.0898, "step": 665 }, { "epoch": 7.887573964497041, "grad_norm": 0.5583420991897583, "learning_rate": 4.30952380952381e-05, "loss": 0.0891, "step": 670 }, { "epoch": 7.946745562130177, "grad_norm": 0.7829737663269043, "learning_rate": 4.190476190476191e-05, "loss": 0.112, "step": 675 }, { "epoch": 8.0, "grad_norm": 0.48943546414375305, "learning_rate": 4.0714285714285717e-05, "loss": 0.115, "step": 680 }, { "epoch": 8.059171597633137, "grad_norm": 0.4989611506462097, "learning_rate": 3.9523809523809526e-05, "loss": 0.0645, "step": 685 }, { "epoch": 8.118343195266272, "grad_norm": 0.5723634362220764, "learning_rate": 3.8333333333333334e-05, "loss": 0.0603, "step": 690 }, { "epoch": 8.177514792899409, "grad_norm": 0.5361748933792114, "learning_rate": 3.7142857142857143e-05, "loss": 0.0626, "step": 695 }, { "epoch": 8.236686390532544, "grad_norm": 0.8497764468193054, "learning_rate": 3.595238095238095e-05, "loss": 0.0856, "step": 700 }, { "epoch": 8.29585798816568, "grad_norm": 0.48423126339912415, "learning_rate": 3.476190476190476e-05, "loss": 0.0568, "step": 705 }, { "epoch": 8.355029585798816, "grad_norm": 0.30722182989120483, "learning_rate": 3.357142857142857e-05, "loss": 0.0564, "step": 710 }, { "epoch": 8.414201183431953, "grad_norm": 0.637298047542572, "learning_rate": 3.2380952380952386e-05, "loss": 0.0792, "step": 715 }, { "epoch": 8.47337278106509, "grad_norm": 1.1620301008224487, "learning_rate": 3.1190476190476195e-05, "loss": 0.0848, "step": 720 }, { "epoch": 8.532544378698224, "grad_norm": 0.9842550158500671, "learning_rate": 3e-05, "loss": 0.0695, "step": 725 }, { "epoch": 8.591715976331361, "grad_norm": 0.5429280996322632, "learning_rate": 2.880952380952381e-05, "loss": 0.0691, "step": 730 }, { "epoch": 8.650887573964496, "grad_norm": 0.4466063380241394, "learning_rate": 2.7619047619047622e-05, "loss": 0.0427, "step": 735 }, { "epoch": 8.710059171597633, "grad_norm": 0.7048435807228088, "learning_rate": 2.642857142857143e-05, "loss": 0.066, "step": 740 }, { "epoch": 8.76923076923077, "grad_norm": 0.71544349193573, "learning_rate": 2.523809523809524e-05, "loss": 0.048, "step": 745 }, { "epoch": 8.828402366863905, "grad_norm": 0.27939409017562866, "learning_rate": 2.404761904761905e-05, "loss": 0.047, "step": 750 }, { "epoch": 8.887573964497042, "grad_norm": 0.30130356550216675, "learning_rate": 2.2857142857142858e-05, "loss": 0.0715, "step": 755 }, { "epoch": 8.946745562130177, "grad_norm": 0.6010681986808777, "learning_rate": 2.1666666666666667e-05, "loss": 0.0478, "step": 760 }, { "epoch": 9.0, "grad_norm": 0.5269991159439087, "learning_rate": 2.0476190476190476e-05, "loss": 0.048, "step": 765 }, { "epoch": 9.059171597633137, "grad_norm": 0.30882084369659424, "learning_rate": 1.928571428571429e-05, "loss": 0.0508, "step": 770 }, { "epoch": 9.118343195266272, "grad_norm": 0.3261071741580963, "learning_rate": 1.8095238095238094e-05, "loss": 0.0439, "step": 775 }, { "epoch": 9.177514792899409, "grad_norm": 0.33494994044303894, "learning_rate": 1.6904761904761906e-05, "loss": 0.0394, "step": 780 }, { "epoch": 9.236686390532544, "grad_norm": 0.470735639333725, "learning_rate": 1.5714285714285715e-05, "loss": 0.0465, "step": 785 }, { "epoch": 9.29585798816568, "grad_norm": 0.4816909432411194, "learning_rate": 1.4523809523809526e-05, "loss": 0.0461, "step": 790 }, { "epoch": 9.355029585798816, "grad_norm": 0.332380086183548, "learning_rate": 1.3333333333333333e-05, "loss": 0.0438, "step": 795 }, { "epoch": 9.414201183431953, "grad_norm": 1.0364826917648315, "learning_rate": 1.2142857142857144e-05, "loss": 0.0575, "step": 800 }, { "epoch": 9.47337278106509, "grad_norm": 0.31686267256736755, "learning_rate": 1.0952380952380953e-05, "loss": 0.0376, "step": 805 }, { "epoch": 9.532544378698224, "grad_norm": 0.19316697120666504, "learning_rate": 9.761904761904762e-06, "loss": 0.0546, "step": 810 }, { "epoch": 9.591715976331361, "grad_norm": 0.3316756784915924, "learning_rate": 8.571428571428573e-06, "loss": 0.0303, "step": 815 }, { "epoch": 9.650887573964496, "grad_norm": 0.38312825560569763, "learning_rate": 7.380952380952382e-06, "loss": 0.0407, "step": 820 }, { "epoch": 9.710059171597633, "grad_norm": 0.1672104001045227, "learning_rate": 6.190476190476191e-06, "loss": 0.0406, "step": 825 }, { "epoch": 9.76923076923077, "grad_norm": 0.30679717659950256, "learning_rate": 5e-06, "loss": 0.0316, "step": 830 }, { "epoch": 9.828402366863905, "grad_norm": 0.3245919346809387, "learning_rate": 3.8095238095238102e-06, "loss": 0.0361, "step": 835 }, { "epoch": 9.887573964497042, "grad_norm": 0.25631895661354065, "learning_rate": 2.6190476190476192e-06, "loss": 0.0351, "step": 840 }, { "epoch": 9.946745562130177, "grad_norm": 0.2532467842102051, "learning_rate": 1.4285714285714286e-06, "loss": 0.0397, "step": 845 }, { "epoch": 10.0, "grad_norm": 0.7708420157432556, "learning_rate": 2.3809523809523814e-07, "loss": 0.0416, "step": 850 }, { "epoch": 10.0, "step": 850, "total_flos": 5.971841440128e+16, "train_loss": 0.3865027742876726, "train_runtime": 1732.5271, "train_samples_per_second": 3.902, "train_steps_per_second": 0.491 } ], "logging_steps": 5, "max_steps": 850, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.971841440128e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }