| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.339553604098061, |
| "eval_steps": 500, |
| "global_step": 800, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.5426861481741071, |
| "epoch": 0.029271862422246615, |
| "grad_norm": 25.375, |
| "learning_rate": 1.8000000000000001e-06, |
| "loss": 0.8153, |
| "mean_token_accuracy": 0.859016589075327, |
| "num_tokens": 178219.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.5261224403977394, |
| "epoch": 0.05854372484449323, |
| "grad_norm": 15.75, |
| "learning_rate": 3.8000000000000005e-06, |
| "loss": 0.683, |
| "mean_token_accuracy": 0.8744383942335844, |
| "num_tokens": 364483.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.5709817312657833, |
| "epoch": 0.08781558726673985, |
| "grad_norm": 7.75, |
| "learning_rate": 5.8e-06, |
| "loss": 0.5136, |
| "mean_token_accuracy": 0.8928901240229606, |
| "num_tokens": 544552.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.713307585567236, |
| "epoch": 0.11708744968898646, |
| "grad_norm": 12.5625, |
| "learning_rate": 7.800000000000002e-06, |
| "loss": 0.4313, |
| "mean_token_accuracy": 0.9048704490065574, |
| "num_tokens": 717853.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.6436625245958567, |
| "epoch": 0.14635931211123307, |
| "grad_norm": 6.9375, |
| "learning_rate": 9.800000000000001e-06, |
| "loss": 0.221, |
| "mean_token_accuracy": 0.949518696218729, |
| "num_tokens": 892769.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.6698052477091551, |
| "epoch": 0.1756311745334797, |
| "grad_norm": 5.625, |
| "learning_rate": 9.997902051783373e-06, |
| "loss": 0.2292, |
| "mean_token_accuracy": 0.9477126337587833, |
| "num_tokens": 1062544.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.6264983955770731, |
| "epoch": 0.20490303695572631, |
| "grad_norm": 2.25, |
| "learning_rate": 9.990652145366113e-06, |
| "loss": 0.1432, |
| "mean_token_accuracy": 0.9651672072708607, |
| "num_tokens": 1238635.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.5786459453403949, |
| "epoch": 0.23417489937797292, |
| "grad_norm": 7.375, |
| "learning_rate": 9.978231889316302e-06, |
| "loss": 0.1347, |
| "mean_token_accuracy": 0.9673161715269089, |
| "num_tokens": 1422527.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.5798539651557804, |
| "epoch": 0.26344676180021953, |
| "grad_norm": 5.65625, |
| "learning_rate": 9.960654151103846e-06, |
| "loss": 0.1386, |
| "mean_token_accuracy": 0.9657714806497097, |
| "num_tokens": 1604412.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.6099357772618532, |
| "epoch": 0.29271862422246614, |
| "grad_norm": 2.640625, |
| "learning_rate": 9.937937141385323e-06, |
| "loss": 0.1463, |
| "mean_token_accuracy": 0.9633365988731384, |
| "num_tokens": 1785808.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.6332811841741204, |
| "epoch": 0.32199048664471275, |
| "grad_norm": 2.734375, |
| "learning_rate": 9.91010439513761e-06, |
| "loss": 0.1918, |
| "mean_token_accuracy": 0.9552642989903688, |
| "num_tokens": 1963662.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.6065963115543127, |
| "epoch": 0.3512623490669594, |
| "grad_norm": 3.484375, |
| "learning_rate": 9.87718474727549e-06, |
| "loss": 0.1421, |
| "mean_token_accuracy": 0.965429300814867, |
| "num_tokens": 2138861.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.6128268184140324, |
| "epoch": 0.380534211489206, |
| "grad_norm": 4.75, |
| "learning_rate": 9.839212302778493e-06, |
| "loss": 0.1156, |
| "mean_token_accuracy": 0.9703474834561348, |
| "num_tokens": 2314376.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.6441192388534546, |
| "epoch": 0.40980607391145263, |
| "grad_norm": 4.6875, |
| "learning_rate": 9.796226401357884e-06, |
| "loss": 0.1499, |
| "mean_token_accuracy": 0.9633081808686257, |
| "num_tokens": 2494288.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.6087658466771245, |
| "epoch": 0.43907793633369924, |
| "grad_norm": 2.828125, |
| "learning_rate": 9.748271576700476e-06, |
| "loss": 0.1374, |
| "mean_token_accuracy": 0.9672404788434505, |
| "num_tokens": 2682554.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.5889677032828331, |
| "epoch": 0.46834979875594585, |
| "grad_norm": 2.78125, |
| "learning_rate": 9.69539751033141e-06, |
| "loss": 0.124, |
| "mean_token_accuracy": 0.9677505977451801, |
| "num_tokens": 2855437.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.6262755762785673, |
| "epoch": 0.49762166117819245, |
| "grad_norm": 3.75, |
| "learning_rate": 9.637658980143771e-06, |
| "loss": 0.1143, |
| "mean_token_accuracy": 0.9720054470002651, |
| "num_tokens": 3034970.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.626539696007967, |
| "epoch": 0.5268935236004391, |
| "grad_norm": 3.953125, |
| "learning_rate": 9.575115803648303e-06, |
| "loss": 0.1188, |
| "mean_token_accuracy": 0.9688047252595424, |
| "num_tokens": 3208412.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.6318832565099001, |
| "epoch": 0.5561653860226857, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.507832776002069e-06, |
| "loss": 0.1235, |
| "mean_token_accuracy": 0.9680231802165509, |
| "num_tokens": 3395494.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.6783206924796105, |
| "epoch": 0.5854372484449323, |
| "grad_norm": 2.0625, |
| "learning_rate": 9.43587960288023e-06, |
| "loss": 0.162, |
| "mean_token_accuracy": 0.9603818170726299, |
| "num_tokens": 3573674.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.6365317944437265, |
| "epoch": 0.6147091108671789, |
| "grad_norm": 2.421875, |
| "learning_rate": 9.359330828260477e-06, |
| "loss": 0.1693, |
| "mean_token_accuracy": 0.9591907132416964, |
| "num_tokens": 3749797.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.6506655000150203, |
| "epoch": 0.6439809732894255, |
| "grad_norm": 3.515625, |
| "learning_rate": 9.278265757194983e-06, |
| "loss": 0.1203, |
| "mean_token_accuracy": 0.9689317874610424, |
| "num_tokens": 3927226.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.6768201310187578, |
| "epoch": 0.6732528357116722, |
| "grad_norm": 2.390625, |
| "learning_rate": 9.1927683736498e-06, |
| "loss": 0.1644, |
| "mean_token_accuracy": 0.9586910635232926, |
| "num_tokens": 4107295.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.638230774179101, |
| "epoch": 0.7025246981339188, |
| "grad_norm": 3.5625, |
| "learning_rate": 9.102927253496926e-06, |
| "loss": 0.1498, |
| "mean_token_accuracy": 0.9630043372511864, |
| "num_tokens": 4286908.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.6300325348973275, |
| "epoch": 0.7317965605561654, |
| "grad_norm": 3.125, |
| "learning_rate": 9.008835472749085e-06, |
| "loss": 0.113, |
| "mean_token_accuracy": 0.9695447482168674, |
| "num_tokens": 4467461.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.614183092303574, |
| "epoch": 0.761068422978412, |
| "grad_norm": 3.015625, |
| "learning_rate": 8.910590511132339e-06, |
| "loss": 0.1014, |
| "mean_token_accuracy": 0.9731229566037655, |
| "num_tokens": 4646737.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.646520059555769, |
| "epoch": 0.7903402854006586, |
| "grad_norm": 3.21875, |
| "learning_rate": 8.808294151096436e-06, |
| "loss": 0.119, |
| "mean_token_accuracy": 0.9697787493467331, |
| "num_tokens": 4821010.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.6479289051145315, |
| "epoch": 0.8196121478229053, |
| "grad_norm": 5.15625, |
| "learning_rate": 8.702052372367496e-06, |
| "loss": 0.1092, |
| "mean_token_accuracy": 0.972802146524191, |
| "num_tokens": 4999440.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.6846032023429871, |
| "epoch": 0.8488840102451518, |
| "grad_norm": 4.1875, |
| "learning_rate": 8.591975242152293e-06, |
| "loss": 0.1371, |
| "mean_token_accuracy": 0.9650875002145767, |
| "num_tokens": 5171265.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.6806027568876744, |
| "epoch": 0.8781558726673985, |
| "grad_norm": 2.421875, |
| "learning_rate": 8.478176801107872e-06, |
| "loss": 0.1554, |
| "mean_token_accuracy": 0.9597282513976098, |
| "num_tokens": 5338761.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.6907825466245413, |
| "epoch": 0.907427735089645, |
| "grad_norm": 2.796875, |
| "learning_rate": 8.360774945194666e-06, |
| "loss": 0.1265, |
| "mean_token_accuracy": 0.9672806590795517, |
| "num_tokens": 5515062.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.6767658580094575, |
| "epoch": 0.9366995975118917, |
| "grad_norm": 2.765625, |
| "learning_rate": 8.239891303535457e-06, |
| "loss": 0.1201, |
| "mean_token_accuracy": 0.9692090585827827, |
| "num_tokens": 5696911.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.6577182695269584, |
| "epoch": 0.9659714599341384, |
| "grad_norm": 4.3125, |
| "learning_rate": 8.1156511124068e-06, |
| "loss": 0.1023, |
| "mean_token_accuracy": 0.9728448636829853, |
| "num_tokens": 5869301.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.6421828601509333, |
| "epoch": 0.9952433223563849, |
| "grad_norm": 4.65625, |
| "learning_rate": 7.988183085493362e-06, |
| "loss": 0.1263, |
| "mean_token_accuracy": 0.9664340995252132, |
| "num_tokens": 6044424.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.663570727620806, |
| "epoch": 1.0234174899377972, |
| "grad_norm": 1.53125, |
| "learning_rate": 7.85761928053969e-06, |
| "loss": 0.106, |
| "mean_token_accuracy": 0.971852526262209, |
| "num_tokens": 6217116.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.6997070100158453, |
| "epoch": 1.0526893523600438, |
| "grad_norm": 2.984375, |
| "learning_rate": 7.72409496253747e-06, |
| "loss": 0.119, |
| "mean_token_accuracy": 0.9707389414310456, |
| "num_tokens": 6387512.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.628655917569995, |
| "epoch": 1.0819612147822906, |
| "grad_norm": 2.96875, |
| "learning_rate": 7.5877484635900876e-06, |
| "loss": 0.1217, |
| "mean_token_accuracy": 0.9687882207334042, |
| "num_tokens": 6573367.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.6477397110313177, |
| "epoch": 1.1112330772045371, |
| "grad_norm": 2.765625, |
| "learning_rate": 7.448721039599616e-06, |
| "loss": 0.131, |
| "mean_token_accuracy": 0.9669771507382393, |
| "num_tokens": 6751810.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.61260422822088, |
| "epoch": 1.1405049396267837, |
| "grad_norm": 5.6875, |
| "learning_rate": 7.307156723924742e-06, |
| "loss": 0.1079, |
| "mean_token_accuracy": 0.9710600562393665, |
| "num_tokens": 6940710.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.6250462032854557, |
| "epoch": 1.1697768020490305, |
| "grad_norm": 2.625, |
| "learning_rate": 7.1632021781612305e-06, |
| "loss": 0.1033, |
| "mean_token_accuracy": 0.9722936369478703, |
| "num_tokens": 7120616.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.6539317451417446, |
| "epoch": 1.199048664471277, |
| "grad_norm": 2.484375, |
| "learning_rate": 7.017006540199501e-06, |
| "loss": 0.1275, |
| "mean_token_accuracy": 0.9674237795174122, |
| "num_tokens": 7295346.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.7194237690418959, |
| "epoch": 1.2283205268935236, |
| "grad_norm": 7.84375, |
| "learning_rate": 6.8687212697167685e-06, |
| "loss": 0.1547, |
| "mean_token_accuracy": 0.9618344724178314, |
| "num_tokens": 7473599.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.658033623546362, |
| "epoch": 1.25759238931577, |
| "grad_norm": 2.703125, |
| "learning_rate": 6.718499991263776e-06, |
| "loss": 0.1033, |
| "mean_token_accuracy": 0.9726057484745979, |
| "num_tokens": 7666324.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.6714454110711813, |
| "epoch": 1.2868642517380169, |
| "grad_norm": 3.046875, |
| "learning_rate": 6.566498335108719e-06, |
| "loss": 0.1742, |
| "mean_token_accuracy": 0.9564896896481514, |
| "num_tokens": 7842308.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.6405692713335156, |
| "epoch": 1.3161361141602634, |
| "grad_norm": 1.6015625, |
| "learning_rate": 6.412873776003224e-06, |
| "loss": 0.1023, |
| "mean_token_accuracy": 0.9734247334301471, |
| "num_tokens": 8027201.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.6420178588479757, |
| "epoch": 1.34540797658251, |
| "grad_norm": 2.4375, |
| "learning_rate": 6.2577854700374326e-06, |
| "loss": 0.0912, |
| "mean_token_accuracy": 0.973945663869381, |
| "num_tokens": 8198350.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.6285538610070944, |
| "epoch": 1.3746798390047568, |
| "grad_norm": 2.578125, |
| "learning_rate": 6.101394089753215e-06, |
| "loss": 0.1143, |
| "mean_token_accuracy": 0.9697877489030361, |
| "num_tokens": 8384460.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.6119630802422762, |
| "epoch": 1.4039517014270033, |
| "grad_norm": 2.71875, |
| "learning_rate": 5.9438616576863085e-06, |
| "loss": 0.1016, |
| "mean_token_accuracy": 0.9731581903994083, |
| "num_tokens": 8555391.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.6343989443033934, |
| "epoch": 1.4332235638492499, |
| "grad_norm": 3.40625, |
| "learning_rate": 5.785351378509875e-06, |
| "loss": 0.112, |
| "mean_token_accuracy": 0.970611660182476, |
| "num_tokens": 8734253.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.674339522048831, |
| "epoch": 1.4624954262714964, |
| "grad_norm": 6.125, |
| "learning_rate": 5.626027469953345e-06, |
| "loss": 0.1395, |
| "mean_token_accuracy": 0.9643946584314108, |
| "num_tokens": 8910777.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.665541959553957, |
| "epoch": 1.4917672886937432, |
| "grad_norm": 6.9375, |
| "learning_rate": 5.466054992671736e-06, |
| "loss": 0.1391, |
| "mean_token_accuracy": 0.9648671910166741, |
| "num_tokens": 9079044.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 0.6814688537269831, |
| "epoch": 1.5210391511159898, |
| "grad_norm": 3.96875, |
| "learning_rate": 5.3055996792416795e-06, |
| "loss": 0.1071, |
| "mean_token_accuracy": 0.9721961826086044, |
| "num_tokens": 9256503.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 0.699323944374919, |
| "epoch": 1.5503110135382365, |
| "grad_norm": 2.375, |
| "learning_rate": 5.14482776246135e-06, |
| "loss": 0.1457, |
| "mean_token_accuracy": 0.9631756335496903, |
| "num_tokens": 9427931.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 0.6563603695482015, |
| "epoch": 1.579582875960483, |
| "grad_norm": 5.8125, |
| "learning_rate": 4.9839058031321454e-06, |
| "loss": 0.0932, |
| "mean_token_accuracy": 0.9753052346408367, |
| "num_tokens": 9605378.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 0.6394913710653782, |
| "epoch": 1.6088547383827296, |
| "grad_norm": 3.015625, |
| "learning_rate": 4.8230005175005765e-06, |
| "loss": 0.0978, |
| "mean_token_accuracy": 0.9747575528919696, |
| "num_tokens": 9790877.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.6210989141836762, |
| "epoch": 1.6381266008049762, |
| "grad_norm": 4.71875, |
| "learning_rate": 4.66227860453908e-06, |
| "loss": 0.1128, |
| "mean_token_accuracy": 0.9707565441727638, |
| "num_tokens": 9979162.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 0.6766301516443491, |
| "epoch": 1.6673984632272227, |
| "grad_norm": 3.65625, |
| "learning_rate": 4.5019065732447596e-06, |
| "loss": 0.1219, |
| "mean_token_accuracy": 0.9683363229036331, |
| "num_tokens": 10153683.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 0.6057208560407161, |
| "epoch": 1.6966703256494693, |
| "grad_norm": 6.625, |
| "learning_rate": 4.342050570134933e-06, |
| "loss": 0.1118, |
| "mean_token_accuracy": 0.9711062803864479, |
| "num_tokens": 10343239.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 0.6616282057017088, |
| "epoch": 1.725942188071716, |
| "grad_norm": 3.328125, |
| "learning_rate": 4.1828762071181924e-06, |
| "loss": 0.1256, |
| "mean_token_accuracy": 0.967843408882618, |
| "num_tokens": 10510567.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 0.6358083071187138, |
| "epoch": 1.7552140504939628, |
| "grad_norm": 2.828125, |
| "learning_rate": 4.02454838991936e-06, |
| "loss": 0.1111, |
| "mean_token_accuracy": 0.9699119538068771, |
| "num_tokens": 10693481.0, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.5980179835110903, |
| "epoch": 1.7844859129162094, |
| "grad_norm": 5.03125, |
| "learning_rate": 3.86723114723601e-06, |
| "loss": 0.0939, |
| "mean_token_accuracy": 0.9751992784440517, |
| "num_tokens": 10872219.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 0.6339624278247357, |
| "epoch": 1.813757775338456, |
| "grad_norm": 2.609375, |
| "learning_rate": 3.7110874608036375e-06, |
| "loss": 0.1029, |
| "mean_token_accuracy": 0.972338755428791, |
| "num_tokens": 11041833.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 0.6484420213848352, |
| "epoch": 1.8430296377607025, |
| "grad_norm": 3.9375, |
| "learning_rate": 3.556279096545467e-06, |
| "loss": 0.0949, |
| "mean_token_accuracy": 0.9737608321011066, |
| "num_tokens": 11214369.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 0.640478839725256, |
| "epoch": 1.872301500182949, |
| "grad_norm": 2.9375, |
| "learning_rate": 3.4029664369818426e-06, |
| "loss": 0.1418, |
| "mean_token_accuracy": 0.9651787281036377, |
| "num_tokens": 11386576.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 0.631079326197505, |
| "epoch": 1.9015733626051956, |
| "grad_norm": 2.125, |
| "learning_rate": 3.251308315072862e-06, |
| "loss": 0.1073, |
| "mean_token_accuracy": 0.9708538435399532, |
| "num_tokens": 11560738.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.6620105486363173, |
| "epoch": 1.9308452250274424, |
| "grad_norm": 2.5625, |
| "learning_rate": 3.1014618496663298e-06, |
| "loss": 0.1672, |
| "mean_token_accuracy": 0.959955221414566, |
| "num_tokens": 11742855.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 0.6634238740429282, |
| "epoch": 1.960117087449689, |
| "grad_norm": 5.3125, |
| "learning_rate": 2.9535822827215686e-06, |
| "loss": 0.094, |
| "mean_token_accuracy": 0.974735701829195, |
| "num_tokens": 11920135.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 0.6778149709105492, |
| "epoch": 1.9893889498719357, |
| "grad_norm": 3.8125, |
| "learning_rate": 2.8078228184776974e-06, |
| "loss": 0.1222, |
| "mean_token_accuracy": 0.9682413943111896, |
| "num_tokens": 12088155.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 0.624413115637643, |
| "epoch": 2.017563117453348, |
| "grad_norm": 2.0625, |
| "learning_rate": 2.6643344647329784e-06, |
| "loss": 0.1305, |
| "mean_token_accuracy": 0.965500292839942, |
| "num_tokens": 12273212.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 0.6615780189633369, |
| "epoch": 2.0468349798755945, |
| "grad_norm": 3.0625, |
| "learning_rate": 2.523265876399731e-06, |
| "loss": 0.1302, |
| "mean_token_accuracy": 0.9653139889240265, |
| "num_tokens": 12446204.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.6647442825138569, |
| "epoch": 2.076106842297841, |
| "grad_norm": 3.125, |
| "learning_rate": 2.384763201496809e-06, |
| "loss": 0.1186, |
| "mean_token_accuracy": 0.9683853723108768, |
| "num_tokens": 12619520.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 0.6487895751371979, |
| "epoch": 2.1053787047200876, |
| "grad_norm": 5.625, |
| "learning_rate": 2.248969929739273e-06, |
| "loss": 0.1201, |
| "mean_token_accuracy": 0.9684486784040928, |
| "num_tokens": 12790301.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 0.6803991423919797, |
| "epoch": 2.1346505671423346, |
| "grad_norm": 2.828125, |
| "learning_rate": 2.1160267438820585e-06, |
| "loss": 0.1352, |
| "mean_token_accuracy": 0.9661958761513233, |
| "num_tokens": 12962085.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 0.640974473580718, |
| "epoch": 2.163922429564581, |
| "grad_norm": 2.109375, |
| "learning_rate": 1.9860713739716846e-06, |
| "loss": 0.0973, |
| "mean_token_accuracy": 0.9733841702342033, |
| "num_tokens": 13136263.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 0.6351502992212772, |
| "epoch": 2.1931942919868277, |
| "grad_norm": 2.78125, |
| "learning_rate": 1.8592384546569897e-06, |
| "loss": 0.1025, |
| "mean_token_accuracy": 0.9729525096714496, |
| "num_tokens": 13322241.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 0.6530712179839611, |
| "epoch": 2.2224661544090742, |
| "grad_norm": 3.0625, |
| "learning_rate": 1.7356593857067161e-06, |
| "loss": 0.1037, |
| "mean_token_accuracy": 0.9716600969433784, |
| "num_tokens": 13495808.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 0.6783195801079274, |
| "epoch": 2.251738016831321, |
| "grad_norm": 2.078125, |
| "learning_rate": 1.6154621958784522e-06, |
| "loss": 0.1281, |
| "mean_token_accuracy": 0.9669503092765808, |
| "num_tokens": 13663184.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 0.6459674347192049, |
| "epoch": 2.2810098792535674, |
| "grad_norm": 4.125, |
| "learning_rate": 1.4987714102799755e-06, |
| "loss": 0.1225, |
| "mean_token_accuracy": 0.9684602275490761, |
| "num_tokens": 13846527.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 0.6219800597056746, |
| "epoch": 2.310281741675814, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.385707921360379e-06, |
| "loss": 0.1018, |
| "mean_token_accuracy": 0.9725745670497418, |
| "num_tokens": 14040306.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 0.6727650195360184, |
| "epoch": 2.339553604098061, |
| "grad_norm": 5.875, |
| "learning_rate": 1.2763888636646838e-06, |
| "loss": 0.1039, |
| "mean_token_accuracy": 0.9730930998921394, |
| "num_tokens": 14211656.0, |
| "step": 800 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1026, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.546803296916603e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|