{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.339553604098061, "eval_steps": 500, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.5426861481741071, "epoch": 0.029271862422246615, "grad_norm": 25.375, "learning_rate": 1.8000000000000001e-06, "loss": 0.8153, "mean_token_accuracy": 0.859016589075327, "num_tokens": 178219.0, "step": 10 }, { "entropy": 0.5261224403977394, "epoch": 0.05854372484449323, "grad_norm": 15.75, "learning_rate": 3.8000000000000005e-06, "loss": 0.683, "mean_token_accuracy": 0.8744383942335844, "num_tokens": 364483.0, "step": 20 }, { "entropy": 0.5709817312657833, "epoch": 0.08781558726673985, "grad_norm": 7.75, "learning_rate": 5.8e-06, "loss": 0.5136, "mean_token_accuracy": 0.8928901240229606, "num_tokens": 544552.0, "step": 30 }, { "entropy": 0.713307585567236, "epoch": 0.11708744968898646, "grad_norm": 12.5625, "learning_rate": 7.800000000000002e-06, "loss": 0.4313, "mean_token_accuracy": 0.9048704490065574, "num_tokens": 717853.0, "step": 40 }, { "entropy": 0.6436625245958567, "epoch": 0.14635931211123307, "grad_norm": 6.9375, "learning_rate": 9.800000000000001e-06, "loss": 0.221, "mean_token_accuracy": 0.949518696218729, "num_tokens": 892769.0, "step": 50 }, { "entropy": 0.6698052477091551, "epoch": 0.1756311745334797, "grad_norm": 5.625, "learning_rate": 9.997902051783373e-06, "loss": 0.2292, "mean_token_accuracy": 0.9477126337587833, "num_tokens": 1062544.0, "step": 60 }, { "entropy": 0.6264983955770731, "epoch": 0.20490303695572631, "grad_norm": 2.25, "learning_rate": 9.990652145366113e-06, "loss": 0.1432, "mean_token_accuracy": 0.9651672072708607, "num_tokens": 1238635.0, "step": 70 }, { "entropy": 0.5786459453403949, "epoch": 0.23417489937797292, "grad_norm": 7.375, "learning_rate": 9.978231889316302e-06, "loss": 0.1347, "mean_token_accuracy": 0.9673161715269089, "num_tokens": 1422527.0, "step": 80 }, { "entropy": 0.5798539651557804, "epoch": 0.26344676180021953, "grad_norm": 5.65625, "learning_rate": 9.960654151103846e-06, "loss": 0.1386, "mean_token_accuracy": 0.9657714806497097, "num_tokens": 1604412.0, "step": 90 }, { "entropy": 0.6099357772618532, "epoch": 0.29271862422246614, "grad_norm": 2.640625, "learning_rate": 9.937937141385323e-06, "loss": 0.1463, "mean_token_accuracy": 0.9633365988731384, "num_tokens": 1785808.0, "step": 100 }, { "entropy": 0.6332811841741204, "epoch": 0.32199048664471275, "grad_norm": 2.734375, "learning_rate": 9.91010439513761e-06, "loss": 0.1918, "mean_token_accuracy": 0.9552642989903688, "num_tokens": 1963662.0, "step": 110 }, { "entropy": 0.6065963115543127, "epoch": 0.3512623490669594, "grad_norm": 3.484375, "learning_rate": 9.87718474727549e-06, "loss": 0.1421, "mean_token_accuracy": 0.965429300814867, "num_tokens": 2138861.0, "step": 120 }, { "entropy": 0.6128268184140324, "epoch": 0.380534211489206, "grad_norm": 4.75, "learning_rate": 9.839212302778493e-06, "loss": 0.1156, "mean_token_accuracy": 0.9703474834561348, "num_tokens": 2314376.0, "step": 130 }, { "entropy": 0.6441192388534546, "epoch": 0.40980607391145263, "grad_norm": 4.6875, "learning_rate": 9.796226401357884e-06, "loss": 0.1499, "mean_token_accuracy": 0.9633081808686257, "num_tokens": 2494288.0, "step": 140 }, { "entropy": 0.6087658466771245, "epoch": 0.43907793633369924, "grad_norm": 2.828125, "learning_rate": 9.748271576700476e-06, "loss": 0.1374, "mean_token_accuracy": 0.9672404788434505, "num_tokens": 2682554.0, "step": 150 }, { "entropy": 0.5889677032828331, "epoch": 0.46834979875594585, "grad_norm": 2.78125, "learning_rate": 9.69539751033141e-06, "loss": 0.124, "mean_token_accuracy": 0.9677505977451801, "num_tokens": 2855437.0, "step": 160 }, { "entropy": 0.6262755762785673, "epoch": 0.49762166117819245, "grad_norm": 3.75, "learning_rate": 9.637658980143771e-06, "loss": 0.1143, "mean_token_accuracy": 0.9720054470002651, "num_tokens": 3034970.0, "step": 170 }, { "entropy": 0.626539696007967, "epoch": 0.5268935236004391, "grad_norm": 3.953125, "learning_rate": 9.575115803648303e-06, "loss": 0.1188, "mean_token_accuracy": 0.9688047252595424, "num_tokens": 3208412.0, "step": 180 }, { "entropy": 0.6318832565099001, "epoch": 0.5561653860226857, "grad_norm": 2.03125, "learning_rate": 9.507832776002069e-06, "loss": 0.1235, "mean_token_accuracy": 0.9680231802165509, "num_tokens": 3395494.0, "step": 190 }, { "entropy": 0.6783206924796105, "epoch": 0.5854372484449323, "grad_norm": 2.0625, "learning_rate": 9.43587960288023e-06, "loss": 0.162, "mean_token_accuracy": 0.9603818170726299, "num_tokens": 3573674.0, "step": 200 }, { "entropy": 0.6365317944437265, "epoch": 0.6147091108671789, "grad_norm": 2.421875, "learning_rate": 9.359330828260477e-06, "loss": 0.1693, "mean_token_accuracy": 0.9591907132416964, "num_tokens": 3749797.0, "step": 210 }, { "entropy": 0.6506655000150203, "epoch": 0.6439809732894255, "grad_norm": 3.515625, "learning_rate": 9.278265757194983e-06, "loss": 0.1203, "mean_token_accuracy": 0.9689317874610424, "num_tokens": 3927226.0, "step": 220 }, { "entropy": 0.6768201310187578, "epoch": 0.6732528357116722, "grad_norm": 2.390625, "learning_rate": 9.1927683736498e-06, "loss": 0.1644, "mean_token_accuracy": 0.9586910635232926, "num_tokens": 4107295.0, "step": 230 }, { "entropy": 0.638230774179101, "epoch": 0.7025246981339188, "grad_norm": 3.5625, "learning_rate": 9.102927253496926e-06, "loss": 0.1498, "mean_token_accuracy": 0.9630043372511864, "num_tokens": 4286908.0, "step": 240 }, { "entropy": 0.6300325348973275, "epoch": 0.7317965605561654, "grad_norm": 3.125, "learning_rate": 9.008835472749085e-06, "loss": 0.113, "mean_token_accuracy": 0.9695447482168674, "num_tokens": 4467461.0, "step": 250 }, { "entropy": 0.614183092303574, "epoch": 0.761068422978412, "grad_norm": 3.015625, "learning_rate": 8.910590511132339e-06, "loss": 0.1014, "mean_token_accuracy": 0.9731229566037655, "num_tokens": 4646737.0, "step": 260 }, { "entropy": 0.646520059555769, "epoch": 0.7903402854006586, "grad_norm": 3.21875, "learning_rate": 8.808294151096436e-06, "loss": 0.119, "mean_token_accuracy": 0.9697787493467331, "num_tokens": 4821010.0, "step": 270 }, { "entropy": 0.6479289051145315, "epoch": 0.8196121478229053, "grad_norm": 5.15625, "learning_rate": 8.702052372367496e-06, "loss": 0.1092, "mean_token_accuracy": 0.972802146524191, "num_tokens": 4999440.0, "step": 280 }, { "entropy": 0.6846032023429871, "epoch": 0.8488840102451518, "grad_norm": 4.1875, "learning_rate": 8.591975242152293e-06, "loss": 0.1371, "mean_token_accuracy": 0.9650875002145767, "num_tokens": 5171265.0, "step": 290 }, { "entropy": 0.6806027568876744, "epoch": 0.8781558726673985, "grad_norm": 2.421875, "learning_rate": 8.478176801107872e-06, "loss": 0.1554, "mean_token_accuracy": 0.9597282513976098, "num_tokens": 5338761.0, "step": 300 }, { "entropy": 0.6907825466245413, "epoch": 0.907427735089645, "grad_norm": 2.796875, "learning_rate": 8.360774945194666e-06, "loss": 0.1265, "mean_token_accuracy": 0.9672806590795517, "num_tokens": 5515062.0, "step": 310 }, { "entropy": 0.6767658580094575, "epoch": 0.9366995975118917, "grad_norm": 2.765625, "learning_rate": 8.239891303535457e-06, "loss": 0.1201, "mean_token_accuracy": 0.9692090585827827, "num_tokens": 5696911.0, "step": 320 }, { "entropy": 0.6577182695269584, "epoch": 0.9659714599341384, "grad_norm": 4.3125, "learning_rate": 8.1156511124068e-06, "loss": 0.1023, "mean_token_accuracy": 0.9728448636829853, "num_tokens": 5869301.0, "step": 330 }, { "entropy": 0.6421828601509333, "epoch": 0.9952433223563849, "grad_norm": 4.65625, "learning_rate": 7.988183085493362e-06, "loss": 0.1263, "mean_token_accuracy": 0.9664340995252132, "num_tokens": 6044424.0, "step": 340 }, { "entropy": 0.663570727620806, "epoch": 1.0234174899377972, "grad_norm": 1.53125, "learning_rate": 7.85761928053969e-06, "loss": 0.106, "mean_token_accuracy": 0.971852526262209, "num_tokens": 6217116.0, "step": 350 }, { "entropy": 0.6997070100158453, "epoch": 1.0526893523600438, "grad_norm": 2.984375, "learning_rate": 7.72409496253747e-06, "loss": 0.119, "mean_token_accuracy": 0.9707389414310456, "num_tokens": 6387512.0, "step": 360 }, { "entropy": 0.628655917569995, "epoch": 1.0819612147822906, "grad_norm": 2.96875, "learning_rate": 7.5877484635900876e-06, "loss": 0.1217, "mean_token_accuracy": 0.9687882207334042, "num_tokens": 6573367.0, "step": 370 }, { "entropy": 0.6477397110313177, "epoch": 1.1112330772045371, "grad_norm": 2.765625, "learning_rate": 7.448721039599616e-06, "loss": 0.131, "mean_token_accuracy": 0.9669771507382393, "num_tokens": 6751810.0, "step": 380 }, { "entropy": 0.61260422822088, "epoch": 1.1405049396267837, "grad_norm": 5.6875, "learning_rate": 7.307156723924742e-06, "loss": 0.1079, "mean_token_accuracy": 0.9710600562393665, "num_tokens": 6940710.0, "step": 390 }, { "entropy": 0.6250462032854557, "epoch": 1.1697768020490305, "grad_norm": 2.625, "learning_rate": 7.1632021781612305e-06, "loss": 0.1033, "mean_token_accuracy": 0.9722936369478703, "num_tokens": 7120616.0, "step": 400 }, { "entropy": 0.6539317451417446, "epoch": 1.199048664471277, "grad_norm": 2.484375, "learning_rate": 7.017006540199501e-06, "loss": 0.1275, "mean_token_accuracy": 0.9674237795174122, "num_tokens": 7295346.0, "step": 410 }, { "entropy": 0.7194237690418959, "epoch": 1.2283205268935236, "grad_norm": 7.84375, "learning_rate": 6.8687212697167685e-06, "loss": 0.1547, "mean_token_accuracy": 0.9618344724178314, "num_tokens": 7473599.0, "step": 420 }, { "entropy": 0.658033623546362, "epoch": 1.25759238931577, "grad_norm": 2.703125, "learning_rate": 6.718499991263776e-06, "loss": 0.1033, "mean_token_accuracy": 0.9726057484745979, "num_tokens": 7666324.0, "step": 430 }, { "entropy": 0.6714454110711813, "epoch": 1.2868642517380169, "grad_norm": 3.046875, "learning_rate": 6.566498335108719e-06, "loss": 0.1742, "mean_token_accuracy": 0.9564896896481514, "num_tokens": 7842308.0, "step": 440 }, { "entropy": 0.6405692713335156, "epoch": 1.3161361141602634, "grad_norm": 1.6015625, "learning_rate": 6.412873776003224e-06, "loss": 0.1023, "mean_token_accuracy": 0.9734247334301471, "num_tokens": 8027201.0, "step": 450 }, { "entropy": 0.6420178588479757, "epoch": 1.34540797658251, "grad_norm": 2.4375, "learning_rate": 6.2577854700374326e-06, "loss": 0.0912, "mean_token_accuracy": 0.973945663869381, "num_tokens": 8198350.0, "step": 460 }, { "entropy": 0.6285538610070944, "epoch": 1.3746798390047568, "grad_norm": 2.578125, "learning_rate": 6.101394089753215e-06, "loss": 0.1143, "mean_token_accuracy": 0.9697877489030361, "num_tokens": 8384460.0, "step": 470 }, { "entropy": 0.6119630802422762, "epoch": 1.4039517014270033, "grad_norm": 2.71875, "learning_rate": 5.9438616576863085e-06, "loss": 0.1016, "mean_token_accuracy": 0.9731581903994083, "num_tokens": 8555391.0, "step": 480 }, { "entropy": 0.6343989443033934, "epoch": 1.4332235638492499, "grad_norm": 3.40625, "learning_rate": 5.785351378509875e-06, "loss": 0.112, "mean_token_accuracy": 0.970611660182476, "num_tokens": 8734253.0, "step": 490 }, { "entropy": 0.674339522048831, "epoch": 1.4624954262714964, "grad_norm": 6.125, "learning_rate": 5.626027469953345e-06, "loss": 0.1395, "mean_token_accuracy": 0.9643946584314108, "num_tokens": 8910777.0, "step": 500 }, { "entropy": 0.665541959553957, "epoch": 1.4917672886937432, "grad_norm": 6.9375, "learning_rate": 5.466054992671736e-06, "loss": 0.1391, "mean_token_accuracy": 0.9648671910166741, "num_tokens": 9079044.0, "step": 510 }, { "entropy": 0.6814688537269831, "epoch": 1.5210391511159898, "grad_norm": 3.96875, "learning_rate": 5.3055996792416795e-06, "loss": 0.1071, "mean_token_accuracy": 0.9721961826086044, "num_tokens": 9256503.0, "step": 520 }, { "entropy": 0.699323944374919, "epoch": 1.5503110135382365, "grad_norm": 2.375, "learning_rate": 5.14482776246135e-06, "loss": 0.1457, "mean_token_accuracy": 0.9631756335496903, "num_tokens": 9427931.0, "step": 530 }, { "entropy": 0.6563603695482015, "epoch": 1.579582875960483, "grad_norm": 5.8125, "learning_rate": 4.9839058031321454e-06, "loss": 0.0932, "mean_token_accuracy": 0.9753052346408367, "num_tokens": 9605378.0, "step": 540 }, { "entropy": 0.6394913710653782, "epoch": 1.6088547383827296, "grad_norm": 3.015625, "learning_rate": 4.8230005175005765e-06, "loss": 0.0978, "mean_token_accuracy": 0.9747575528919696, "num_tokens": 9790877.0, "step": 550 }, { "entropy": 0.6210989141836762, "epoch": 1.6381266008049762, "grad_norm": 4.71875, "learning_rate": 4.66227860453908e-06, "loss": 0.1128, "mean_token_accuracy": 0.9707565441727638, "num_tokens": 9979162.0, "step": 560 }, { "entropy": 0.6766301516443491, "epoch": 1.6673984632272227, "grad_norm": 3.65625, "learning_rate": 4.5019065732447596e-06, "loss": 0.1219, "mean_token_accuracy": 0.9683363229036331, "num_tokens": 10153683.0, "step": 570 }, { "entropy": 0.6057208560407161, "epoch": 1.6966703256494693, "grad_norm": 6.625, "learning_rate": 4.342050570134933e-06, "loss": 0.1118, "mean_token_accuracy": 0.9711062803864479, "num_tokens": 10343239.0, "step": 580 }, { "entropy": 0.6616282057017088, "epoch": 1.725942188071716, "grad_norm": 3.328125, "learning_rate": 4.1828762071181924e-06, "loss": 0.1256, "mean_token_accuracy": 0.967843408882618, "num_tokens": 10510567.0, "step": 590 }, { "entropy": 0.6358083071187138, "epoch": 1.7552140504939628, "grad_norm": 2.828125, "learning_rate": 4.02454838991936e-06, "loss": 0.1111, "mean_token_accuracy": 0.9699119538068771, "num_tokens": 10693481.0, "step": 600 }, { "entropy": 0.5980179835110903, "epoch": 1.7844859129162094, "grad_norm": 5.03125, "learning_rate": 3.86723114723601e-06, "loss": 0.0939, "mean_token_accuracy": 0.9751992784440517, "num_tokens": 10872219.0, "step": 610 }, { "entropy": 0.6339624278247357, "epoch": 1.813757775338456, "grad_norm": 2.609375, "learning_rate": 3.7110874608036375e-06, "loss": 0.1029, "mean_token_accuracy": 0.972338755428791, "num_tokens": 11041833.0, "step": 620 }, { "entropy": 0.6484420213848352, "epoch": 1.8430296377607025, "grad_norm": 3.9375, "learning_rate": 3.556279096545467e-06, "loss": 0.0949, "mean_token_accuracy": 0.9737608321011066, "num_tokens": 11214369.0, "step": 630 }, { "entropy": 0.640478839725256, "epoch": 1.872301500182949, "grad_norm": 2.9375, "learning_rate": 3.4029664369818426e-06, "loss": 0.1418, "mean_token_accuracy": 0.9651787281036377, "num_tokens": 11386576.0, "step": 640 }, { "entropy": 0.631079326197505, "epoch": 1.9015733626051956, "grad_norm": 2.125, "learning_rate": 3.251308315072862e-06, "loss": 0.1073, "mean_token_accuracy": 0.9708538435399532, "num_tokens": 11560738.0, "step": 650 }, { "entropy": 0.6620105486363173, "epoch": 1.9308452250274424, "grad_norm": 2.5625, "learning_rate": 3.1014618496663298e-06, "loss": 0.1672, "mean_token_accuracy": 0.959955221414566, "num_tokens": 11742855.0, "step": 660 }, { "entropy": 0.6634238740429282, "epoch": 1.960117087449689, "grad_norm": 5.3125, "learning_rate": 2.9535822827215686e-06, "loss": 0.094, "mean_token_accuracy": 0.974735701829195, "num_tokens": 11920135.0, "step": 670 }, { "entropy": 0.6778149709105492, "epoch": 1.9893889498719357, "grad_norm": 3.8125, "learning_rate": 2.8078228184776974e-06, "loss": 0.1222, "mean_token_accuracy": 0.9682413943111896, "num_tokens": 12088155.0, "step": 680 }, { "entropy": 0.624413115637643, "epoch": 2.017563117453348, "grad_norm": 2.0625, "learning_rate": 2.6643344647329784e-06, "loss": 0.1305, "mean_token_accuracy": 0.965500292839942, "num_tokens": 12273212.0, "step": 690 }, { "entropy": 0.6615780189633369, "epoch": 2.0468349798755945, "grad_norm": 3.0625, "learning_rate": 2.523265876399731e-06, "loss": 0.1302, "mean_token_accuracy": 0.9653139889240265, "num_tokens": 12446204.0, "step": 700 }, { "entropy": 0.6647442825138569, "epoch": 2.076106842297841, "grad_norm": 3.125, "learning_rate": 2.384763201496809e-06, "loss": 0.1186, "mean_token_accuracy": 0.9683853723108768, "num_tokens": 12619520.0, "step": 710 }, { "entropy": 0.6487895751371979, "epoch": 2.1053787047200876, "grad_norm": 5.625, "learning_rate": 2.248969929739273e-06, "loss": 0.1201, "mean_token_accuracy": 0.9684486784040928, "num_tokens": 12790301.0, "step": 720 }, { "entropy": 0.6803991423919797, "epoch": 2.1346505671423346, "grad_norm": 2.828125, "learning_rate": 2.1160267438820585e-06, "loss": 0.1352, "mean_token_accuracy": 0.9661958761513233, "num_tokens": 12962085.0, "step": 730 }, { "entropy": 0.640974473580718, "epoch": 2.163922429564581, "grad_norm": 2.109375, "learning_rate": 1.9860713739716846e-06, "loss": 0.0973, "mean_token_accuracy": 0.9733841702342033, "num_tokens": 13136263.0, "step": 740 }, { "entropy": 0.6351502992212772, "epoch": 2.1931942919868277, "grad_norm": 2.78125, "learning_rate": 1.8592384546569897e-06, "loss": 0.1025, "mean_token_accuracy": 0.9729525096714496, "num_tokens": 13322241.0, "step": 750 }, { "entropy": 0.6530712179839611, "epoch": 2.2224661544090742, "grad_norm": 3.0625, "learning_rate": 1.7356593857067161e-06, "loss": 0.1037, "mean_token_accuracy": 0.9716600969433784, "num_tokens": 13495808.0, "step": 760 }, { "entropy": 0.6783195801079274, "epoch": 2.251738016831321, "grad_norm": 2.078125, "learning_rate": 1.6154621958784522e-06, "loss": 0.1281, "mean_token_accuracy": 0.9669503092765808, "num_tokens": 13663184.0, "step": 770 }, { "entropy": 0.6459674347192049, "epoch": 2.2810098792535674, "grad_norm": 4.125, "learning_rate": 1.4987714102799755e-06, "loss": 0.1225, "mean_token_accuracy": 0.9684602275490761, "num_tokens": 13846527.0, "step": 780 }, { "entropy": 0.6219800597056746, "epoch": 2.310281741675814, "grad_norm": 2.28125, "learning_rate": 1.385707921360379e-06, "loss": 0.1018, "mean_token_accuracy": 0.9725745670497418, "num_tokens": 14040306.0, "step": 790 }, { "entropy": 0.6727650195360184, "epoch": 2.339553604098061, "grad_norm": 5.875, "learning_rate": 1.2763888636646838e-06, "loss": 0.1039, "mean_token_accuracy": 0.9730930998921394, "num_tokens": 14211656.0, "step": 800 } ], "logging_steps": 10, "max_steps": 1026, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.546803296916603e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }