| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.4624954262714964, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.7402557095512747, |
| "epoch": 0.029271862422246615, |
| "grad_norm": 29.125, |
| "learning_rate": 1.8000000000000001e-06, |
| "loss": 0.8764, |
| "mean_token_accuracy": 0.8633198849856853, |
| "num_tokens": 178219.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.6707062933593988, |
| "epoch": 0.05854372484449323, |
| "grad_norm": 29.625, |
| "learning_rate": 3.8000000000000005e-06, |
| "loss": 0.7056, |
| "mean_token_accuracy": 0.8833413228392601, |
| "num_tokens": 364483.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.6742518592625857, |
| "epoch": 0.08781558726673985, |
| "grad_norm": 8.4375, |
| "learning_rate": 5.8e-06, |
| "loss": 0.4525, |
| "mean_token_accuracy": 0.9048154704272747, |
| "num_tokens": 544552.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.7069467414170504, |
| "epoch": 0.11708744968898646, |
| "grad_norm": 10.0, |
| "learning_rate": 7.800000000000002e-06, |
| "loss": 0.29, |
| "mean_token_accuracy": 0.9340683862566947, |
| "num_tokens": 717853.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.6330054465681314, |
| "epoch": 0.14635931211123307, |
| "grad_norm": 2.453125, |
| "learning_rate": 9.800000000000001e-06, |
| "loss": 0.1453, |
| "mean_token_accuracy": 0.9656811647117138, |
| "num_tokens": 892769.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.6911797292530537, |
| "epoch": 0.1756311745334797, |
| "grad_norm": 4.71875, |
| "learning_rate": 9.997902051783373e-06, |
| "loss": 0.1719, |
| "mean_token_accuracy": 0.9569758839905262, |
| "num_tokens": 1062544.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.658262687176466, |
| "epoch": 0.20490303695572631, |
| "grad_norm": 8.9375, |
| "learning_rate": 9.990652145366113e-06, |
| "loss": 0.1144, |
| "mean_token_accuracy": 0.971483013778925, |
| "num_tokens": 1238635.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.6544046986848115, |
| "epoch": 0.23417489937797292, |
| "grad_norm": 11.125, |
| "learning_rate": 9.978231889316302e-06, |
| "loss": 0.117, |
| "mean_token_accuracy": 0.9697644971311092, |
| "num_tokens": 1422527.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.6321761136874556, |
| "epoch": 0.26344676180021953, |
| "grad_norm": 5.8125, |
| "learning_rate": 9.960654151103846e-06, |
| "loss": 0.1172, |
| "mean_token_accuracy": 0.9699941977858544, |
| "num_tokens": 1604412.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.6520156674087048, |
| "epoch": 0.29271862422246614, |
| "grad_norm": 2.75, |
| "learning_rate": 9.937937141385323e-06, |
| "loss": 0.127, |
| "mean_token_accuracy": 0.9669150829315185, |
| "num_tokens": 1785808.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.7009117918089032, |
| "epoch": 0.32199048664471275, |
| "grad_norm": 4.5625, |
| "learning_rate": 9.91010439513761e-06, |
| "loss": 0.166, |
| "mean_token_accuracy": 0.9596201993525029, |
| "num_tokens": 1963662.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.699804374948144, |
| "epoch": 0.3512623490669594, |
| "grad_norm": 4.5, |
| "learning_rate": 9.87718474727549e-06, |
| "loss": 0.1242, |
| "mean_token_accuracy": 0.9688111513853073, |
| "num_tokens": 2138861.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.6593301335349679, |
| "epoch": 0.380534211489206, |
| "grad_norm": 3.828125, |
| "learning_rate": 9.839212302778493e-06, |
| "loss": 0.0984, |
| "mean_token_accuracy": 0.9735250800848008, |
| "num_tokens": 2314376.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.6733343230560422, |
| "epoch": 0.40980607391145263, |
| "grad_norm": 4.03125, |
| "learning_rate": 9.796226401357884e-06, |
| "loss": 0.1292, |
| "mean_token_accuracy": 0.9677626974880695, |
| "num_tokens": 2494288.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.6483910661190748, |
| "epoch": 0.43907793633369924, |
| "grad_norm": 9.3125, |
| "learning_rate": 9.748271576700476e-06, |
| "loss": 0.1171, |
| "mean_token_accuracy": 0.9706188321113587, |
| "num_tokens": 2682554.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.6520246665924787, |
| "epoch": 0.46834979875594585, |
| "grad_norm": 3.15625, |
| "learning_rate": 9.69539751033141e-06, |
| "loss": 0.1038, |
| "mean_token_accuracy": 0.9712590672075748, |
| "num_tokens": 2855437.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.6615954734385013, |
| "epoch": 0.49762166117819245, |
| "grad_norm": 7.0625, |
| "learning_rate": 9.637658980143771e-06, |
| "loss": 0.0976, |
| "mean_token_accuracy": 0.9749271534383297, |
| "num_tokens": 3034970.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.6598344139754773, |
| "epoch": 0.5268935236004391, |
| "grad_norm": 3.59375, |
| "learning_rate": 9.575115803648303e-06, |
| "loss": 0.1018, |
| "mean_token_accuracy": 0.9722783856093884, |
| "num_tokens": 3208412.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.681413133814931, |
| "epoch": 0.5561653860226857, |
| "grad_norm": 3.78125, |
| "learning_rate": 9.507832776002069e-06, |
| "loss": 0.106, |
| "mean_token_accuracy": 0.9714638628065586, |
| "num_tokens": 3395494.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.7187394430860877, |
| "epoch": 0.5854372484449323, |
| "grad_norm": 2.65625, |
| "learning_rate": 9.43587960288023e-06, |
| "loss": 0.1434, |
| "mean_token_accuracy": 0.9640420243144036, |
| "num_tokens": 3573674.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.7018351562321186, |
| "epoch": 0.6147091108671789, |
| "grad_norm": 4.9375, |
| "learning_rate": 9.359330828260477e-06, |
| "loss": 0.1488, |
| "mean_token_accuracy": 0.9625116638839245, |
| "num_tokens": 3749797.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.6656203528866171, |
| "epoch": 0.6439809732894255, |
| "grad_norm": 22.0, |
| "learning_rate": 9.278265757194983e-06, |
| "loss": 0.1026, |
| "mean_token_accuracy": 0.9728666849434375, |
| "num_tokens": 3927226.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.7618660530075431, |
| "epoch": 0.6732528357116722, |
| "grad_norm": 7.3125, |
| "learning_rate": 9.1927683736498e-06, |
| "loss": 0.1411, |
| "mean_token_accuracy": 0.962882998585701, |
| "num_tokens": 4107295.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.6730721700936556, |
| "epoch": 0.7025246981339188, |
| "grad_norm": 11.0, |
| "learning_rate": 9.102927253496926e-06, |
| "loss": 0.1263, |
| "mean_token_accuracy": 0.9679374843835831, |
| "num_tokens": 4286908.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.6370995994657278, |
| "epoch": 0.7317965605561654, |
| "grad_norm": 6.21875, |
| "learning_rate": 9.008835472749085e-06, |
| "loss": 0.0971, |
| "mean_token_accuracy": 0.9734279833734035, |
| "num_tokens": 4467461.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.626228180155158, |
| "epoch": 0.761068422978412, |
| "grad_norm": 2.90625, |
| "learning_rate": 8.910590511132339e-06, |
| "loss": 0.0872, |
| "mean_token_accuracy": 0.976653154194355, |
| "num_tokens": 4646737.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.6526400525122881, |
| "epoch": 0.7903402854006586, |
| "grad_norm": 8.5625, |
| "learning_rate": 8.808294151096436e-06, |
| "loss": 0.1004, |
| "mean_token_accuracy": 0.9733942933380604, |
| "num_tokens": 4821010.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.5949247144162655, |
| "epoch": 0.8196121478229053, |
| "grad_norm": 4.625, |
| "learning_rate": 8.702052372367496e-06, |
| "loss": 0.0946, |
| "mean_token_accuracy": 0.9757313847541809, |
| "num_tokens": 4999440.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.6587195005267859, |
| "epoch": 0.8488840102451518, |
| "grad_norm": 9.75, |
| "learning_rate": 8.591975242152293e-06, |
| "loss": 0.1173, |
| "mean_token_accuracy": 0.9692915640771389, |
| "num_tokens": 5171265.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.6832986019551754, |
| "epoch": 0.8781558726673985, |
| "grad_norm": 9.75, |
| "learning_rate": 8.478176801107872e-06, |
| "loss": 0.132, |
| "mean_token_accuracy": 0.9648732647299767, |
| "num_tokens": 5338761.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.6479978006333112, |
| "epoch": 0.907427735089645, |
| "grad_norm": 6.59375, |
| "learning_rate": 8.360774945194666e-06, |
| "loss": 0.1102, |
| "mean_token_accuracy": 0.9708870485424995, |
| "num_tokens": 5515062.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.6245086956769228, |
| "epoch": 0.9366995975118917, |
| "grad_norm": 33.75, |
| "learning_rate": 8.239891303535457e-06, |
| "loss": 0.1039, |
| "mean_token_accuracy": 0.9724696420133114, |
| "num_tokens": 5696911.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.6312450472265482, |
| "epoch": 0.9659714599341384, |
| "grad_norm": 6.3125, |
| "learning_rate": 8.1156511124068e-06, |
| "loss": 0.0879, |
| "mean_token_accuracy": 0.9753462255001069, |
| "num_tokens": 5869301.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.6530843697488308, |
| "epoch": 0.9952433223563849, |
| "grad_norm": 5.40625, |
| "learning_rate": 7.988183085493362e-06, |
| "loss": 0.1105, |
| "mean_token_accuracy": 0.969614065438509, |
| "num_tokens": 6044424.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.6368226534747458, |
| "epoch": 1.0234174899377972, |
| "grad_norm": 2.8125, |
| "learning_rate": 7.85761928053969e-06, |
| "loss": 0.0885, |
| "mean_token_accuracy": 0.9756075329594798, |
| "num_tokens": 6217116.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.6533633038401604, |
| "epoch": 1.0526893523600438, |
| "grad_norm": 4.09375, |
| "learning_rate": 7.72409496253747e-06, |
| "loss": 0.1008, |
| "mean_token_accuracy": 0.9739545792341232, |
| "num_tokens": 6387512.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.6239726161584258, |
| "epoch": 1.0819612147822906, |
| "grad_norm": 3.421875, |
| "learning_rate": 7.5877484635900876e-06, |
| "loss": 0.1032, |
| "mean_token_accuracy": 0.9725366532802582, |
| "num_tokens": 6573367.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.6464782979339361, |
| "epoch": 1.1112330772045371, |
| "grad_norm": 2.640625, |
| "learning_rate": 7.448721039599616e-06, |
| "loss": 0.1105, |
| "mean_token_accuracy": 0.9710234113037586, |
| "num_tokens": 6751810.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.6139115165919066, |
| "epoch": 1.1405049396267837, |
| "grad_norm": 7.84375, |
| "learning_rate": 7.307156723924742e-06, |
| "loss": 0.0907, |
| "mean_token_accuracy": 0.9750210918486119, |
| "num_tokens": 6940710.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.6495381936430931, |
| "epoch": 1.1697768020490305, |
| "grad_norm": 2.578125, |
| "learning_rate": 7.1632021781612305e-06, |
| "loss": 0.0888, |
| "mean_token_accuracy": 0.9754416085779667, |
| "num_tokens": 7120616.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.6580816496163606, |
| "epoch": 1.199048664471277, |
| "grad_norm": 4.53125, |
| "learning_rate": 7.017006540199501e-06, |
| "loss": 0.1054, |
| "mean_token_accuracy": 0.9714232549071312, |
| "num_tokens": 7295346.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.6860831430181861, |
| "epoch": 1.2283205268935236, |
| "grad_norm": 39.75, |
| "learning_rate": 6.8687212697167685e-06, |
| "loss": 0.1315, |
| "mean_token_accuracy": 0.9658499620854855, |
| "num_tokens": 7473599.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.6426566727459431, |
| "epoch": 1.25759238931577, |
| "grad_norm": 2.640625, |
| "learning_rate": 6.718499991263776e-06, |
| "loss": 0.0855, |
| "mean_token_accuracy": 0.9763770438730717, |
| "num_tokens": 7666324.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.6955473996698857, |
| "epoch": 1.2868642517380169, |
| "grad_norm": 51.75, |
| "learning_rate": 6.566498335108719e-06, |
| "loss": 0.148, |
| "mean_token_accuracy": 0.9621038816869258, |
| "num_tokens": 7842308.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.6305533852428198, |
| "epoch": 1.3161361141602634, |
| "grad_norm": 6.84375, |
| "learning_rate": 6.412873776003224e-06, |
| "loss": 0.0829, |
| "mean_token_accuracy": 0.9776231050491333, |
| "num_tokens": 8027201.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.6824685353785753, |
| "epoch": 1.34540797658251, |
| "grad_norm": 11.0, |
| "learning_rate": 6.2577854700374326e-06, |
| "loss": 0.0748, |
| "mean_token_accuracy": 0.9776533439755439, |
| "num_tokens": 8198350.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.6491625333204866, |
| "epoch": 1.3746798390047568, |
| "grad_norm": 2.9375, |
| "learning_rate": 6.101394089753215e-06, |
| "loss": 0.096, |
| "mean_token_accuracy": 0.9740288414061069, |
| "num_tokens": 8384460.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.6950589299201966, |
| "epoch": 1.4039517014270033, |
| "grad_norm": 2.265625, |
| "learning_rate": 5.9438616576863085e-06, |
| "loss": 0.0866, |
| "mean_token_accuracy": 0.9760893404483795, |
| "num_tokens": 8555391.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.681521375477314, |
| "epoch": 1.4332235638492499, |
| "grad_norm": 8.375, |
| "learning_rate": 5.785351378509875e-06, |
| "loss": 0.0922, |
| "mean_token_accuracy": 0.9748925127089023, |
| "num_tokens": 8734253.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.7467833517119289, |
| "epoch": 1.4624954262714964, |
| "grad_norm": 5.125, |
| "learning_rate": 5.626027469953345e-06, |
| "loss": 0.1173, |
| "mean_token_accuracy": 0.9686291612684726, |
| "num_tokens": 8910777.0, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1026, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.1634477360256102e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|