{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 160, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 0.43827545642852783, "learning_rate": 0.0001, "loss": 0.9667, "mean_token_accuracy": 0.7421795547008514, "num_tokens": 18527.0, "step": 10 }, { "epoch": 0.1, "grad_norm": 0.39240607619285583, "learning_rate": 0.0001, "loss": 1.0018, "mean_token_accuracy": 0.728947314620018, "num_tokens": 37412.0, "step": 20 }, { "epoch": 0.15, "grad_norm": 0.38427555561065674, "learning_rate": 0.0001, "loss": 1.0093, "mean_token_accuracy": 0.7267461150884629, "num_tokens": 56239.0, "step": 30 }, { "epoch": 0.2, "grad_norm": 0.42408332228660583, "learning_rate": 0.0001, "loss": 0.9559, "mean_token_accuracy": 0.7414480298757553, "num_tokens": 75307.0, "step": 40 }, { "epoch": 0.25, "grad_norm": 0.46401479840278625, "learning_rate": 0.0001, "loss": 1.0349, "mean_token_accuracy": 0.7188403815031051, "num_tokens": 93699.0, "step": 50 }, { "epoch": 0.3, "grad_norm": 0.4213346838951111, "learning_rate": 0.0001, "loss": 0.9546, "mean_token_accuracy": 0.7375499308109283, "num_tokens": 112724.0, "step": 60 }, { "epoch": 0.35, "grad_norm": 0.40055790543556213, "learning_rate": 0.0001, "loss": 0.9743, "mean_token_accuracy": 0.7378433406352997, "num_tokens": 131831.0, "step": 70 }, { "epoch": 0.4, "grad_norm": 0.45163631439208984, "learning_rate": 0.0001, "loss": 1.0189, "mean_token_accuracy": 0.7263641938567161, "num_tokens": 151163.0, "step": 80 }, { "epoch": 0.45, "grad_norm": 0.4101933538913727, "learning_rate": 0.0001, "loss": 0.9672, "mean_token_accuracy": 0.7346487194299698, "num_tokens": 169973.0, "step": 90 }, { "epoch": 0.5, "grad_norm": 0.44324934482574463, "learning_rate": 0.0001, "loss": 1.0263, "mean_token_accuracy": 0.7225876390933991, "num_tokens": 188998.0, "step": 100 }, { "epoch": 0.55, "grad_norm": 0.47263675928115845, "learning_rate": 0.0001, "loss": 0.9865, "mean_token_accuracy": 0.7291134670376778, "num_tokens": 207403.0, "step": 110 }, { "epoch": 0.6, "grad_norm": 0.418999046087265, "learning_rate": 0.0001, "loss": 0.9624, "mean_token_accuracy": 0.7337665259838104, "num_tokens": 225944.0, "step": 120 }, { "epoch": 0.65, "grad_norm": 0.48299098014831543, "learning_rate": 0.0001, "loss": 1.0097, "mean_token_accuracy": 0.7260663375258446, "num_tokens": 244001.0, "step": 130 }, { "epoch": 0.7, "grad_norm": 0.45383280515670776, "learning_rate": 0.0001, "loss": 0.9367, "mean_token_accuracy": 0.7450008347630501, "num_tokens": 262380.0, "step": 140 }, { "epoch": 0.75, "grad_norm": 0.3950103521347046, "learning_rate": 0.0001, "loss": 0.9511, "mean_token_accuracy": 0.7413627594709397, "num_tokens": 281319.0, "step": 150 }, { "epoch": 0.8, "grad_norm": 0.5174494981765747, "learning_rate": 0.0001, "loss": 0.8982, "mean_token_accuracy": 0.7503622397780418, "num_tokens": 301270.0, "step": 160 }, { "epoch": 0.8, "eval_loss": 1.0890034437179565, "eval_mean_token_accuracy": 0.7177740011364222, "eval_num_tokens": 301270.0, "eval_runtime": 18.3264, "eval_samples_per_second": 17.461, "eval_steps_per_second": 8.731, "step": 160 }, { "epoch": 0.85, "grad_norm": 0.5073578953742981, "learning_rate": 0.0001, "loss": 0.9498, "mean_token_accuracy": 0.7435929998755455, "num_tokens": 320325.0, "step": 170 }, { "epoch": 0.9, "grad_norm": 0.4688466489315033, "learning_rate": 0.0001, "loss": 0.957, "mean_token_accuracy": 0.7374263137578965, "num_tokens": 339313.0, "step": 180 }, { "epoch": 0.95, "grad_norm": 0.5929768085479736, "learning_rate": 0.0001, "loss": 1.02, "mean_token_accuracy": 0.7251972034573555, "num_tokens": 357224.0, "step": 190 }, { "epoch": 1.0, "grad_norm": 0.5086683630943298, "learning_rate": 0.0001, "loss": 0.992, "mean_token_accuracy": 0.7353695958852768, "num_tokens": 376643.0, "step": 200 }, { "epoch": 1.05, "grad_norm": 0.4679737985134125, "learning_rate": 0.0001, "loss": 0.8852, "mean_token_accuracy": 0.7536615505814552, "num_tokens": 395121.0, "step": 210 }, { "epoch": 1.1, "grad_norm": 0.4789438843727112, "learning_rate": 0.0001, "loss": 0.9377, "mean_token_accuracy": 0.7422916859388351, "num_tokens": 413895.0, "step": 220 }, { "epoch": 1.15, "grad_norm": 0.5487397313117981, "learning_rate": 0.0001, "loss": 0.8488, "mean_token_accuracy": 0.7590507015585899, "num_tokens": 432763.0, "step": 230 }, { "epoch": 1.2, "grad_norm": 0.5337480306625366, "learning_rate": 0.0001, "loss": 0.8666, "mean_token_accuracy": 0.7579806029796601, "num_tokens": 451608.0, "step": 240 }, { "epoch": 1.25, "grad_norm": 0.49510490894317627, "learning_rate": 0.0001, "loss": 0.9104, "mean_token_accuracy": 0.7497919097542762, "num_tokens": 470554.0, "step": 250 }, { "epoch": 1.3, "grad_norm": 0.657990038394928, "learning_rate": 0.0001, "loss": 0.8951, "mean_token_accuracy": 0.7483552470803261, "num_tokens": 488654.0, "step": 260 }, { "epoch": 1.35, "grad_norm": 0.6424926519393921, "learning_rate": 0.0001, "loss": 0.8812, "mean_token_accuracy": 0.7524763226509095, "num_tokens": 507895.0, "step": 270 }, { "epoch": 1.4, "grad_norm": 0.556791365146637, "learning_rate": 0.0001, "loss": 0.8923, "mean_token_accuracy": 0.753830075263977, "num_tokens": 526416.0, "step": 280 }, { "epoch": 1.45, "grad_norm": 0.5518726110458374, "learning_rate": 0.0001, "loss": 0.919, "mean_token_accuracy": 0.7403171643614769, "num_tokens": 545947.0, "step": 290 }, { "epoch": 1.5, "grad_norm": 0.5887808799743652, "learning_rate": 0.0001, "loss": 0.9512, "mean_token_accuracy": 0.741435107588768, "num_tokens": 564070.0, "step": 300 }, { "epoch": 1.55, "grad_norm": 0.4931485056877136, "learning_rate": 0.0001, "loss": 0.8012, "mean_token_accuracy": 0.775481553375721, "num_tokens": 583519.0, "step": 310 }, { "epoch": 1.6, "grad_norm": 0.5791897177696228, "learning_rate": 0.0001, "loss": 0.8834, "mean_token_accuracy": 0.7559772610664368, "num_tokens": 602601.0, "step": 320 }, { "epoch": 1.6, "eval_loss": 1.0932101011276245, "eval_mean_token_accuracy": 0.7169803511351347, "eval_num_tokens": 602601.0, "eval_runtime": 18.3278, "eval_samples_per_second": 17.46, "eval_steps_per_second": 8.73, "step": 320 }, { "epoch": 1.65, "grad_norm": 0.5467931032180786, "learning_rate": 0.0001, "loss": 0.9429, "mean_token_accuracy": 0.7412704437971115, "num_tokens": 621579.0, "step": 330 }, { "epoch": 1.7, "grad_norm": 0.5784661173820496, "learning_rate": 0.0001, "loss": 0.8529, "mean_token_accuracy": 0.7641629755496979, "num_tokens": 640332.0, "step": 340 }, { "epoch": 1.75, "grad_norm": 0.4568150043487549, "learning_rate": 0.0001, "loss": 0.8716, "mean_token_accuracy": 0.7545537993311882, "num_tokens": 659812.0, "step": 350 }, { "epoch": 1.8, "grad_norm": 0.5498734712600708, "learning_rate": 0.0001, "loss": 0.8971, "mean_token_accuracy": 0.752374118566513, "num_tokens": 678799.0, "step": 360 }, { "epoch": 1.85, "grad_norm": 0.5936706066131592, "learning_rate": 0.0001, "loss": 0.9918, "mean_token_accuracy": 0.7298657700419426, "num_tokens": 696438.0, "step": 370 }, { "epoch": 1.9, "grad_norm": 0.5029641389846802, "learning_rate": 0.0001, "loss": 0.9371, "mean_token_accuracy": 0.7457024067640304, "num_tokens": 715459.0, "step": 380 }, { "epoch": 1.95, "grad_norm": 0.4978192448616028, "learning_rate": 0.0001, "loss": 0.9429, "mean_token_accuracy": 0.7435312032699585, "num_tokens": 733927.0, "step": 390 }, { "epoch": 2.0, "grad_norm": 0.5036607384681702, "learning_rate": 0.0001, "loss": 0.9169, "mean_token_accuracy": 0.7488818317651749, "num_tokens": 753286.0, "step": 400 }, { "epoch": 2.05, "grad_norm": 0.6137449741363525, "learning_rate": 0.0001, "loss": 0.7911, "mean_token_accuracy": 0.7788830995559692, "num_tokens": 772442.0, "step": 410 }, { "epoch": 2.1, "grad_norm": 0.5912206172943115, "learning_rate": 0.0001, "loss": 0.7946, "mean_token_accuracy": 0.7781713724136352, "num_tokens": 790693.0, "step": 420 }, { "epoch": 2.15, "grad_norm": 0.6016007661819458, "learning_rate": 0.0001, "loss": 0.7889, "mean_token_accuracy": 0.7735364139080048, "num_tokens": 809966.0, "step": 430 }, { "epoch": 2.2, "grad_norm": 0.5786302089691162, "learning_rate": 0.0001, "loss": 0.8027, "mean_token_accuracy": 0.7732229202985763, "num_tokens": 829851.0, "step": 440 }, { "epoch": 2.25, "grad_norm": 0.5368697047233582, "learning_rate": 0.0001, "loss": 0.8208, "mean_token_accuracy": 0.7645123034715653, "num_tokens": 848464.0, "step": 450 }, { "epoch": 2.3, "grad_norm": 0.712200403213501, "learning_rate": 0.0001, "loss": 0.8047, "mean_token_accuracy": 0.7703322827816009, "num_tokens": 867916.0, "step": 460 }, { "epoch": 2.35, "grad_norm": 0.7731253504753113, "learning_rate": 0.0001, "loss": 0.8328, "mean_token_accuracy": 0.7687120333313942, "num_tokens": 886785.0, "step": 470 }, { "epoch": 2.4, "grad_norm": 0.629067599773407, "learning_rate": 0.0001, "loss": 0.8092, "mean_token_accuracy": 0.7728568613529205, "num_tokens": 905795.0, "step": 480 }, { "epoch": 2.4, "eval_loss": 1.125347375869751, "eval_mean_token_accuracy": 0.711500994861126, "eval_num_tokens": 905795.0, "eval_runtime": 18.3351, "eval_samples_per_second": 17.453, "eval_steps_per_second": 8.726, "step": 480 }, { "epoch": 2.45, "grad_norm": 0.6760151982307434, "learning_rate": 0.0001, "loss": 0.8256, "mean_token_accuracy": 0.7666228592395783, "num_tokens": 925208.0, "step": 490 }, { "epoch": 2.5, "grad_norm": 0.6286083459854126, "learning_rate": 0.0001, "loss": 0.7471, "mean_token_accuracy": 0.7834655106067657, "num_tokens": 944333.0, "step": 500 }, { "epoch": 2.55, "grad_norm": 0.6746655702590942, "learning_rate": 0.0001, "loss": 0.7931, "mean_token_accuracy": 0.7782243847846985, "num_tokens": 962570.0, "step": 510 }, { "epoch": 2.6, "grad_norm": 0.5208497643470764, "learning_rate": 0.0001, "loss": 0.8246, "mean_token_accuracy": 0.7665972113609314, "num_tokens": 981133.0, "step": 520 }, { "epoch": 2.65, "grad_norm": 0.7782342433929443, "learning_rate": 0.0001, "loss": 0.9398, "mean_token_accuracy": 0.743607884645462, "num_tokens": 997998.0, "step": 530 }, { "epoch": 2.7, "grad_norm": 0.6813472509384155, "learning_rate": 0.0001, "loss": 0.777, "mean_token_accuracy": 0.7756537094712257, "num_tokens": 1016188.0, "step": 540 }, { "epoch": 2.75, "grad_norm": 0.6890157461166382, "learning_rate": 0.0001, "loss": 0.8493, "mean_token_accuracy": 0.7614479854702949, "num_tokens": 1035174.0, "step": 550 }, { "epoch": 2.8, "grad_norm": 0.6942604780197144, "learning_rate": 0.0001, "loss": 0.8153, "mean_token_accuracy": 0.7713455215096474, "num_tokens": 1054954.0, "step": 560 }, { "epoch": 2.85, "grad_norm": 0.6722708344459534, "learning_rate": 0.0001, "loss": 0.8811, "mean_token_accuracy": 0.7551201656460762, "num_tokens": 1073025.0, "step": 570 }, { "epoch": 2.9, "grad_norm": 0.6637946963310242, "learning_rate": 0.0001, "loss": 0.7876, "mean_token_accuracy": 0.7747491240501404, "num_tokens": 1092124.0, "step": 580 }, { "epoch": 2.95, "grad_norm": 0.6677461862564087, "learning_rate": 0.0001, "loss": 0.8539, "mean_token_accuracy": 0.762774932384491, "num_tokens": 1110204.0, "step": 590 }, { "epoch": 3.0, "grad_norm": 0.4877650737762451, "learning_rate": 0.0001, "loss": 0.8402, "mean_token_accuracy": 0.7657597422599792, "num_tokens": 1129929.0, "step": 600 }, { "epoch": 3.05, "grad_norm": 0.6816550493240356, "learning_rate": 0.0001, "loss": 0.7319, "mean_token_accuracy": 0.7943529888987542, "num_tokens": 1149086.0, "step": 610 }, { "epoch": 3.1, "grad_norm": 0.7251542210578918, "learning_rate": 0.0001, "loss": 0.7057, "mean_token_accuracy": 0.7959758445620537, "num_tokens": 1168048.0, "step": 620 }, { "epoch": 3.15, "grad_norm": 0.7146134376525879, "learning_rate": 0.0001, "loss": 0.7221, "mean_token_accuracy": 0.7933965891599655, "num_tokens": 1186753.0, "step": 630 }, { "epoch": 3.2, "grad_norm": 0.6925667524337769, "learning_rate": 0.0001, "loss": 0.7238, "mean_token_accuracy": 0.7958982989192009, "num_tokens": 1205471.0, "step": 640 }, { "epoch": 3.2, "eval_loss": 1.1755249500274658, "eval_mean_token_accuracy": 0.7095268920063973, "eval_num_tokens": 1205471.0, "eval_runtime": 18.3277, "eval_samples_per_second": 17.46, "eval_steps_per_second": 8.73, "step": 640 }, { "epoch": 3.25, "grad_norm": 0.6561554074287415, "learning_rate": 0.0001, "loss": 0.7076, "mean_token_accuracy": 0.7985713094472885, "num_tokens": 1225099.0, "step": 650 }, { "epoch": 3.3, "grad_norm": 0.6188570857048035, "learning_rate": 0.0001, "loss": 0.7698, "mean_token_accuracy": 0.7828378319740296, "num_tokens": 1244074.0, "step": 660 }, { "epoch": 3.35, "grad_norm": 0.7147638201713562, "learning_rate": 0.0001, "loss": 0.6592, "mean_token_accuracy": 0.8094674125313759, "num_tokens": 1263781.0, "step": 670 }, { "epoch": 3.4, "grad_norm": 0.761284589767456, "learning_rate": 0.0001, "loss": 0.7407, "mean_token_accuracy": 0.7847389072179795, "num_tokens": 1282554.0, "step": 680 }, { "epoch": 3.45, "grad_norm": 0.7286498546600342, "learning_rate": 0.0001, "loss": 0.7481, "mean_token_accuracy": 0.7902553915977478, "num_tokens": 1300680.0, "step": 690 }, { "epoch": 3.5, "grad_norm": 0.8410418033599854, "learning_rate": 0.0001, "loss": 0.7742, "mean_token_accuracy": 0.7788675397634506, "num_tokens": 1318486.0, "step": 700 }, { "epoch": 3.55, "grad_norm": 0.7173243165016174, "learning_rate": 0.0001, "loss": 0.7424, "mean_token_accuracy": 0.784093214571476, "num_tokens": 1337891.0, "step": 710 }, { "epoch": 3.6, "grad_norm": 0.7642121911048889, "learning_rate": 0.0001, "loss": 0.7925, "mean_token_accuracy": 0.7764820963144302, "num_tokens": 1355920.0, "step": 720 }, { "epoch": 3.65, "grad_norm": 0.5757165551185608, "learning_rate": 0.0001, "loss": 0.7781, "mean_token_accuracy": 0.7775384098291397, "num_tokens": 1374735.0, "step": 730 }, { "epoch": 3.7, "grad_norm": 0.8661559820175171, "learning_rate": 0.0001, "loss": 0.7351, "mean_token_accuracy": 0.7928005129098892, "num_tokens": 1393708.0, "step": 740 }, { "epoch": 3.75, "grad_norm": 0.6611236333847046, "learning_rate": 0.0001, "loss": 0.7353, "mean_token_accuracy": 0.7899313852190971, "num_tokens": 1412985.0, "step": 750 }, { "epoch": 3.8, "grad_norm": 0.787530779838562, "learning_rate": 0.0001, "loss": 0.758, "mean_token_accuracy": 0.7841869756579399, "num_tokens": 1431040.0, "step": 760 }, { "epoch": 3.85, "grad_norm": 0.8499215841293335, "learning_rate": 0.0001, "loss": 0.6919, "mean_token_accuracy": 0.8019102051854133, "num_tokens": 1450564.0, "step": 770 }, { "epoch": 3.9, "grad_norm": 0.6646876335144043, "learning_rate": 0.0001, "loss": 0.7811, "mean_token_accuracy": 0.7774537593126297, "num_tokens": 1468469.0, "step": 780 }, { "epoch": 3.95, "grad_norm": 0.692534863948822, "learning_rate": 0.0001, "loss": 0.6728, "mean_token_accuracy": 0.8075501427054406, "num_tokens": 1487203.0, "step": 790 }, { "epoch": 4.0, "grad_norm": 0.673222541809082, "learning_rate": 0.0001, "loss": 0.6851, "mean_token_accuracy": 0.7982731312513351, "num_tokens": 1506572.0, "step": 800 }, { "epoch": 4.0, "eval_loss": 1.1678171157836914, "eval_mean_token_accuracy": 0.708535186573863, "eval_num_tokens": 1506572.0, "eval_runtime": 18.3587, "eval_samples_per_second": 17.43, "eval_steps_per_second": 8.715, "step": 800 } ], "logging_steps": 10, "max_steps": 800, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 160, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4794283939332096.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }