| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.0, | |
| "eval_steps": 160, | |
| "global_step": 800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.43827545642852783, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9667, | |
| "mean_token_accuracy": 0.7421795547008514, | |
| "num_tokens": 18527.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.39240607619285583, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0018, | |
| "mean_token_accuracy": 0.728947314620018, | |
| "num_tokens": 37412.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.38427555561065674, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0093, | |
| "mean_token_accuracy": 0.7267461150884629, | |
| "num_tokens": 56239.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.42408332228660583, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9559, | |
| "mean_token_accuracy": 0.7414480298757553, | |
| "num_tokens": 75307.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.46401479840278625, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0349, | |
| "mean_token_accuracy": 0.7188403815031051, | |
| "num_tokens": 93699.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.4213346838951111, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9546, | |
| "mean_token_accuracy": 0.7375499308109283, | |
| "num_tokens": 112724.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.40055790543556213, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9743, | |
| "mean_token_accuracy": 0.7378433406352997, | |
| "num_tokens": 131831.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.45163631439208984, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0189, | |
| "mean_token_accuracy": 0.7263641938567161, | |
| "num_tokens": 151163.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.4101933538913727, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9672, | |
| "mean_token_accuracy": 0.7346487194299698, | |
| "num_tokens": 169973.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.44324934482574463, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0263, | |
| "mean_token_accuracy": 0.7225876390933991, | |
| "num_tokens": 188998.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.47263675928115845, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9865, | |
| "mean_token_accuracy": 0.7291134670376778, | |
| "num_tokens": 207403.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.418999046087265, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9624, | |
| "mean_token_accuracy": 0.7337665259838104, | |
| "num_tokens": 225944.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.48299098014831543, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0097, | |
| "mean_token_accuracy": 0.7260663375258446, | |
| "num_tokens": 244001.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.45383280515670776, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9367, | |
| "mean_token_accuracy": 0.7450008347630501, | |
| "num_tokens": 262380.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.3950103521347046, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9511, | |
| "mean_token_accuracy": 0.7413627594709397, | |
| "num_tokens": 281319.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.5174494981765747, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8982, | |
| "mean_token_accuracy": 0.7503622397780418, | |
| "num_tokens": 301270.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_loss": 1.0890034437179565, | |
| "eval_mean_token_accuracy": 0.7177740011364222, | |
| "eval_num_tokens": 301270.0, | |
| "eval_runtime": 18.3264, | |
| "eval_samples_per_second": 17.461, | |
| "eval_steps_per_second": 8.731, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.5073578953742981, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9498, | |
| "mean_token_accuracy": 0.7435929998755455, | |
| "num_tokens": 320325.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.4688466489315033, | |
| "learning_rate": 0.0001, | |
| "loss": 0.957, | |
| "mean_token_accuracy": 0.7374263137578965, | |
| "num_tokens": 339313.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.5929768085479736, | |
| "learning_rate": 0.0001, | |
| "loss": 1.02, | |
| "mean_token_accuracy": 0.7251972034573555, | |
| "num_tokens": 357224.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.5086683630943298, | |
| "learning_rate": 0.0001, | |
| "loss": 0.992, | |
| "mean_token_accuracy": 0.7353695958852768, | |
| "num_tokens": 376643.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 0.4679737985134125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8852, | |
| "mean_token_accuracy": 0.7536615505814552, | |
| "num_tokens": 395121.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 0.4789438843727112, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9377, | |
| "mean_token_accuracy": 0.7422916859388351, | |
| "num_tokens": 413895.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 0.5487397313117981, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8488, | |
| "mean_token_accuracy": 0.7590507015585899, | |
| "num_tokens": 432763.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.5337480306625366, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8666, | |
| "mean_token_accuracy": 0.7579806029796601, | |
| "num_tokens": 451608.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.49510490894317627, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9104, | |
| "mean_token_accuracy": 0.7497919097542762, | |
| "num_tokens": 470554.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 0.657990038394928, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8951, | |
| "mean_token_accuracy": 0.7483552470803261, | |
| "num_tokens": 488654.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 0.6424926519393921, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8812, | |
| "mean_token_accuracy": 0.7524763226509095, | |
| "num_tokens": 507895.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.556791365146637, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8923, | |
| "mean_token_accuracy": 0.753830075263977, | |
| "num_tokens": 526416.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 0.5518726110458374, | |
| "learning_rate": 0.0001, | |
| "loss": 0.919, | |
| "mean_token_accuracy": 0.7403171643614769, | |
| "num_tokens": 545947.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.5887808799743652, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9512, | |
| "mean_token_accuracy": 0.741435107588768, | |
| "num_tokens": 564070.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 0.4931485056877136, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8012, | |
| "mean_token_accuracy": 0.775481553375721, | |
| "num_tokens": 583519.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.5791897177696228, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8834, | |
| "mean_token_accuracy": 0.7559772610664368, | |
| "num_tokens": 602601.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_loss": 1.0932101011276245, | |
| "eval_mean_token_accuracy": 0.7169803511351347, | |
| "eval_num_tokens": 602601.0, | |
| "eval_runtime": 18.3278, | |
| "eval_samples_per_second": 17.46, | |
| "eval_steps_per_second": 8.73, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 0.5467931032180786, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9429, | |
| "mean_token_accuracy": 0.7412704437971115, | |
| "num_tokens": 621579.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 0.5784661173820496, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8529, | |
| "mean_token_accuracy": 0.7641629755496979, | |
| "num_tokens": 640332.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.4568150043487549, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8716, | |
| "mean_token_accuracy": 0.7545537993311882, | |
| "num_tokens": 659812.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.5498734712600708, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8971, | |
| "mean_token_accuracy": 0.752374118566513, | |
| "num_tokens": 678799.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 0.5936706066131592, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9918, | |
| "mean_token_accuracy": 0.7298657700419426, | |
| "num_tokens": 696438.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 0.5029641389846802, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9371, | |
| "mean_token_accuracy": 0.7457024067640304, | |
| "num_tokens": 715459.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 0.4978192448616028, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9429, | |
| "mean_token_accuracy": 0.7435312032699585, | |
| "num_tokens": 733927.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.5036607384681702, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9169, | |
| "mean_token_accuracy": 0.7488818317651749, | |
| "num_tokens": 753286.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 0.6137449741363525, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7911, | |
| "mean_token_accuracy": 0.7788830995559692, | |
| "num_tokens": 772442.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 0.5912206172943115, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7946, | |
| "mean_token_accuracy": 0.7781713724136352, | |
| "num_tokens": 790693.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 0.6016007661819458, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7889, | |
| "mean_token_accuracy": 0.7735364139080048, | |
| "num_tokens": 809966.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 0.5786302089691162, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8027, | |
| "mean_token_accuracy": 0.7732229202985763, | |
| "num_tokens": 829851.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 0.5368697047233582, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8208, | |
| "mean_token_accuracy": 0.7645123034715653, | |
| "num_tokens": 848464.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 0.712200403213501, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8047, | |
| "mean_token_accuracy": 0.7703322827816009, | |
| "num_tokens": 867916.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 0.7731253504753113, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8328, | |
| "mean_token_accuracy": 0.7687120333313942, | |
| "num_tokens": 886785.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.629067599773407, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8092, | |
| "mean_token_accuracy": 0.7728568613529205, | |
| "num_tokens": 905795.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_loss": 1.125347375869751, | |
| "eval_mean_token_accuracy": 0.711500994861126, | |
| "eval_num_tokens": 905795.0, | |
| "eval_runtime": 18.3351, | |
| "eval_samples_per_second": 17.453, | |
| "eval_steps_per_second": 8.726, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 0.6760151982307434, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8256, | |
| "mean_token_accuracy": 0.7666228592395783, | |
| "num_tokens": 925208.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.6286083459854126, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7471, | |
| "mean_token_accuracy": 0.7834655106067657, | |
| "num_tokens": 944333.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 0.6746655702590942, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7931, | |
| "mean_token_accuracy": 0.7782243847846985, | |
| "num_tokens": 962570.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.5208497643470764, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8246, | |
| "mean_token_accuracy": 0.7665972113609314, | |
| "num_tokens": 981133.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 0.7782342433929443, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9398, | |
| "mean_token_accuracy": 0.743607884645462, | |
| "num_tokens": 997998.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.6813472509384155, | |
| "learning_rate": 0.0001, | |
| "loss": 0.777, | |
| "mean_token_accuracy": 0.7756537094712257, | |
| "num_tokens": 1016188.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.6890157461166382, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8493, | |
| "mean_token_accuracy": 0.7614479854702949, | |
| "num_tokens": 1035174.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.6942604780197144, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8153, | |
| "mean_token_accuracy": 0.7713455215096474, | |
| "num_tokens": 1054954.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 0.6722708344459534, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8811, | |
| "mean_token_accuracy": 0.7551201656460762, | |
| "num_tokens": 1073025.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 0.6637946963310242, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7876, | |
| "mean_token_accuracy": 0.7747491240501404, | |
| "num_tokens": 1092124.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 0.6677461862564087, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8539, | |
| "mean_token_accuracy": 0.762774932384491, | |
| "num_tokens": 1110204.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.4877650737762451, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8402, | |
| "mean_token_accuracy": 0.7657597422599792, | |
| "num_tokens": 1129929.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "grad_norm": 0.6816550493240356, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7319, | |
| "mean_token_accuracy": 0.7943529888987542, | |
| "num_tokens": 1149086.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 0.7251542210578918, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7057, | |
| "mean_token_accuracy": 0.7959758445620537, | |
| "num_tokens": 1168048.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 0.7146134376525879, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7221, | |
| "mean_token_accuracy": 0.7933965891599655, | |
| "num_tokens": 1186753.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 0.6925667524337769, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7238, | |
| "mean_token_accuracy": 0.7958982989192009, | |
| "num_tokens": 1205471.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_loss": 1.1755249500274658, | |
| "eval_mean_token_accuracy": 0.7095268920063973, | |
| "eval_num_tokens": 1205471.0, | |
| "eval_runtime": 18.3277, | |
| "eval_samples_per_second": 17.46, | |
| "eval_steps_per_second": 8.73, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 0.6561554074287415, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7076, | |
| "mean_token_accuracy": 0.7985713094472885, | |
| "num_tokens": 1225099.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "grad_norm": 0.6188570857048035, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7698, | |
| "mean_token_accuracy": 0.7828378319740296, | |
| "num_tokens": 1244074.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "grad_norm": 0.7147638201713562, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6592, | |
| "mean_token_accuracy": 0.8094674125313759, | |
| "num_tokens": 1263781.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 0.761284589767456, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7407, | |
| "mean_token_accuracy": 0.7847389072179795, | |
| "num_tokens": 1282554.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "grad_norm": 0.7286498546600342, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7481, | |
| "mean_token_accuracy": 0.7902553915977478, | |
| "num_tokens": 1300680.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 0.8410418033599854, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7742, | |
| "mean_token_accuracy": 0.7788675397634506, | |
| "num_tokens": 1318486.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.55, | |
| "grad_norm": 0.7173243165016174, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7424, | |
| "mean_token_accuracy": 0.784093214571476, | |
| "num_tokens": 1337891.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 0.7642121911048889, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7925, | |
| "mean_token_accuracy": 0.7764820963144302, | |
| "num_tokens": 1355920.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "grad_norm": 0.5757165551185608, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7781, | |
| "mean_token_accuracy": 0.7775384098291397, | |
| "num_tokens": 1374735.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 0.8661559820175171, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7351, | |
| "mean_token_accuracy": 0.7928005129098892, | |
| "num_tokens": 1393708.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 0.6611236333847046, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7353, | |
| "mean_token_accuracy": 0.7899313852190971, | |
| "num_tokens": 1412985.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 0.787530779838562, | |
| "learning_rate": 0.0001, | |
| "loss": 0.758, | |
| "mean_token_accuracy": 0.7841869756579399, | |
| "num_tokens": 1431040.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 0.8499215841293335, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6919, | |
| "mean_token_accuracy": 0.8019102051854133, | |
| "num_tokens": 1450564.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 0.6646876335144043, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7811, | |
| "mean_token_accuracy": 0.7774537593126297, | |
| "num_tokens": 1468469.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 3.95, | |
| "grad_norm": 0.692534863948822, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6728, | |
| "mean_token_accuracy": 0.8075501427054406, | |
| "num_tokens": 1487203.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.673222541809082, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6851, | |
| "mean_token_accuracy": 0.7982731312513351, | |
| "num_tokens": 1506572.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 1.1678171157836914, | |
| "eval_mean_token_accuracy": 0.708535186573863, | |
| "eval_num_tokens": 1506572.0, | |
| "eval_runtime": 18.3587, | |
| "eval_samples_per_second": 17.43, | |
| "eval_steps_per_second": 8.715, | |
| "step": 800 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 800, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 160, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4794283939332096.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |